uaritm commited on Oct 8, 2025

Commit

865a603

verified ·

1 Parent(s): 1da3322

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

lora_checkpoints/README.md +209 -0
lora_checkpoints/adapter_config.json +42 -0
lora_checkpoints/adapter_model.safetensors +3 -0
lora_checkpoints/added_tokens.json +3 -0
lora_checkpoints/chat_template.jinja +47 -0
lora_checkpoints/checkpoint-4400/README.md +209 -0
lora_checkpoints/checkpoint-4400/adapter_config.json +42 -0
lora_checkpoints/checkpoint-4400/adapter_model.safetensors +3 -0
lora_checkpoints/checkpoint-4400/added_tokens.json +3 -0
lora_checkpoints/checkpoint-4400/chat_template.jinja +47 -0
lora_checkpoints/checkpoint-4400/optimizer.pt +3 -0
lora_checkpoints/checkpoint-4400/rng_state.pth +3 -0
lora_checkpoints/checkpoint-4400/scaler.pt +3 -0
lora_checkpoints/checkpoint-4400/scheduler.pt +3 -0
lora_checkpoints/checkpoint-4400/special_tokens_map.json +33 -0
lora_checkpoints/checkpoint-4400/tokenizer.model +3 -0
lora_checkpoints/checkpoint-4400/tokenizer_config.json +0 -0
lora_checkpoints/checkpoint-4400/trainer_state.json +3114 -0
lora_checkpoints/checkpoint-4400/training_args.bin +3 -0
lora_checkpoints/checkpoint-4600/README.md +209 -0
lora_checkpoints/checkpoint-4600/adapter_config.json +42 -0
lora_checkpoints/checkpoint-4600/adapter_model.safetensors +3 -0
lora_checkpoints/checkpoint-4600/added_tokens.json +3 -0
lora_checkpoints/checkpoint-4600/chat_template.jinja +47 -0
lora_checkpoints/checkpoint-4600/optimizer.pt +3 -0
lora_checkpoints/checkpoint-4600/rng_state.pth +3 -0
lora_checkpoints/checkpoint-4600/scaler.pt +3 -0
lora_checkpoints/checkpoint-4600/scheduler.pt +3 -0
lora_checkpoints/checkpoint-4600/special_tokens_map.json +33 -0
lora_checkpoints/checkpoint-4600/tokenizer.model +3 -0
lora_checkpoints/checkpoint-4600/tokenizer_config.json +0 -0
lora_checkpoints/checkpoint-4600/trainer_state.json +3254 -0
lora_checkpoints/checkpoint-4600/training_args.bin +3 -0
lora_checkpoints/checkpoint-4800/README.md +209 -0
lora_checkpoints/checkpoint-4800/adapter_config.json +42 -0
lora_checkpoints/checkpoint-4800/adapter_model.safetensors +3 -0
lora_checkpoints/checkpoint-4800/added_tokens.json +3 -0
lora_checkpoints/checkpoint-4800/chat_template.jinja +47 -0
lora_checkpoints/checkpoint-4800/optimizer.pt +3 -0
lora_checkpoints/checkpoint-4800/rng_state.pth +3 -0
lora_checkpoints/checkpoint-4800/scaler.pt +3 -0
lora_checkpoints/checkpoint-4800/scheduler.pt +3 -0
lora_checkpoints/checkpoint-4800/special_tokens_map.json +33 -0
lora_checkpoints/checkpoint-4800/tokenizer.model +3 -0
lora_checkpoints/checkpoint-4800/tokenizer_config.json +0 -0
lora_checkpoints/checkpoint-4800/trainer_state.json +3394 -0
lora_checkpoints/checkpoint-4800/training_args.bin +3 -0
lora_checkpoints/special_tokens_map.json +33 -0
lora_checkpoints/tokenizer.model +3 -0
lora_checkpoints/tokenizer_config.json +0 -0

lora_checkpoints/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: uaritm/gemma3_1b_med_qa_ru
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:uaritm/gemma3_1b_med_qa_ru
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.17.1

lora_checkpoints/adapter_config.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "uaritm/gemma3_1b_med_qa_ru",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "k_proj",
+    "o_proj",
+    "q_proj",
+    "gate_proj",
+    "down_proj",
+    "up_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

lora_checkpoints/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b9b4ff0908196780a01d5777bf7ee02d871b7bb99dcb4c70e6b808d975af1aad
+size 52231312

lora_checkpoints/added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "<image_soft_token>": 262144
+}

lora_checkpoints/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,47 @@

+{{ bos_token }}
+{%- if messages[0]['role'] == 'system' -%}
+    {%- if messages[0]['content'] is string -%}
+        {%- set first_user_prefix = messages[0]['content'] + '
+' -%}
+    {%- else -%}
+        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
+' -%}
+    {%- endif -%}
+    {%- set loop_messages = messages[1:] -%}
+{%- else -%}
+    {%- set first_user_prefix = "" -%}
+    {%- set loop_messages = messages -%}
+{%- endif -%}
+{%- for message in loop_messages -%}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
+        {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
+    {%- endif -%}
+    {%- if (message['role'] == 'assistant') -%}
+        {%- set role = "model" -%}
+    {%- else -%}
+        {%- set role = message['role'] -%}
+    {%- endif -%}
+    {{ '<start_of_turn>' + role + '
+' + (first_user_prefix if loop.first else "") }}
+    {%- if message['content'] is string -%}
+        {{ message['content'] | trim }}
+    {%- elif message['content'] is iterable -%}
+        {%- for item in message['content'] -%}
+            {%- if item['type'] == 'image' -%}
+                {{ '<start_of_image>' }}
+            {%- elif item['type'] == 'text' -%}
+                {{ item['text'] | trim }}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- else -%}
+        {{ raise_exception("Invalid content type") }}
+    {%- endif -%}
+    {{ '<end_of_turn>
+' }}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    {{'<start_of_turn>model
+'}}
+{%- endif -%}

lora_checkpoints/checkpoint-4400/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: uaritm/gemma3_1b_med_qa_ru
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:uaritm/gemma3_1b_med_qa_ru
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.17.1

lora_checkpoints/checkpoint-4400/adapter_config.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "uaritm/gemma3_1b_med_qa_ru",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "k_proj",
+    "o_proj",
+    "q_proj",
+    "gate_proj",
+    "down_proj",
+    "up_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

lora_checkpoints/checkpoint-4400/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9050b82063245ef7ac87dfa8d36e9f9bcbf4db241b1ac2d995a8b3122f0c1bab
+size 52231312

lora_checkpoints/checkpoint-4400/added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "<image_soft_token>": 262144
+}

lora_checkpoints/checkpoint-4400/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,47 @@

+{{ bos_token }}
+{%- if messages[0]['role'] == 'system' -%}
+    {%- if messages[0]['content'] is string -%}
+        {%- set first_user_prefix = messages[0]['content'] + '
+' -%}
+    {%- else -%}
+        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
+' -%}
+    {%- endif -%}
+    {%- set loop_messages = messages[1:] -%}
+{%- else -%}
+    {%- set first_user_prefix = "" -%}
+    {%- set loop_messages = messages -%}
+{%- endif -%}
+{%- for message in loop_messages -%}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
+        {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
+    {%- endif -%}
+    {%- if (message['role'] == 'assistant') -%}
+        {%- set role = "model" -%}
+    {%- else -%}
+        {%- set role = message['role'] -%}
+    {%- endif -%}
+    {{ '<start_of_turn>' + role + '
+' + (first_user_prefix if loop.first else "") }}
+    {%- if message['content'] is string -%}
+        {{ message['content'] | trim }}
+    {%- elif message['content'] is iterable -%}
+        {%- for item in message['content'] -%}
+            {%- if item['type'] == 'image' -%}
+                {{ '<start_of_image>' }}
+            {%- elif item['type'] == 'text' -%}
+                {{ item['text'] | trim }}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- else -%}
+        {{ raise_exception("Invalid content type") }}
+    {%- endif -%}
+    {{ '<end_of_turn>
+' }}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    {{'<start_of_turn>model
+'}}
+{%- endif -%}

lora_checkpoints/checkpoint-4400/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b106776532c3a8d2ca91a43cb4885171a035baa1d90d48ae73b409db5e0e0f5
+size 104671958

lora_checkpoints/checkpoint-4400/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3d810a67992b58b715a7d9b25a5acdf9e0b832de9c82ec2c6816584522da09bb
+size 14244

lora_checkpoints/checkpoint-4400/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f52741266f98091dd23c9f0b9999f8607a62de5e710d64dca525fe9ba02fe51
+size 988

lora_checkpoints/checkpoint-4400/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8fb657393d96492d81b7388ad08bb8c9c79899a2d54086f176059c5daff895c7
+size 1064

lora_checkpoints/checkpoint-4400/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "boi_token": "<start_of_image>",
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eoi_token": "<end_of_image>",
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "image_token": "<image_soft_token>",
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

lora_checkpoints/checkpoint-4400/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
+size 4689074

lora_checkpoints/checkpoint-4400/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

lora_checkpoints/checkpoint-4400/trainer_state.json ADDED Viewed

	@@ -0,0 +1,3114 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9024484040507628,
+  "eval_steps": 500,
+  "global_step": 4400,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00205101910011537,
+      "grad_norm": 1.9277215003967285,
+      "learning_rate": 3.6885245901639347e-06,
+      "loss": 1.4306,
+      "step": 10
+    },
+    {
+      "epoch": 0.00410203820023074,
+      "grad_norm": 0.3513035476207733,
+      "learning_rate": 7.78688524590164e-06,
+      "loss": 1.3524,
+      "step": 20
+    },
+    {
+      "epoch": 0.006153057300346109,
+      "grad_norm": 0.3364648222923279,
+      "learning_rate": 1.1885245901639344e-05,
+      "loss": 1.3188,
+      "step": 30
+    },
+    {
+      "epoch": 0.00820407640046148,
+      "grad_norm": 0.3382512927055359,
+      "learning_rate": 1.598360655737705e-05,
+      "loss": 1.3418,
+      "step": 40
+    },
+    {
+      "epoch": 0.01025509550057685,
+      "grad_norm": 0.360334575176239,
+      "learning_rate": 2.0081967213114755e-05,
+      "loss": 1.3381,
+      "step": 50
+    },
+    {
+      "epoch": 0.012306114600692218,
+      "grad_norm": 0.3408481180667877,
+      "learning_rate": 2.418032786885246e-05,
+      "loss": 1.3365,
+      "step": 60
+    },
+    {
+      "epoch": 0.014357133700807588,
+      "grad_norm": 0.36211535334587097,
+      "learning_rate": 2.8278688524590162e-05,
+      "loss": 1.3314,
+      "step": 70
+    },
+    {
+      "epoch": 0.01640815280092296,
+      "grad_norm": 0.38704580068588257,
+      "learning_rate": 3.237704918032787e-05,
+      "loss": 1.3108,
+      "step": 80
+    },
+    {
+      "epoch": 0.018459171901038327,
+      "grad_norm": 0.44303640723228455,
+      "learning_rate": 3.6475409836065576e-05,
+      "loss": 1.3073,
+      "step": 90
+    },
+    {
+      "epoch": 0.0205101910011537,
+      "grad_norm": 0.4073602557182312,
+      "learning_rate": 4.057377049180328e-05,
+      "loss": 1.2993,
+      "step": 100
+    },
+    {
+      "epoch": 0.022561210101269068,
+      "grad_norm": 0.4478100538253784,
+      "learning_rate": 4.467213114754098e-05,
+      "loss": 1.3413,
+      "step": 110
+    },
+    {
+      "epoch": 0.024612229201384436,
+      "grad_norm": 0.39146170020103455,
+      "learning_rate": 4.8770491803278687e-05,
+      "loss": 1.3168,
+      "step": 120
+    },
+    {
+      "epoch": 0.026663248301499808,
+      "grad_norm": 0.3786431849002838,
+      "learning_rate": 5.28688524590164e-05,
+      "loss": 1.2774,
+      "step": 130
+    },
+    {
+      "epoch": 0.028714267401615177,
+      "grad_norm": 0.4014948904514313,
+      "learning_rate": 5.69672131147541e-05,
+      "loss": 1.346,
+      "step": 140
+    },
+    {
+      "epoch": 0.03076528650173055,
+      "grad_norm": 0.3987842798233032,
+      "learning_rate": 6.10655737704918e-05,
+      "loss": 1.2816,
+      "step": 150
+    },
+    {
+      "epoch": 0.03281630560184592,
+      "grad_norm": 0.3897082507610321,
+      "learning_rate": 6.516393442622951e-05,
+      "loss": 1.3485,
+      "step": 160
+    },
+    {
+      "epoch": 0.034867324701961286,
+      "grad_norm": 0.373279333114624,
+      "learning_rate": 6.926229508196722e-05,
+      "loss": 1.3185,
+      "step": 170
+    },
+    {
+      "epoch": 0.036918343802076654,
+      "grad_norm": 0.3812575340270996,
+      "learning_rate": 7.336065573770491e-05,
+      "loss": 1.3394,
+      "step": 180
+    },
+    {
+      "epoch": 0.03896936290219203,
+      "grad_norm": 0.35926997661590576,
+      "learning_rate": 7.745901639344263e-05,
+      "loss": 1.2821,
+      "step": 190
+    },
+    {
+      "epoch": 0.0410203820023074,
+      "grad_norm": 0.3649434745311737,
+      "learning_rate": 8.155737704918032e-05,
+      "loss": 1.33,
+      "step": 200
+    },
+    {
+      "epoch": 0.04307140110242277,
+      "grad_norm": 0.345662921667099,
+      "learning_rate": 8.565573770491803e-05,
+      "loss": 1.3107,
+      "step": 210
+    },
+    {
+      "epoch": 0.045122420202538135,
+      "grad_norm": 0.37169769406318665,
+      "learning_rate": 8.975409836065574e-05,
+      "loss": 1.309,
+      "step": 220
+    },
+    {
+      "epoch": 0.047173439302653504,
+      "grad_norm": 0.37920281291007996,
+      "learning_rate": 9.385245901639344e-05,
+      "loss": 1.3352,
+      "step": 230
+    },
+    {
+      "epoch": 0.04922445840276887,
+      "grad_norm": 0.35772770643234253,
+      "learning_rate": 9.795081967213115e-05,
+      "loss": 1.2402,
+      "step": 240
+    },
+    {
+      "epoch": 0.05127547750288425,
+      "grad_norm": 0.38790181279182434,
+      "learning_rate": 9.989205526770294e-05,
+      "loss": 1.326,
+      "step": 250
+    },
+    {
+      "epoch": 0.053326496602999617,
+      "grad_norm": 0.3545536696910858,
+      "learning_rate": 9.967616580310882e-05,
+      "loss": 1.3173,
+      "step": 260
+    },
+    {
+      "epoch": 0.055377515703114985,
+      "grad_norm": 0.3845142722129822,
+      "learning_rate": 9.946027633851469e-05,
+      "loss": 1.2949,
+      "step": 270
+    },
+    {
+      "epoch": 0.057428534803230354,
+      "grad_norm": 0.38621339201927185,
+      "learning_rate": 9.924438687392055e-05,
+      "loss": 1.2773,
+      "step": 280
+    },
+    {
+      "epoch": 0.05947955390334572,
+      "grad_norm": 0.38091301918029785,
+      "learning_rate": 9.902849740932643e-05,
+      "loss": 1.3282,
+      "step": 290
+    },
+    {
+      "epoch": 0.0615305730034611,
+      "grad_norm": 0.37546730041503906,
+      "learning_rate": 9.88126079447323e-05,
+      "loss": 1.2862,
+      "step": 300
+    },
+    {
+      "epoch": 0.06358159210357646,
+      "grad_norm": 0.3515011966228485,
+      "learning_rate": 9.859671848013817e-05,
+      "loss": 1.2937,
+      "step": 310
+    },
+    {
+      "epoch": 0.06563261120369183,
+      "grad_norm": 0.3863738775253296,
+      "learning_rate": 9.838082901554406e-05,
+      "loss": 1.3056,
+      "step": 320
+    },
+    {
+      "epoch": 0.06768363030380721,
+      "grad_norm": 0.36615240573883057,
+      "learning_rate": 9.816493955094992e-05,
+      "loss": 1.3062,
+      "step": 330
+    },
+    {
+      "epoch": 0.06973464940392257,
+      "grad_norm": 0.37741243839263916,
+      "learning_rate": 9.794905008635579e-05,
+      "loss": 1.3094,
+      "step": 340
+    },
+    {
+      "epoch": 0.07178566850403795,
+      "grad_norm": 0.38626739382743835,
+      "learning_rate": 9.773316062176167e-05,
+      "loss": 1.2947,
+      "step": 350
+    },
+    {
+      "epoch": 0.07383668760415331,
+      "grad_norm": 0.38667401671409607,
+      "learning_rate": 9.751727115716753e-05,
+      "loss": 1.2976,
+      "step": 360
+    },
+    {
+      "epoch": 0.07588770670426868,
+      "grad_norm": 0.36084800958633423,
+      "learning_rate": 9.730138169257342e-05,
+      "loss": 1.27,
+      "step": 370
+    },
+    {
+      "epoch": 0.07793872580438406,
+      "grad_norm": 0.3754425346851349,
+      "learning_rate": 9.708549222797928e-05,
+      "loss": 1.3243,
+      "step": 380
+    },
+    {
+      "epoch": 0.07998974490449942,
+      "grad_norm": 0.39857473969459534,
+      "learning_rate": 9.686960276338515e-05,
+      "loss": 1.3077,
+      "step": 390
+    },
+    {
+      "epoch": 0.0820407640046148,
+      "grad_norm": 0.3919648230075836,
+      "learning_rate": 9.665371329879103e-05,
+      "loss": 1.2985,
+      "step": 400
+    },
+    {
+      "epoch": 0.08409178310473016,
+      "grad_norm": 0.3675483465194702,
+      "learning_rate": 9.643782383419689e-05,
+      "loss": 1.2946,
+      "step": 410
+    },
+    {
+      "epoch": 0.08614280220484553,
+      "grad_norm": 0.3898465633392334,
+      "learning_rate": 9.622193436960277e-05,
+      "loss": 1.333,
+      "step": 420
+    },
+    {
+      "epoch": 0.08819382130496091,
+      "grad_norm": 0.3681259751319885,
+      "learning_rate": 9.600604490500864e-05,
+      "loss": 1.2968,
+      "step": 430
+    },
+    {
+      "epoch": 0.09024484040507627,
+      "grad_norm": 0.36453816294670105,
+      "learning_rate": 9.57901554404145e-05,
+      "loss": 1.272,
+      "step": 440
+    },
+    {
+      "epoch": 0.09229585950519165,
+      "grad_norm": 0.34828147292137146,
+      "learning_rate": 9.557426597582039e-05,
+      "loss": 1.3245,
+      "step": 450
+    },
+    {
+      "epoch": 0.09434687860530701,
+      "grad_norm": 0.3570501208305359,
+      "learning_rate": 9.535837651122625e-05,
+      "loss": 1.313,
+      "step": 460
+    },
+    {
+      "epoch": 0.09639789770542238,
+      "grad_norm": 0.36692506074905396,
+      "learning_rate": 9.514248704663213e-05,
+      "loss": 1.2915,
+      "step": 470
+    },
+    {
+      "epoch": 0.09844891680553775,
+      "grad_norm": 0.39161381125450134,
+      "learning_rate": 9.4926597582038e-05,
+      "loss": 1.3101,
+      "step": 480
+    },
+    {
+      "epoch": 0.10049993590565312,
+      "grad_norm": 0.3808858394622803,
+      "learning_rate": 9.471070811744387e-05,
+      "loss": 1.3099,
+      "step": 490
+    },
+    {
+      "epoch": 0.1025509550057685,
+      "grad_norm": 0.3541582524776459,
+      "learning_rate": 9.449481865284975e-05,
+      "loss": 1.2772,
+      "step": 500
+    },
+    {
+      "epoch": 0.10460197410588386,
+      "grad_norm": 0.379190593957901,
+      "learning_rate": 9.427892918825562e-05,
+      "loss": 1.2914,
+      "step": 510
+    },
+    {
+      "epoch": 0.10665299320599923,
+      "grad_norm": 0.37727421522140503,
+      "learning_rate": 9.406303972366149e-05,
+      "loss": 1.2888,
+      "step": 520
+    },
+    {
+      "epoch": 0.1087040123061146,
+      "grad_norm": 0.3787306845188141,
+      "learning_rate": 9.384715025906737e-05,
+      "loss": 1.3049,
+      "step": 530
+    },
+    {
+      "epoch": 0.11075503140622997,
+      "grad_norm": 0.3831459581851959,
+      "learning_rate": 9.363126079447323e-05,
+      "loss": 1.2631,
+      "step": 540
+    },
+    {
+      "epoch": 0.11280605050634535,
+      "grad_norm": 0.37274929881095886,
+      "learning_rate": 9.34153713298791e-05,
+      "loss": 1.3313,
+      "step": 550
+    },
+    {
+      "epoch": 0.11485706960646071,
+      "grad_norm": 0.3683277368545532,
+      "learning_rate": 9.319948186528498e-05,
+      "loss": 1.2528,
+      "step": 560
+    },
+    {
+      "epoch": 0.11690808870657608,
+      "grad_norm": 0.39554840326309204,
+      "learning_rate": 9.298359240069085e-05,
+      "loss": 1.2737,
+      "step": 570
+    },
+    {
+      "epoch": 0.11895910780669144,
+      "grad_norm": 0.39166760444641113,
+      "learning_rate": 9.276770293609673e-05,
+      "loss": 1.271,
+      "step": 580
+    },
+    {
+      "epoch": 0.12101012690680682,
+      "grad_norm": 0.384085476398468,
+      "learning_rate": 9.255181347150259e-05,
+      "loss": 1.2921,
+      "step": 590
+    },
+    {
+      "epoch": 0.1230611460069222,
+      "grad_norm": 0.3704201281070709,
+      "learning_rate": 9.233592400690847e-05,
+      "loss": 1.2776,
+      "step": 600
+    },
+    {
+      "epoch": 0.12511216510703757,
+      "grad_norm": 0.3844301998615265,
+      "learning_rate": 9.212003454231434e-05,
+      "loss": 1.3067,
+      "step": 610
+    },
+    {
+      "epoch": 0.12716318420715292,
+      "grad_norm": 0.3971571922302246,
+      "learning_rate": 9.190414507772022e-05,
+      "loss": 1.2792,
+      "step": 620
+    },
+    {
+      "epoch": 0.1292142033072683,
+      "grad_norm": 0.40666353702545166,
+      "learning_rate": 9.168825561312608e-05,
+      "loss": 1.2964,
+      "step": 630
+    },
+    {
+      "epoch": 0.13126522240738367,
+      "grad_norm": 0.38252532482147217,
+      "learning_rate": 9.147236614853195e-05,
+      "loss": 1.2815,
+      "step": 640
+    },
+    {
+      "epoch": 0.13331624150749904,
+      "grad_norm": 0.37795621156692505,
+      "learning_rate": 9.125647668393783e-05,
+      "loss": 1.283,
+      "step": 650
+    },
+    {
+      "epoch": 0.13536726060761442,
+      "grad_norm": 0.4035683572292328,
+      "learning_rate": 9.10405872193437e-05,
+      "loss": 1.288,
+      "step": 660
+    },
+    {
+      "epoch": 0.13741827970772977,
+      "grad_norm": 0.410669207572937,
+      "learning_rate": 9.082469775474958e-05,
+      "loss": 1.2659,
+      "step": 670
+    },
+    {
+      "epoch": 0.13946929880784514,
+      "grad_norm": 0.3809865713119507,
+      "learning_rate": 9.060880829015544e-05,
+      "loss": 1.3133,
+      "step": 680
+    },
+    {
+      "epoch": 0.14152031790796052,
+      "grad_norm": 0.3748447597026825,
+      "learning_rate": 9.039291882556131e-05,
+      "loss": 1.2643,
+      "step": 690
+    },
+    {
+      "epoch": 0.1435713370080759,
+      "grad_norm": 0.39292991161346436,
+      "learning_rate": 9.017702936096719e-05,
+      "loss": 1.2855,
+      "step": 700
+    },
+    {
+      "epoch": 0.14562235610819127,
+      "grad_norm": 0.4399755001068115,
+      "learning_rate": 8.996113989637307e-05,
+      "loss": 1.286,
+      "step": 710
+    },
+    {
+      "epoch": 0.14767337520830662,
+      "grad_norm": 0.42447429895401,
+      "learning_rate": 8.974525043177894e-05,
+      "loss": 1.2736,
+      "step": 720
+    },
+    {
+      "epoch": 0.149724394308422,
+      "grad_norm": 0.37248438596725464,
+      "learning_rate": 8.95293609671848e-05,
+      "loss": 1.2652,
+      "step": 730
+    },
+    {
+      "epoch": 0.15177541340853737,
+      "grad_norm": 0.39122238755226135,
+      "learning_rate": 8.931347150259068e-05,
+      "loss": 1.2814,
+      "step": 740
+    },
+    {
+      "epoch": 0.15382643250865274,
+      "grad_norm": 0.3697800040245056,
+      "learning_rate": 8.909758203799655e-05,
+      "loss": 1.2462,
+      "step": 750
+    },
+    {
+      "epoch": 0.15587745160876812,
+      "grad_norm": 0.3901929259300232,
+      "learning_rate": 8.888169257340241e-05,
+      "loss": 1.2742,
+      "step": 760
+    },
+    {
+      "epoch": 0.15792847070888347,
+      "grad_norm": 0.3833727538585663,
+      "learning_rate": 8.86658031088083e-05,
+      "loss": 1.3015,
+      "step": 770
+    },
+    {
+      "epoch": 0.15997948980899884,
+      "grad_norm": 0.4028802216053009,
+      "learning_rate": 8.844991364421416e-05,
+      "loss": 1.2631,
+      "step": 780
+    },
+    {
+      "epoch": 0.16203050890911422,
+      "grad_norm": 0.39087918400764465,
+      "learning_rate": 8.823402417962004e-05,
+      "loss": 1.2993,
+      "step": 790
+    },
+    {
+      "epoch": 0.1640815280092296,
+      "grad_norm": 0.39453235268592834,
+      "learning_rate": 8.801813471502591e-05,
+      "loss": 1.2544,
+      "step": 800
+    },
+    {
+      "epoch": 0.16613254710934497,
+      "grad_norm": 0.42142602801322937,
+      "learning_rate": 8.780224525043178e-05,
+      "loss": 1.2676,
+      "step": 810
+    },
+    {
+      "epoch": 0.16818356620946032,
+      "grad_norm": 0.36646899580955505,
+      "learning_rate": 8.758635578583767e-05,
+      "loss": 1.2765,
+      "step": 820
+    },
+    {
+      "epoch": 0.1702345853095757,
+      "grad_norm": 0.4253019094467163,
+      "learning_rate": 8.737046632124353e-05,
+      "loss": 1.3003,
+      "step": 830
+    },
+    {
+      "epoch": 0.17228560440969107,
+      "grad_norm": 0.41490674018859863,
+      "learning_rate": 8.715457685664939e-05,
+      "loss": 1.2731,
+      "step": 840
+    },
+    {
+      "epoch": 0.17433662350980644,
+      "grad_norm": 0.405460387468338,
+      "learning_rate": 8.693868739205528e-05,
+      "loss": 1.2122,
+      "step": 850
+    },
+    {
+      "epoch": 0.17638764260992182,
+      "grad_norm": 0.4028235375881195,
+      "learning_rate": 8.672279792746114e-05,
+      "loss": 1.3238,
+      "step": 860
+    },
+    {
+      "epoch": 0.17843866171003717,
+      "grad_norm": 0.38994792103767395,
+      "learning_rate": 8.650690846286701e-05,
+      "loss": 1.2875,
+      "step": 870
+    },
+    {
+      "epoch": 0.18048968081015254,
+      "grad_norm": 0.4099538326263428,
+      "learning_rate": 8.629101899827289e-05,
+      "loss": 1.2807,
+      "step": 880
+    },
+    {
+      "epoch": 0.18254069991026792,
+      "grad_norm": 0.40470021963119507,
+      "learning_rate": 8.607512953367875e-05,
+      "loss": 1.2802,
+      "step": 890
+    },
+    {
+      "epoch": 0.1845917190103833,
+      "grad_norm": 0.4066854417324066,
+      "learning_rate": 8.585924006908464e-05,
+      "loss": 1.2464,
+      "step": 900
+    },
+    {
+      "epoch": 0.18664273811049864,
+      "grad_norm": 0.38739994168281555,
+      "learning_rate": 8.56433506044905e-05,
+      "loss": 1.2831,
+      "step": 910
+    },
+    {
+      "epoch": 0.18869375721061402,
+      "grad_norm": 0.4257420301437378,
+      "learning_rate": 8.542746113989638e-05,
+      "loss": 1.2679,
+      "step": 920
+    },
+    {
+      "epoch": 0.1907447763107294,
+      "grad_norm": 0.41571488976478577,
+      "learning_rate": 8.521157167530225e-05,
+      "loss": 1.2501,
+      "step": 930
+    },
+    {
+      "epoch": 0.19279579541084477,
+      "grad_norm": 0.4178495407104492,
+      "learning_rate": 8.499568221070811e-05,
+      "loss": 1.2657,
+      "step": 940
+    },
+    {
+      "epoch": 0.19484681451096014,
+      "grad_norm": 0.4083455801010132,
+      "learning_rate": 8.477979274611399e-05,
+      "loss": 1.2781,
+      "step": 950
+    },
+    {
+      "epoch": 0.1968978336110755,
+      "grad_norm": 0.4067554175853729,
+      "learning_rate": 8.456390328151986e-05,
+      "loss": 1.2582,
+      "step": 960
+    },
+    {
+      "epoch": 0.19894885271119087,
+      "grad_norm": 0.4067447781562805,
+      "learning_rate": 8.434801381692574e-05,
+      "loss": 1.2948,
+      "step": 970
+    },
+    {
+      "epoch": 0.20099987181130624,
+      "grad_norm": 0.44283562898635864,
+      "learning_rate": 8.413212435233161e-05,
+      "loss": 1.3011,
+      "step": 980
+    },
+    {
+      "epoch": 0.20305089091142162,
+      "grad_norm": 0.41568294167518616,
+      "learning_rate": 8.391623488773748e-05,
+      "loss": 1.2804,
+      "step": 990
+    },
+    {
+      "epoch": 0.205101910011537,
+      "grad_norm": 0.4183642864227295,
+      "learning_rate": 8.370034542314335e-05,
+      "loss": 1.2228,
+      "step": 1000
+    },
+    {
+      "epoch": 0.20715292911165234,
+      "grad_norm": 0.4311917722225189,
+      "learning_rate": 8.348445595854923e-05,
+      "loss": 1.2714,
+      "step": 1010
+    },
+    {
+      "epoch": 0.20920394821176772,
+      "grad_norm": 0.41575828194618225,
+      "learning_rate": 8.32685664939551e-05,
+      "loss": 1.2783,
+      "step": 1020
+    },
+    {
+      "epoch": 0.2112549673118831,
+      "grad_norm": 0.3958878815174103,
+      "learning_rate": 8.305267702936098e-05,
+      "loss": 1.2558,
+      "step": 1030
+    },
+    {
+      "epoch": 0.21330598641199847,
+      "grad_norm": 0.43759557604789734,
+      "learning_rate": 8.283678756476684e-05,
+      "loss": 1.2557,
+      "step": 1040
+    },
+    {
+      "epoch": 0.21535700551211384,
+      "grad_norm": 0.41460636258125305,
+      "learning_rate": 8.262089810017271e-05,
+      "loss": 1.2851,
+      "step": 1050
+    },
+    {
+      "epoch": 0.2174080246122292,
+      "grad_norm": 0.4114689826965332,
+      "learning_rate": 8.240500863557859e-05,
+      "loss": 1.3076,
+      "step": 1060
+    },
+    {
+      "epoch": 0.21945904371234456,
+      "grad_norm": 0.42222094535827637,
+      "learning_rate": 8.218911917098446e-05,
+      "loss": 1.2263,
+      "step": 1070
+    },
+    {
+      "epoch": 0.22151006281245994,
+      "grad_norm": 0.4098639488220215,
+      "learning_rate": 8.197322970639033e-05,
+      "loss": 1.2779,
+      "step": 1080
+    },
+    {
+      "epoch": 0.22356108191257532,
+      "grad_norm": 0.4205043315887451,
+      "learning_rate": 8.175734024179621e-05,
+      "loss": 1.2177,
+      "step": 1090
+    },
+    {
+      "epoch": 0.2256121010126907,
+      "grad_norm": 0.4501648247241974,
+      "learning_rate": 8.154145077720208e-05,
+      "loss": 1.3227,
+      "step": 1100
+    },
+    {
+      "epoch": 0.22766312011280604,
+      "grad_norm": 0.41510599851608276,
+      "learning_rate": 8.132556131260795e-05,
+      "loss": 1.3177,
+      "step": 1110
+    },
+    {
+      "epoch": 0.22971413921292141,
+      "grad_norm": 0.41567444801330566,
+      "learning_rate": 8.110967184801383e-05,
+      "loss": 1.2506,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2317651583130368,
+      "grad_norm": 0.4262779653072357,
+      "learning_rate": 8.089378238341969e-05,
+      "loss": 1.2506,
+      "step": 1130
+    },
+    {
+      "epoch": 0.23381617741315217,
+      "grad_norm": 0.4220465421676636,
+      "learning_rate": 8.067789291882558e-05,
+      "loss": 1.2514,
+      "step": 1140
+    },
+    {
+      "epoch": 0.23586719651326754,
+      "grad_norm": 0.4169275462627411,
+      "learning_rate": 8.046200345423144e-05,
+      "loss": 1.2693,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2379182156133829,
+      "grad_norm": 0.43145328760147095,
+      "learning_rate": 8.02461139896373e-05,
+      "loss": 1.2394,
+      "step": 1160
+    },
+    {
+      "epoch": 0.23996923471349826,
+      "grad_norm": 0.42889878153800964,
+      "learning_rate": 8.003022452504319e-05,
+      "loss": 1.248,
+      "step": 1170
+    },
+    {
+      "epoch": 0.24202025381361364,
+      "grad_norm": 0.41731464862823486,
+      "learning_rate": 7.981433506044905e-05,
+      "loss": 1.2498,
+      "step": 1180
+    },
+    {
+      "epoch": 0.24407127291372901,
+      "grad_norm": 0.4326362609863281,
+      "learning_rate": 7.959844559585493e-05,
+      "loss": 1.265,
+      "step": 1190
+    },
+    {
+      "epoch": 0.2461222920138444,
+      "grad_norm": 0.4242352843284607,
+      "learning_rate": 7.93825561312608e-05,
+      "loss": 1.2672,
+      "step": 1200
+    },
+    {
+      "epoch": 0.24817331111395974,
+      "grad_norm": 0.4441153407096863,
+      "learning_rate": 7.916666666666666e-05,
+      "loss": 1.2944,
+      "step": 1210
+    },
+    {
+      "epoch": 0.25022433021407514,
+      "grad_norm": 0.40912818908691406,
+      "learning_rate": 7.895077720207255e-05,
+      "loss": 1.2702,
+      "step": 1220
+    },
+    {
+      "epoch": 0.2522753493141905,
+      "grad_norm": 0.44539037346839905,
+      "learning_rate": 7.873488773747841e-05,
+      "loss": 1.2228,
+      "step": 1230
+    },
+    {
+      "epoch": 0.25432636841430584,
+      "grad_norm": 0.4299303889274597,
+      "learning_rate": 7.851899827288429e-05,
+      "loss": 1.2328,
+      "step": 1240
+    },
+    {
+      "epoch": 0.25637738751442124,
+      "grad_norm": 0.4408973455429077,
+      "learning_rate": 7.830310880829016e-05,
+      "loss": 1.2358,
+      "step": 1250
+    },
+    {
+      "epoch": 0.2584284066145366,
+      "grad_norm": 0.4100968837738037,
+      "learning_rate": 7.808721934369602e-05,
+      "loss": 1.2458,
+      "step": 1260
+    },
+    {
+      "epoch": 0.260479425714652,
+      "grad_norm": 0.4401489198207855,
+      "learning_rate": 7.787132987910191e-05,
+      "loss": 1.2593,
+      "step": 1270
+    },
+    {
+      "epoch": 0.26253044481476734,
+      "grad_norm": 0.4514229893684387,
+      "learning_rate": 7.765544041450777e-05,
+      "loss": 1.2632,
+      "step": 1280
+    },
+    {
+      "epoch": 0.2645814639148827,
+      "grad_norm": 0.38684791326522827,
+      "learning_rate": 7.743955094991365e-05,
+      "loss": 1.2424,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2666324830149981,
+      "grad_norm": 0.46148189902305603,
+      "learning_rate": 7.722366148531953e-05,
+      "loss": 1.2445,
+      "step": 1300
+    },
+    {
+      "epoch": 0.26868350211511344,
+      "grad_norm": 0.4319213628768921,
+      "learning_rate": 7.700777202072539e-05,
+      "loss": 1.2253,
+      "step": 1310
+    },
+    {
+      "epoch": 0.27073452121522884,
+      "grad_norm": 0.4195545017719269,
+      "learning_rate": 7.679188255613126e-05,
+      "loss": 1.2578,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2727855403153442,
+      "grad_norm": 0.43690159916877747,
+      "learning_rate": 7.657599309153714e-05,
+      "loss": 1.2573,
+      "step": 1330
+    },
+    {
+      "epoch": 0.27483655941545954,
+      "grad_norm": 0.44571492075920105,
+      "learning_rate": 7.636010362694301e-05,
+      "loss": 1.2607,
+      "step": 1340
+    },
+    {
+      "epoch": 0.27688757851557494,
+      "grad_norm": 0.43295958638191223,
+      "learning_rate": 7.614421416234889e-05,
+      "loss": 1.2278,
+      "step": 1350
+    },
+    {
+      "epoch": 0.2789385976156903,
+      "grad_norm": 0.44495707750320435,
+      "learning_rate": 7.592832469775475e-05,
+      "loss": 1.2798,
+      "step": 1360
+    },
+    {
+      "epoch": 0.2809896167158057,
+      "grad_norm": 0.4412330985069275,
+      "learning_rate": 7.571243523316062e-05,
+      "loss": 1.2501,
+      "step": 1370
+    },
+    {
+      "epoch": 0.28304063581592104,
+      "grad_norm": 0.44599953293800354,
+      "learning_rate": 7.54965457685665e-05,
+      "loss": 1.2396,
+      "step": 1380
+    },
+    {
+      "epoch": 0.2850916549160364,
+      "grad_norm": 0.447109580039978,
+      "learning_rate": 7.528065630397237e-05,
+      "loss": 1.2767,
+      "step": 1390
+    },
+    {
+      "epoch": 0.2871426740161518,
+      "grad_norm": 0.44506722688674927,
+      "learning_rate": 7.506476683937824e-05,
+      "loss": 1.2546,
+      "step": 1400
+    },
+    {
+      "epoch": 0.28919369311626714,
+      "grad_norm": 0.44061776995658875,
+      "learning_rate": 7.484887737478411e-05,
+      "loss": 1.2413,
+      "step": 1410
+    },
+    {
+      "epoch": 0.29124471221638254,
+      "grad_norm": 0.45085111260414124,
+      "learning_rate": 7.463298791018999e-05,
+      "loss": 1.2483,
+      "step": 1420
+    },
+    {
+      "epoch": 0.2932957313164979,
+      "grad_norm": 0.4437837600708008,
+      "learning_rate": 7.441709844559586e-05,
+      "loss": 1.252,
+      "step": 1430
+    },
+    {
+      "epoch": 0.29534675041661324,
+      "grad_norm": 0.4294221103191376,
+      "learning_rate": 7.420120898100174e-05,
+      "loss": 1.2386,
+      "step": 1440
+    },
+    {
+      "epoch": 0.29739776951672864,
+      "grad_norm": 0.4780830144882202,
+      "learning_rate": 7.39853195164076e-05,
+      "loss": 1.2639,
+      "step": 1450
+    },
+    {
+      "epoch": 0.299448788616844,
+      "grad_norm": 0.44152942299842834,
+      "learning_rate": 7.376943005181347e-05,
+      "loss": 1.2756,
+      "step": 1460
+    },
+    {
+      "epoch": 0.3014998077169594,
+      "grad_norm": 0.41989192366600037,
+      "learning_rate": 7.355354058721935e-05,
+      "loss": 1.2614,
+      "step": 1470
+    },
+    {
+      "epoch": 0.30355082681707474,
+      "grad_norm": 0.5871754884719849,
+      "learning_rate": 7.333765112262521e-05,
+      "loss": 1.2615,
+      "step": 1480
+    },
+    {
+      "epoch": 0.3056018459171901,
+      "grad_norm": 0.4467261731624603,
+      "learning_rate": 7.31217616580311e-05,
+      "loss": 1.2624,
+      "step": 1490
+    },
+    {
+      "epoch": 0.3076528650173055,
+      "grad_norm": 0.49219033122062683,
+      "learning_rate": 7.290587219343696e-05,
+      "loss": 1.289,
+      "step": 1500
+    },
+    {
+      "epoch": 0.30970388411742084,
+      "grad_norm": 0.4700734317302704,
+      "learning_rate": 7.268998272884284e-05,
+      "loss": 1.242,
+      "step": 1510
+    },
+    {
+      "epoch": 0.31175490321753624,
+      "grad_norm": 0.4607170820236206,
+      "learning_rate": 7.247409326424871e-05,
+      "loss": 1.2554,
+      "step": 1520
+    },
+    {
+      "epoch": 0.3138059223176516,
+      "grad_norm": 0.4335988759994507,
+      "learning_rate": 7.225820379965457e-05,
+      "loss": 1.2423,
+      "step": 1530
+    },
+    {
+      "epoch": 0.31585694141776693,
+      "grad_norm": 0.4366897940635681,
+      "learning_rate": 7.204231433506046e-05,
+      "loss": 1.2219,
+      "step": 1540
+    },
+    {
+      "epoch": 0.31790796051788234,
+      "grad_norm": 0.45856085419654846,
+      "learning_rate": 7.182642487046632e-05,
+      "loss": 1.2189,
+      "step": 1550
+    },
+    {
+      "epoch": 0.3199589796179977,
+      "grad_norm": 0.4563063085079193,
+      "learning_rate": 7.16105354058722e-05,
+      "loss": 1.2696,
+      "step": 1560
+    },
+    {
+      "epoch": 0.3220099987181131,
+      "grad_norm": 0.4276934862136841,
+      "learning_rate": 7.139464594127807e-05,
+      "loss": 1.2659,
+      "step": 1570
+    },
+    {
+      "epoch": 0.32406101781822844,
+      "grad_norm": 0.46200886368751526,
+      "learning_rate": 7.117875647668394e-05,
+      "loss": 1.2261,
+      "step": 1580
+    },
+    {
+      "epoch": 0.3261120369183438,
+      "grad_norm": 0.4863358736038208,
+      "learning_rate": 7.096286701208982e-05,
+      "loss": 1.2292,
+      "step": 1590
+    },
+    {
+      "epoch": 0.3281630560184592,
+      "grad_norm": 0.4537160098552704,
+      "learning_rate": 7.074697754749569e-05,
+      "loss": 1.2453,
+      "step": 1600
+    },
+    {
+      "epoch": 0.33021407511857453,
+      "grad_norm": 0.4507627487182617,
+      "learning_rate": 7.053108808290155e-05,
+      "loss": 1.2081,
+      "step": 1610
+    },
+    {
+      "epoch": 0.33226509421868994,
+      "grad_norm": 0.43197301030158997,
+      "learning_rate": 7.031519861830744e-05,
+      "loss": 1.2757,
+      "step": 1620
+    },
+    {
+      "epoch": 0.3343161133188053,
+      "grad_norm": 0.4551820456981659,
+      "learning_rate": 7.00993091537133e-05,
+      "loss": 1.2751,
+      "step": 1630
+    },
+    {
+      "epoch": 0.33636713241892063,
+      "grad_norm": 0.45099398493766785,
+      "learning_rate": 6.988341968911917e-05,
+      "loss": 1.2583,
+      "step": 1640
+    },
+    {
+      "epoch": 0.33841815151903604,
+      "grad_norm": 0.46787434816360474,
+      "learning_rate": 6.966753022452505e-05,
+      "loss": 1.2448,
+      "step": 1650
+    },
+    {
+      "epoch": 0.3404691706191514,
+      "grad_norm": 0.45500054955482483,
+      "learning_rate": 6.945164075993091e-05,
+      "loss": 1.2394,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3425201897192668,
+      "grad_norm": 0.4682730436325073,
+      "learning_rate": 6.92357512953368e-05,
+      "loss": 1.2287,
+      "step": 1670
+    },
+    {
+      "epoch": 0.34457120881938214,
+      "grad_norm": 0.4615074396133423,
+      "learning_rate": 6.901986183074266e-05,
+      "loss": 1.2042,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3466222279194975,
+      "grad_norm": 0.4548027217388153,
+      "learning_rate": 6.880397236614854e-05,
+      "loss": 1.2671,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3486732470196129,
+      "grad_norm": 0.4783169627189636,
+      "learning_rate": 6.858808290155441e-05,
+      "loss": 1.2533,
+      "step": 1700
+    },
+    {
+      "epoch": 0.35072426611972823,
+      "grad_norm": 0.46452414989471436,
+      "learning_rate": 6.837219343696027e-05,
+      "loss": 1.2681,
+      "step": 1710
+    },
+    {
+      "epoch": 0.35277528521984364,
+      "grad_norm": 0.4663463532924652,
+      "learning_rate": 6.815630397236615e-05,
+      "loss": 1.2561,
+      "step": 1720
+    },
+    {
+      "epoch": 0.354826304319959,
+      "grad_norm": 0.46744370460510254,
+      "learning_rate": 6.794041450777202e-05,
+      "loss": 1.2453,
+      "step": 1730
+    },
+    {
+      "epoch": 0.35687732342007433,
+      "grad_norm": 0.471835732460022,
+      "learning_rate": 6.77245250431779e-05,
+      "loss": 1.2472,
+      "step": 1740
+    },
+    {
+      "epoch": 0.35892834252018974,
+      "grad_norm": 0.4618450701236725,
+      "learning_rate": 6.750863557858377e-05,
+      "loss": 1.2547,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3609793616203051,
+      "grad_norm": 0.4651658833026886,
+      "learning_rate": 6.729274611398963e-05,
+      "loss": 1.2623,
+      "step": 1760
+    },
+    {
+      "epoch": 0.36303038072042043,
+      "grad_norm": 0.46842116117477417,
+      "learning_rate": 6.707685664939551e-05,
+      "loss": 1.2391,
+      "step": 1770
+    },
+    {
+      "epoch": 0.36508139982053583,
+      "grad_norm": 0.45604613423347473,
+      "learning_rate": 6.686096718480138e-05,
+      "loss": 1.2884,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3671324189206512,
+      "grad_norm": 0.4306802451610565,
+      "learning_rate": 6.664507772020726e-05,
+      "loss": 1.2252,
+      "step": 1790
+    },
+    {
+      "epoch": 0.3691834380207666,
+      "grad_norm": 0.4549136757850647,
+      "learning_rate": 6.642918825561312e-05,
+      "loss": 1.2496,
+      "step": 1800
+    },
+    {
+      "epoch": 0.37123445712088193,
+      "grad_norm": 0.47443437576293945,
+      "learning_rate": 6.6213298791019e-05,
+      "loss": 1.2655,
+      "step": 1810
+    },
+    {
+      "epoch": 0.3732854762209973,
+      "grad_norm": 0.46772050857543945,
+      "learning_rate": 6.599740932642487e-05,
+      "loss": 1.2366,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3753364953211127,
+      "grad_norm": 0.4691794216632843,
+      "learning_rate": 6.578151986183075e-05,
+      "loss": 1.2152,
+      "step": 1830
+    },
+    {
+      "epoch": 0.37738751442122803,
+      "grad_norm": 0.43691304326057434,
+      "learning_rate": 6.556563039723662e-05,
+      "loss": 1.2511,
+      "step": 1840
+    },
+    {
+      "epoch": 0.37943853352134344,
+      "grad_norm": 0.4595348536968231,
+      "learning_rate": 6.534974093264248e-05,
+      "loss": 1.2635,
+      "step": 1850
+    },
+    {
+      "epoch": 0.3814895526214588,
+      "grad_norm": 0.44760558009147644,
+      "learning_rate": 6.513385146804836e-05,
+      "loss": 1.2342,
+      "step": 1860
+    },
+    {
+      "epoch": 0.38354057172157413,
+      "grad_norm": 0.4559841454029083,
+      "learning_rate": 6.491796200345423e-05,
+      "loss": 1.2432,
+      "step": 1870
+    },
+    {
+      "epoch": 0.38559159082168953,
+      "grad_norm": 0.4497215449810028,
+      "learning_rate": 6.470207253886011e-05,
+      "loss": 1.2267,
+      "step": 1880
+    },
+    {
+      "epoch": 0.3876426099218049,
+      "grad_norm": 0.4863613247871399,
+      "learning_rate": 6.448618307426598e-05,
+      "loss": 1.254,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3896936290219203,
+      "grad_norm": 0.4500603675842285,
+      "learning_rate": 6.427029360967185e-05,
+      "loss": 1.2214,
+      "step": 1900
+    },
+    {
+      "epoch": 0.39174464812203563,
+      "grad_norm": 0.4400598704814911,
+      "learning_rate": 6.405440414507774e-05,
+      "loss": 1.2352,
+      "step": 1910
+    },
+    {
+      "epoch": 0.393795667222151,
+      "grad_norm": 0.46070367097854614,
+      "learning_rate": 6.38385146804836e-05,
+      "loss": 1.2468,
+      "step": 1920
+    },
+    {
+      "epoch": 0.3958466863222664,
+      "grad_norm": 0.44312766194343567,
+      "learning_rate": 6.362262521588946e-05,
+      "loss": 1.1923,
+      "step": 1930
+    },
+    {
+      "epoch": 0.39789770542238173,
+      "grad_norm": 0.5013573169708252,
+      "learning_rate": 6.340673575129535e-05,
+      "loss": 1.2361,
+      "step": 1940
+    },
+    {
+      "epoch": 0.39994872452249713,
+      "grad_norm": 0.4884537160396576,
+      "learning_rate": 6.319084628670121e-05,
+      "loss": 1.2434,
+      "step": 1950
+    },
+    {
+      "epoch": 0.4019997436226125,
+      "grad_norm": 0.46138620376586914,
+      "learning_rate": 6.297495682210708e-05,
+      "loss": 1.257,
+      "step": 1960
+    },
+    {
+      "epoch": 0.40405076272272783,
+      "grad_norm": 0.4941729009151459,
+      "learning_rate": 6.275906735751296e-05,
+      "loss": 1.2347,
+      "step": 1970
+    },
+    {
+      "epoch": 0.40610178182284323,
+      "grad_norm": 0.4675595760345459,
+      "learning_rate": 6.254317789291882e-05,
+      "loss": 1.2353,
+      "step": 1980
+    },
+    {
+      "epoch": 0.4081528009229586,
+      "grad_norm": 0.47944632172584534,
+      "learning_rate": 6.232728842832471e-05,
+      "loss": 1.2643,
+      "step": 1990
+    },
+    {
+      "epoch": 0.410203820023074,
+      "grad_norm": 0.4476461112499237,
+      "learning_rate": 6.211139896373057e-05,
+      "loss": 1.2558,
+      "step": 2000
+    },
+    {
+      "epoch": 0.41225483912318933,
+      "grad_norm": 0.4706653654575348,
+      "learning_rate": 6.189550949913645e-05,
+      "loss": 1.227,
+      "step": 2010
+    },
+    {
+      "epoch": 0.4143058582233047,
+      "grad_norm": 0.48062801361083984,
+      "learning_rate": 6.167962003454232e-05,
+      "loss": 1.2273,
+      "step": 2020
+    },
+    {
+      "epoch": 0.4163568773234201,
+      "grad_norm": 0.46771204471588135,
+      "learning_rate": 6.146373056994818e-05,
+      "loss": 1.2268,
+      "step": 2030
+    },
+    {
+      "epoch": 0.41840789642353543,
+      "grad_norm": 0.4725424647331238,
+      "learning_rate": 6.124784110535406e-05,
+      "loss": 1.2009,
+      "step": 2040
+    },
+    {
+      "epoch": 0.42045891552365083,
+      "grad_norm": 0.47520384192466736,
+      "learning_rate": 6.1031951640759934e-05,
+      "loss": 1.2511,
+      "step": 2050
+    },
+    {
+      "epoch": 0.4225099346237662,
+      "grad_norm": 0.44635480642318726,
+      "learning_rate": 6.081606217616581e-05,
+      "loss": 1.21,
+      "step": 2060
+    },
+    {
+      "epoch": 0.42456095372388153,
+      "grad_norm": 0.47436651587486267,
+      "learning_rate": 6.060017271157168e-05,
+      "loss": 1.2116,
+      "step": 2070
+    },
+    {
+      "epoch": 0.42661197282399693,
+      "grad_norm": 0.5115741491317749,
+      "learning_rate": 6.0384283246977546e-05,
+      "loss": 1.2778,
+      "step": 2080
+    },
+    {
+      "epoch": 0.4286629919241123,
+      "grad_norm": 0.4488040506839752,
+      "learning_rate": 6.016839378238343e-05,
+      "loss": 1.2242,
+      "step": 2090
+    },
+    {
+      "epoch": 0.4307140110242277,
+      "grad_norm": 0.4834796190261841,
+      "learning_rate": 5.9952504317789296e-05,
+      "loss": 1.2357,
+      "step": 2100
+    },
+    {
+      "epoch": 0.43276503012434303,
+      "grad_norm": 0.45478227734565735,
+      "learning_rate": 5.973661485319517e-05,
+      "loss": 1.2233,
+      "step": 2110
+    },
+    {
+      "epoch": 0.4348160492244584,
+      "grad_norm": 0.4539099633693695,
+      "learning_rate": 5.952072538860104e-05,
+      "loss": 1.2527,
+      "step": 2120
+    },
+    {
+      "epoch": 0.4368670683245738,
+      "grad_norm": 0.47722533345222473,
+      "learning_rate": 5.930483592400691e-05,
+      "loss": 1.2015,
+      "step": 2130
+    },
+    {
+      "epoch": 0.43891808742468913,
+      "grad_norm": 0.472023069858551,
+      "learning_rate": 5.908894645941278e-05,
+      "loss": 1.2222,
+      "step": 2140
+    },
+    {
+      "epoch": 0.44096910652480453,
+      "grad_norm": 0.4648214876651764,
+      "learning_rate": 5.887305699481865e-05,
+      "loss": 1.2112,
+      "step": 2150
+    },
+    {
+      "epoch": 0.4430201256249199,
+      "grad_norm": 0.48654377460479736,
+      "learning_rate": 5.8657167530224534e-05,
+      "loss": 1.227,
+      "step": 2160
+    },
+    {
+      "epoch": 0.44507114472503523,
+      "grad_norm": 0.4997814893722534,
+      "learning_rate": 5.84412780656304e-05,
+      "loss": 1.2721,
+      "step": 2170
+    },
+    {
+      "epoch": 0.44712216382515063,
+      "grad_norm": 0.47997352480888367,
+      "learning_rate": 5.822538860103627e-05,
+      "loss": 1.2018,
+      "step": 2180
+    },
+    {
+      "epoch": 0.449173182925266,
+      "grad_norm": 0.4899247884750366,
+      "learning_rate": 5.8009499136442146e-05,
+      "loss": 1.2599,
+      "step": 2190
+    },
+    {
+      "epoch": 0.4512242020253814,
+      "grad_norm": 0.4752749800682068,
+      "learning_rate": 5.7793609671848014e-05,
+      "loss": 1.2171,
+      "step": 2200
+    },
+    {
+      "epoch": 0.45327522112549673,
+      "grad_norm": 0.4801314175128937,
+      "learning_rate": 5.7577720207253896e-05,
+      "loss": 1.2234,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4553262402256121,
+      "grad_norm": 0.4591893255710602,
+      "learning_rate": 5.7361830742659764e-05,
+      "loss": 1.2242,
+      "step": 2220
+    },
+    {
+      "epoch": 0.4573772593257275,
+      "grad_norm": 0.46896713972091675,
+      "learning_rate": 5.7145941278065626e-05,
+      "loss": 1.2117,
+      "step": 2230
+    },
+    {
+      "epoch": 0.45942827842584283,
+      "grad_norm": 0.4853857755661011,
+      "learning_rate": 5.693005181347151e-05,
+      "loss": 1.2218,
+      "step": 2240
+    },
+    {
+      "epoch": 0.46147929752595823,
+      "grad_norm": 0.4648151993751526,
+      "learning_rate": 5.6714162348877376e-05,
+      "loss": 1.2401,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4635303166260736,
+      "grad_norm": 0.4839739501476288,
+      "learning_rate": 5.649827288428325e-05,
+      "loss": 1.1976,
+      "step": 2260
+    },
+    {
+      "epoch": 0.4655813357261889,
+      "grad_norm": 0.4986715018749237,
+      "learning_rate": 5.628238341968912e-05,
+      "loss": 1.2274,
+      "step": 2270
+    },
+    {
+      "epoch": 0.46763235482630433,
+      "grad_norm": 0.4636840522289276,
+      "learning_rate": 5.606649395509499e-05,
+      "loss": 1.236,
+      "step": 2280
+    },
+    {
+      "epoch": 0.4696833739264197,
+      "grad_norm": 0.5011271834373474,
+      "learning_rate": 5.585060449050087e-05,
+      "loss": 1.2275,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4717343930265351,
+      "grad_norm": 0.4648337662220001,
+      "learning_rate": 5.563471502590674e-05,
+      "loss": 1.2457,
+      "step": 2300
+    },
+    {
+      "epoch": 0.47378541212665043,
+      "grad_norm": 0.47708699107170105,
+      "learning_rate": 5.5418825561312614e-05,
+      "loss": 1.2316,
+      "step": 2310
+    },
+    {
+      "epoch": 0.4758364312267658,
+      "grad_norm": 0.4954835772514343,
+      "learning_rate": 5.520293609671848e-05,
+      "loss": 1.229,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4778874503268812,
+      "grad_norm": 0.4701727330684662,
+      "learning_rate": 5.498704663212435e-05,
+      "loss": 1.248,
+      "step": 2330
+    },
+    {
+      "epoch": 0.47993846942699653,
+      "grad_norm": 0.4796009957790375,
+      "learning_rate": 5.477115716753023e-05,
+      "loss": 1.2248,
+      "step": 2340
+    },
+    {
+      "epoch": 0.48198948852711193,
+      "grad_norm": 0.4906330406665802,
+      "learning_rate": 5.4555267702936094e-05,
+      "loss": 1.2628,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4840405076272273,
+      "grad_norm": 0.47203144431114197,
+      "learning_rate": 5.4339378238341976e-05,
+      "loss": 1.2067,
+      "step": 2360
+    },
+    {
+      "epoch": 0.4860915267273426,
+      "grad_norm": 0.503813624382019,
+      "learning_rate": 5.4123488773747845e-05,
+      "loss": 1.2006,
+      "step": 2370
+    },
+    {
+      "epoch": 0.48814254582745803,
+      "grad_norm": 0.4918235242366791,
+      "learning_rate": 5.390759930915371e-05,
+      "loss": 1.1887,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4901935649275734,
+      "grad_norm": 0.4799112379550934,
+      "learning_rate": 5.369170984455959e-05,
+      "loss": 1.2079,
+      "step": 2390
+    },
+    {
+      "epoch": 0.4922445840276888,
+      "grad_norm": 0.4769650101661682,
+      "learning_rate": 5.347582037996546e-05,
+      "loss": 1.1945,
+      "step": 2400
+    },
+    {
+      "epoch": 0.49429560312780413,
+      "grad_norm": 0.5079638957977295,
+      "learning_rate": 5.325993091537134e-05,
+      "loss": 1.2294,
+      "step": 2410
+    },
+    {
+      "epoch": 0.4963466222279195,
+      "grad_norm": 0.520418643951416,
+      "learning_rate": 5.304404145077721e-05,
+      "loss": 1.2308,
+      "step": 2420
+    },
+    {
+      "epoch": 0.4983976413280349,
+      "grad_norm": 0.4546453058719635,
+      "learning_rate": 5.2828151986183075e-05,
+      "loss": 1.2206,
+      "step": 2430
+    },
+    {
+      "epoch": 0.5004486604281503,
+      "grad_norm": 0.47760534286499023,
+      "learning_rate": 5.261226252158895e-05,
+      "loss": 1.208,
+      "step": 2440
+    },
+    {
+      "epoch": 0.5024996795282656,
+      "grad_norm": 0.5267066955566406,
+      "learning_rate": 5.239637305699482e-05,
+      "loss": 1.2123,
+      "step": 2450
+    },
+    {
+      "epoch": 0.504550698628381,
+      "grad_norm": 0.45763811469078064,
+      "learning_rate": 5.2180483592400694e-05,
+      "loss": 1.2159,
+      "step": 2460
+    },
+    {
+      "epoch": 0.5066017177284964,
+      "grad_norm": 0.4922376871109009,
+      "learning_rate": 5.196459412780656e-05,
+      "loss": 1.2456,
+      "step": 2470
+    },
+    {
+      "epoch": 0.5086527368286117,
+      "grad_norm": 0.47043368220329285,
+      "learning_rate": 5.174870466321243e-05,
+      "loss": 1.2052,
+      "step": 2480
+    },
+    {
+      "epoch": 0.5107037559287271,
+      "grad_norm": 0.5082889795303345,
+      "learning_rate": 5.153281519861831e-05,
+      "loss": 1.2393,
+      "step": 2490
+    },
+    {
+      "epoch": 0.5127547750288425,
+      "grad_norm": 0.4955206513404846,
+      "learning_rate": 5.131692573402418e-05,
+      "loss": 1.2323,
+      "step": 2500
+    },
+    {
+      "epoch": 0.5148057941289578,
+      "grad_norm": 0.48625460267066956,
+      "learning_rate": 5.1101036269430057e-05,
+      "loss": 1.206,
+      "step": 2510
+    },
+    {
+      "epoch": 0.5168568132290732,
+      "grad_norm": 0.49060237407684326,
+      "learning_rate": 5.0885146804835925e-05,
+      "loss": 1.2353,
+      "step": 2520
+    },
+    {
+      "epoch": 0.5189078323291886,
+      "grad_norm": 0.46809640526771545,
+      "learning_rate": 5.0669257340241793e-05,
+      "loss": 1.2287,
+      "step": 2530
+    },
+    {
+      "epoch": 0.520958851429304,
+      "grad_norm": 0.4944596290588379,
+      "learning_rate": 5.0453367875647675e-05,
+      "loss": 1.2413,
+      "step": 2540
+    },
+    {
+      "epoch": 0.5230098705294193,
+      "grad_norm": 0.46914994716644287,
+      "learning_rate": 5.023747841105354e-05,
+      "loss": 1.22,
+      "step": 2550
+    },
+    {
+      "epoch": 0.5250608896295347,
+      "grad_norm": 0.4888727366924286,
+      "learning_rate": 5.002158894645942e-05,
+      "loss": 1.2343,
+      "step": 2560
+    },
+    {
+      "epoch": 0.5271119087296501,
+      "grad_norm": 0.4785778522491455,
+      "learning_rate": 4.980569948186529e-05,
+      "loss": 1.187,
+      "step": 2570
+    },
+    {
+      "epoch": 0.5291629278297654,
+      "grad_norm": 0.4947550594806671,
+      "learning_rate": 4.958981001727116e-05,
+      "loss": 1.2288,
+      "step": 2580
+    },
+    {
+      "epoch": 0.5312139469298808,
+      "grad_norm": 0.5263291597366333,
+      "learning_rate": 4.937392055267703e-05,
+      "loss": 1.2044,
+      "step": 2590
+    },
+    {
+      "epoch": 0.5332649660299962,
+      "grad_norm": 0.49239382147789,
+      "learning_rate": 4.9158031088082906e-05,
+      "loss": 1.1865,
+      "step": 2600
+    },
+    {
+      "epoch": 0.5353159851301115,
+      "grad_norm": 0.48874983191490173,
+      "learning_rate": 4.8942141623488775e-05,
+      "loss": 1.2672,
+      "step": 2610
+    },
+    {
+      "epoch": 0.5373670042302269,
+      "grad_norm": 0.48474863171577454,
+      "learning_rate": 4.872625215889465e-05,
+      "loss": 1.2359,
+      "step": 2620
+    },
+    {
+      "epoch": 0.5394180233303423,
+      "grad_norm": 0.4978977143764496,
+      "learning_rate": 4.851036269430052e-05,
+      "loss": 1.2139,
+      "step": 2630
+    },
+    {
+      "epoch": 0.5414690424304577,
+      "grad_norm": 0.5144924521446228,
+      "learning_rate": 4.829447322970639e-05,
+      "loss": 1.221,
+      "step": 2640
+    },
+    {
+      "epoch": 0.543520061530573,
+      "grad_norm": 0.5082759857177734,
+      "learning_rate": 4.807858376511227e-05,
+      "loss": 1.2209,
+      "step": 2650
+    },
+    {
+      "epoch": 0.5455710806306884,
+      "grad_norm": 0.4933965504169464,
+      "learning_rate": 4.786269430051814e-05,
+      "loss": 1.207,
+      "step": 2660
+    },
+    {
+      "epoch": 0.5476220997308038,
+      "grad_norm": 0.49464166164398193,
+      "learning_rate": 4.7646804835924005e-05,
+      "loss": 1.2398,
+      "step": 2670
+    },
+    {
+      "epoch": 0.5496731188309191,
+      "grad_norm": 0.49377110600471497,
+      "learning_rate": 4.743091537132988e-05,
+      "loss": 1.2451,
+      "step": 2680
+    },
+    {
+      "epoch": 0.5517241379310345,
+      "grad_norm": 0.5111104846000671,
+      "learning_rate": 4.7215025906735756e-05,
+      "loss": 1.2197,
+      "step": 2690
+    },
+    {
+      "epoch": 0.5537751570311499,
+      "grad_norm": 0.47716042399406433,
+      "learning_rate": 4.699913644214163e-05,
+      "loss": 1.1891,
+      "step": 2700
+    },
+    {
+      "epoch": 0.5558261761312652,
+      "grad_norm": 0.5081655383110046,
+      "learning_rate": 4.678324697754749e-05,
+      "loss": 1.2507,
+      "step": 2710
+    },
+    {
+      "epoch": 0.5578771952313806,
+      "grad_norm": 0.49036547541618347,
+      "learning_rate": 4.656735751295337e-05,
+      "loss": 1.1805,
+      "step": 2720
+    },
+    {
+      "epoch": 0.559928214331496,
+      "grad_norm": 0.5139365792274475,
+      "learning_rate": 4.635146804835924e-05,
+      "loss": 1.2361,
+      "step": 2730
+    },
+    {
+      "epoch": 0.5619792334316114,
+      "grad_norm": 0.5098669528961182,
+      "learning_rate": 4.613557858376512e-05,
+      "loss": 1.2409,
+      "step": 2740
+    },
+    {
+      "epoch": 0.5640302525317267,
+      "grad_norm": 0.4786950349807739,
+      "learning_rate": 4.5919689119170986e-05,
+      "loss": 1.2067,
+      "step": 2750
+    },
+    {
+      "epoch": 0.5660812716318421,
+      "grad_norm": 0.5063204169273376,
+      "learning_rate": 4.5703799654576855e-05,
+      "loss": 1.1942,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5681322907319575,
+      "grad_norm": 0.511663556098938,
+      "learning_rate": 4.548791018998273e-05,
+      "loss": 1.2017,
+      "step": 2770
+    },
+    {
+      "epoch": 0.5701833098320728,
+      "grad_norm": 0.48765748739242554,
+      "learning_rate": 4.5272020725388605e-05,
+      "loss": 1.222,
+      "step": 2780
+    },
+    {
+      "epoch": 0.5722343289321882,
+      "grad_norm": 0.49707624316215515,
+      "learning_rate": 4.5056131260794474e-05,
+      "loss": 1.2075,
+      "step": 2790
+    },
+    {
+      "epoch": 0.5742853480323036,
+      "grad_norm": 0.5067517757415771,
+      "learning_rate": 4.484024179620035e-05,
+      "loss": 1.211,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5763363671324189,
+      "grad_norm": 0.4615229368209839,
+      "learning_rate": 4.462435233160622e-05,
+      "loss": 1.2303,
+      "step": 2810
+    },
+    {
+      "epoch": 0.5783873862325343,
+      "grad_norm": 0.4948524236679077,
+      "learning_rate": 4.440846286701209e-05,
+      "loss": 1.2024,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5804384053326497,
+      "grad_norm": 0.5140314102172852,
+      "learning_rate": 4.419257340241796e-05,
+      "loss": 1.2217,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5824894244327651,
+      "grad_norm": 0.5108122825622559,
+      "learning_rate": 4.3976683937823836e-05,
+      "loss": 1.1838,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5845404435328804,
+      "grad_norm": 0.5021159052848816,
+      "learning_rate": 4.376079447322971e-05,
+      "loss": 1.2418,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5865914626329958,
+      "grad_norm": 0.5086933374404907,
+      "learning_rate": 4.354490500863558e-05,
+      "loss": 1.2321,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5886424817331112,
+      "grad_norm": 0.5083547830581665,
+      "learning_rate": 4.332901554404145e-05,
+      "loss": 1.2035,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5906935008332265,
+      "grad_norm": 0.4828626215457916,
+      "learning_rate": 4.311312607944732e-05,
+      "loss": 1.2302,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5927445199333419,
+      "grad_norm": 0.5140969157218933,
+      "learning_rate": 4.28972366148532e-05,
+      "loss": 1.2058,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5947955390334573,
+      "grad_norm": 0.497364342212677,
+      "learning_rate": 4.2681347150259074e-05,
+      "loss": 1.2382,
+      "step": 2900
+    },
+    {
+      "epoch": 0.5968465581335726,
+      "grad_norm": 0.49104997515678406,
+      "learning_rate": 4.246545768566494e-05,
+      "loss": 1.2322,
+      "step": 2910
+    },
+    {
+      "epoch": 0.598897577233688,
+      "grad_norm": 0.521659255027771,
+      "learning_rate": 4.224956822107081e-05,
+      "loss": 1.1868,
+      "step": 2920
+    },
+    {
+      "epoch": 0.6009485963338034,
+      "grad_norm": 0.5175550580024719,
+      "learning_rate": 4.2033678756476686e-05,
+      "loss": 1.2169,
+      "step": 2930
+    },
+    {
+      "epoch": 0.6029996154339188,
+      "grad_norm": 0.4998300075531006,
+      "learning_rate": 4.181778929188256e-05,
+      "loss": 1.2227,
+      "step": 2940
+    },
+    {
+      "epoch": 0.6050506345340341,
+      "grad_norm": 0.4932349622249603,
+      "learning_rate": 4.160189982728843e-05,
+      "loss": 1.2371,
+      "step": 2950
+    },
+    {
+      "epoch": 0.6071016536341495,
+      "grad_norm": 0.5610498189926147,
+      "learning_rate": 4.1386010362694304e-05,
+      "loss": 1.2105,
+      "step": 2960
+    },
+    {
+      "epoch": 0.6091526727342649,
+      "grad_norm": 0.4975990355014801,
+      "learning_rate": 4.117012089810017e-05,
+      "loss": 1.2511,
+      "step": 2970
+    },
+    {
+      "epoch": 0.6112036918343802,
+      "grad_norm": 0.5154693722724915,
+      "learning_rate": 4.095423143350605e-05,
+      "loss": 1.2399,
+      "step": 2980
+    },
+    {
+      "epoch": 0.6132547109344956,
+      "grad_norm": 0.4968002736568451,
+      "learning_rate": 4.0738341968911916e-05,
+      "loss": 1.2041,
+      "step": 2990
+    },
+    {
+      "epoch": 0.615305730034611,
+      "grad_norm": 0.4866868555545807,
+      "learning_rate": 4.052245250431779e-05,
+      "loss": 1.1965,
+      "step": 3000
+    },
+    {
+      "epoch": 0.6173567491347263,
+      "grad_norm": 0.5152925848960876,
+      "learning_rate": 4.030656303972367e-05,
+      "loss": 1.2298,
+      "step": 3010
+    },
+    {
+      "epoch": 0.6194077682348417,
+      "grad_norm": 0.513058602809906,
+      "learning_rate": 4.0090673575129535e-05,
+      "loss": 1.2414,
+      "step": 3020
+    },
+    {
+      "epoch": 0.6214587873349571,
+      "grad_norm": 0.5031930208206177,
+      "learning_rate": 3.987478411053541e-05,
+      "loss": 1.1766,
+      "step": 3030
+    },
+    {
+      "epoch": 0.6235098064350725,
+      "grad_norm": 0.5087730288505554,
+      "learning_rate": 3.965889464594128e-05,
+      "loss": 1.229,
+      "step": 3040
+    },
+    {
+      "epoch": 0.6255608255351878,
+      "grad_norm": 0.4878797233104706,
+      "learning_rate": 3.9443005181347154e-05,
+      "loss": 1.2018,
+      "step": 3050
+    },
+    {
+      "epoch": 0.6276118446353032,
+      "grad_norm": 0.5124858617782593,
+      "learning_rate": 3.922711571675303e-05,
+      "loss": 1.1848,
+      "step": 3060
+    },
+    {
+      "epoch": 0.6296628637354186,
+      "grad_norm": 0.49720969796180725,
+      "learning_rate": 3.90112262521589e-05,
+      "loss": 1.1892,
+      "step": 3070
+    },
+    {
+      "epoch": 0.6317138828355339,
+      "grad_norm": 0.49900123476982117,
+      "learning_rate": 3.8795336787564766e-05,
+      "loss": 1.2027,
+      "step": 3080
+    },
+    {
+      "epoch": 0.6337649019356493,
+      "grad_norm": 0.5007952451705933,
+      "learning_rate": 3.857944732297064e-05,
+      "loss": 1.2373,
+      "step": 3090
+    },
+    {
+      "epoch": 0.6358159210357647,
+      "grad_norm": 0.49481576681137085,
+      "learning_rate": 3.8363557858376516e-05,
+      "loss": 1.2294,
+      "step": 3100
+    },
+    {
+      "epoch": 0.63786694013588,
+      "grad_norm": 0.4979318082332611,
+      "learning_rate": 3.8147668393782385e-05,
+      "loss": 1.2312,
+      "step": 3110
+    },
+    {
+      "epoch": 0.6399179592359954,
+      "grad_norm": 0.49939480423927307,
+      "learning_rate": 3.793177892918825e-05,
+      "loss": 1.2394,
+      "step": 3120
+    },
+    {
+      "epoch": 0.6419689783361108,
+      "grad_norm": 0.5186517834663391,
+      "learning_rate": 3.771588946459413e-05,
+      "loss": 1.199,
+      "step": 3130
+    },
+    {
+      "epoch": 0.6440199974362262,
+      "grad_norm": 0.5386569499969482,
+      "learning_rate": 3.7500000000000003e-05,
+      "loss": 1.1801,
+      "step": 3140
+    },
+    {
+      "epoch": 0.6460710165363415,
+      "grad_norm": 0.5134577751159668,
+      "learning_rate": 3.728411053540587e-05,
+      "loss": 1.2286,
+      "step": 3150
+    },
+    {
+      "epoch": 0.6481220356364569,
+      "grad_norm": 0.5191785097122192,
+      "learning_rate": 3.706822107081175e-05,
+      "loss": 1.2068,
+      "step": 3160
+    },
+    {
+      "epoch": 0.6501730547365723,
+      "grad_norm": 0.4857168197631836,
+      "learning_rate": 3.6852331606217615e-05,
+      "loss": 1.2116,
+      "step": 3170
+    },
+    {
+      "epoch": 0.6522240738366876,
+      "grad_norm": 0.5283413529396057,
+      "learning_rate": 3.663644214162349e-05,
+      "loss": 1.1792,
+      "step": 3180
+    },
+    {
+      "epoch": 0.654275092936803,
+      "grad_norm": 0.528938353061676,
+      "learning_rate": 3.6420552677029366e-05,
+      "loss": 1.1963,
+      "step": 3190
+    },
+    {
+      "epoch": 0.6563261120369184,
+      "grad_norm": 0.5067134499549866,
+      "learning_rate": 3.6204663212435234e-05,
+      "loss": 1.2476,
+      "step": 3200
+    },
+    {
+      "epoch": 0.6583771311370337,
+      "grad_norm": 0.4993511736392975,
+      "learning_rate": 3.598877374784111e-05,
+      "loss": 1.2273,
+      "step": 3210
+    },
+    {
+      "epoch": 0.6604281502371491,
+      "grad_norm": 0.5275943279266357,
+      "learning_rate": 3.577288428324698e-05,
+      "loss": 1.2287,
+      "step": 3220
+    },
+    {
+      "epoch": 0.6624791693372645,
+      "grad_norm": 0.49331194162368774,
+      "learning_rate": 3.555699481865285e-05,
+      "loss": 1.1794,
+      "step": 3230
+    },
+    {
+      "epoch": 0.6645301884373799,
+      "grad_norm": 0.5065453052520752,
+      "learning_rate": 3.534110535405872e-05,
+      "loss": 1.2342,
+      "step": 3240
+    },
+    {
+      "epoch": 0.6665812075374952,
+      "grad_norm": 0.5334459543228149,
+      "learning_rate": 3.51252158894646e-05,
+      "loss": 1.1782,
+      "step": 3250
+    },
+    {
+      "epoch": 0.6686322266376106,
+      "grad_norm": 0.535772979259491,
+      "learning_rate": 3.490932642487047e-05,
+      "loss": 1.2108,
+      "step": 3260
+    },
+    {
+      "epoch": 0.670683245737726,
+      "grad_norm": 0.5377807021141052,
+      "learning_rate": 3.469343696027634e-05,
+      "loss": 1.1903,
+      "step": 3270
+    },
+    {
+      "epoch": 0.6727342648378413,
+      "grad_norm": 0.5266278386116028,
+      "learning_rate": 3.447754749568221e-05,
+      "loss": 1.2183,
+      "step": 3280
+    },
+    {
+      "epoch": 0.6747852839379567,
+      "grad_norm": 0.4987232983112335,
+      "learning_rate": 3.4261658031088084e-05,
+      "loss": 1.1915,
+      "step": 3290
+    },
+    {
+      "epoch": 0.6768363030380721,
+      "grad_norm": 0.5178554058074951,
+      "learning_rate": 3.404576856649396e-05,
+      "loss": 1.179,
+      "step": 3300
+    },
+    {
+      "epoch": 0.6788873221381874,
+      "grad_norm": 0.5086014270782471,
+      "learning_rate": 3.382987910189983e-05,
+      "loss": 1.2298,
+      "step": 3310
+    },
+    {
+      "epoch": 0.6809383412383028,
+      "grad_norm": 0.5420427918434143,
+      "learning_rate": 3.3613989637305696e-05,
+      "loss": 1.2072,
+      "step": 3320
+    },
+    {
+      "epoch": 0.6829893603384182,
+      "grad_norm": 0.5170331001281738,
+      "learning_rate": 3.339810017271157e-05,
+      "loss": 1.2252,
+      "step": 3330
+    },
+    {
+      "epoch": 0.6850403794385336,
+      "grad_norm": 0.48680609464645386,
+      "learning_rate": 3.3182210708117446e-05,
+      "loss": 1.2059,
+      "step": 3340
+    },
+    {
+      "epoch": 0.6870913985386489,
+      "grad_norm": 0.5035340189933777,
+      "learning_rate": 3.296632124352332e-05,
+      "loss": 1.2009,
+      "step": 3350
+    },
+    {
+      "epoch": 0.6891424176387643,
+      "grad_norm": 0.513165295124054,
+      "learning_rate": 3.275043177892919e-05,
+      "loss": 1.1844,
+      "step": 3360
+    },
+    {
+      "epoch": 0.6911934367388797,
+      "grad_norm": 0.5243003368377686,
+      "learning_rate": 3.2534542314335065e-05,
+      "loss": 1.2009,
+      "step": 3370
+    },
+    {
+      "epoch": 0.693244455838995,
+      "grad_norm": 0.5219825506210327,
+      "learning_rate": 3.2318652849740933e-05,
+      "loss": 1.2039,
+      "step": 3380
+    },
+    {
+      "epoch": 0.6952954749391104,
+      "grad_norm": 0.5202507972717285,
+      "learning_rate": 3.210276338514681e-05,
+      "loss": 1.225,
+      "step": 3390
+    },
+    {
+      "epoch": 0.6973464940392258,
+      "grad_norm": 0.5152229070663452,
+      "learning_rate": 3.188687392055268e-05,
+      "loss": 1.1886,
+      "step": 3400
+    },
+    {
+      "epoch": 0.6993975131393411,
+      "grad_norm": 0.5382890701293945,
+      "learning_rate": 3.167098445595855e-05,
+      "loss": 1.2113,
+      "step": 3410
+    },
+    {
+      "epoch": 0.7014485322394565,
+      "grad_norm": 0.5525237917900085,
+      "learning_rate": 3.145509499136443e-05,
+      "loss": 1.2283,
+      "step": 3420
+    },
+    {
+      "epoch": 0.7034995513395719,
+      "grad_norm": 0.5308887958526611,
+      "learning_rate": 3.1239205526770296e-05,
+      "loss": 1.2311,
+      "step": 3430
+    },
+    {
+      "epoch": 0.7055505704396873,
+      "grad_norm": 0.5247687697410583,
+      "learning_rate": 3.1023316062176164e-05,
+      "loss": 1.1946,
+      "step": 3440
+    },
+    {
+      "epoch": 0.7076015895398026,
+      "grad_norm": 0.5322206616401672,
+      "learning_rate": 3.080742659758204e-05,
+      "loss": 1.2198,
+      "step": 3450
+    },
+    {
+      "epoch": 0.709652608639918,
+      "grad_norm": 0.5104162693023682,
+      "learning_rate": 3.0591537132987915e-05,
+      "loss": 1.2105,
+      "step": 3460
+    },
+    {
+      "epoch": 0.7117036277400334,
+      "grad_norm": 0.4890803098678589,
+      "learning_rate": 3.0375647668393786e-05,
+      "loss": 1.2074,
+      "step": 3470
+    },
+    {
+      "epoch": 0.7137546468401487,
+      "grad_norm": 0.529225766658783,
+      "learning_rate": 3.0159758203799655e-05,
+      "loss": 1.2321,
+      "step": 3480
+    },
+    {
+      "epoch": 0.7158056659402641,
+      "grad_norm": 0.5252069234848022,
+      "learning_rate": 2.9943868739205527e-05,
+      "loss": 1.1995,
+      "step": 3490
+    },
+    {
+      "epoch": 0.7178566850403795,
+      "grad_norm": 0.5369967818260193,
+      "learning_rate": 2.9727979274611402e-05,
+      "loss": 1.2234,
+      "step": 3500
+    },
+    {
+      "epoch": 0.7199077041404948,
+      "grad_norm": 0.5053485631942749,
+      "learning_rate": 2.9512089810017274e-05,
+      "loss": 1.2035,
+      "step": 3510
+    },
+    {
+      "epoch": 0.7219587232406102,
+      "grad_norm": 0.5131696462631226,
+      "learning_rate": 2.929620034542315e-05,
+      "loss": 1.2681,
+      "step": 3520
+    },
+    {
+      "epoch": 0.7240097423407256,
+      "grad_norm": 0.5332499742507935,
+      "learning_rate": 2.9080310880829014e-05,
+      "loss": 1.2039,
+      "step": 3530
+    },
+    {
+      "epoch": 0.7260607614408409,
+      "grad_norm": 0.5105617046356201,
+      "learning_rate": 2.886442141623489e-05,
+      "loss": 1.2,
+      "step": 3540
+    },
+    {
+      "epoch": 0.7281117805409563,
+      "grad_norm": 0.5197264552116394,
+      "learning_rate": 2.864853195164076e-05,
+      "loss": 1.1821,
+      "step": 3550
+    },
+    {
+      "epoch": 0.7301627996410717,
+      "grad_norm": 0.505455493927002,
+      "learning_rate": 2.8432642487046636e-05,
+      "loss": 1.2158,
+      "step": 3560
+    },
+    {
+      "epoch": 0.7322138187411871,
+      "grad_norm": 0.5290804505348206,
+      "learning_rate": 2.8216753022452508e-05,
+      "loss": 1.174,
+      "step": 3570
+    },
+    {
+      "epoch": 0.7342648378413024,
+      "grad_norm": 0.5349313020706177,
+      "learning_rate": 2.8000863557858376e-05,
+      "loss": 1.2301,
+      "step": 3580
+    },
+    {
+      "epoch": 0.7363158569414178,
+      "grad_norm": 0.4875812530517578,
+      "learning_rate": 2.7784974093264248e-05,
+      "loss": 1.2015,
+      "step": 3590
+    },
+    {
+      "epoch": 0.7383668760415332,
+      "grad_norm": 0.5164597630500793,
+      "learning_rate": 2.7569084628670123e-05,
+      "loss": 1.2294,
+      "step": 3600
+    },
+    {
+      "epoch": 0.7404178951416485,
+      "grad_norm": 0.5129172801971436,
+      "learning_rate": 2.7353195164075995e-05,
+      "loss": 1.2122,
+      "step": 3610
+    },
+    {
+      "epoch": 0.7424689142417639,
+      "grad_norm": 0.5218586921691895,
+      "learning_rate": 2.713730569948187e-05,
+      "loss": 1.2002,
+      "step": 3620
+    },
+    {
+      "epoch": 0.7445199333418793,
+      "grad_norm": 0.5423296093940735,
+      "learning_rate": 2.6921416234887735e-05,
+      "loss": 1.1685,
+      "step": 3630
+    },
+    {
+      "epoch": 0.7465709524419946,
+      "grad_norm": 0.5151218771934509,
+      "learning_rate": 2.670552677029361e-05,
+      "loss": 1.2167,
+      "step": 3640
+    },
+    {
+      "epoch": 0.74862197154211,
+      "grad_norm": 0.5160235166549683,
+      "learning_rate": 2.6489637305699482e-05,
+      "loss": 1.2269,
+      "step": 3650
+    },
+    {
+      "epoch": 0.7506729906422254,
+      "grad_norm": 0.5056514143943787,
+      "learning_rate": 2.6273747841105357e-05,
+      "loss": 1.2467,
+      "step": 3660
+    },
+    {
+      "epoch": 0.7527240097423408,
+      "grad_norm": 0.52911776304245,
+      "learning_rate": 2.605785837651123e-05,
+      "loss": 1.2182,
+      "step": 3670
+    },
+    {
+      "epoch": 0.7547750288424561,
+      "grad_norm": 0.5172019600868225,
+      "learning_rate": 2.5841968911917097e-05,
+      "loss": 1.1888,
+      "step": 3680
+    },
+    {
+      "epoch": 0.7568260479425715,
+      "grad_norm": 0.5043123960494995,
+      "learning_rate": 2.562607944732297e-05,
+      "loss": 1.2004,
+      "step": 3690
+    },
+    {
+      "epoch": 0.7588770670426869,
+      "grad_norm": 0.5103533267974854,
+      "learning_rate": 2.5410189982728844e-05,
+      "loss": 1.1627,
+      "step": 3700
+    },
+    {
+      "epoch": 0.7609280861428022,
+      "grad_norm": 0.5295760631561279,
+      "learning_rate": 2.5194300518134716e-05,
+      "loss": 1.1604,
+      "step": 3710
+    },
+    {
+      "epoch": 0.7629791052429176,
+      "grad_norm": 0.5427724719047546,
+      "learning_rate": 2.4978411053540588e-05,
+      "loss": 1.1781,
+      "step": 3720
+    },
+    {
+      "epoch": 0.765030124343033,
+      "grad_norm": 0.5164818167686462,
+      "learning_rate": 2.476252158894646e-05,
+      "loss": 1.2208,
+      "step": 3730
+    },
+    {
+      "epoch": 0.7670811434431483,
+      "grad_norm": 0.5196744799613953,
+      "learning_rate": 2.4546632124352335e-05,
+      "loss": 1.1971,
+      "step": 3740
+    },
+    {
+      "epoch": 0.7691321625432637,
+      "grad_norm": 0.5128475427627563,
+      "learning_rate": 2.4330742659758203e-05,
+      "loss": 1.1909,
+      "step": 3750
+    },
+    {
+      "epoch": 0.7711831816433791,
+      "grad_norm": 0.49743902683258057,
+      "learning_rate": 2.411485319516408e-05,
+      "loss": 1.2109,
+      "step": 3760
+    },
+    {
+      "epoch": 0.7732342007434945,
+      "grad_norm": 0.5152381658554077,
+      "learning_rate": 2.3898963730569947e-05,
+      "loss": 1.2228,
+      "step": 3770
+    },
+    {
+      "epoch": 0.7752852198436098,
+      "grad_norm": 0.5446299910545349,
+      "learning_rate": 2.3683074265975822e-05,
+      "loss": 1.1953,
+      "step": 3780
+    },
+    {
+      "epoch": 0.7773362389437252,
+      "grad_norm": 0.5300847291946411,
+      "learning_rate": 2.3467184801381694e-05,
+      "loss": 1.1843,
+      "step": 3790
+    },
+    {
+      "epoch": 0.7793872580438406,
+      "grad_norm": 0.5129801630973816,
+      "learning_rate": 2.3251295336787566e-05,
+      "loss": 1.1809,
+      "step": 3800
+    },
+    {
+      "epoch": 0.7814382771439559,
+      "grad_norm": 0.549198567867279,
+      "learning_rate": 2.3035405872193438e-05,
+      "loss": 1.2099,
+      "step": 3810
+    },
+    {
+      "epoch": 0.7834892962440713,
+      "grad_norm": 0.5118544101715088,
+      "learning_rate": 2.281951640759931e-05,
+      "loss": 1.2149,
+      "step": 3820
+    },
+    {
+      "epoch": 0.7855403153441867,
+      "grad_norm": 0.5479713082313538,
+      "learning_rate": 2.260362694300518e-05,
+      "loss": 1.1771,
+      "step": 3830
+    },
+    {
+      "epoch": 0.787591334444302,
+      "grad_norm": 0.541350245475769,
+      "learning_rate": 2.2387737478411056e-05,
+      "loss": 1.1737,
+      "step": 3840
+    },
+    {
+      "epoch": 0.7896423535444174,
+      "grad_norm": 0.5543351769447327,
+      "learning_rate": 2.2171848013816925e-05,
+      "loss": 1.2233,
+      "step": 3850
+    },
+    {
+      "epoch": 0.7916933726445328,
+      "grad_norm": 0.5010188817977905,
+      "learning_rate": 2.19559585492228e-05,
+      "loss": 1.1938,
+      "step": 3860
+    },
+    {
+      "epoch": 0.7937443917446482,
+      "grad_norm": 0.5245205760002136,
+      "learning_rate": 2.1740069084628672e-05,
+      "loss": 1.2015,
+      "step": 3870
+    },
+    {
+      "epoch": 0.7957954108447635,
+      "grad_norm": 0.5324139595031738,
+      "learning_rate": 2.1524179620034544e-05,
+      "loss": 1.2248,
+      "step": 3880
+    },
+    {
+      "epoch": 0.7978464299448789,
+      "grad_norm": 0.5172831416130066,
+      "learning_rate": 2.1308290155440415e-05,
+      "loss": 1.1992,
+      "step": 3890
+    },
+    {
+      "epoch": 0.7998974490449943,
+      "grad_norm": 0.5434138178825378,
+      "learning_rate": 2.1092400690846287e-05,
+      "loss": 1.1813,
+      "step": 3900
+    },
+    {
+      "epoch": 0.8019484681451096,
+      "grad_norm": 0.5221844911575317,
+      "learning_rate": 2.087651122625216e-05,
+      "loss": 1.1625,
+      "step": 3910
+    },
+    {
+      "epoch": 0.803999487245225,
+      "grad_norm": 0.5027469992637634,
+      "learning_rate": 2.0660621761658034e-05,
+      "loss": 1.181,
+      "step": 3920
+    },
+    {
+      "epoch": 0.8060505063453404,
+      "grad_norm": 0.5298044085502625,
+      "learning_rate": 2.0444732297063903e-05,
+      "loss": 1.2079,
+      "step": 3930
+    },
+    {
+      "epoch": 0.8081015254454557,
+      "grad_norm": 0.5463908910751343,
+      "learning_rate": 2.0228842832469778e-05,
+      "loss": 1.2009,
+      "step": 3940
+    },
+    {
+      "epoch": 0.8101525445455711,
+      "grad_norm": 0.5394027233123779,
+      "learning_rate": 2.0012953367875646e-05,
+      "loss": 1.1931,
+      "step": 3950
+    },
+    {
+      "epoch": 0.8122035636456865,
+      "grad_norm": 0.5041294097900391,
+      "learning_rate": 1.979706390328152e-05,
+      "loss": 1.2107,
+      "step": 3960
+    },
+    {
+      "epoch": 0.8142545827458019,
+      "grad_norm": 0.5223291516304016,
+      "learning_rate": 1.9581174438687393e-05,
+      "loss": 1.1775,
+      "step": 3970
+    },
+    {
+      "epoch": 0.8163056018459172,
+      "grad_norm": 0.5221052169799805,
+      "learning_rate": 1.9365284974093265e-05,
+      "loss": 1.2052,
+      "step": 3980
+    },
+    {
+      "epoch": 0.8183566209460326,
+      "grad_norm": 0.5229529738426208,
+      "learning_rate": 1.9149395509499137e-05,
+      "loss": 1.1922,
+      "step": 3990
+    },
+    {
+      "epoch": 0.820407640046148,
+      "grad_norm": 0.5651980042457581,
+      "learning_rate": 1.893350604490501e-05,
+      "loss": 1.2043,
+      "step": 4000
+    },
+    {
+      "epoch": 0.8224586591462633,
+      "grad_norm": 0.5169751644134521,
+      "learning_rate": 1.871761658031088e-05,
+      "loss": 1.2157,
+      "step": 4010
+    },
+    {
+      "epoch": 0.8245096782463787,
+      "grad_norm": 0.5741276144981384,
+      "learning_rate": 1.8501727115716755e-05,
+      "loss": 1.2112,
+      "step": 4020
+    },
+    {
+      "epoch": 0.8265606973464941,
+      "grad_norm": 0.530596137046814,
+      "learning_rate": 1.8285837651122624e-05,
+      "loss": 1.2535,
+      "step": 4030
+    },
+    {
+      "epoch": 0.8286117164466094,
+      "grad_norm": 0.5436383485794067,
+      "learning_rate": 1.80699481865285e-05,
+      "loss": 1.1789,
+      "step": 4040
+    },
+    {
+      "epoch": 0.8306627355467248,
+      "grad_norm": 0.5238965749740601,
+      "learning_rate": 1.7854058721934368e-05,
+      "loss": 1.1645,
+      "step": 4050
+    },
+    {
+      "epoch": 0.8327137546468402,
+      "grad_norm": 0.5226778388023376,
+      "learning_rate": 1.7638169257340243e-05,
+      "loss": 1.2238,
+      "step": 4060
+    },
+    {
+      "epoch": 0.8347647737469556,
+      "grad_norm": 0.5810254812240601,
+      "learning_rate": 1.7422279792746114e-05,
+      "loss": 1.2212,
+      "step": 4070
+    },
+    {
+      "epoch": 0.8368157928470709,
+      "grad_norm": 0.5228540301322937,
+      "learning_rate": 1.7206390328151986e-05,
+      "loss": 1.2025,
+      "step": 4080
+    },
+    {
+      "epoch": 0.8388668119471863,
+      "grad_norm": 0.5112829804420471,
+      "learning_rate": 1.6990500863557858e-05,
+      "loss": 1.1838,
+      "step": 4090
+    },
+    {
+      "epoch": 0.8409178310473017,
+      "grad_norm": 0.5092179775238037,
+      "learning_rate": 1.6774611398963733e-05,
+      "loss": 1.1981,
+      "step": 4100
+    },
+    {
+      "epoch": 0.842968850147417,
+      "grad_norm": 0.5236721634864807,
+      "learning_rate": 1.65587219343696e-05,
+      "loss": 1.1994,
+      "step": 4110
+    },
+    {
+      "epoch": 0.8450198692475324,
+      "grad_norm": 0.5067551732063293,
+      "learning_rate": 1.6342832469775477e-05,
+      "loss": 1.1758,
+      "step": 4120
+    },
+    {
+      "epoch": 0.8470708883476478,
+      "grad_norm": 0.5471055507659912,
+      "learning_rate": 1.6126943005181345e-05,
+      "loss": 1.2315,
+      "step": 4130
+    },
+    {
+      "epoch": 0.8491219074477631,
+      "grad_norm": 0.514798641204834,
+      "learning_rate": 1.591105354058722e-05,
+      "loss": 1.183,
+      "step": 4140
+    },
+    {
+      "epoch": 0.8511729265478785,
+      "grad_norm": 0.5316623449325562,
+      "learning_rate": 1.5695164075993092e-05,
+      "loss": 1.1997,
+      "step": 4150
+    },
+    {
+      "epoch": 0.8532239456479939,
+      "grad_norm": 0.531896710395813,
+      "learning_rate": 1.5479274611398964e-05,
+      "loss": 1.1967,
+      "step": 4160
+    },
+    {
+      "epoch": 0.8552749647481093,
+      "grad_norm": 0.5044012665748596,
+      "learning_rate": 1.5263385146804836e-05,
+      "loss": 1.2061,
+      "step": 4170
+    },
+    {
+      "epoch": 0.8573259838482246,
+      "grad_norm": 0.547264039516449,
+      "learning_rate": 1.5047495682210708e-05,
+      "loss": 1.1975,
+      "step": 4180
+    },
+    {
+      "epoch": 0.85937700294834,
+      "grad_norm": 0.5514972805976868,
+      "learning_rate": 1.4831606217616581e-05,
+      "loss": 1.2044,
+      "step": 4190
+    },
+    {
+      "epoch": 0.8614280220484554,
+      "grad_norm": 0.5322652459144592,
+      "learning_rate": 1.4615716753022455e-05,
+      "loss": 1.2044,
+      "step": 4200
+    },
+    {
+      "epoch": 0.8634790411485707,
+      "grad_norm": 0.5309359431266785,
+      "learning_rate": 1.4399827288428325e-05,
+      "loss": 1.2066,
+      "step": 4210
+    },
+    {
+      "epoch": 0.8655300602486861,
+      "grad_norm": 0.5314792394638062,
+      "learning_rate": 1.4183937823834198e-05,
+      "loss": 1.2006,
+      "step": 4220
+    },
+    {
+      "epoch": 0.8675810793488015,
+      "grad_norm": 0.5549922585487366,
+      "learning_rate": 1.3968048359240068e-05,
+      "loss": 1.2058,
+      "step": 4230
+    },
+    {
+      "epoch": 0.8696320984489168,
+      "grad_norm": 0.5373049378395081,
+      "learning_rate": 1.3752158894645942e-05,
+      "loss": 1.2002,
+      "step": 4240
+    },
+    {
+      "epoch": 0.8716831175490322,
+      "grad_norm": 0.5322666764259338,
+      "learning_rate": 1.3536269430051815e-05,
+      "loss": 1.215,
+      "step": 4250
+    },
+    {
+      "epoch": 0.8737341366491476,
+      "grad_norm": 0.5549564957618713,
+      "learning_rate": 1.3320379965457685e-05,
+      "loss": 1.2131,
+      "step": 4260
+    },
+    {
+      "epoch": 0.875785155749263,
+      "grad_norm": 0.5308319926261902,
+      "learning_rate": 1.3104490500863559e-05,
+      "loss": 1.2203,
+      "step": 4270
+    },
+    {
+      "epoch": 0.8778361748493783,
+      "grad_norm": 0.5089017152786255,
+      "learning_rate": 1.2888601036269432e-05,
+      "loss": 1.1801,
+      "step": 4280
+    },
+    {
+      "epoch": 0.8798871939494937,
+      "grad_norm": 0.5377966165542603,
+      "learning_rate": 1.2672711571675302e-05,
+      "loss": 1.189,
+      "step": 4290
+    },
+    {
+      "epoch": 0.8819382130496091,
+      "grad_norm": 0.5528485178947449,
+      "learning_rate": 1.2456822107081174e-05,
+      "loss": 1.2197,
+      "step": 4300
+    },
+    {
+      "epoch": 0.8839892321497244,
+      "grad_norm": 0.5241679549217224,
+      "learning_rate": 1.2240932642487048e-05,
+      "loss": 1.1652,
+      "step": 4310
+    },
+    {
+      "epoch": 0.8860402512498398,
+      "grad_norm": 0.5626764893531799,
+      "learning_rate": 1.202504317789292e-05,
+      "loss": 1.1805,
+      "step": 4320
+    },
+    {
+      "epoch": 0.8880912703499552,
+      "grad_norm": 0.5248028635978699,
+      "learning_rate": 1.1809153713298791e-05,
+      "loss": 1.1652,
+      "step": 4330
+    },
+    {
+      "epoch": 0.8901422894500705,
+      "grad_norm": 0.5452848672866821,
+      "learning_rate": 1.1593264248704663e-05,
+      "loss": 1.2171,
+      "step": 4340
+    },
+    {
+      "epoch": 0.8921933085501859,
+      "grad_norm": 0.5505712628364563,
+      "learning_rate": 1.1377374784110537e-05,
+      "loss": 1.1967,
+      "step": 4350
+    },
+    {
+      "epoch": 0.8942443276503013,
+      "grad_norm": 0.5437038540840149,
+      "learning_rate": 1.1161485319516408e-05,
+      "loss": 1.2216,
+      "step": 4360
+    },
+    {
+      "epoch": 0.8962953467504167,
+      "grad_norm": 0.5138014554977417,
+      "learning_rate": 1.094559585492228e-05,
+      "loss": 1.193,
+      "step": 4370
+    },
+    {
+      "epoch": 0.898346365850532,
+      "grad_norm": 0.542080283164978,
+      "learning_rate": 1.0729706390328152e-05,
+      "loss": 1.1677,
+      "step": 4380
+    },
+    {
+      "epoch": 0.9003973849506474,
+      "grad_norm": 0.5166792273521423,
+      "learning_rate": 1.0513816925734024e-05,
+      "loss": 1.2147,
+      "step": 4390
+    },
+    {
+      "epoch": 0.9024484040507628,
+      "grad_norm": 0.536491334438324,
+      "learning_rate": 1.0297927461139897e-05,
+      "loss": 1.2077,
+      "step": 4400
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 4876,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.1227995226985472e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

lora_checkpoints/checkpoint-4400/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b8f6520f47933838e96dca56ee883040325b73481aff07afcabf963674a84fe
+size 5624

lora_checkpoints/checkpoint-4600/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: uaritm/gemma3_1b_med_qa_ru
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:uaritm/gemma3_1b_med_qa_ru
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.17.1

lora_checkpoints/checkpoint-4600/adapter_config.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "uaritm/gemma3_1b_med_qa_ru",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "k_proj",
+    "o_proj",
+    "q_proj",
+    "gate_proj",
+    "down_proj",
+    "up_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

lora_checkpoints/checkpoint-4600/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71ffe8eb65fbb93105908f266c3615953eb7f27d7b126202c9cbe9696d56a76a
+size 52231312

lora_checkpoints/checkpoint-4600/added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "<image_soft_token>": 262144
+}

lora_checkpoints/checkpoint-4600/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,47 @@

+{{ bos_token }}
+{%- if messages[0]['role'] == 'system' -%}
+    {%- if messages[0]['content'] is string -%}
+        {%- set first_user_prefix = messages[0]['content'] + '
+' -%}
+    {%- else -%}
+        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
+' -%}
+    {%- endif -%}
+    {%- set loop_messages = messages[1:] -%}
+{%- else -%}
+    {%- set first_user_prefix = "" -%}
+    {%- set loop_messages = messages -%}
+{%- endif -%}
+{%- for message in loop_messages -%}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
+        {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
+    {%- endif -%}
+    {%- if (message['role'] == 'assistant') -%}
+        {%- set role = "model" -%}
+    {%- else -%}
+        {%- set role = message['role'] -%}
+    {%- endif -%}
+    {{ '<start_of_turn>' + role + '
+' + (first_user_prefix if loop.first else "") }}
+    {%- if message['content'] is string -%}
+        {{ message['content'] | trim }}
+    {%- elif message['content'] is iterable -%}
+        {%- for item in message['content'] -%}
+            {%- if item['type'] == 'image' -%}
+                {{ '<start_of_image>' }}
+            {%- elif item['type'] == 'text' -%}
+                {{ item['text'] | trim }}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- else -%}
+        {{ raise_exception("Invalid content type") }}
+    {%- endif -%}
+    {{ '<end_of_turn>
+' }}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    {{'<start_of_turn>model
+'}}
+{%- endif -%}

lora_checkpoints/checkpoint-4600/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7fe9000272a4d0f562595a138a32bd4c6248f15178ca24507e8825653d6547da
+size 104671958

lora_checkpoints/checkpoint-4600/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bf68f7d2db510e0dece06020a6c1f3e492c8d7f7df52a36128ef0f01be2e4ddf
+size 14244

lora_checkpoints/checkpoint-4600/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:15d60713e239aa461be03135bdbca99b2bfe14ea1d561b3ce05394a2a8b3b9e7
+size 988

lora_checkpoints/checkpoint-4600/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a249874e4ec66b917b0e2b9932feaf87c6ddf577951744ab797e8c3252e36163
+size 1064

lora_checkpoints/checkpoint-4600/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "boi_token": "<start_of_image>",
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eoi_token": "<end_of_image>",
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "image_token": "<image_soft_token>",
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

lora_checkpoints/checkpoint-4600/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
+size 4689074

lora_checkpoints/checkpoint-4600/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

lora_checkpoints/checkpoint-4600/trainer_state.json ADDED Viewed

	@@ -0,0 +1,3254 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9434687860530702,
+  "eval_steps": 500,
+  "global_step": 4600,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00205101910011537,
+      "grad_norm": 1.9277215003967285,
+      "learning_rate": 3.6885245901639347e-06,
+      "loss": 1.4306,
+      "step": 10
+    },
+    {
+      "epoch": 0.00410203820023074,
+      "grad_norm": 0.3513035476207733,
+      "learning_rate": 7.78688524590164e-06,
+      "loss": 1.3524,
+      "step": 20
+    },
+    {
+      "epoch": 0.006153057300346109,
+      "grad_norm": 0.3364648222923279,
+      "learning_rate": 1.1885245901639344e-05,
+      "loss": 1.3188,
+      "step": 30
+    },
+    {
+      "epoch": 0.00820407640046148,
+      "grad_norm": 0.3382512927055359,
+      "learning_rate": 1.598360655737705e-05,
+      "loss": 1.3418,
+      "step": 40
+    },
+    {
+      "epoch": 0.01025509550057685,
+      "grad_norm": 0.360334575176239,
+      "learning_rate": 2.0081967213114755e-05,
+      "loss": 1.3381,
+      "step": 50
+    },
+    {
+      "epoch": 0.012306114600692218,
+      "grad_norm": 0.3408481180667877,
+      "learning_rate": 2.418032786885246e-05,
+      "loss": 1.3365,
+      "step": 60
+    },
+    {
+      "epoch": 0.014357133700807588,
+      "grad_norm": 0.36211535334587097,
+      "learning_rate": 2.8278688524590162e-05,
+      "loss": 1.3314,
+      "step": 70
+    },
+    {
+      "epoch": 0.01640815280092296,
+      "grad_norm": 0.38704580068588257,
+      "learning_rate": 3.237704918032787e-05,
+      "loss": 1.3108,
+      "step": 80
+    },
+    {
+      "epoch": 0.018459171901038327,
+      "grad_norm": 0.44303640723228455,
+      "learning_rate": 3.6475409836065576e-05,
+      "loss": 1.3073,
+      "step": 90
+    },
+    {
+      "epoch": 0.0205101910011537,
+      "grad_norm": 0.4073602557182312,
+      "learning_rate": 4.057377049180328e-05,
+      "loss": 1.2993,
+      "step": 100
+    },
+    {
+      "epoch": 0.022561210101269068,
+      "grad_norm": 0.4478100538253784,
+      "learning_rate": 4.467213114754098e-05,
+      "loss": 1.3413,
+      "step": 110
+    },
+    {
+      "epoch": 0.024612229201384436,
+      "grad_norm": 0.39146170020103455,
+      "learning_rate": 4.8770491803278687e-05,
+      "loss": 1.3168,
+      "step": 120
+    },
+    {
+      "epoch": 0.026663248301499808,
+      "grad_norm": 0.3786431849002838,
+      "learning_rate": 5.28688524590164e-05,
+      "loss": 1.2774,
+      "step": 130
+    },
+    {
+      "epoch": 0.028714267401615177,
+      "grad_norm": 0.4014948904514313,
+      "learning_rate": 5.69672131147541e-05,
+      "loss": 1.346,
+      "step": 140
+    },
+    {
+      "epoch": 0.03076528650173055,
+      "grad_norm": 0.3987842798233032,
+      "learning_rate": 6.10655737704918e-05,
+      "loss": 1.2816,
+      "step": 150
+    },
+    {
+      "epoch": 0.03281630560184592,
+      "grad_norm": 0.3897082507610321,
+      "learning_rate": 6.516393442622951e-05,
+      "loss": 1.3485,
+      "step": 160
+    },
+    {
+      "epoch": 0.034867324701961286,
+      "grad_norm": 0.373279333114624,
+      "learning_rate": 6.926229508196722e-05,
+      "loss": 1.3185,
+      "step": 170
+    },
+    {
+      "epoch": 0.036918343802076654,
+      "grad_norm": 0.3812575340270996,
+      "learning_rate": 7.336065573770491e-05,
+      "loss": 1.3394,
+      "step": 180
+    },
+    {
+      "epoch": 0.03896936290219203,
+      "grad_norm": 0.35926997661590576,
+      "learning_rate": 7.745901639344263e-05,
+      "loss": 1.2821,
+      "step": 190
+    },
+    {
+      "epoch": 0.0410203820023074,
+      "grad_norm": 0.3649434745311737,
+      "learning_rate": 8.155737704918032e-05,
+      "loss": 1.33,
+      "step": 200
+    },
+    {
+      "epoch": 0.04307140110242277,
+      "grad_norm": 0.345662921667099,
+      "learning_rate": 8.565573770491803e-05,
+      "loss": 1.3107,
+      "step": 210
+    },
+    {
+      "epoch": 0.045122420202538135,
+      "grad_norm": 0.37169769406318665,
+      "learning_rate": 8.975409836065574e-05,
+      "loss": 1.309,
+      "step": 220
+    },
+    {
+      "epoch": 0.047173439302653504,
+      "grad_norm": 0.37920281291007996,
+      "learning_rate": 9.385245901639344e-05,
+      "loss": 1.3352,
+      "step": 230
+    },
+    {
+      "epoch": 0.04922445840276887,
+      "grad_norm": 0.35772770643234253,
+      "learning_rate": 9.795081967213115e-05,
+      "loss": 1.2402,
+      "step": 240
+    },
+    {
+      "epoch": 0.05127547750288425,
+      "grad_norm": 0.38790181279182434,
+      "learning_rate": 9.989205526770294e-05,
+      "loss": 1.326,
+      "step": 250
+    },
+    {
+      "epoch": 0.053326496602999617,
+      "grad_norm": 0.3545536696910858,
+      "learning_rate": 9.967616580310882e-05,
+      "loss": 1.3173,
+      "step": 260
+    },
+    {
+      "epoch": 0.055377515703114985,
+      "grad_norm": 0.3845142722129822,
+      "learning_rate": 9.946027633851469e-05,
+      "loss": 1.2949,
+      "step": 270
+    },
+    {
+      "epoch": 0.057428534803230354,
+      "grad_norm": 0.38621339201927185,
+      "learning_rate": 9.924438687392055e-05,
+      "loss": 1.2773,
+      "step": 280
+    },
+    {
+      "epoch": 0.05947955390334572,
+      "grad_norm": 0.38091301918029785,
+      "learning_rate": 9.902849740932643e-05,
+      "loss": 1.3282,
+      "step": 290
+    },
+    {
+      "epoch": 0.0615305730034611,
+      "grad_norm": 0.37546730041503906,
+      "learning_rate": 9.88126079447323e-05,
+      "loss": 1.2862,
+      "step": 300
+    },
+    {
+      "epoch": 0.06358159210357646,
+      "grad_norm": 0.3515011966228485,
+      "learning_rate": 9.859671848013817e-05,
+      "loss": 1.2937,
+      "step": 310
+    },
+    {
+      "epoch": 0.06563261120369183,
+      "grad_norm": 0.3863738775253296,
+      "learning_rate": 9.838082901554406e-05,
+      "loss": 1.3056,
+      "step": 320
+    },
+    {
+      "epoch": 0.06768363030380721,
+      "grad_norm": 0.36615240573883057,
+      "learning_rate": 9.816493955094992e-05,
+      "loss": 1.3062,
+      "step": 330
+    },
+    {
+      "epoch": 0.06973464940392257,
+      "grad_norm": 0.37741243839263916,
+      "learning_rate": 9.794905008635579e-05,
+      "loss": 1.3094,
+      "step": 340
+    },
+    {
+      "epoch": 0.07178566850403795,
+      "grad_norm": 0.38626739382743835,
+      "learning_rate": 9.773316062176167e-05,
+      "loss": 1.2947,
+      "step": 350
+    },
+    {
+      "epoch": 0.07383668760415331,
+      "grad_norm": 0.38667401671409607,
+      "learning_rate": 9.751727115716753e-05,
+      "loss": 1.2976,
+      "step": 360
+    },
+    {
+      "epoch": 0.07588770670426868,
+      "grad_norm": 0.36084800958633423,
+      "learning_rate": 9.730138169257342e-05,
+      "loss": 1.27,
+      "step": 370
+    },
+    {
+      "epoch": 0.07793872580438406,
+      "grad_norm": 0.3754425346851349,
+      "learning_rate": 9.708549222797928e-05,
+      "loss": 1.3243,
+      "step": 380
+    },
+    {
+      "epoch": 0.07998974490449942,
+      "grad_norm": 0.39857473969459534,
+      "learning_rate": 9.686960276338515e-05,
+      "loss": 1.3077,
+      "step": 390
+    },
+    {
+      "epoch": 0.0820407640046148,
+      "grad_norm": 0.3919648230075836,
+      "learning_rate": 9.665371329879103e-05,
+      "loss": 1.2985,
+      "step": 400
+    },
+    {
+      "epoch": 0.08409178310473016,
+      "grad_norm": 0.3675483465194702,
+      "learning_rate": 9.643782383419689e-05,
+      "loss": 1.2946,
+      "step": 410
+    },
+    {
+      "epoch": 0.08614280220484553,
+      "grad_norm": 0.3898465633392334,
+      "learning_rate": 9.622193436960277e-05,
+      "loss": 1.333,
+      "step": 420
+    },
+    {
+      "epoch": 0.08819382130496091,
+      "grad_norm": 0.3681259751319885,
+      "learning_rate": 9.600604490500864e-05,
+      "loss": 1.2968,
+      "step": 430
+    },
+    {
+      "epoch": 0.09024484040507627,
+      "grad_norm": 0.36453816294670105,
+      "learning_rate": 9.57901554404145e-05,
+      "loss": 1.272,
+      "step": 440
+    },
+    {
+      "epoch": 0.09229585950519165,
+      "grad_norm": 0.34828147292137146,
+      "learning_rate": 9.557426597582039e-05,
+      "loss": 1.3245,
+      "step": 450
+    },
+    {
+      "epoch": 0.09434687860530701,
+      "grad_norm": 0.3570501208305359,
+      "learning_rate": 9.535837651122625e-05,
+      "loss": 1.313,
+      "step": 460
+    },
+    {
+      "epoch": 0.09639789770542238,
+      "grad_norm": 0.36692506074905396,
+      "learning_rate": 9.514248704663213e-05,
+      "loss": 1.2915,
+      "step": 470
+    },
+    {
+      "epoch": 0.09844891680553775,
+      "grad_norm": 0.39161381125450134,
+      "learning_rate": 9.4926597582038e-05,
+      "loss": 1.3101,
+      "step": 480
+    },
+    {
+      "epoch": 0.10049993590565312,
+      "grad_norm": 0.3808858394622803,
+      "learning_rate": 9.471070811744387e-05,
+      "loss": 1.3099,
+      "step": 490
+    },
+    {
+      "epoch": 0.1025509550057685,
+      "grad_norm": 0.3541582524776459,
+      "learning_rate": 9.449481865284975e-05,
+      "loss": 1.2772,
+      "step": 500
+    },
+    {
+      "epoch": 0.10460197410588386,
+      "grad_norm": 0.379190593957901,
+      "learning_rate": 9.427892918825562e-05,
+      "loss": 1.2914,
+      "step": 510
+    },
+    {
+      "epoch": 0.10665299320599923,
+      "grad_norm": 0.37727421522140503,
+      "learning_rate": 9.406303972366149e-05,
+      "loss": 1.2888,
+      "step": 520
+    },
+    {
+      "epoch": 0.1087040123061146,
+      "grad_norm": 0.3787306845188141,
+      "learning_rate": 9.384715025906737e-05,
+      "loss": 1.3049,
+      "step": 530
+    },
+    {
+      "epoch": 0.11075503140622997,
+      "grad_norm": 0.3831459581851959,
+      "learning_rate": 9.363126079447323e-05,
+      "loss": 1.2631,
+      "step": 540
+    },
+    {
+      "epoch": 0.11280605050634535,
+      "grad_norm": 0.37274929881095886,
+      "learning_rate": 9.34153713298791e-05,
+      "loss": 1.3313,
+      "step": 550
+    },
+    {
+      "epoch": 0.11485706960646071,
+      "grad_norm": 0.3683277368545532,
+      "learning_rate": 9.319948186528498e-05,
+      "loss": 1.2528,
+      "step": 560
+    },
+    {
+      "epoch": 0.11690808870657608,
+      "grad_norm": 0.39554840326309204,
+      "learning_rate": 9.298359240069085e-05,
+      "loss": 1.2737,
+      "step": 570
+    },
+    {
+      "epoch": 0.11895910780669144,
+      "grad_norm": 0.39166760444641113,
+      "learning_rate": 9.276770293609673e-05,
+      "loss": 1.271,
+      "step": 580
+    },
+    {
+      "epoch": 0.12101012690680682,
+      "grad_norm": 0.384085476398468,
+      "learning_rate": 9.255181347150259e-05,
+      "loss": 1.2921,
+      "step": 590
+    },
+    {
+      "epoch": 0.1230611460069222,
+      "grad_norm": 0.3704201281070709,
+      "learning_rate": 9.233592400690847e-05,
+      "loss": 1.2776,
+      "step": 600
+    },
+    {
+      "epoch": 0.12511216510703757,
+      "grad_norm": 0.3844301998615265,
+      "learning_rate": 9.212003454231434e-05,
+      "loss": 1.3067,
+      "step": 610
+    },
+    {
+      "epoch": 0.12716318420715292,
+      "grad_norm": 0.3971571922302246,
+      "learning_rate": 9.190414507772022e-05,
+      "loss": 1.2792,
+      "step": 620
+    },
+    {
+      "epoch": 0.1292142033072683,
+      "grad_norm": 0.40666353702545166,
+      "learning_rate": 9.168825561312608e-05,
+      "loss": 1.2964,
+      "step": 630
+    },
+    {
+      "epoch": 0.13126522240738367,
+      "grad_norm": 0.38252532482147217,
+      "learning_rate": 9.147236614853195e-05,
+      "loss": 1.2815,
+      "step": 640
+    },
+    {
+      "epoch": 0.13331624150749904,
+      "grad_norm": 0.37795621156692505,
+      "learning_rate": 9.125647668393783e-05,
+      "loss": 1.283,
+      "step": 650
+    },
+    {
+      "epoch": 0.13536726060761442,
+      "grad_norm": 0.4035683572292328,
+      "learning_rate": 9.10405872193437e-05,
+      "loss": 1.288,
+      "step": 660
+    },
+    {
+      "epoch": 0.13741827970772977,
+      "grad_norm": 0.410669207572937,
+      "learning_rate": 9.082469775474958e-05,
+      "loss": 1.2659,
+      "step": 670
+    },
+    {
+      "epoch": 0.13946929880784514,
+      "grad_norm": 0.3809865713119507,
+      "learning_rate": 9.060880829015544e-05,
+      "loss": 1.3133,
+      "step": 680
+    },
+    {
+      "epoch": 0.14152031790796052,
+      "grad_norm": 0.3748447597026825,
+      "learning_rate": 9.039291882556131e-05,
+      "loss": 1.2643,
+      "step": 690
+    },
+    {
+      "epoch": 0.1435713370080759,
+      "grad_norm": 0.39292991161346436,
+      "learning_rate": 9.017702936096719e-05,
+      "loss": 1.2855,
+      "step": 700
+    },
+    {
+      "epoch": 0.14562235610819127,
+      "grad_norm": 0.4399755001068115,
+      "learning_rate": 8.996113989637307e-05,
+      "loss": 1.286,
+      "step": 710
+    },
+    {
+      "epoch": 0.14767337520830662,
+      "grad_norm": 0.42447429895401,
+      "learning_rate": 8.974525043177894e-05,
+      "loss": 1.2736,
+      "step": 720
+    },
+    {
+      "epoch": 0.149724394308422,
+      "grad_norm": 0.37248438596725464,
+      "learning_rate": 8.95293609671848e-05,
+      "loss": 1.2652,
+      "step": 730
+    },
+    {
+      "epoch": 0.15177541340853737,
+      "grad_norm": 0.39122238755226135,
+      "learning_rate": 8.931347150259068e-05,
+      "loss": 1.2814,
+      "step": 740
+    },
+    {
+      "epoch": 0.15382643250865274,
+      "grad_norm": 0.3697800040245056,
+      "learning_rate": 8.909758203799655e-05,
+      "loss": 1.2462,
+      "step": 750
+    },
+    {
+      "epoch": 0.15587745160876812,
+      "grad_norm": 0.3901929259300232,
+      "learning_rate": 8.888169257340241e-05,
+      "loss": 1.2742,
+      "step": 760
+    },
+    {
+      "epoch": 0.15792847070888347,
+      "grad_norm": 0.3833727538585663,
+      "learning_rate": 8.86658031088083e-05,
+      "loss": 1.3015,
+      "step": 770
+    },
+    {
+      "epoch": 0.15997948980899884,
+      "grad_norm": 0.4028802216053009,
+      "learning_rate": 8.844991364421416e-05,
+      "loss": 1.2631,
+      "step": 780
+    },
+    {
+      "epoch": 0.16203050890911422,
+      "grad_norm": 0.39087918400764465,
+      "learning_rate": 8.823402417962004e-05,
+      "loss": 1.2993,
+      "step": 790
+    },
+    {
+      "epoch": 0.1640815280092296,
+      "grad_norm": 0.39453235268592834,
+      "learning_rate": 8.801813471502591e-05,
+      "loss": 1.2544,
+      "step": 800
+    },
+    {
+      "epoch": 0.16613254710934497,
+      "grad_norm": 0.42142602801322937,
+      "learning_rate": 8.780224525043178e-05,
+      "loss": 1.2676,
+      "step": 810
+    },
+    {
+      "epoch": 0.16818356620946032,
+      "grad_norm": 0.36646899580955505,
+      "learning_rate": 8.758635578583767e-05,
+      "loss": 1.2765,
+      "step": 820
+    },
+    {
+      "epoch": 0.1702345853095757,
+      "grad_norm": 0.4253019094467163,
+      "learning_rate": 8.737046632124353e-05,
+      "loss": 1.3003,
+      "step": 830
+    },
+    {
+      "epoch": 0.17228560440969107,
+      "grad_norm": 0.41490674018859863,
+      "learning_rate": 8.715457685664939e-05,
+      "loss": 1.2731,
+      "step": 840
+    },
+    {
+      "epoch": 0.17433662350980644,
+      "grad_norm": 0.405460387468338,
+      "learning_rate": 8.693868739205528e-05,
+      "loss": 1.2122,
+      "step": 850
+    },
+    {
+      "epoch": 0.17638764260992182,
+      "grad_norm": 0.4028235375881195,
+      "learning_rate": 8.672279792746114e-05,
+      "loss": 1.3238,
+      "step": 860
+    },
+    {
+      "epoch": 0.17843866171003717,
+      "grad_norm": 0.38994792103767395,
+      "learning_rate": 8.650690846286701e-05,
+      "loss": 1.2875,
+      "step": 870
+    },
+    {
+      "epoch": 0.18048968081015254,
+      "grad_norm": 0.4099538326263428,
+      "learning_rate": 8.629101899827289e-05,
+      "loss": 1.2807,
+      "step": 880
+    },
+    {
+      "epoch": 0.18254069991026792,
+      "grad_norm": 0.40470021963119507,
+      "learning_rate": 8.607512953367875e-05,
+      "loss": 1.2802,
+      "step": 890
+    },
+    {
+      "epoch": 0.1845917190103833,
+      "grad_norm": 0.4066854417324066,
+      "learning_rate": 8.585924006908464e-05,
+      "loss": 1.2464,
+      "step": 900
+    },
+    {
+      "epoch": 0.18664273811049864,
+      "grad_norm": 0.38739994168281555,
+      "learning_rate": 8.56433506044905e-05,
+      "loss": 1.2831,
+      "step": 910
+    },
+    {
+      "epoch": 0.18869375721061402,
+      "grad_norm": 0.4257420301437378,
+      "learning_rate": 8.542746113989638e-05,
+      "loss": 1.2679,
+      "step": 920
+    },
+    {
+      "epoch": 0.1907447763107294,
+      "grad_norm": 0.41571488976478577,
+      "learning_rate": 8.521157167530225e-05,
+      "loss": 1.2501,
+      "step": 930
+    },
+    {
+      "epoch": 0.19279579541084477,
+      "grad_norm": 0.4178495407104492,
+      "learning_rate": 8.499568221070811e-05,
+      "loss": 1.2657,
+      "step": 940
+    },
+    {
+      "epoch": 0.19484681451096014,
+      "grad_norm": 0.4083455801010132,
+      "learning_rate": 8.477979274611399e-05,
+      "loss": 1.2781,
+      "step": 950
+    },
+    {
+      "epoch": 0.1968978336110755,
+      "grad_norm": 0.4067554175853729,
+      "learning_rate": 8.456390328151986e-05,
+      "loss": 1.2582,
+      "step": 960
+    },
+    {
+      "epoch": 0.19894885271119087,
+      "grad_norm": 0.4067447781562805,
+      "learning_rate": 8.434801381692574e-05,
+      "loss": 1.2948,
+      "step": 970
+    },
+    {
+      "epoch": 0.20099987181130624,
+      "grad_norm": 0.44283562898635864,
+      "learning_rate": 8.413212435233161e-05,
+      "loss": 1.3011,
+      "step": 980
+    },
+    {
+      "epoch": 0.20305089091142162,
+      "grad_norm": 0.41568294167518616,
+      "learning_rate": 8.391623488773748e-05,
+      "loss": 1.2804,
+      "step": 990
+    },
+    {
+      "epoch": 0.205101910011537,
+      "grad_norm": 0.4183642864227295,
+      "learning_rate": 8.370034542314335e-05,
+      "loss": 1.2228,
+      "step": 1000
+    },
+    {
+      "epoch": 0.20715292911165234,
+      "grad_norm": 0.4311917722225189,
+      "learning_rate": 8.348445595854923e-05,
+      "loss": 1.2714,
+      "step": 1010
+    },
+    {
+      "epoch": 0.20920394821176772,
+      "grad_norm": 0.41575828194618225,
+      "learning_rate": 8.32685664939551e-05,
+      "loss": 1.2783,
+      "step": 1020
+    },
+    {
+      "epoch": 0.2112549673118831,
+      "grad_norm": 0.3958878815174103,
+      "learning_rate": 8.305267702936098e-05,
+      "loss": 1.2558,
+      "step": 1030
+    },
+    {
+      "epoch": 0.21330598641199847,
+      "grad_norm": 0.43759557604789734,
+      "learning_rate": 8.283678756476684e-05,
+      "loss": 1.2557,
+      "step": 1040
+    },
+    {
+      "epoch": 0.21535700551211384,
+      "grad_norm": 0.41460636258125305,
+      "learning_rate": 8.262089810017271e-05,
+      "loss": 1.2851,
+      "step": 1050
+    },
+    {
+      "epoch": 0.2174080246122292,
+      "grad_norm": 0.4114689826965332,
+      "learning_rate": 8.240500863557859e-05,
+      "loss": 1.3076,
+      "step": 1060
+    },
+    {
+      "epoch": 0.21945904371234456,
+      "grad_norm": 0.42222094535827637,
+      "learning_rate": 8.218911917098446e-05,
+      "loss": 1.2263,
+      "step": 1070
+    },
+    {
+      "epoch": 0.22151006281245994,
+      "grad_norm": 0.4098639488220215,
+      "learning_rate": 8.197322970639033e-05,
+      "loss": 1.2779,
+      "step": 1080
+    },
+    {
+      "epoch": 0.22356108191257532,
+      "grad_norm": 0.4205043315887451,
+      "learning_rate": 8.175734024179621e-05,
+      "loss": 1.2177,
+      "step": 1090
+    },
+    {
+      "epoch": 0.2256121010126907,
+      "grad_norm": 0.4501648247241974,
+      "learning_rate": 8.154145077720208e-05,
+      "loss": 1.3227,
+      "step": 1100
+    },
+    {
+      "epoch": 0.22766312011280604,
+      "grad_norm": 0.41510599851608276,
+      "learning_rate": 8.132556131260795e-05,
+      "loss": 1.3177,
+      "step": 1110
+    },
+    {
+      "epoch": 0.22971413921292141,
+      "grad_norm": 0.41567444801330566,
+      "learning_rate": 8.110967184801383e-05,
+      "loss": 1.2506,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2317651583130368,
+      "grad_norm": 0.4262779653072357,
+      "learning_rate": 8.089378238341969e-05,
+      "loss": 1.2506,
+      "step": 1130
+    },
+    {
+      "epoch": 0.23381617741315217,
+      "grad_norm": 0.4220465421676636,
+      "learning_rate": 8.067789291882558e-05,
+      "loss": 1.2514,
+      "step": 1140
+    },
+    {
+      "epoch": 0.23586719651326754,
+      "grad_norm": 0.4169275462627411,
+      "learning_rate": 8.046200345423144e-05,
+      "loss": 1.2693,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2379182156133829,
+      "grad_norm": 0.43145328760147095,
+      "learning_rate": 8.02461139896373e-05,
+      "loss": 1.2394,
+      "step": 1160
+    },
+    {
+      "epoch": 0.23996923471349826,
+      "grad_norm": 0.42889878153800964,
+      "learning_rate": 8.003022452504319e-05,
+      "loss": 1.248,
+      "step": 1170
+    },
+    {
+      "epoch": 0.24202025381361364,
+      "grad_norm": 0.41731464862823486,
+      "learning_rate": 7.981433506044905e-05,
+      "loss": 1.2498,
+      "step": 1180
+    },
+    {
+      "epoch": 0.24407127291372901,
+      "grad_norm": 0.4326362609863281,
+      "learning_rate": 7.959844559585493e-05,
+      "loss": 1.265,
+      "step": 1190
+    },
+    {
+      "epoch": 0.2461222920138444,
+      "grad_norm": 0.4242352843284607,
+      "learning_rate": 7.93825561312608e-05,
+      "loss": 1.2672,
+      "step": 1200
+    },
+    {
+      "epoch": 0.24817331111395974,
+      "grad_norm": 0.4441153407096863,
+      "learning_rate": 7.916666666666666e-05,
+      "loss": 1.2944,
+      "step": 1210
+    },
+    {
+      "epoch": 0.25022433021407514,
+      "grad_norm": 0.40912818908691406,
+      "learning_rate": 7.895077720207255e-05,
+      "loss": 1.2702,
+      "step": 1220
+    },
+    {
+      "epoch": 0.2522753493141905,
+      "grad_norm": 0.44539037346839905,
+      "learning_rate": 7.873488773747841e-05,
+      "loss": 1.2228,
+      "step": 1230
+    },
+    {
+      "epoch": 0.25432636841430584,
+      "grad_norm": 0.4299303889274597,
+      "learning_rate": 7.851899827288429e-05,
+      "loss": 1.2328,
+      "step": 1240
+    },
+    {
+      "epoch": 0.25637738751442124,
+      "grad_norm": 0.4408973455429077,
+      "learning_rate": 7.830310880829016e-05,
+      "loss": 1.2358,
+      "step": 1250
+    },
+    {
+      "epoch": 0.2584284066145366,
+      "grad_norm": 0.4100968837738037,
+      "learning_rate": 7.808721934369602e-05,
+      "loss": 1.2458,
+      "step": 1260
+    },
+    {
+      "epoch": 0.260479425714652,
+      "grad_norm": 0.4401489198207855,
+      "learning_rate": 7.787132987910191e-05,
+      "loss": 1.2593,
+      "step": 1270
+    },
+    {
+      "epoch": 0.26253044481476734,
+      "grad_norm": 0.4514229893684387,
+      "learning_rate": 7.765544041450777e-05,
+      "loss": 1.2632,
+      "step": 1280
+    },
+    {
+      "epoch": 0.2645814639148827,
+      "grad_norm": 0.38684791326522827,
+      "learning_rate": 7.743955094991365e-05,
+      "loss": 1.2424,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2666324830149981,
+      "grad_norm": 0.46148189902305603,
+      "learning_rate": 7.722366148531953e-05,
+      "loss": 1.2445,
+      "step": 1300
+    },
+    {
+      "epoch": 0.26868350211511344,
+      "grad_norm": 0.4319213628768921,
+      "learning_rate": 7.700777202072539e-05,
+      "loss": 1.2253,
+      "step": 1310
+    },
+    {
+      "epoch": 0.27073452121522884,
+      "grad_norm": 0.4195545017719269,
+      "learning_rate": 7.679188255613126e-05,
+      "loss": 1.2578,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2727855403153442,
+      "grad_norm": 0.43690159916877747,
+      "learning_rate": 7.657599309153714e-05,
+      "loss": 1.2573,
+      "step": 1330
+    },
+    {
+      "epoch": 0.27483655941545954,
+      "grad_norm": 0.44571492075920105,
+      "learning_rate": 7.636010362694301e-05,
+      "loss": 1.2607,
+      "step": 1340
+    },
+    {
+      "epoch": 0.27688757851557494,
+      "grad_norm": 0.43295958638191223,
+      "learning_rate": 7.614421416234889e-05,
+      "loss": 1.2278,
+      "step": 1350
+    },
+    {
+      "epoch": 0.2789385976156903,
+      "grad_norm": 0.44495707750320435,
+      "learning_rate": 7.592832469775475e-05,
+      "loss": 1.2798,
+      "step": 1360
+    },
+    {
+      "epoch": 0.2809896167158057,
+      "grad_norm": 0.4412330985069275,
+      "learning_rate": 7.571243523316062e-05,
+      "loss": 1.2501,
+      "step": 1370
+    },
+    {
+      "epoch": 0.28304063581592104,
+      "grad_norm": 0.44599953293800354,
+      "learning_rate": 7.54965457685665e-05,
+      "loss": 1.2396,
+      "step": 1380
+    },
+    {
+      "epoch": 0.2850916549160364,
+      "grad_norm": 0.447109580039978,
+      "learning_rate": 7.528065630397237e-05,
+      "loss": 1.2767,
+      "step": 1390
+    },
+    {
+      "epoch": 0.2871426740161518,
+      "grad_norm": 0.44506722688674927,
+      "learning_rate": 7.506476683937824e-05,
+      "loss": 1.2546,
+      "step": 1400
+    },
+    {
+      "epoch": 0.28919369311626714,
+      "grad_norm": 0.44061776995658875,
+      "learning_rate": 7.484887737478411e-05,
+      "loss": 1.2413,
+      "step": 1410
+    },
+    {
+      "epoch": 0.29124471221638254,
+      "grad_norm": 0.45085111260414124,
+      "learning_rate": 7.463298791018999e-05,
+      "loss": 1.2483,
+      "step": 1420
+    },
+    {
+      "epoch": 0.2932957313164979,
+      "grad_norm": 0.4437837600708008,
+      "learning_rate": 7.441709844559586e-05,
+      "loss": 1.252,
+      "step": 1430
+    },
+    {
+      "epoch": 0.29534675041661324,
+      "grad_norm": 0.4294221103191376,
+      "learning_rate": 7.420120898100174e-05,
+      "loss": 1.2386,
+      "step": 1440
+    },
+    {
+      "epoch": 0.29739776951672864,
+      "grad_norm": 0.4780830144882202,
+      "learning_rate": 7.39853195164076e-05,
+      "loss": 1.2639,
+      "step": 1450
+    },
+    {
+      "epoch": 0.299448788616844,
+      "grad_norm": 0.44152942299842834,
+      "learning_rate": 7.376943005181347e-05,
+      "loss": 1.2756,
+      "step": 1460
+    },
+    {
+      "epoch": 0.3014998077169594,
+      "grad_norm": 0.41989192366600037,
+      "learning_rate": 7.355354058721935e-05,
+      "loss": 1.2614,
+      "step": 1470
+    },
+    {
+      "epoch": 0.30355082681707474,
+      "grad_norm": 0.5871754884719849,
+      "learning_rate": 7.333765112262521e-05,
+      "loss": 1.2615,
+      "step": 1480
+    },
+    {
+      "epoch": 0.3056018459171901,
+      "grad_norm": 0.4467261731624603,
+      "learning_rate": 7.31217616580311e-05,
+      "loss": 1.2624,
+      "step": 1490
+    },
+    {
+      "epoch": 0.3076528650173055,
+      "grad_norm": 0.49219033122062683,
+      "learning_rate": 7.290587219343696e-05,
+      "loss": 1.289,
+      "step": 1500
+    },
+    {
+      "epoch": 0.30970388411742084,
+      "grad_norm": 0.4700734317302704,
+      "learning_rate": 7.268998272884284e-05,
+      "loss": 1.242,
+      "step": 1510
+    },
+    {
+      "epoch": 0.31175490321753624,
+      "grad_norm": 0.4607170820236206,
+      "learning_rate": 7.247409326424871e-05,
+      "loss": 1.2554,
+      "step": 1520
+    },
+    {
+      "epoch": 0.3138059223176516,
+      "grad_norm": 0.4335988759994507,
+      "learning_rate": 7.225820379965457e-05,
+      "loss": 1.2423,
+      "step": 1530
+    },
+    {
+      "epoch": 0.31585694141776693,
+      "grad_norm": 0.4366897940635681,
+      "learning_rate": 7.204231433506046e-05,
+      "loss": 1.2219,
+      "step": 1540
+    },
+    {
+      "epoch": 0.31790796051788234,
+      "grad_norm": 0.45856085419654846,
+      "learning_rate": 7.182642487046632e-05,
+      "loss": 1.2189,
+      "step": 1550
+    },
+    {
+      "epoch": 0.3199589796179977,
+      "grad_norm": 0.4563063085079193,
+      "learning_rate": 7.16105354058722e-05,
+      "loss": 1.2696,
+      "step": 1560
+    },
+    {
+      "epoch": 0.3220099987181131,
+      "grad_norm": 0.4276934862136841,
+      "learning_rate": 7.139464594127807e-05,
+      "loss": 1.2659,
+      "step": 1570
+    },
+    {
+      "epoch": 0.32406101781822844,
+      "grad_norm": 0.46200886368751526,
+      "learning_rate": 7.117875647668394e-05,
+      "loss": 1.2261,
+      "step": 1580
+    },
+    {
+      "epoch": 0.3261120369183438,
+      "grad_norm": 0.4863358736038208,
+      "learning_rate": 7.096286701208982e-05,
+      "loss": 1.2292,
+      "step": 1590
+    },
+    {
+      "epoch": 0.3281630560184592,
+      "grad_norm": 0.4537160098552704,
+      "learning_rate": 7.074697754749569e-05,
+      "loss": 1.2453,
+      "step": 1600
+    },
+    {
+      "epoch": 0.33021407511857453,
+      "grad_norm": 0.4507627487182617,
+      "learning_rate": 7.053108808290155e-05,
+      "loss": 1.2081,
+      "step": 1610
+    },
+    {
+      "epoch": 0.33226509421868994,
+      "grad_norm": 0.43197301030158997,
+      "learning_rate": 7.031519861830744e-05,
+      "loss": 1.2757,
+      "step": 1620
+    },
+    {
+      "epoch": 0.3343161133188053,
+      "grad_norm": 0.4551820456981659,
+      "learning_rate": 7.00993091537133e-05,
+      "loss": 1.2751,
+      "step": 1630
+    },
+    {
+      "epoch": 0.33636713241892063,
+      "grad_norm": 0.45099398493766785,
+      "learning_rate": 6.988341968911917e-05,
+      "loss": 1.2583,
+      "step": 1640
+    },
+    {
+      "epoch": 0.33841815151903604,
+      "grad_norm": 0.46787434816360474,
+      "learning_rate": 6.966753022452505e-05,
+      "loss": 1.2448,
+      "step": 1650
+    },
+    {
+      "epoch": 0.3404691706191514,
+      "grad_norm": 0.45500054955482483,
+      "learning_rate": 6.945164075993091e-05,
+      "loss": 1.2394,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3425201897192668,
+      "grad_norm": 0.4682730436325073,
+      "learning_rate": 6.92357512953368e-05,
+      "loss": 1.2287,
+      "step": 1670
+    },
+    {
+      "epoch": 0.34457120881938214,
+      "grad_norm": 0.4615074396133423,
+      "learning_rate": 6.901986183074266e-05,
+      "loss": 1.2042,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3466222279194975,
+      "grad_norm": 0.4548027217388153,
+      "learning_rate": 6.880397236614854e-05,
+      "loss": 1.2671,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3486732470196129,
+      "grad_norm": 0.4783169627189636,
+      "learning_rate": 6.858808290155441e-05,
+      "loss": 1.2533,
+      "step": 1700
+    },
+    {
+      "epoch": 0.35072426611972823,
+      "grad_norm": 0.46452414989471436,
+      "learning_rate": 6.837219343696027e-05,
+      "loss": 1.2681,
+      "step": 1710
+    },
+    {
+      "epoch": 0.35277528521984364,
+      "grad_norm": 0.4663463532924652,
+      "learning_rate": 6.815630397236615e-05,
+      "loss": 1.2561,
+      "step": 1720
+    },
+    {
+      "epoch": 0.354826304319959,
+      "grad_norm": 0.46744370460510254,
+      "learning_rate": 6.794041450777202e-05,
+      "loss": 1.2453,
+      "step": 1730
+    },
+    {
+      "epoch": 0.35687732342007433,
+      "grad_norm": 0.471835732460022,
+      "learning_rate": 6.77245250431779e-05,
+      "loss": 1.2472,
+      "step": 1740
+    },
+    {
+      "epoch": 0.35892834252018974,
+      "grad_norm": 0.4618450701236725,
+      "learning_rate": 6.750863557858377e-05,
+      "loss": 1.2547,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3609793616203051,
+      "grad_norm": 0.4651658833026886,
+      "learning_rate": 6.729274611398963e-05,
+      "loss": 1.2623,
+      "step": 1760
+    },
+    {
+      "epoch": 0.36303038072042043,
+      "grad_norm": 0.46842116117477417,
+      "learning_rate": 6.707685664939551e-05,
+      "loss": 1.2391,
+      "step": 1770
+    },
+    {
+      "epoch": 0.36508139982053583,
+      "grad_norm": 0.45604613423347473,
+      "learning_rate": 6.686096718480138e-05,
+      "loss": 1.2884,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3671324189206512,
+      "grad_norm": 0.4306802451610565,
+      "learning_rate": 6.664507772020726e-05,
+      "loss": 1.2252,
+      "step": 1790
+    },
+    {
+      "epoch": 0.3691834380207666,
+      "grad_norm": 0.4549136757850647,
+      "learning_rate": 6.642918825561312e-05,
+      "loss": 1.2496,
+      "step": 1800
+    },
+    {
+      "epoch": 0.37123445712088193,
+      "grad_norm": 0.47443437576293945,
+      "learning_rate": 6.6213298791019e-05,
+      "loss": 1.2655,
+      "step": 1810
+    },
+    {
+      "epoch": 0.3732854762209973,
+      "grad_norm": 0.46772050857543945,
+      "learning_rate": 6.599740932642487e-05,
+      "loss": 1.2366,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3753364953211127,
+      "grad_norm": 0.4691794216632843,
+      "learning_rate": 6.578151986183075e-05,
+      "loss": 1.2152,
+      "step": 1830
+    },
+    {
+      "epoch": 0.37738751442122803,
+      "grad_norm": 0.43691304326057434,
+      "learning_rate": 6.556563039723662e-05,
+      "loss": 1.2511,
+      "step": 1840
+    },
+    {
+      "epoch": 0.37943853352134344,
+      "grad_norm": 0.4595348536968231,
+      "learning_rate": 6.534974093264248e-05,
+      "loss": 1.2635,
+      "step": 1850
+    },
+    {
+      "epoch": 0.3814895526214588,
+      "grad_norm": 0.44760558009147644,
+      "learning_rate": 6.513385146804836e-05,
+      "loss": 1.2342,
+      "step": 1860
+    },
+    {
+      "epoch": 0.38354057172157413,
+      "grad_norm": 0.4559841454029083,
+      "learning_rate": 6.491796200345423e-05,
+      "loss": 1.2432,
+      "step": 1870
+    },
+    {
+      "epoch": 0.38559159082168953,
+      "grad_norm": 0.4497215449810028,
+      "learning_rate": 6.470207253886011e-05,
+      "loss": 1.2267,
+      "step": 1880
+    },
+    {
+      "epoch": 0.3876426099218049,
+      "grad_norm": 0.4863613247871399,
+      "learning_rate": 6.448618307426598e-05,
+      "loss": 1.254,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3896936290219203,
+      "grad_norm": 0.4500603675842285,
+      "learning_rate": 6.427029360967185e-05,
+      "loss": 1.2214,
+      "step": 1900
+    },
+    {
+      "epoch": 0.39174464812203563,
+      "grad_norm": 0.4400598704814911,
+      "learning_rate": 6.405440414507774e-05,
+      "loss": 1.2352,
+      "step": 1910
+    },
+    {
+      "epoch": 0.393795667222151,
+      "grad_norm": 0.46070367097854614,
+      "learning_rate": 6.38385146804836e-05,
+      "loss": 1.2468,
+      "step": 1920
+    },
+    {
+      "epoch": 0.3958466863222664,
+      "grad_norm": 0.44312766194343567,
+      "learning_rate": 6.362262521588946e-05,
+      "loss": 1.1923,
+      "step": 1930
+    },
+    {
+      "epoch": 0.39789770542238173,
+      "grad_norm": 0.5013573169708252,
+      "learning_rate": 6.340673575129535e-05,
+      "loss": 1.2361,
+      "step": 1940
+    },
+    {
+      "epoch": 0.39994872452249713,
+      "grad_norm": 0.4884537160396576,
+      "learning_rate": 6.319084628670121e-05,
+      "loss": 1.2434,
+      "step": 1950
+    },
+    {
+      "epoch": 0.4019997436226125,
+      "grad_norm": 0.46138620376586914,
+      "learning_rate": 6.297495682210708e-05,
+      "loss": 1.257,
+      "step": 1960
+    },
+    {
+      "epoch": 0.40405076272272783,
+      "grad_norm": 0.4941729009151459,
+      "learning_rate": 6.275906735751296e-05,
+      "loss": 1.2347,
+      "step": 1970
+    },
+    {
+      "epoch": 0.40610178182284323,
+      "grad_norm": 0.4675595760345459,
+      "learning_rate": 6.254317789291882e-05,
+      "loss": 1.2353,
+      "step": 1980
+    },
+    {
+      "epoch": 0.4081528009229586,
+      "grad_norm": 0.47944632172584534,
+      "learning_rate": 6.232728842832471e-05,
+      "loss": 1.2643,
+      "step": 1990
+    },
+    {
+      "epoch": 0.410203820023074,
+      "grad_norm": 0.4476461112499237,
+      "learning_rate": 6.211139896373057e-05,
+      "loss": 1.2558,
+      "step": 2000
+    },
+    {
+      "epoch": 0.41225483912318933,
+      "grad_norm": 0.4706653654575348,
+      "learning_rate": 6.189550949913645e-05,
+      "loss": 1.227,
+      "step": 2010
+    },
+    {
+      "epoch": 0.4143058582233047,
+      "grad_norm": 0.48062801361083984,
+      "learning_rate": 6.167962003454232e-05,
+      "loss": 1.2273,
+      "step": 2020
+    },
+    {
+      "epoch": 0.4163568773234201,
+      "grad_norm": 0.46771204471588135,
+      "learning_rate": 6.146373056994818e-05,
+      "loss": 1.2268,
+      "step": 2030
+    },
+    {
+      "epoch": 0.41840789642353543,
+      "grad_norm": 0.4725424647331238,
+      "learning_rate": 6.124784110535406e-05,
+      "loss": 1.2009,
+      "step": 2040
+    },
+    {
+      "epoch": 0.42045891552365083,
+      "grad_norm": 0.47520384192466736,
+      "learning_rate": 6.1031951640759934e-05,
+      "loss": 1.2511,
+      "step": 2050
+    },
+    {
+      "epoch": 0.4225099346237662,
+      "grad_norm": 0.44635480642318726,
+      "learning_rate": 6.081606217616581e-05,
+      "loss": 1.21,
+      "step": 2060
+    },
+    {
+      "epoch": 0.42456095372388153,
+      "grad_norm": 0.47436651587486267,
+      "learning_rate": 6.060017271157168e-05,
+      "loss": 1.2116,
+      "step": 2070
+    },
+    {
+      "epoch": 0.42661197282399693,
+      "grad_norm": 0.5115741491317749,
+      "learning_rate": 6.0384283246977546e-05,
+      "loss": 1.2778,
+      "step": 2080
+    },
+    {
+      "epoch": 0.4286629919241123,
+      "grad_norm": 0.4488040506839752,
+      "learning_rate": 6.016839378238343e-05,
+      "loss": 1.2242,
+      "step": 2090
+    },
+    {
+      "epoch": 0.4307140110242277,
+      "grad_norm": 0.4834796190261841,
+      "learning_rate": 5.9952504317789296e-05,
+      "loss": 1.2357,
+      "step": 2100
+    },
+    {
+      "epoch": 0.43276503012434303,
+      "grad_norm": 0.45478227734565735,
+      "learning_rate": 5.973661485319517e-05,
+      "loss": 1.2233,
+      "step": 2110
+    },
+    {
+      "epoch": 0.4348160492244584,
+      "grad_norm": 0.4539099633693695,
+      "learning_rate": 5.952072538860104e-05,
+      "loss": 1.2527,
+      "step": 2120
+    },
+    {
+      "epoch": 0.4368670683245738,
+      "grad_norm": 0.47722533345222473,
+      "learning_rate": 5.930483592400691e-05,
+      "loss": 1.2015,
+      "step": 2130
+    },
+    {
+      "epoch": 0.43891808742468913,
+      "grad_norm": 0.472023069858551,
+      "learning_rate": 5.908894645941278e-05,
+      "loss": 1.2222,
+      "step": 2140
+    },
+    {
+      "epoch": 0.44096910652480453,
+      "grad_norm": 0.4648214876651764,
+      "learning_rate": 5.887305699481865e-05,
+      "loss": 1.2112,
+      "step": 2150
+    },
+    {
+      "epoch": 0.4430201256249199,
+      "grad_norm": 0.48654377460479736,
+      "learning_rate": 5.8657167530224534e-05,
+      "loss": 1.227,
+      "step": 2160
+    },
+    {
+      "epoch": 0.44507114472503523,
+      "grad_norm": 0.4997814893722534,
+      "learning_rate": 5.84412780656304e-05,
+      "loss": 1.2721,
+      "step": 2170
+    },
+    {
+      "epoch": 0.44712216382515063,
+      "grad_norm": 0.47997352480888367,
+      "learning_rate": 5.822538860103627e-05,
+      "loss": 1.2018,
+      "step": 2180
+    },
+    {
+      "epoch": 0.449173182925266,
+      "grad_norm": 0.4899247884750366,
+      "learning_rate": 5.8009499136442146e-05,
+      "loss": 1.2599,
+      "step": 2190
+    },
+    {
+      "epoch": 0.4512242020253814,
+      "grad_norm": 0.4752749800682068,
+      "learning_rate": 5.7793609671848014e-05,
+      "loss": 1.2171,
+      "step": 2200
+    },
+    {
+      "epoch": 0.45327522112549673,
+      "grad_norm": 0.4801314175128937,
+      "learning_rate": 5.7577720207253896e-05,
+      "loss": 1.2234,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4553262402256121,
+      "grad_norm": 0.4591893255710602,
+      "learning_rate": 5.7361830742659764e-05,
+      "loss": 1.2242,
+      "step": 2220
+    },
+    {
+      "epoch": 0.4573772593257275,
+      "grad_norm": 0.46896713972091675,
+      "learning_rate": 5.7145941278065626e-05,
+      "loss": 1.2117,
+      "step": 2230
+    },
+    {
+      "epoch": 0.45942827842584283,
+      "grad_norm": 0.4853857755661011,
+      "learning_rate": 5.693005181347151e-05,
+      "loss": 1.2218,
+      "step": 2240
+    },
+    {
+      "epoch": 0.46147929752595823,
+      "grad_norm": 0.4648151993751526,
+      "learning_rate": 5.6714162348877376e-05,
+      "loss": 1.2401,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4635303166260736,
+      "grad_norm": 0.4839739501476288,
+      "learning_rate": 5.649827288428325e-05,
+      "loss": 1.1976,
+      "step": 2260
+    },
+    {
+      "epoch": 0.4655813357261889,
+      "grad_norm": 0.4986715018749237,
+      "learning_rate": 5.628238341968912e-05,
+      "loss": 1.2274,
+      "step": 2270
+    },
+    {
+      "epoch": 0.46763235482630433,
+      "grad_norm": 0.4636840522289276,
+      "learning_rate": 5.606649395509499e-05,
+      "loss": 1.236,
+      "step": 2280
+    },
+    {
+      "epoch": 0.4696833739264197,
+      "grad_norm": 0.5011271834373474,
+      "learning_rate": 5.585060449050087e-05,
+      "loss": 1.2275,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4717343930265351,
+      "grad_norm": 0.4648337662220001,
+      "learning_rate": 5.563471502590674e-05,
+      "loss": 1.2457,
+      "step": 2300
+    },
+    {
+      "epoch": 0.47378541212665043,
+      "grad_norm": 0.47708699107170105,
+      "learning_rate": 5.5418825561312614e-05,
+      "loss": 1.2316,
+      "step": 2310
+    },
+    {
+      "epoch": 0.4758364312267658,
+      "grad_norm": 0.4954835772514343,
+      "learning_rate": 5.520293609671848e-05,
+      "loss": 1.229,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4778874503268812,
+      "grad_norm": 0.4701727330684662,
+      "learning_rate": 5.498704663212435e-05,
+      "loss": 1.248,
+      "step": 2330
+    },
+    {
+      "epoch": 0.47993846942699653,
+      "grad_norm": 0.4796009957790375,
+      "learning_rate": 5.477115716753023e-05,
+      "loss": 1.2248,
+      "step": 2340
+    },
+    {
+      "epoch": 0.48198948852711193,
+      "grad_norm": 0.4906330406665802,
+      "learning_rate": 5.4555267702936094e-05,
+      "loss": 1.2628,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4840405076272273,
+      "grad_norm": 0.47203144431114197,
+      "learning_rate": 5.4339378238341976e-05,
+      "loss": 1.2067,
+      "step": 2360
+    },
+    {
+      "epoch": 0.4860915267273426,
+      "grad_norm": 0.503813624382019,
+      "learning_rate": 5.4123488773747845e-05,
+      "loss": 1.2006,
+      "step": 2370
+    },
+    {
+      "epoch": 0.48814254582745803,
+      "grad_norm": 0.4918235242366791,
+      "learning_rate": 5.390759930915371e-05,
+      "loss": 1.1887,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4901935649275734,
+      "grad_norm": 0.4799112379550934,
+      "learning_rate": 5.369170984455959e-05,
+      "loss": 1.2079,
+      "step": 2390
+    },
+    {
+      "epoch": 0.4922445840276888,
+      "grad_norm": 0.4769650101661682,
+      "learning_rate": 5.347582037996546e-05,
+      "loss": 1.1945,
+      "step": 2400
+    },
+    {
+      "epoch": 0.49429560312780413,
+      "grad_norm": 0.5079638957977295,
+      "learning_rate": 5.325993091537134e-05,
+      "loss": 1.2294,
+      "step": 2410
+    },
+    {
+      "epoch": 0.4963466222279195,
+      "grad_norm": 0.520418643951416,
+      "learning_rate": 5.304404145077721e-05,
+      "loss": 1.2308,
+      "step": 2420
+    },
+    {
+      "epoch": 0.4983976413280349,
+      "grad_norm": 0.4546453058719635,
+      "learning_rate": 5.2828151986183075e-05,
+      "loss": 1.2206,
+      "step": 2430
+    },
+    {
+      "epoch": 0.5004486604281503,
+      "grad_norm": 0.47760534286499023,
+      "learning_rate": 5.261226252158895e-05,
+      "loss": 1.208,
+      "step": 2440
+    },
+    {
+      "epoch": 0.5024996795282656,
+      "grad_norm": 0.5267066955566406,
+      "learning_rate": 5.239637305699482e-05,
+      "loss": 1.2123,
+      "step": 2450
+    },
+    {
+      "epoch": 0.504550698628381,
+      "grad_norm": 0.45763811469078064,
+      "learning_rate": 5.2180483592400694e-05,
+      "loss": 1.2159,
+      "step": 2460
+    },
+    {
+      "epoch": 0.5066017177284964,
+      "grad_norm": 0.4922376871109009,
+      "learning_rate": 5.196459412780656e-05,
+      "loss": 1.2456,
+      "step": 2470
+    },
+    {
+      "epoch": 0.5086527368286117,
+      "grad_norm": 0.47043368220329285,
+      "learning_rate": 5.174870466321243e-05,
+      "loss": 1.2052,
+      "step": 2480
+    },
+    {
+      "epoch": 0.5107037559287271,
+      "grad_norm": 0.5082889795303345,
+      "learning_rate": 5.153281519861831e-05,
+      "loss": 1.2393,
+      "step": 2490
+    },
+    {
+      "epoch": 0.5127547750288425,
+      "grad_norm": 0.4955206513404846,
+      "learning_rate": 5.131692573402418e-05,
+      "loss": 1.2323,
+      "step": 2500
+    },
+    {
+      "epoch": 0.5148057941289578,
+      "grad_norm": 0.48625460267066956,
+      "learning_rate": 5.1101036269430057e-05,
+      "loss": 1.206,
+      "step": 2510
+    },
+    {
+      "epoch": 0.5168568132290732,
+      "grad_norm": 0.49060237407684326,
+      "learning_rate": 5.0885146804835925e-05,
+      "loss": 1.2353,
+      "step": 2520
+    },
+    {
+      "epoch": 0.5189078323291886,
+      "grad_norm": 0.46809640526771545,
+      "learning_rate": 5.0669257340241793e-05,
+      "loss": 1.2287,
+      "step": 2530
+    },
+    {
+      "epoch": 0.520958851429304,
+      "grad_norm": 0.4944596290588379,
+      "learning_rate": 5.0453367875647675e-05,
+      "loss": 1.2413,
+      "step": 2540
+    },
+    {
+      "epoch": 0.5230098705294193,
+      "grad_norm": 0.46914994716644287,
+      "learning_rate": 5.023747841105354e-05,
+      "loss": 1.22,
+      "step": 2550
+    },
+    {
+      "epoch": 0.5250608896295347,
+      "grad_norm": 0.4888727366924286,
+      "learning_rate": 5.002158894645942e-05,
+      "loss": 1.2343,
+      "step": 2560
+    },
+    {
+      "epoch": 0.5271119087296501,
+      "grad_norm": 0.4785778522491455,
+      "learning_rate": 4.980569948186529e-05,
+      "loss": 1.187,
+      "step": 2570
+    },
+    {
+      "epoch": 0.5291629278297654,
+      "grad_norm": 0.4947550594806671,
+      "learning_rate": 4.958981001727116e-05,
+      "loss": 1.2288,
+      "step": 2580
+    },
+    {
+      "epoch": 0.5312139469298808,
+      "grad_norm": 0.5263291597366333,
+      "learning_rate": 4.937392055267703e-05,
+      "loss": 1.2044,
+      "step": 2590
+    },
+    {
+      "epoch": 0.5332649660299962,
+      "grad_norm": 0.49239382147789,
+      "learning_rate": 4.9158031088082906e-05,
+      "loss": 1.1865,
+      "step": 2600
+    },
+    {
+      "epoch": 0.5353159851301115,
+      "grad_norm": 0.48874983191490173,
+      "learning_rate": 4.8942141623488775e-05,
+      "loss": 1.2672,
+      "step": 2610
+    },
+    {
+      "epoch": 0.5373670042302269,
+      "grad_norm": 0.48474863171577454,
+      "learning_rate": 4.872625215889465e-05,
+      "loss": 1.2359,
+      "step": 2620
+    },
+    {
+      "epoch": 0.5394180233303423,
+      "grad_norm": 0.4978977143764496,
+      "learning_rate": 4.851036269430052e-05,
+      "loss": 1.2139,
+      "step": 2630
+    },
+    {
+      "epoch": 0.5414690424304577,
+      "grad_norm": 0.5144924521446228,
+      "learning_rate": 4.829447322970639e-05,
+      "loss": 1.221,
+      "step": 2640
+    },
+    {
+      "epoch": 0.543520061530573,
+      "grad_norm": 0.5082759857177734,
+      "learning_rate": 4.807858376511227e-05,
+      "loss": 1.2209,
+      "step": 2650
+    },
+    {
+      "epoch": 0.5455710806306884,
+      "grad_norm": 0.4933965504169464,
+      "learning_rate": 4.786269430051814e-05,
+      "loss": 1.207,
+      "step": 2660
+    },
+    {
+      "epoch": 0.5476220997308038,
+      "grad_norm": 0.49464166164398193,
+      "learning_rate": 4.7646804835924005e-05,
+      "loss": 1.2398,
+      "step": 2670
+    },
+    {
+      "epoch": 0.5496731188309191,
+      "grad_norm": 0.49377110600471497,
+      "learning_rate": 4.743091537132988e-05,
+      "loss": 1.2451,
+      "step": 2680
+    },
+    {
+      "epoch": 0.5517241379310345,
+      "grad_norm": 0.5111104846000671,
+      "learning_rate": 4.7215025906735756e-05,
+      "loss": 1.2197,
+      "step": 2690
+    },
+    {
+      "epoch": 0.5537751570311499,
+      "grad_norm": 0.47716042399406433,
+      "learning_rate": 4.699913644214163e-05,
+      "loss": 1.1891,
+      "step": 2700
+    },
+    {
+      "epoch": 0.5558261761312652,
+      "grad_norm": 0.5081655383110046,
+      "learning_rate": 4.678324697754749e-05,
+      "loss": 1.2507,
+      "step": 2710
+    },
+    {
+      "epoch": 0.5578771952313806,
+      "grad_norm": 0.49036547541618347,
+      "learning_rate": 4.656735751295337e-05,
+      "loss": 1.1805,
+      "step": 2720
+    },
+    {
+      "epoch": 0.559928214331496,
+      "grad_norm": 0.5139365792274475,
+      "learning_rate": 4.635146804835924e-05,
+      "loss": 1.2361,
+      "step": 2730
+    },
+    {
+      "epoch": 0.5619792334316114,
+      "grad_norm": 0.5098669528961182,
+      "learning_rate": 4.613557858376512e-05,
+      "loss": 1.2409,
+      "step": 2740
+    },
+    {
+      "epoch": 0.5640302525317267,
+      "grad_norm": 0.4786950349807739,
+      "learning_rate": 4.5919689119170986e-05,
+      "loss": 1.2067,
+      "step": 2750
+    },
+    {
+      "epoch": 0.5660812716318421,
+      "grad_norm": 0.5063204169273376,
+      "learning_rate": 4.5703799654576855e-05,
+      "loss": 1.1942,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5681322907319575,
+      "grad_norm": 0.511663556098938,
+      "learning_rate": 4.548791018998273e-05,
+      "loss": 1.2017,
+      "step": 2770
+    },
+    {
+      "epoch": 0.5701833098320728,
+      "grad_norm": 0.48765748739242554,
+      "learning_rate": 4.5272020725388605e-05,
+      "loss": 1.222,
+      "step": 2780
+    },
+    {
+      "epoch": 0.5722343289321882,
+      "grad_norm": 0.49707624316215515,
+      "learning_rate": 4.5056131260794474e-05,
+      "loss": 1.2075,
+      "step": 2790
+    },
+    {
+      "epoch": 0.5742853480323036,
+      "grad_norm": 0.5067517757415771,
+      "learning_rate": 4.484024179620035e-05,
+      "loss": 1.211,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5763363671324189,
+      "grad_norm": 0.4615229368209839,
+      "learning_rate": 4.462435233160622e-05,
+      "loss": 1.2303,
+      "step": 2810
+    },
+    {
+      "epoch": 0.5783873862325343,
+      "grad_norm": 0.4948524236679077,
+      "learning_rate": 4.440846286701209e-05,
+      "loss": 1.2024,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5804384053326497,
+      "grad_norm": 0.5140314102172852,
+      "learning_rate": 4.419257340241796e-05,
+      "loss": 1.2217,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5824894244327651,
+      "grad_norm": 0.5108122825622559,
+      "learning_rate": 4.3976683937823836e-05,
+      "loss": 1.1838,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5845404435328804,
+      "grad_norm": 0.5021159052848816,
+      "learning_rate": 4.376079447322971e-05,
+      "loss": 1.2418,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5865914626329958,
+      "grad_norm": 0.5086933374404907,
+      "learning_rate": 4.354490500863558e-05,
+      "loss": 1.2321,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5886424817331112,
+      "grad_norm": 0.5083547830581665,
+      "learning_rate": 4.332901554404145e-05,
+      "loss": 1.2035,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5906935008332265,
+      "grad_norm": 0.4828626215457916,
+      "learning_rate": 4.311312607944732e-05,
+      "loss": 1.2302,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5927445199333419,
+      "grad_norm": 0.5140969157218933,
+      "learning_rate": 4.28972366148532e-05,
+      "loss": 1.2058,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5947955390334573,
+      "grad_norm": 0.497364342212677,
+      "learning_rate": 4.2681347150259074e-05,
+      "loss": 1.2382,
+      "step": 2900
+    },
+    {
+      "epoch": 0.5968465581335726,
+      "grad_norm": 0.49104997515678406,
+      "learning_rate": 4.246545768566494e-05,
+      "loss": 1.2322,
+      "step": 2910
+    },
+    {
+      "epoch": 0.598897577233688,
+      "grad_norm": 0.521659255027771,
+      "learning_rate": 4.224956822107081e-05,
+      "loss": 1.1868,
+      "step": 2920
+    },
+    {
+      "epoch": 0.6009485963338034,
+      "grad_norm": 0.5175550580024719,
+      "learning_rate": 4.2033678756476686e-05,
+      "loss": 1.2169,
+      "step": 2930
+    },
+    {
+      "epoch": 0.6029996154339188,
+      "grad_norm": 0.4998300075531006,
+      "learning_rate": 4.181778929188256e-05,
+      "loss": 1.2227,
+      "step": 2940
+    },
+    {
+      "epoch": 0.6050506345340341,
+      "grad_norm": 0.4932349622249603,
+      "learning_rate": 4.160189982728843e-05,
+      "loss": 1.2371,
+      "step": 2950
+    },
+    {
+      "epoch": 0.6071016536341495,
+      "grad_norm": 0.5610498189926147,
+      "learning_rate": 4.1386010362694304e-05,
+      "loss": 1.2105,
+      "step": 2960
+    },
+    {
+      "epoch": 0.6091526727342649,
+      "grad_norm": 0.4975990355014801,
+      "learning_rate": 4.117012089810017e-05,
+      "loss": 1.2511,
+      "step": 2970
+    },
+    {
+      "epoch": 0.6112036918343802,
+      "grad_norm": 0.5154693722724915,
+      "learning_rate": 4.095423143350605e-05,
+      "loss": 1.2399,
+      "step": 2980
+    },
+    {
+      "epoch": 0.6132547109344956,
+      "grad_norm": 0.4968002736568451,
+      "learning_rate": 4.0738341968911916e-05,
+      "loss": 1.2041,
+      "step": 2990
+    },
+    {
+      "epoch": 0.615305730034611,
+      "grad_norm": 0.4866868555545807,
+      "learning_rate": 4.052245250431779e-05,
+      "loss": 1.1965,
+      "step": 3000
+    },
+    {
+      "epoch": 0.6173567491347263,
+      "grad_norm": 0.5152925848960876,
+      "learning_rate": 4.030656303972367e-05,
+      "loss": 1.2298,
+      "step": 3010
+    },
+    {
+      "epoch": 0.6194077682348417,
+      "grad_norm": 0.513058602809906,
+      "learning_rate": 4.0090673575129535e-05,
+      "loss": 1.2414,
+      "step": 3020
+    },
+    {
+      "epoch": 0.6214587873349571,
+      "grad_norm": 0.5031930208206177,
+      "learning_rate": 3.987478411053541e-05,
+      "loss": 1.1766,
+      "step": 3030
+    },
+    {
+      "epoch": 0.6235098064350725,
+      "grad_norm": 0.5087730288505554,
+      "learning_rate": 3.965889464594128e-05,
+      "loss": 1.229,
+      "step": 3040
+    },
+    {
+      "epoch": 0.6255608255351878,
+      "grad_norm": 0.4878797233104706,
+      "learning_rate": 3.9443005181347154e-05,
+      "loss": 1.2018,
+      "step": 3050
+    },
+    {
+      "epoch": 0.6276118446353032,
+      "grad_norm": 0.5124858617782593,
+      "learning_rate": 3.922711571675303e-05,
+      "loss": 1.1848,
+      "step": 3060
+    },
+    {
+      "epoch": 0.6296628637354186,
+      "grad_norm": 0.49720969796180725,
+      "learning_rate": 3.90112262521589e-05,
+      "loss": 1.1892,
+      "step": 3070
+    },
+    {
+      "epoch": 0.6317138828355339,
+      "grad_norm": 0.49900123476982117,
+      "learning_rate": 3.8795336787564766e-05,
+      "loss": 1.2027,
+      "step": 3080
+    },
+    {
+      "epoch": 0.6337649019356493,
+      "grad_norm": 0.5007952451705933,
+      "learning_rate": 3.857944732297064e-05,
+      "loss": 1.2373,
+      "step": 3090
+    },
+    {
+      "epoch": 0.6358159210357647,
+      "grad_norm": 0.49481576681137085,
+      "learning_rate": 3.8363557858376516e-05,
+      "loss": 1.2294,
+      "step": 3100
+    },
+    {
+      "epoch": 0.63786694013588,
+      "grad_norm": 0.4979318082332611,
+      "learning_rate": 3.8147668393782385e-05,
+      "loss": 1.2312,
+      "step": 3110
+    },
+    {
+      "epoch": 0.6399179592359954,
+      "grad_norm": 0.49939480423927307,
+      "learning_rate": 3.793177892918825e-05,
+      "loss": 1.2394,
+      "step": 3120
+    },
+    {
+      "epoch": 0.6419689783361108,
+      "grad_norm": 0.5186517834663391,
+      "learning_rate": 3.771588946459413e-05,
+      "loss": 1.199,
+      "step": 3130
+    },
+    {
+      "epoch": 0.6440199974362262,
+      "grad_norm": 0.5386569499969482,
+      "learning_rate": 3.7500000000000003e-05,
+      "loss": 1.1801,
+      "step": 3140
+    },
+    {
+      "epoch": 0.6460710165363415,
+      "grad_norm": 0.5134577751159668,
+      "learning_rate": 3.728411053540587e-05,
+      "loss": 1.2286,
+      "step": 3150
+    },
+    {
+      "epoch": 0.6481220356364569,
+      "grad_norm": 0.5191785097122192,
+      "learning_rate": 3.706822107081175e-05,
+      "loss": 1.2068,
+      "step": 3160
+    },
+    {
+      "epoch": 0.6501730547365723,
+      "grad_norm": 0.4857168197631836,
+      "learning_rate": 3.6852331606217615e-05,
+      "loss": 1.2116,
+      "step": 3170
+    },
+    {
+      "epoch": 0.6522240738366876,
+      "grad_norm": 0.5283413529396057,
+      "learning_rate": 3.663644214162349e-05,
+      "loss": 1.1792,
+      "step": 3180
+    },
+    {
+      "epoch": 0.654275092936803,
+      "grad_norm": 0.528938353061676,
+      "learning_rate": 3.6420552677029366e-05,
+      "loss": 1.1963,
+      "step": 3190
+    },
+    {
+      "epoch": 0.6563261120369184,
+      "grad_norm": 0.5067134499549866,
+      "learning_rate": 3.6204663212435234e-05,
+      "loss": 1.2476,
+      "step": 3200
+    },
+    {
+      "epoch": 0.6583771311370337,
+      "grad_norm": 0.4993511736392975,
+      "learning_rate": 3.598877374784111e-05,
+      "loss": 1.2273,
+      "step": 3210
+    },
+    {
+      "epoch": 0.6604281502371491,
+      "grad_norm": 0.5275943279266357,
+      "learning_rate": 3.577288428324698e-05,
+      "loss": 1.2287,
+      "step": 3220
+    },
+    {
+      "epoch": 0.6624791693372645,
+      "grad_norm": 0.49331194162368774,
+      "learning_rate": 3.555699481865285e-05,
+      "loss": 1.1794,
+      "step": 3230
+    },
+    {
+      "epoch": 0.6645301884373799,
+      "grad_norm": 0.5065453052520752,
+      "learning_rate": 3.534110535405872e-05,
+      "loss": 1.2342,
+      "step": 3240
+    },
+    {
+      "epoch": 0.6665812075374952,
+      "grad_norm": 0.5334459543228149,
+      "learning_rate": 3.51252158894646e-05,
+      "loss": 1.1782,
+      "step": 3250
+    },
+    {
+      "epoch": 0.6686322266376106,
+      "grad_norm": 0.535772979259491,
+      "learning_rate": 3.490932642487047e-05,
+      "loss": 1.2108,
+      "step": 3260
+    },
+    {
+      "epoch": 0.670683245737726,
+      "grad_norm": 0.5377807021141052,
+      "learning_rate": 3.469343696027634e-05,
+      "loss": 1.1903,
+      "step": 3270
+    },
+    {
+      "epoch": 0.6727342648378413,
+      "grad_norm": 0.5266278386116028,
+      "learning_rate": 3.447754749568221e-05,
+      "loss": 1.2183,
+      "step": 3280
+    },
+    {
+      "epoch": 0.6747852839379567,
+      "grad_norm": 0.4987232983112335,
+      "learning_rate": 3.4261658031088084e-05,
+      "loss": 1.1915,
+      "step": 3290
+    },
+    {
+      "epoch": 0.6768363030380721,
+      "grad_norm": 0.5178554058074951,
+      "learning_rate": 3.404576856649396e-05,
+      "loss": 1.179,
+      "step": 3300
+    },
+    {
+      "epoch": 0.6788873221381874,
+      "grad_norm": 0.5086014270782471,
+      "learning_rate": 3.382987910189983e-05,
+      "loss": 1.2298,
+      "step": 3310
+    },
+    {
+      "epoch": 0.6809383412383028,
+      "grad_norm": 0.5420427918434143,
+      "learning_rate": 3.3613989637305696e-05,
+      "loss": 1.2072,
+      "step": 3320
+    },
+    {
+      "epoch": 0.6829893603384182,
+      "grad_norm": 0.5170331001281738,
+      "learning_rate": 3.339810017271157e-05,
+      "loss": 1.2252,
+      "step": 3330
+    },
+    {
+      "epoch": 0.6850403794385336,
+      "grad_norm": 0.48680609464645386,
+      "learning_rate": 3.3182210708117446e-05,
+      "loss": 1.2059,
+      "step": 3340
+    },
+    {
+      "epoch": 0.6870913985386489,
+      "grad_norm": 0.5035340189933777,
+      "learning_rate": 3.296632124352332e-05,
+      "loss": 1.2009,
+      "step": 3350
+    },
+    {
+      "epoch": 0.6891424176387643,
+      "grad_norm": 0.513165295124054,
+      "learning_rate": 3.275043177892919e-05,
+      "loss": 1.1844,
+      "step": 3360
+    },
+    {
+      "epoch": 0.6911934367388797,
+      "grad_norm": 0.5243003368377686,
+      "learning_rate": 3.2534542314335065e-05,
+      "loss": 1.2009,
+      "step": 3370
+    },
+    {
+      "epoch": 0.693244455838995,
+      "grad_norm": 0.5219825506210327,
+      "learning_rate": 3.2318652849740933e-05,
+      "loss": 1.2039,
+      "step": 3380
+    },
+    {
+      "epoch": 0.6952954749391104,
+      "grad_norm": 0.5202507972717285,
+      "learning_rate": 3.210276338514681e-05,
+      "loss": 1.225,
+      "step": 3390
+    },
+    {
+      "epoch": 0.6973464940392258,
+      "grad_norm": 0.5152229070663452,
+      "learning_rate": 3.188687392055268e-05,
+      "loss": 1.1886,
+      "step": 3400
+    },
+    {
+      "epoch": 0.6993975131393411,
+      "grad_norm": 0.5382890701293945,
+      "learning_rate": 3.167098445595855e-05,
+      "loss": 1.2113,
+      "step": 3410
+    },
+    {
+      "epoch": 0.7014485322394565,
+      "grad_norm": 0.5525237917900085,
+      "learning_rate": 3.145509499136443e-05,
+      "loss": 1.2283,
+      "step": 3420
+    },
+    {
+      "epoch": 0.7034995513395719,
+      "grad_norm": 0.5308887958526611,
+      "learning_rate": 3.1239205526770296e-05,
+      "loss": 1.2311,
+      "step": 3430
+    },
+    {
+      "epoch": 0.7055505704396873,
+      "grad_norm": 0.5247687697410583,
+      "learning_rate": 3.1023316062176164e-05,
+      "loss": 1.1946,
+      "step": 3440
+    },
+    {
+      "epoch": 0.7076015895398026,
+      "grad_norm": 0.5322206616401672,
+      "learning_rate": 3.080742659758204e-05,
+      "loss": 1.2198,
+      "step": 3450
+    },
+    {
+      "epoch": 0.709652608639918,
+      "grad_norm": 0.5104162693023682,
+      "learning_rate": 3.0591537132987915e-05,
+      "loss": 1.2105,
+      "step": 3460
+    },
+    {
+      "epoch": 0.7117036277400334,
+      "grad_norm": 0.4890803098678589,
+      "learning_rate": 3.0375647668393786e-05,
+      "loss": 1.2074,
+      "step": 3470
+    },
+    {
+      "epoch": 0.7137546468401487,
+      "grad_norm": 0.529225766658783,
+      "learning_rate": 3.0159758203799655e-05,
+      "loss": 1.2321,
+      "step": 3480
+    },
+    {
+      "epoch": 0.7158056659402641,
+      "grad_norm": 0.5252069234848022,
+      "learning_rate": 2.9943868739205527e-05,
+      "loss": 1.1995,
+      "step": 3490
+    },
+    {
+      "epoch": 0.7178566850403795,
+      "grad_norm": 0.5369967818260193,
+      "learning_rate": 2.9727979274611402e-05,
+      "loss": 1.2234,
+      "step": 3500
+    },
+    {
+      "epoch": 0.7199077041404948,
+      "grad_norm": 0.5053485631942749,
+      "learning_rate": 2.9512089810017274e-05,
+      "loss": 1.2035,
+      "step": 3510
+    },
+    {
+      "epoch": 0.7219587232406102,
+      "grad_norm": 0.5131696462631226,
+      "learning_rate": 2.929620034542315e-05,
+      "loss": 1.2681,
+      "step": 3520
+    },
+    {
+      "epoch": 0.7240097423407256,
+      "grad_norm": 0.5332499742507935,
+      "learning_rate": 2.9080310880829014e-05,
+      "loss": 1.2039,
+      "step": 3530
+    },
+    {
+      "epoch": 0.7260607614408409,
+      "grad_norm": 0.5105617046356201,
+      "learning_rate": 2.886442141623489e-05,
+      "loss": 1.2,
+      "step": 3540
+    },
+    {
+      "epoch": 0.7281117805409563,
+      "grad_norm": 0.5197264552116394,
+      "learning_rate": 2.864853195164076e-05,
+      "loss": 1.1821,
+      "step": 3550
+    },
+    {
+      "epoch": 0.7301627996410717,
+      "grad_norm": 0.505455493927002,
+      "learning_rate": 2.8432642487046636e-05,
+      "loss": 1.2158,
+      "step": 3560
+    },
+    {
+      "epoch": 0.7322138187411871,
+      "grad_norm": 0.5290804505348206,
+      "learning_rate": 2.8216753022452508e-05,
+      "loss": 1.174,
+      "step": 3570
+    },
+    {
+      "epoch": 0.7342648378413024,
+      "grad_norm": 0.5349313020706177,
+      "learning_rate": 2.8000863557858376e-05,
+      "loss": 1.2301,
+      "step": 3580
+    },
+    {
+      "epoch": 0.7363158569414178,
+      "grad_norm": 0.4875812530517578,
+      "learning_rate": 2.7784974093264248e-05,
+      "loss": 1.2015,
+      "step": 3590
+    },
+    {
+      "epoch": 0.7383668760415332,
+      "grad_norm": 0.5164597630500793,
+      "learning_rate": 2.7569084628670123e-05,
+      "loss": 1.2294,
+      "step": 3600
+    },
+    {
+      "epoch": 0.7404178951416485,
+      "grad_norm": 0.5129172801971436,
+      "learning_rate": 2.7353195164075995e-05,
+      "loss": 1.2122,
+      "step": 3610
+    },
+    {
+      "epoch": 0.7424689142417639,
+      "grad_norm": 0.5218586921691895,
+      "learning_rate": 2.713730569948187e-05,
+      "loss": 1.2002,
+      "step": 3620
+    },
+    {
+      "epoch": 0.7445199333418793,
+      "grad_norm": 0.5423296093940735,
+      "learning_rate": 2.6921416234887735e-05,
+      "loss": 1.1685,
+      "step": 3630
+    },
+    {
+      "epoch": 0.7465709524419946,
+      "grad_norm": 0.5151218771934509,
+      "learning_rate": 2.670552677029361e-05,
+      "loss": 1.2167,
+      "step": 3640
+    },
+    {
+      "epoch": 0.74862197154211,
+      "grad_norm": 0.5160235166549683,
+      "learning_rate": 2.6489637305699482e-05,
+      "loss": 1.2269,
+      "step": 3650
+    },
+    {
+      "epoch": 0.7506729906422254,
+      "grad_norm": 0.5056514143943787,
+      "learning_rate": 2.6273747841105357e-05,
+      "loss": 1.2467,
+      "step": 3660
+    },
+    {
+      "epoch": 0.7527240097423408,
+      "grad_norm": 0.52911776304245,
+      "learning_rate": 2.605785837651123e-05,
+      "loss": 1.2182,
+      "step": 3670
+    },
+    {
+      "epoch": 0.7547750288424561,
+      "grad_norm": 0.5172019600868225,
+      "learning_rate": 2.5841968911917097e-05,
+      "loss": 1.1888,
+      "step": 3680
+    },
+    {
+      "epoch": 0.7568260479425715,
+      "grad_norm": 0.5043123960494995,
+      "learning_rate": 2.562607944732297e-05,
+      "loss": 1.2004,
+      "step": 3690
+    },
+    {
+      "epoch": 0.7588770670426869,
+      "grad_norm": 0.5103533267974854,
+      "learning_rate": 2.5410189982728844e-05,
+      "loss": 1.1627,
+      "step": 3700
+    },
+    {
+      "epoch": 0.7609280861428022,
+      "grad_norm": 0.5295760631561279,
+      "learning_rate": 2.5194300518134716e-05,
+      "loss": 1.1604,
+      "step": 3710
+    },
+    {
+      "epoch": 0.7629791052429176,
+      "grad_norm": 0.5427724719047546,
+      "learning_rate": 2.4978411053540588e-05,
+      "loss": 1.1781,
+      "step": 3720
+    },
+    {
+      "epoch": 0.765030124343033,
+      "grad_norm": 0.5164818167686462,
+      "learning_rate": 2.476252158894646e-05,
+      "loss": 1.2208,
+      "step": 3730
+    },
+    {
+      "epoch": 0.7670811434431483,
+      "grad_norm": 0.5196744799613953,
+      "learning_rate": 2.4546632124352335e-05,
+      "loss": 1.1971,
+      "step": 3740
+    },
+    {
+      "epoch": 0.7691321625432637,
+      "grad_norm": 0.5128475427627563,
+      "learning_rate": 2.4330742659758203e-05,
+      "loss": 1.1909,
+      "step": 3750
+    },
+    {
+      "epoch": 0.7711831816433791,
+      "grad_norm": 0.49743902683258057,
+      "learning_rate": 2.411485319516408e-05,
+      "loss": 1.2109,
+      "step": 3760
+    },
+    {
+      "epoch": 0.7732342007434945,
+      "grad_norm": 0.5152381658554077,
+      "learning_rate": 2.3898963730569947e-05,
+      "loss": 1.2228,
+      "step": 3770
+    },
+    {
+      "epoch": 0.7752852198436098,
+      "grad_norm": 0.5446299910545349,
+      "learning_rate": 2.3683074265975822e-05,
+      "loss": 1.1953,
+      "step": 3780
+    },
+    {
+      "epoch": 0.7773362389437252,
+      "grad_norm": 0.5300847291946411,
+      "learning_rate": 2.3467184801381694e-05,
+      "loss": 1.1843,
+      "step": 3790
+    },
+    {
+      "epoch": 0.7793872580438406,
+      "grad_norm": 0.5129801630973816,
+      "learning_rate": 2.3251295336787566e-05,
+      "loss": 1.1809,
+      "step": 3800
+    },
+    {
+      "epoch": 0.7814382771439559,
+      "grad_norm": 0.549198567867279,
+      "learning_rate": 2.3035405872193438e-05,
+      "loss": 1.2099,
+      "step": 3810
+    },
+    {
+      "epoch": 0.7834892962440713,
+      "grad_norm": 0.5118544101715088,
+      "learning_rate": 2.281951640759931e-05,
+      "loss": 1.2149,
+      "step": 3820
+    },
+    {
+      "epoch": 0.7855403153441867,
+      "grad_norm": 0.5479713082313538,
+      "learning_rate": 2.260362694300518e-05,
+      "loss": 1.1771,
+      "step": 3830
+    },
+    {
+      "epoch": 0.787591334444302,
+      "grad_norm": 0.541350245475769,
+      "learning_rate": 2.2387737478411056e-05,
+      "loss": 1.1737,
+      "step": 3840
+    },
+    {
+      "epoch": 0.7896423535444174,
+      "grad_norm": 0.5543351769447327,
+      "learning_rate": 2.2171848013816925e-05,
+      "loss": 1.2233,
+      "step": 3850
+    },
+    {
+      "epoch": 0.7916933726445328,
+      "grad_norm": 0.5010188817977905,
+      "learning_rate": 2.19559585492228e-05,
+      "loss": 1.1938,
+      "step": 3860
+    },
+    {
+      "epoch": 0.7937443917446482,
+      "grad_norm": 0.5245205760002136,
+      "learning_rate": 2.1740069084628672e-05,
+      "loss": 1.2015,
+      "step": 3870
+    },
+    {
+      "epoch": 0.7957954108447635,
+      "grad_norm": 0.5324139595031738,
+      "learning_rate": 2.1524179620034544e-05,
+      "loss": 1.2248,
+      "step": 3880
+    },
+    {
+      "epoch": 0.7978464299448789,
+      "grad_norm": 0.5172831416130066,
+      "learning_rate": 2.1308290155440415e-05,
+      "loss": 1.1992,
+      "step": 3890
+    },
+    {
+      "epoch": 0.7998974490449943,
+      "grad_norm": 0.5434138178825378,
+      "learning_rate": 2.1092400690846287e-05,
+      "loss": 1.1813,
+      "step": 3900
+    },
+    {
+      "epoch": 0.8019484681451096,
+      "grad_norm": 0.5221844911575317,
+      "learning_rate": 2.087651122625216e-05,
+      "loss": 1.1625,
+      "step": 3910
+    },
+    {
+      "epoch": 0.803999487245225,
+      "grad_norm": 0.5027469992637634,
+      "learning_rate": 2.0660621761658034e-05,
+      "loss": 1.181,
+      "step": 3920
+    },
+    {
+      "epoch": 0.8060505063453404,
+      "grad_norm": 0.5298044085502625,
+      "learning_rate": 2.0444732297063903e-05,
+      "loss": 1.2079,
+      "step": 3930
+    },
+    {
+      "epoch": 0.8081015254454557,
+      "grad_norm": 0.5463908910751343,
+      "learning_rate": 2.0228842832469778e-05,
+      "loss": 1.2009,
+      "step": 3940
+    },
+    {
+      "epoch": 0.8101525445455711,
+      "grad_norm": 0.5394027233123779,
+      "learning_rate": 2.0012953367875646e-05,
+      "loss": 1.1931,
+      "step": 3950
+    },
+    {
+      "epoch": 0.8122035636456865,
+      "grad_norm": 0.5041294097900391,
+      "learning_rate": 1.979706390328152e-05,
+      "loss": 1.2107,
+      "step": 3960
+    },
+    {
+      "epoch": 0.8142545827458019,
+      "grad_norm": 0.5223291516304016,
+      "learning_rate": 1.9581174438687393e-05,
+      "loss": 1.1775,
+      "step": 3970
+    },
+    {
+      "epoch": 0.8163056018459172,
+      "grad_norm": 0.5221052169799805,
+      "learning_rate": 1.9365284974093265e-05,
+      "loss": 1.2052,
+      "step": 3980
+    },
+    {
+      "epoch": 0.8183566209460326,
+      "grad_norm": 0.5229529738426208,
+      "learning_rate": 1.9149395509499137e-05,
+      "loss": 1.1922,
+      "step": 3990
+    },
+    {
+      "epoch": 0.820407640046148,
+      "grad_norm": 0.5651980042457581,
+      "learning_rate": 1.893350604490501e-05,
+      "loss": 1.2043,
+      "step": 4000
+    },
+    {
+      "epoch": 0.8224586591462633,
+      "grad_norm": 0.5169751644134521,
+      "learning_rate": 1.871761658031088e-05,
+      "loss": 1.2157,
+      "step": 4010
+    },
+    {
+      "epoch": 0.8245096782463787,
+      "grad_norm": 0.5741276144981384,
+      "learning_rate": 1.8501727115716755e-05,
+      "loss": 1.2112,
+      "step": 4020
+    },
+    {
+      "epoch": 0.8265606973464941,
+      "grad_norm": 0.530596137046814,
+      "learning_rate": 1.8285837651122624e-05,
+      "loss": 1.2535,
+      "step": 4030
+    },
+    {
+      "epoch": 0.8286117164466094,
+      "grad_norm": 0.5436383485794067,
+      "learning_rate": 1.80699481865285e-05,
+      "loss": 1.1789,
+      "step": 4040
+    },
+    {
+      "epoch": 0.8306627355467248,
+      "grad_norm": 0.5238965749740601,
+      "learning_rate": 1.7854058721934368e-05,
+      "loss": 1.1645,
+      "step": 4050
+    },
+    {
+      "epoch": 0.8327137546468402,
+      "grad_norm": 0.5226778388023376,
+      "learning_rate": 1.7638169257340243e-05,
+      "loss": 1.2238,
+      "step": 4060
+    },
+    {
+      "epoch": 0.8347647737469556,
+      "grad_norm": 0.5810254812240601,
+      "learning_rate": 1.7422279792746114e-05,
+      "loss": 1.2212,
+      "step": 4070
+    },
+    {
+      "epoch": 0.8368157928470709,
+      "grad_norm": 0.5228540301322937,
+      "learning_rate": 1.7206390328151986e-05,
+      "loss": 1.2025,
+      "step": 4080
+    },
+    {
+      "epoch": 0.8388668119471863,
+      "grad_norm": 0.5112829804420471,
+      "learning_rate": 1.6990500863557858e-05,
+      "loss": 1.1838,
+      "step": 4090
+    },
+    {
+      "epoch": 0.8409178310473017,
+      "grad_norm": 0.5092179775238037,
+      "learning_rate": 1.6774611398963733e-05,
+      "loss": 1.1981,
+      "step": 4100
+    },
+    {
+      "epoch": 0.842968850147417,
+      "grad_norm": 0.5236721634864807,
+      "learning_rate": 1.65587219343696e-05,
+      "loss": 1.1994,
+      "step": 4110
+    },
+    {
+      "epoch": 0.8450198692475324,
+      "grad_norm": 0.5067551732063293,
+      "learning_rate": 1.6342832469775477e-05,
+      "loss": 1.1758,
+      "step": 4120
+    },
+    {
+      "epoch": 0.8470708883476478,
+      "grad_norm": 0.5471055507659912,
+      "learning_rate": 1.6126943005181345e-05,
+      "loss": 1.2315,
+      "step": 4130
+    },
+    {
+      "epoch": 0.8491219074477631,
+      "grad_norm": 0.514798641204834,
+      "learning_rate": 1.591105354058722e-05,
+      "loss": 1.183,
+      "step": 4140
+    },
+    {
+      "epoch": 0.8511729265478785,
+      "grad_norm": 0.5316623449325562,
+      "learning_rate": 1.5695164075993092e-05,
+      "loss": 1.1997,
+      "step": 4150
+    },
+    {
+      "epoch": 0.8532239456479939,
+      "grad_norm": 0.531896710395813,
+      "learning_rate": 1.5479274611398964e-05,
+      "loss": 1.1967,
+      "step": 4160
+    },
+    {
+      "epoch": 0.8552749647481093,
+      "grad_norm": 0.5044012665748596,
+      "learning_rate": 1.5263385146804836e-05,
+      "loss": 1.2061,
+      "step": 4170
+    },
+    {
+      "epoch": 0.8573259838482246,
+      "grad_norm": 0.547264039516449,
+      "learning_rate": 1.5047495682210708e-05,
+      "loss": 1.1975,
+      "step": 4180
+    },
+    {
+      "epoch": 0.85937700294834,
+      "grad_norm": 0.5514972805976868,
+      "learning_rate": 1.4831606217616581e-05,
+      "loss": 1.2044,
+      "step": 4190
+    },
+    {
+      "epoch": 0.8614280220484554,
+      "grad_norm": 0.5322652459144592,
+      "learning_rate": 1.4615716753022455e-05,
+      "loss": 1.2044,
+      "step": 4200
+    },
+    {
+      "epoch": 0.8634790411485707,
+      "grad_norm": 0.5309359431266785,
+      "learning_rate": 1.4399827288428325e-05,
+      "loss": 1.2066,
+      "step": 4210
+    },
+    {
+      "epoch": 0.8655300602486861,
+      "grad_norm": 0.5314792394638062,
+      "learning_rate": 1.4183937823834198e-05,
+      "loss": 1.2006,
+      "step": 4220
+    },
+    {
+      "epoch": 0.8675810793488015,
+      "grad_norm": 0.5549922585487366,
+      "learning_rate": 1.3968048359240068e-05,
+      "loss": 1.2058,
+      "step": 4230
+    },
+    {
+      "epoch": 0.8696320984489168,
+      "grad_norm": 0.5373049378395081,
+      "learning_rate": 1.3752158894645942e-05,
+      "loss": 1.2002,
+      "step": 4240
+    },
+    {
+      "epoch": 0.8716831175490322,
+      "grad_norm": 0.5322666764259338,
+      "learning_rate": 1.3536269430051815e-05,
+      "loss": 1.215,
+      "step": 4250
+    },
+    {
+      "epoch": 0.8737341366491476,
+      "grad_norm": 0.5549564957618713,
+      "learning_rate": 1.3320379965457685e-05,
+      "loss": 1.2131,
+      "step": 4260
+    },
+    {
+      "epoch": 0.875785155749263,
+      "grad_norm": 0.5308319926261902,
+      "learning_rate": 1.3104490500863559e-05,
+      "loss": 1.2203,
+      "step": 4270
+    },
+    {
+      "epoch": 0.8778361748493783,
+      "grad_norm": 0.5089017152786255,
+      "learning_rate": 1.2888601036269432e-05,
+      "loss": 1.1801,
+      "step": 4280
+    },
+    {
+      "epoch": 0.8798871939494937,
+      "grad_norm": 0.5377966165542603,
+      "learning_rate": 1.2672711571675302e-05,
+      "loss": 1.189,
+      "step": 4290
+    },
+    {
+      "epoch": 0.8819382130496091,
+      "grad_norm": 0.5528485178947449,
+      "learning_rate": 1.2456822107081174e-05,
+      "loss": 1.2197,
+      "step": 4300
+    },
+    {
+      "epoch": 0.8839892321497244,
+      "grad_norm": 0.5241679549217224,
+      "learning_rate": 1.2240932642487048e-05,
+      "loss": 1.1652,
+      "step": 4310
+    },
+    {
+      "epoch": 0.8860402512498398,
+      "grad_norm": 0.5626764893531799,
+      "learning_rate": 1.202504317789292e-05,
+      "loss": 1.1805,
+      "step": 4320
+    },
+    {
+      "epoch": 0.8880912703499552,
+      "grad_norm": 0.5248028635978699,
+      "learning_rate": 1.1809153713298791e-05,
+      "loss": 1.1652,
+      "step": 4330
+    },
+    {
+      "epoch": 0.8901422894500705,
+      "grad_norm": 0.5452848672866821,
+      "learning_rate": 1.1593264248704663e-05,
+      "loss": 1.2171,
+      "step": 4340
+    },
+    {
+      "epoch": 0.8921933085501859,
+      "grad_norm": 0.5505712628364563,
+      "learning_rate": 1.1377374784110537e-05,
+      "loss": 1.1967,
+      "step": 4350
+    },
+    {
+      "epoch": 0.8942443276503013,
+      "grad_norm": 0.5437038540840149,
+      "learning_rate": 1.1161485319516408e-05,
+      "loss": 1.2216,
+      "step": 4360
+    },
+    {
+      "epoch": 0.8962953467504167,
+      "grad_norm": 0.5138014554977417,
+      "learning_rate": 1.094559585492228e-05,
+      "loss": 1.193,
+      "step": 4370
+    },
+    {
+      "epoch": 0.898346365850532,
+      "grad_norm": 0.542080283164978,
+      "learning_rate": 1.0729706390328152e-05,
+      "loss": 1.1677,
+      "step": 4380
+    },
+    {
+      "epoch": 0.9003973849506474,
+      "grad_norm": 0.5166792273521423,
+      "learning_rate": 1.0513816925734024e-05,
+      "loss": 1.2147,
+      "step": 4390
+    },
+    {
+      "epoch": 0.9024484040507628,
+      "grad_norm": 0.536491334438324,
+      "learning_rate": 1.0297927461139897e-05,
+      "loss": 1.2077,
+      "step": 4400
+    },
+    {
+      "epoch": 0.9044994231508781,
+      "grad_norm": 0.5504462718963623,
+      "learning_rate": 1.0082037996545769e-05,
+      "loss": 1.1913,
+      "step": 4410
+    },
+    {
+      "epoch": 0.9065504422509935,
+      "grad_norm": 0.5299994945526123,
+      "learning_rate": 9.866148531951641e-06,
+      "loss": 1.1987,
+      "step": 4420
+    },
+    {
+      "epoch": 0.9086014613511089,
+      "grad_norm": 0.5432473421096802,
+      "learning_rate": 9.650259067357513e-06,
+      "loss": 1.199,
+      "step": 4430
+    },
+    {
+      "epoch": 0.9106524804512242,
+      "grad_norm": 0.529331386089325,
+      "learning_rate": 9.434369602763386e-06,
+      "loss": 1.214,
+      "step": 4440
+    },
+    {
+      "epoch": 0.9127034995513396,
+      "grad_norm": 0.49785298109054565,
+      "learning_rate": 9.218480138169258e-06,
+      "loss": 1.202,
+      "step": 4450
+    },
+    {
+      "epoch": 0.914754518651455,
+      "grad_norm": 0.5281327962875366,
+      "learning_rate": 9.00259067357513e-06,
+      "loss": 1.1904,
+      "step": 4460
+    },
+    {
+      "epoch": 0.9168055377515704,
+      "grad_norm": 0.5474033951759338,
+      "learning_rate": 8.786701208981002e-06,
+      "loss": 1.1972,
+      "step": 4470
+    },
+    {
+      "epoch": 0.9188565568516857,
+      "grad_norm": 0.5412236452102661,
+      "learning_rate": 8.570811744386873e-06,
+      "loss": 1.1797,
+      "step": 4480
+    },
+    {
+      "epoch": 0.9209075759518011,
+      "grad_norm": 0.5599170923233032,
+      "learning_rate": 8.354922279792747e-06,
+      "loss": 1.176,
+      "step": 4490
+    },
+    {
+      "epoch": 0.9229585950519165,
+      "grad_norm": 0.5590323805809021,
+      "learning_rate": 8.139032815198619e-06,
+      "loss": 1.1863,
+      "step": 4500
+    },
+    {
+      "epoch": 0.9250096141520318,
+      "grad_norm": 0.566150426864624,
+      "learning_rate": 7.92314335060449e-06,
+      "loss": 1.2217,
+      "step": 4510
+    },
+    {
+      "epoch": 0.9270606332521472,
+      "grad_norm": 0.5459644794464111,
+      "learning_rate": 7.707253886010362e-06,
+      "loss": 1.1903,
+      "step": 4520
+    },
+    {
+      "epoch": 0.9291116523522626,
+      "grad_norm": 0.5333088636398315,
+      "learning_rate": 7.491364421416235e-06,
+      "loss": 1.2076,
+      "step": 4530
+    },
+    {
+      "epoch": 0.9311626714523779,
+      "grad_norm": 0.5921478271484375,
+      "learning_rate": 7.2754749568221076e-06,
+      "loss": 1.191,
+      "step": 4540
+    },
+    {
+      "epoch": 0.9332136905524933,
+      "grad_norm": 0.5061055421829224,
+      "learning_rate": 7.059585492227979e-06,
+      "loss": 1.1787,
+      "step": 4550
+    },
+    {
+      "epoch": 0.9352647096526087,
+      "grad_norm": 0.5804794430732727,
+      "learning_rate": 6.843696027633852e-06,
+      "loss": 1.2096,
+      "step": 4560
+    },
+    {
+      "epoch": 0.9373157287527241,
+      "grad_norm": 0.5328559875488281,
+      "learning_rate": 6.627806563039724e-06,
+      "loss": 1.2072,
+      "step": 4570
+    },
+    {
+      "epoch": 0.9393667478528394,
+      "grad_norm": 0.518925130367279,
+      "learning_rate": 6.4119170984455965e-06,
+      "loss": 1.2119,
+      "step": 4580
+    },
+    {
+      "epoch": 0.9414177669529548,
+      "grad_norm": 0.5092957019805908,
+      "learning_rate": 6.196027633851468e-06,
+      "loss": 1.2137,
+      "step": 4590
+    },
+    {
+      "epoch": 0.9434687860530702,
+      "grad_norm": 0.5156581401824951,
+      "learning_rate": 5.980138169257341e-06,
+      "loss": 1.2059,
+      "step": 4600
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 4876,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.173859535124265e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

lora_checkpoints/checkpoint-4600/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b8f6520f47933838e96dca56ee883040325b73481aff07afcabf963674a84fe
+size 5624

lora_checkpoints/checkpoint-4800/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: uaritm/gemma3_1b_med_qa_ru
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:uaritm/gemma3_1b_med_qa_ru
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.17.1

lora_checkpoints/checkpoint-4800/adapter_config.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "uaritm/gemma3_1b_med_qa_ru",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "k_proj",
+    "o_proj",
+    "q_proj",
+    "gate_proj",
+    "down_proj",
+    "up_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

lora_checkpoints/checkpoint-4800/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:838cb10282d32f7525e548df00df831364d0a56716513f0f666bb587cf01bb04
+size 52231312

lora_checkpoints/checkpoint-4800/added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "<image_soft_token>": 262144
+}

lora_checkpoints/checkpoint-4800/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,47 @@

+{{ bos_token }}
+{%- if messages[0]['role'] == 'system' -%}
+    {%- if messages[0]['content'] is string -%}
+        {%- set first_user_prefix = messages[0]['content'] + '
+' -%}
+    {%- else -%}
+        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
+' -%}
+    {%- endif -%}
+    {%- set loop_messages = messages[1:] -%}
+{%- else -%}
+    {%- set first_user_prefix = "" -%}
+    {%- set loop_messages = messages -%}
+{%- endif -%}
+{%- for message in loop_messages -%}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
+        {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
+    {%- endif -%}
+    {%- if (message['role'] == 'assistant') -%}
+        {%- set role = "model" -%}
+    {%- else -%}
+        {%- set role = message['role'] -%}
+    {%- endif -%}
+    {{ '<start_of_turn>' + role + '
+' + (first_user_prefix if loop.first else "") }}
+    {%- if message['content'] is string -%}
+        {{ message['content'] | trim }}
+    {%- elif message['content'] is iterable -%}
+        {%- for item in message['content'] -%}
+            {%- if item['type'] == 'image' -%}
+                {{ '<start_of_image>' }}
+            {%- elif item['type'] == 'text' -%}
+                {{ item['text'] | trim }}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- else -%}
+        {{ raise_exception("Invalid content type") }}
+    {%- endif -%}
+    {{ '<end_of_turn>
+' }}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    {{'<start_of_turn>model
+'}}
+{%- endif -%}

lora_checkpoints/checkpoint-4800/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:541c17967fd1cb6da978fa5ce8a107f2b33a846b96278b868f133dabbd3e18c2
+size 104671958

lora_checkpoints/checkpoint-4800/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c56340152e52ff384fdecd489c87b5947889c784ecc63c969b82f3f6c043c7b1
+size 14244

lora_checkpoints/checkpoint-4800/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5da4ea0c1bcacc6c536f51e41f20fb1c9301dc84cb8e04333e56f06168b8cb83
+size 988

lora_checkpoints/checkpoint-4800/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0cd087ae287cc95f1935ac195b488fa11a3ff4d8db1628daeadd0359c3d8cd18
+size 1064

lora_checkpoints/checkpoint-4800/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "boi_token": "<start_of_image>",
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eoi_token": "<end_of_image>",
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "image_token": "<image_soft_token>",
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

lora_checkpoints/checkpoint-4800/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
+size 4689074

lora_checkpoints/checkpoint-4800/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

lora_checkpoints/checkpoint-4800/trainer_state.json ADDED Viewed

	@@ -0,0 +1,3394 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9844891680553776,
+  "eval_steps": 500,
+  "global_step": 4800,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00205101910011537,
+      "grad_norm": 1.9277215003967285,
+      "learning_rate": 3.6885245901639347e-06,
+      "loss": 1.4306,
+      "step": 10
+    },
+    {
+      "epoch": 0.00410203820023074,
+      "grad_norm": 0.3513035476207733,
+      "learning_rate": 7.78688524590164e-06,
+      "loss": 1.3524,
+      "step": 20
+    },
+    {
+      "epoch": 0.006153057300346109,
+      "grad_norm": 0.3364648222923279,
+      "learning_rate": 1.1885245901639344e-05,
+      "loss": 1.3188,
+      "step": 30
+    },
+    {
+      "epoch": 0.00820407640046148,
+      "grad_norm": 0.3382512927055359,
+      "learning_rate": 1.598360655737705e-05,
+      "loss": 1.3418,
+      "step": 40
+    },
+    {
+      "epoch": 0.01025509550057685,
+      "grad_norm": 0.360334575176239,
+      "learning_rate": 2.0081967213114755e-05,
+      "loss": 1.3381,
+      "step": 50
+    },
+    {
+      "epoch": 0.012306114600692218,
+      "grad_norm": 0.3408481180667877,
+      "learning_rate": 2.418032786885246e-05,
+      "loss": 1.3365,
+      "step": 60
+    },
+    {
+      "epoch": 0.014357133700807588,
+      "grad_norm": 0.36211535334587097,
+      "learning_rate": 2.8278688524590162e-05,
+      "loss": 1.3314,
+      "step": 70
+    },
+    {
+      "epoch": 0.01640815280092296,
+      "grad_norm": 0.38704580068588257,
+      "learning_rate": 3.237704918032787e-05,
+      "loss": 1.3108,
+      "step": 80
+    },
+    {
+      "epoch": 0.018459171901038327,
+      "grad_norm": 0.44303640723228455,
+      "learning_rate": 3.6475409836065576e-05,
+      "loss": 1.3073,
+      "step": 90
+    },
+    {
+      "epoch": 0.0205101910011537,
+      "grad_norm": 0.4073602557182312,
+      "learning_rate": 4.057377049180328e-05,
+      "loss": 1.2993,
+      "step": 100
+    },
+    {
+      "epoch": 0.022561210101269068,
+      "grad_norm": 0.4478100538253784,
+      "learning_rate": 4.467213114754098e-05,
+      "loss": 1.3413,
+      "step": 110
+    },
+    {
+      "epoch": 0.024612229201384436,
+      "grad_norm": 0.39146170020103455,
+      "learning_rate": 4.8770491803278687e-05,
+      "loss": 1.3168,
+      "step": 120
+    },
+    {
+      "epoch": 0.026663248301499808,
+      "grad_norm": 0.3786431849002838,
+      "learning_rate": 5.28688524590164e-05,
+      "loss": 1.2774,
+      "step": 130
+    },
+    {
+      "epoch": 0.028714267401615177,
+      "grad_norm": 0.4014948904514313,
+      "learning_rate": 5.69672131147541e-05,
+      "loss": 1.346,
+      "step": 140
+    },
+    {
+      "epoch": 0.03076528650173055,
+      "grad_norm": 0.3987842798233032,
+      "learning_rate": 6.10655737704918e-05,
+      "loss": 1.2816,
+      "step": 150
+    },
+    {
+      "epoch": 0.03281630560184592,
+      "grad_norm": 0.3897082507610321,
+      "learning_rate": 6.516393442622951e-05,
+      "loss": 1.3485,
+      "step": 160
+    },
+    {
+      "epoch": 0.034867324701961286,
+      "grad_norm": 0.373279333114624,
+      "learning_rate": 6.926229508196722e-05,
+      "loss": 1.3185,
+      "step": 170
+    },
+    {
+      "epoch": 0.036918343802076654,
+      "grad_norm": 0.3812575340270996,
+      "learning_rate": 7.336065573770491e-05,
+      "loss": 1.3394,
+      "step": 180
+    },
+    {
+      "epoch": 0.03896936290219203,
+      "grad_norm": 0.35926997661590576,
+      "learning_rate": 7.745901639344263e-05,
+      "loss": 1.2821,
+      "step": 190
+    },
+    {
+      "epoch": 0.0410203820023074,
+      "grad_norm": 0.3649434745311737,
+      "learning_rate": 8.155737704918032e-05,
+      "loss": 1.33,
+      "step": 200
+    },
+    {
+      "epoch": 0.04307140110242277,
+      "grad_norm": 0.345662921667099,
+      "learning_rate": 8.565573770491803e-05,
+      "loss": 1.3107,
+      "step": 210
+    },
+    {
+      "epoch": 0.045122420202538135,
+      "grad_norm": 0.37169769406318665,
+      "learning_rate": 8.975409836065574e-05,
+      "loss": 1.309,
+      "step": 220
+    },
+    {
+      "epoch": 0.047173439302653504,
+      "grad_norm": 0.37920281291007996,
+      "learning_rate": 9.385245901639344e-05,
+      "loss": 1.3352,
+      "step": 230
+    },
+    {
+      "epoch": 0.04922445840276887,
+      "grad_norm": 0.35772770643234253,
+      "learning_rate": 9.795081967213115e-05,
+      "loss": 1.2402,
+      "step": 240
+    },
+    {
+      "epoch": 0.05127547750288425,
+      "grad_norm": 0.38790181279182434,
+      "learning_rate": 9.989205526770294e-05,
+      "loss": 1.326,
+      "step": 250
+    },
+    {
+      "epoch": 0.053326496602999617,
+      "grad_norm": 0.3545536696910858,
+      "learning_rate": 9.967616580310882e-05,
+      "loss": 1.3173,
+      "step": 260
+    },
+    {
+      "epoch": 0.055377515703114985,
+      "grad_norm": 0.3845142722129822,
+      "learning_rate": 9.946027633851469e-05,
+      "loss": 1.2949,
+      "step": 270
+    },
+    {
+      "epoch": 0.057428534803230354,
+      "grad_norm": 0.38621339201927185,
+      "learning_rate": 9.924438687392055e-05,
+      "loss": 1.2773,
+      "step": 280
+    },
+    {
+      "epoch": 0.05947955390334572,
+      "grad_norm": 0.38091301918029785,
+      "learning_rate": 9.902849740932643e-05,
+      "loss": 1.3282,
+      "step": 290
+    },
+    {
+      "epoch": 0.0615305730034611,
+      "grad_norm": 0.37546730041503906,
+      "learning_rate": 9.88126079447323e-05,
+      "loss": 1.2862,
+      "step": 300
+    },
+    {
+      "epoch": 0.06358159210357646,
+      "grad_norm": 0.3515011966228485,
+      "learning_rate": 9.859671848013817e-05,
+      "loss": 1.2937,
+      "step": 310
+    },
+    {
+      "epoch": 0.06563261120369183,
+      "grad_norm": 0.3863738775253296,
+      "learning_rate": 9.838082901554406e-05,
+      "loss": 1.3056,
+      "step": 320
+    },
+    {
+      "epoch": 0.06768363030380721,
+      "grad_norm": 0.36615240573883057,
+      "learning_rate": 9.816493955094992e-05,
+      "loss": 1.3062,
+      "step": 330
+    },
+    {
+      "epoch": 0.06973464940392257,
+      "grad_norm": 0.37741243839263916,
+      "learning_rate": 9.794905008635579e-05,
+      "loss": 1.3094,
+      "step": 340
+    },
+    {
+      "epoch": 0.07178566850403795,
+      "grad_norm": 0.38626739382743835,
+      "learning_rate": 9.773316062176167e-05,
+      "loss": 1.2947,
+      "step": 350
+    },
+    {
+      "epoch": 0.07383668760415331,
+      "grad_norm": 0.38667401671409607,
+      "learning_rate": 9.751727115716753e-05,
+      "loss": 1.2976,
+      "step": 360
+    },
+    {
+      "epoch": 0.07588770670426868,
+      "grad_norm": 0.36084800958633423,
+      "learning_rate": 9.730138169257342e-05,
+      "loss": 1.27,
+      "step": 370
+    },
+    {
+      "epoch": 0.07793872580438406,
+      "grad_norm": 0.3754425346851349,
+      "learning_rate": 9.708549222797928e-05,
+      "loss": 1.3243,
+      "step": 380
+    },
+    {
+      "epoch": 0.07998974490449942,
+      "grad_norm": 0.39857473969459534,
+      "learning_rate": 9.686960276338515e-05,
+      "loss": 1.3077,
+      "step": 390
+    },
+    {
+      "epoch": 0.0820407640046148,
+      "grad_norm": 0.3919648230075836,
+      "learning_rate": 9.665371329879103e-05,
+      "loss": 1.2985,
+      "step": 400
+    },
+    {
+      "epoch": 0.08409178310473016,
+      "grad_norm": 0.3675483465194702,
+      "learning_rate": 9.643782383419689e-05,
+      "loss": 1.2946,
+      "step": 410
+    },
+    {
+      "epoch": 0.08614280220484553,
+      "grad_norm": 0.3898465633392334,
+      "learning_rate": 9.622193436960277e-05,
+      "loss": 1.333,
+      "step": 420
+    },
+    {
+      "epoch": 0.08819382130496091,
+      "grad_norm": 0.3681259751319885,
+      "learning_rate": 9.600604490500864e-05,
+      "loss": 1.2968,
+      "step": 430
+    },
+    {
+      "epoch": 0.09024484040507627,
+      "grad_norm": 0.36453816294670105,
+      "learning_rate": 9.57901554404145e-05,
+      "loss": 1.272,
+      "step": 440
+    },
+    {
+      "epoch": 0.09229585950519165,
+      "grad_norm": 0.34828147292137146,
+      "learning_rate": 9.557426597582039e-05,
+      "loss": 1.3245,
+      "step": 450
+    },
+    {
+      "epoch": 0.09434687860530701,
+      "grad_norm": 0.3570501208305359,
+      "learning_rate": 9.535837651122625e-05,
+      "loss": 1.313,
+      "step": 460
+    },
+    {
+      "epoch": 0.09639789770542238,
+      "grad_norm": 0.36692506074905396,
+      "learning_rate": 9.514248704663213e-05,
+      "loss": 1.2915,
+      "step": 470
+    },
+    {
+      "epoch": 0.09844891680553775,
+      "grad_norm": 0.39161381125450134,
+      "learning_rate": 9.4926597582038e-05,
+      "loss": 1.3101,
+      "step": 480
+    },
+    {
+      "epoch": 0.10049993590565312,
+      "grad_norm": 0.3808858394622803,
+      "learning_rate": 9.471070811744387e-05,
+      "loss": 1.3099,
+      "step": 490
+    },
+    {
+      "epoch": 0.1025509550057685,
+      "grad_norm": 0.3541582524776459,
+      "learning_rate": 9.449481865284975e-05,
+      "loss": 1.2772,
+      "step": 500
+    },
+    {
+      "epoch": 0.10460197410588386,
+      "grad_norm": 0.379190593957901,
+      "learning_rate": 9.427892918825562e-05,
+      "loss": 1.2914,
+      "step": 510
+    },
+    {
+      "epoch": 0.10665299320599923,
+      "grad_norm": 0.37727421522140503,
+      "learning_rate": 9.406303972366149e-05,
+      "loss": 1.2888,
+      "step": 520
+    },
+    {
+      "epoch": 0.1087040123061146,
+      "grad_norm": 0.3787306845188141,
+      "learning_rate": 9.384715025906737e-05,
+      "loss": 1.3049,
+      "step": 530
+    },
+    {
+      "epoch": 0.11075503140622997,
+      "grad_norm": 0.3831459581851959,
+      "learning_rate": 9.363126079447323e-05,
+      "loss": 1.2631,
+      "step": 540
+    },
+    {
+      "epoch": 0.11280605050634535,
+      "grad_norm": 0.37274929881095886,
+      "learning_rate": 9.34153713298791e-05,
+      "loss": 1.3313,
+      "step": 550
+    },
+    {
+      "epoch": 0.11485706960646071,
+      "grad_norm": 0.3683277368545532,
+      "learning_rate": 9.319948186528498e-05,
+      "loss": 1.2528,
+      "step": 560
+    },
+    {
+      "epoch": 0.11690808870657608,
+      "grad_norm": 0.39554840326309204,
+      "learning_rate": 9.298359240069085e-05,
+      "loss": 1.2737,
+      "step": 570
+    },
+    {
+      "epoch": 0.11895910780669144,
+      "grad_norm": 0.39166760444641113,
+      "learning_rate": 9.276770293609673e-05,
+      "loss": 1.271,
+      "step": 580
+    },
+    {
+      "epoch": 0.12101012690680682,
+      "grad_norm": 0.384085476398468,
+      "learning_rate": 9.255181347150259e-05,
+      "loss": 1.2921,
+      "step": 590
+    },
+    {
+      "epoch": 0.1230611460069222,
+      "grad_norm": 0.3704201281070709,
+      "learning_rate": 9.233592400690847e-05,
+      "loss": 1.2776,
+      "step": 600
+    },
+    {
+      "epoch": 0.12511216510703757,
+      "grad_norm": 0.3844301998615265,
+      "learning_rate": 9.212003454231434e-05,
+      "loss": 1.3067,
+      "step": 610
+    },
+    {
+      "epoch": 0.12716318420715292,
+      "grad_norm": 0.3971571922302246,
+      "learning_rate": 9.190414507772022e-05,
+      "loss": 1.2792,
+      "step": 620
+    },
+    {
+      "epoch": 0.1292142033072683,
+      "grad_norm": 0.40666353702545166,
+      "learning_rate": 9.168825561312608e-05,
+      "loss": 1.2964,
+      "step": 630
+    },
+    {
+      "epoch": 0.13126522240738367,
+      "grad_norm": 0.38252532482147217,
+      "learning_rate": 9.147236614853195e-05,
+      "loss": 1.2815,
+      "step": 640
+    },
+    {
+      "epoch": 0.13331624150749904,
+      "grad_norm": 0.37795621156692505,
+      "learning_rate": 9.125647668393783e-05,
+      "loss": 1.283,
+      "step": 650
+    },
+    {
+      "epoch": 0.13536726060761442,
+      "grad_norm": 0.4035683572292328,
+      "learning_rate": 9.10405872193437e-05,
+      "loss": 1.288,
+      "step": 660
+    },
+    {
+      "epoch": 0.13741827970772977,
+      "grad_norm": 0.410669207572937,
+      "learning_rate": 9.082469775474958e-05,
+      "loss": 1.2659,
+      "step": 670
+    },
+    {
+      "epoch": 0.13946929880784514,
+      "grad_norm": 0.3809865713119507,
+      "learning_rate": 9.060880829015544e-05,
+      "loss": 1.3133,
+      "step": 680
+    },
+    {
+      "epoch": 0.14152031790796052,
+      "grad_norm": 0.3748447597026825,
+      "learning_rate": 9.039291882556131e-05,
+      "loss": 1.2643,
+      "step": 690
+    },
+    {
+      "epoch": 0.1435713370080759,
+      "grad_norm": 0.39292991161346436,
+      "learning_rate": 9.017702936096719e-05,
+      "loss": 1.2855,
+      "step": 700
+    },
+    {
+      "epoch": 0.14562235610819127,
+      "grad_norm": 0.4399755001068115,
+      "learning_rate": 8.996113989637307e-05,
+      "loss": 1.286,
+      "step": 710
+    },
+    {
+      "epoch": 0.14767337520830662,
+      "grad_norm": 0.42447429895401,
+      "learning_rate": 8.974525043177894e-05,
+      "loss": 1.2736,
+      "step": 720
+    },
+    {
+      "epoch": 0.149724394308422,
+      "grad_norm": 0.37248438596725464,
+      "learning_rate": 8.95293609671848e-05,
+      "loss": 1.2652,
+      "step": 730
+    },
+    {
+      "epoch": 0.15177541340853737,
+      "grad_norm": 0.39122238755226135,
+      "learning_rate": 8.931347150259068e-05,
+      "loss": 1.2814,
+      "step": 740
+    },
+    {
+      "epoch": 0.15382643250865274,
+      "grad_norm": 0.3697800040245056,
+      "learning_rate": 8.909758203799655e-05,
+      "loss": 1.2462,
+      "step": 750
+    },
+    {
+      "epoch": 0.15587745160876812,
+      "grad_norm": 0.3901929259300232,
+      "learning_rate": 8.888169257340241e-05,
+      "loss": 1.2742,
+      "step": 760
+    },
+    {
+      "epoch": 0.15792847070888347,
+      "grad_norm": 0.3833727538585663,
+      "learning_rate": 8.86658031088083e-05,
+      "loss": 1.3015,
+      "step": 770
+    },
+    {
+      "epoch": 0.15997948980899884,
+      "grad_norm": 0.4028802216053009,
+      "learning_rate": 8.844991364421416e-05,
+      "loss": 1.2631,
+      "step": 780
+    },
+    {
+      "epoch": 0.16203050890911422,
+      "grad_norm": 0.39087918400764465,
+      "learning_rate": 8.823402417962004e-05,
+      "loss": 1.2993,
+      "step": 790
+    },
+    {
+      "epoch": 0.1640815280092296,
+      "grad_norm": 0.39453235268592834,
+      "learning_rate": 8.801813471502591e-05,
+      "loss": 1.2544,
+      "step": 800
+    },
+    {
+      "epoch": 0.16613254710934497,
+      "grad_norm": 0.42142602801322937,
+      "learning_rate": 8.780224525043178e-05,
+      "loss": 1.2676,
+      "step": 810
+    },
+    {
+      "epoch": 0.16818356620946032,
+      "grad_norm": 0.36646899580955505,
+      "learning_rate": 8.758635578583767e-05,
+      "loss": 1.2765,
+      "step": 820
+    },
+    {
+      "epoch": 0.1702345853095757,
+      "grad_norm": 0.4253019094467163,
+      "learning_rate": 8.737046632124353e-05,
+      "loss": 1.3003,
+      "step": 830
+    },
+    {
+      "epoch": 0.17228560440969107,
+      "grad_norm": 0.41490674018859863,
+      "learning_rate": 8.715457685664939e-05,
+      "loss": 1.2731,
+      "step": 840
+    },
+    {
+      "epoch": 0.17433662350980644,
+      "grad_norm": 0.405460387468338,
+      "learning_rate": 8.693868739205528e-05,
+      "loss": 1.2122,
+      "step": 850
+    },
+    {
+      "epoch": 0.17638764260992182,
+      "grad_norm": 0.4028235375881195,
+      "learning_rate": 8.672279792746114e-05,
+      "loss": 1.3238,
+      "step": 860
+    },
+    {
+      "epoch": 0.17843866171003717,
+      "grad_norm": 0.38994792103767395,
+      "learning_rate": 8.650690846286701e-05,
+      "loss": 1.2875,
+      "step": 870
+    },
+    {
+      "epoch": 0.18048968081015254,
+      "grad_norm": 0.4099538326263428,
+      "learning_rate": 8.629101899827289e-05,
+      "loss": 1.2807,
+      "step": 880
+    },
+    {
+      "epoch": 0.18254069991026792,
+      "grad_norm": 0.40470021963119507,
+      "learning_rate": 8.607512953367875e-05,
+      "loss": 1.2802,
+      "step": 890
+    },
+    {
+      "epoch": 0.1845917190103833,
+      "grad_norm": 0.4066854417324066,
+      "learning_rate": 8.585924006908464e-05,
+      "loss": 1.2464,
+      "step": 900
+    },
+    {
+      "epoch": 0.18664273811049864,
+      "grad_norm": 0.38739994168281555,
+      "learning_rate": 8.56433506044905e-05,
+      "loss": 1.2831,
+      "step": 910
+    },
+    {
+      "epoch": 0.18869375721061402,
+      "grad_norm": 0.4257420301437378,
+      "learning_rate": 8.542746113989638e-05,
+      "loss": 1.2679,
+      "step": 920
+    },
+    {
+      "epoch": 0.1907447763107294,
+      "grad_norm": 0.41571488976478577,
+      "learning_rate": 8.521157167530225e-05,
+      "loss": 1.2501,
+      "step": 930
+    },
+    {
+      "epoch": 0.19279579541084477,
+      "grad_norm": 0.4178495407104492,
+      "learning_rate": 8.499568221070811e-05,
+      "loss": 1.2657,
+      "step": 940
+    },
+    {
+      "epoch": 0.19484681451096014,
+      "grad_norm": 0.4083455801010132,
+      "learning_rate": 8.477979274611399e-05,
+      "loss": 1.2781,
+      "step": 950
+    },
+    {
+      "epoch": 0.1968978336110755,
+      "grad_norm": 0.4067554175853729,
+      "learning_rate": 8.456390328151986e-05,
+      "loss": 1.2582,
+      "step": 960
+    },
+    {
+      "epoch": 0.19894885271119087,
+      "grad_norm": 0.4067447781562805,
+      "learning_rate": 8.434801381692574e-05,
+      "loss": 1.2948,
+      "step": 970
+    },
+    {
+      "epoch": 0.20099987181130624,
+      "grad_norm": 0.44283562898635864,
+      "learning_rate": 8.413212435233161e-05,
+      "loss": 1.3011,
+      "step": 980
+    },
+    {
+      "epoch": 0.20305089091142162,
+      "grad_norm": 0.41568294167518616,
+      "learning_rate": 8.391623488773748e-05,
+      "loss": 1.2804,
+      "step": 990
+    },
+    {
+      "epoch": 0.205101910011537,
+      "grad_norm": 0.4183642864227295,
+      "learning_rate": 8.370034542314335e-05,
+      "loss": 1.2228,
+      "step": 1000
+    },
+    {
+      "epoch": 0.20715292911165234,
+      "grad_norm": 0.4311917722225189,
+      "learning_rate": 8.348445595854923e-05,
+      "loss": 1.2714,
+      "step": 1010
+    },
+    {
+      "epoch": 0.20920394821176772,
+      "grad_norm": 0.41575828194618225,
+      "learning_rate": 8.32685664939551e-05,
+      "loss": 1.2783,
+      "step": 1020
+    },
+    {
+      "epoch": 0.2112549673118831,
+      "grad_norm": 0.3958878815174103,
+      "learning_rate": 8.305267702936098e-05,
+      "loss": 1.2558,
+      "step": 1030
+    },
+    {
+      "epoch": 0.21330598641199847,
+      "grad_norm": 0.43759557604789734,
+      "learning_rate": 8.283678756476684e-05,
+      "loss": 1.2557,
+      "step": 1040
+    },
+    {
+      "epoch": 0.21535700551211384,
+      "grad_norm": 0.41460636258125305,
+      "learning_rate": 8.262089810017271e-05,
+      "loss": 1.2851,
+      "step": 1050
+    },
+    {
+      "epoch": 0.2174080246122292,
+      "grad_norm": 0.4114689826965332,
+      "learning_rate": 8.240500863557859e-05,
+      "loss": 1.3076,
+      "step": 1060
+    },
+    {
+      "epoch": 0.21945904371234456,
+      "grad_norm": 0.42222094535827637,
+      "learning_rate": 8.218911917098446e-05,
+      "loss": 1.2263,
+      "step": 1070
+    },
+    {
+      "epoch": 0.22151006281245994,
+      "grad_norm": 0.4098639488220215,
+      "learning_rate": 8.197322970639033e-05,
+      "loss": 1.2779,
+      "step": 1080
+    },
+    {
+      "epoch": 0.22356108191257532,
+      "grad_norm": 0.4205043315887451,
+      "learning_rate": 8.175734024179621e-05,
+      "loss": 1.2177,
+      "step": 1090
+    },
+    {
+      "epoch": 0.2256121010126907,
+      "grad_norm": 0.4501648247241974,
+      "learning_rate": 8.154145077720208e-05,
+      "loss": 1.3227,
+      "step": 1100
+    },
+    {
+      "epoch": 0.22766312011280604,
+      "grad_norm": 0.41510599851608276,
+      "learning_rate": 8.132556131260795e-05,
+      "loss": 1.3177,
+      "step": 1110
+    },
+    {
+      "epoch": 0.22971413921292141,
+      "grad_norm": 0.41567444801330566,
+      "learning_rate": 8.110967184801383e-05,
+      "loss": 1.2506,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2317651583130368,
+      "grad_norm": 0.4262779653072357,
+      "learning_rate": 8.089378238341969e-05,
+      "loss": 1.2506,
+      "step": 1130
+    },
+    {
+      "epoch": 0.23381617741315217,
+      "grad_norm": 0.4220465421676636,
+      "learning_rate": 8.067789291882558e-05,
+      "loss": 1.2514,
+      "step": 1140
+    },
+    {
+      "epoch": 0.23586719651326754,
+      "grad_norm": 0.4169275462627411,
+      "learning_rate": 8.046200345423144e-05,
+      "loss": 1.2693,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2379182156133829,
+      "grad_norm": 0.43145328760147095,
+      "learning_rate": 8.02461139896373e-05,
+      "loss": 1.2394,
+      "step": 1160
+    },
+    {
+      "epoch": 0.23996923471349826,
+      "grad_norm": 0.42889878153800964,
+      "learning_rate": 8.003022452504319e-05,
+      "loss": 1.248,
+      "step": 1170
+    },
+    {
+      "epoch": 0.24202025381361364,
+      "grad_norm": 0.41731464862823486,
+      "learning_rate": 7.981433506044905e-05,
+      "loss": 1.2498,
+      "step": 1180
+    },
+    {
+      "epoch": 0.24407127291372901,
+      "grad_norm": 0.4326362609863281,
+      "learning_rate": 7.959844559585493e-05,
+      "loss": 1.265,
+      "step": 1190
+    },
+    {
+      "epoch": 0.2461222920138444,
+      "grad_norm": 0.4242352843284607,
+      "learning_rate": 7.93825561312608e-05,
+      "loss": 1.2672,
+      "step": 1200
+    },
+    {
+      "epoch": 0.24817331111395974,
+      "grad_norm": 0.4441153407096863,
+      "learning_rate": 7.916666666666666e-05,
+      "loss": 1.2944,
+      "step": 1210
+    },
+    {
+      "epoch": 0.25022433021407514,
+      "grad_norm": 0.40912818908691406,
+      "learning_rate": 7.895077720207255e-05,
+      "loss": 1.2702,
+      "step": 1220
+    },
+    {
+      "epoch": 0.2522753493141905,
+      "grad_norm": 0.44539037346839905,
+      "learning_rate": 7.873488773747841e-05,
+      "loss": 1.2228,
+      "step": 1230
+    },
+    {
+      "epoch": 0.25432636841430584,
+      "grad_norm": 0.4299303889274597,
+      "learning_rate": 7.851899827288429e-05,
+      "loss": 1.2328,
+      "step": 1240
+    },
+    {
+      "epoch": 0.25637738751442124,
+      "grad_norm": 0.4408973455429077,
+      "learning_rate": 7.830310880829016e-05,
+      "loss": 1.2358,
+      "step": 1250
+    },
+    {
+      "epoch": 0.2584284066145366,
+      "grad_norm": 0.4100968837738037,
+      "learning_rate": 7.808721934369602e-05,
+      "loss": 1.2458,
+      "step": 1260
+    },
+    {
+      "epoch": 0.260479425714652,
+      "grad_norm": 0.4401489198207855,
+      "learning_rate": 7.787132987910191e-05,
+      "loss": 1.2593,
+      "step": 1270
+    },
+    {
+      "epoch": 0.26253044481476734,
+      "grad_norm": 0.4514229893684387,
+      "learning_rate": 7.765544041450777e-05,
+      "loss": 1.2632,
+      "step": 1280
+    },
+    {
+      "epoch": 0.2645814639148827,
+      "grad_norm": 0.38684791326522827,
+      "learning_rate": 7.743955094991365e-05,
+      "loss": 1.2424,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2666324830149981,
+      "grad_norm": 0.46148189902305603,
+      "learning_rate": 7.722366148531953e-05,
+      "loss": 1.2445,
+      "step": 1300
+    },
+    {
+      "epoch": 0.26868350211511344,
+      "grad_norm": 0.4319213628768921,
+      "learning_rate": 7.700777202072539e-05,
+      "loss": 1.2253,
+      "step": 1310
+    },
+    {
+      "epoch": 0.27073452121522884,
+      "grad_norm": 0.4195545017719269,
+      "learning_rate": 7.679188255613126e-05,
+      "loss": 1.2578,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2727855403153442,
+      "grad_norm": 0.43690159916877747,
+      "learning_rate": 7.657599309153714e-05,
+      "loss": 1.2573,
+      "step": 1330
+    },
+    {
+      "epoch": 0.27483655941545954,
+      "grad_norm": 0.44571492075920105,
+      "learning_rate": 7.636010362694301e-05,
+      "loss": 1.2607,
+      "step": 1340
+    },
+    {
+      "epoch": 0.27688757851557494,
+      "grad_norm": 0.43295958638191223,
+      "learning_rate": 7.614421416234889e-05,
+      "loss": 1.2278,
+      "step": 1350
+    },
+    {
+      "epoch": 0.2789385976156903,
+      "grad_norm": 0.44495707750320435,
+      "learning_rate": 7.592832469775475e-05,
+      "loss": 1.2798,
+      "step": 1360
+    },
+    {
+      "epoch": 0.2809896167158057,
+      "grad_norm": 0.4412330985069275,
+      "learning_rate": 7.571243523316062e-05,
+      "loss": 1.2501,
+      "step": 1370
+    },
+    {
+      "epoch": 0.28304063581592104,
+      "grad_norm": 0.44599953293800354,
+      "learning_rate": 7.54965457685665e-05,
+      "loss": 1.2396,
+      "step": 1380
+    },
+    {
+      "epoch": 0.2850916549160364,
+      "grad_norm": 0.447109580039978,
+      "learning_rate": 7.528065630397237e-05,
+      "loss": 1.2767,
+      "step": 1390
+    },
+    {
+      "epoch": 0.2871426740161518,
+      "grad_norm": 0.44506722688674927,
+      "learning_rate": 7.506476683937824e-05,
+      "loss": 1.2546,
+      "step": 1400
+    },
+    {
+      "epoch": 0.28919369311626714,
+      "grad_norm": 0.44061776995658875,
+      "learning_rate": 7.484887737478411e-05,
+      "loss": 1.2413,
+      "step": 1410
+    },
+    {
+      "epoch": 0.29124471221638254,
+      "grad_norm": 0.45085111260414124,
+      "learning_rate": 7.463298791018999e-05,
+      "loss": 1.2483,
+      "step": 1420
+    },
+    {
+      "epoch": 0.2932957313164979,
+      "grad_norm": 0.4437837600708008,
+      "learning_rate": 7.441709844559586e-05,
+      "loss": 1.252,
+      "step": 1430
+    },
+    {
+      "epoch": 0.29534675041661324,
+      "grad_norm": 0.4294221103191376,
+      "learning_rate": 7.420120898100174e-05,
+      "loss": 1.2386,
+      "step": 1440
+    },
+    {
+      "epoch": 0.29739776951672864,
+      "grad_norm": 0.4780830144882202,
+      "learning_rate": 7.39853195164076e-05,
+      "loss": 1.2639,
+      "step": 1450
+    },
+    {
+      "epoch": 0.299448788616844,
+      "grad_norm": 0.44152942299842834,
+      "learning_rate": 7.376943005181347e-05,
+      "loss": 1.2756,
+      "step": 1460
+    },
+    {
+      "epoch": 0.3014998077169594,
+      "grad_norm": 0.41989192366600037,
+      "learning_rate": 7.355354058721935e-05,
+      "loss": 1.2614,
+      "step": 1470
+    },
+    {
+      "epoch": 0.30355082681707474,
+      "grad_norm": 0.5871754884719849,
+      "learning_rate": 7.333765112262521e-05,
+      "loss": 1.2615,
+      "step": 1480
+    },
+    {
+      "epoch": 0.3056018459171901,
+      "grad_norm": 0.4467261731624603,
+      "learning_rate": 7.31217616580311e-05,
+      "loss": 1.2624,
+      "step": 1490
+    },
+    {
+      "epoch": 0.3076528650173055,
+      "grad_norm": 0.49219033122062683,
+      "learning_rate": 7.290587219343696e-05,
+      "loss": 1.289,
+      "step": 1500
+    },
+    {
+      "epoch": 0.30970388411742084,
+      "grad_norm": 0.4700734317302704,
+      "learning_rate": 7.268998272884284e-05,
+      "loss": 1.242,
+      "step": 1510
+    },
+    {
+      "epoch": 0.31175490321753624,
+      "grad_norm": 0.4607170820236206,
+      "learning_rate": 7.247409326424871e-05,
+      "loss": 1.2554,
+      "step": 1520
+    },
+    {
+      "epoch": 0.3138059223176516,
+      "grad_norm": 0.4335988759994507,
+      "learning_rate": 7.225820379965457e-05,
+      "loss": 1.2423,
+      "step": 1530
+    },
+    {
+      "epoch": 0.31585694141776693,
+      "grad_norm": 0.4366897940635681,
+      "learning_rate": 7.204231433506046e-05,
+      "loss": 1.2219,
+      "step": 1540
+    },
+    {
+      "epoch": 0.31790796051788234,
+      "grad_norm": 0.45856085419654846,
+      "learning_rate": 7.182642487046632e-05,
+      "loss": 1.2189,
+      "step": 1550
+    },
+    {
+      "epoch": 0.3199589796179977,
+      "grad_norm": 0.4563063085079193,
+      "learning_rate": 7.16105354058722e-05,
+      "loss": 1.2696,
+      "step": 1560
+    },
+    {
+      "epoch": 0.3220099987181131,
+      "grad_norm": 0.4276934862136841,
+      "learning_rate": 7.139464594127807e-05,
+      "loss": 1.2659,
+      "step": 1570
+    },
+    {
+      "epoch": 0.32406101781822844,
+      "grad_norm": 0.46200886368751526,
+      "learning_rate": 7.117875647668394e-05,
+      "loss": 1.2261,
+      "step": 1580
+    },
+    {
+      "epoch": 0.3261120369183438,
+      "grad_norm": 0.4863358736038208,
+      "learning_rate": 7.096286701208982e-05,
+      "loss": 1.2292,
+      "step": 1590
+    },
+    {
+      "epoch": 0.3281630560184592,
+      "grad_norm": 0.4537160098552704,
+      "learning_rate": 7.074697754749569e-05,
+      "loss": 1.2453,
+      "step": 1600
+    },
+    {
+      "epoch": 0.33021407511857453,
+      "grad_norm": 0.4507627487182617,
+      "learning_rate": 7.053108808290155e-05,
+      "loss": 1.2081,
+      "step": 1610
+    },
+    {
+      "epoch": 0.33226509421868994,
+      "grad_norm": 0.43197301030158997,
+      "learning_rate": 7.031519861830744e-05,
+      "loss": 1.2757,
+      "step": 1620
+    },
+    {
+      "epoch": 0.3343161133188053,
+      "grad_norm": 0.4551820456981659,
+      "learning_rate": 7.00993091537133e-05,
+      "loss": 1.2751,
+      "step": 1630
+    },
+    {
+      "epoch": 0.33636713241892063,
+      "grad_norm": 0.45099398493766785,
+      "learning_rate": 6.988341968911917e-05,
+      "loss": 1.2583,
+      "step": 1640
+    },
+    {
+      "epoch": 0.33841815151903604,
+      "grad_norm": 0.46787434816360474,
+      "learning_rate": 6.966753022452505e-05,
+      "loss": 1.2448,
+      "step": 1650
+    },
+    {
+      "epoch": 0.3404691706191514,
+      "grad_norm": 0.45500054955482483,
+      "learning_rate": 6.945164075993091e-05,
+      "loss": 1.2394,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3425201897192668,
+      "grad_norm": 0.4682730436325073,
+      "learning_rate": 6.92357512953368e-05,
+      "loss": 1.2287,
+      "step": 1670
+    },
+    {
+      "epoch": 0.34457120881938214,
+      "grad_norm": 0.4615074396133423,
+      "learning_rate": 6.901986183074266e-05,
+      "loss": 1.2042,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3466222279194975,
+      "grad_norm": 0.4548027217388153,
+      "learning_rate": 6.880397236614854e-05,
+      "loss": 1.2671,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3486732470196129,
+      "grad_norm": 0.4783169627189636,
+      "learning_rate": 6.858808290155441e-05,
+      "loss": 1.2533,
+      "step": 1700
+    },
+    {
+      "epoch": 0.35072426611972823,
+      "grad_norm": 0.46452414989471436,
+      "learning_rate": 6.837219343696027e-05,
+      "loss": 1.2681,
+      "step": 1710
+    },
+    {
+      "epoch": 0.35277528521984364,
+      "grad_norm": 0.4663463532924652,
+      "learning_rate": 6.815630397236615e-05,
+      "loss": 1.2561,
+      "step": 1720
+    },
+    {
+      "epoch": 0.354826304319959,
+      "grad_norm": 0.46744370460510254,
+      "learning_rate": 6.794041450777202e-05,
+      "loss": 1.2453,
+      "step": 1730
+    },
+    {
+      "epoch": 0.35687732342007433,
+      "grad_norm": 0.471835732460022,
+      "learning_rate": 6.77245250431779e-05,
+      "loss": 1.2472,
+      "step": 1740
+    },
+    {
+      "epoch": 0.35892834252018974,
+      "grad_norm": 0.4618450701236725,
+      "learning_rate": 6.750863557858377e-05,
+      "loss": 1.2547,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3609793616203051,
+      "grad_norm": 0.4651658833026886,
+      "learning_rate": 6.729274611398963e-05,
+      "loss": 1.2623,
+      "step": 1760
+    },
+    {
+      "epoch": 0.36303038072042043,
+      "grad_norm": 0.46842116117477417,
+      "learning_rate": 6.707685664939551e-05,
+      "loss": 1.2391,
+      "step": 1770
+    },
+    {
+      "epoch": 0.36508139982053583,
+      "grad_norm": 0.45604613423347473,
+      "learning_rate": 6.686096718480138e-05,
+      "loss": 1.2884,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3671324189206512,
+      "grad_norm": 0.4306802451610565,
+      "learning_rate": 6.664507772020726e-05,
+      "loss": 1.2252,
+      "step": 1790
+    },
+    {
+      "epoch": 0.3691834380207666,
+      "grad_norm": 0.4549136757850647,
+      "learning_rate": 6.642918825561312e-05,
+      "loss": 1.2496,
+      "step": 1800
+    },
+    {
+      "epoch": 0.37123445712088193,
+      "grad_norm": 0.47443437576293945,
+      "learning_rate": 6.6213298791019e-05,
+      "loss": 1.2655,
+      "step": 1810
+    },
+    {
+      "epoch": 0.3732854762209973,
+      "grad_norm": 0.46772050857543945,
+      "learning_rate": 6.599740932642487e-05,
+      "loss": 1.2366,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3753364953211127,
+      "grad_norm": 0.4691794216632843,
+      "learning_rate": 6.578151986183075e-05,
+      "loss": 1.2152,
+      "step": 1830
+    },
+    {
+      "epoch": 0.37738751442122803,
+      "grad_norm": 0.43691304326057434,
+      "learning_rate": 6.556563039723662e-05,
+      "loss": 1.2511,
+      "step": 1840
+    },
+    {
+      "epoch": 0.37943853352134344,
+      "grad_norm": 0.4595348536968231,
+      "learning_rate": 6.534974093264248e-05,
+      "loss": 1.2635,
+      "step": 1850
+    },
+    {
+      "epoch": 0.3814895526214588,
+      "grad_norm": 0.44760558009147644,
+      "learning_rate": 6.513385146804836e-05,
+      "loss": 1.2342,
+      "step": 1860
+    },
+    {
+      "epoch": 0.38354057172157413,
+      "grad_norm": 0.4559841454029083,
+      "learning_rate": 6.491796200345423e-05,
+      "loss": 1.2432,
+      "step": 1870
+    },
+    {
+      "epoch": 0.38559159082168953,
+      "grad_norm": 0.4497215449810028,
+      "learning_rate": 6.470207253886011e-05,
+      "loss": 1.2267,
+      "step": 1880
+    },
+    {
+      "epoch": 0.3876426099218049,
+      "grad_norm": 0.4863613247871399,
+      "learning_rate": 6.448618307426598e-05,
+      "loss": 1.254,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3896936290219203,
+      "grad_norm": 0.4500603675842285,
+      "learning_rate": 6.427029360967185e-05,
+      "loss": 1.2214,
+      "step": 1900
+    },
+    {
+      "epoch": 0.39174464812203563,
+      "grad_norm": 0.4400598704814911,
+      "learning_rate": 6.405440414507774e-05,
+      "loss": 1.2352,
+      "step": 1910
+    },
+    {
+      "epoch": 0.393795667222151,
+      "grad_norm": 0.46070367097854614,
+      "learning_rate": 6.38385146804836e-05,
+      "loss": 1.2468,
+      "step": 1920
+    },
+    {
+      "epoch": 0.3958466863222664,
+      "grad_norm": 0.44312766194343567,
+      "learning_rate": 6.362262521588946e-05,
+      "loss": 1.1923,
+      "step": 1930
+    },
+    {
+      "epoch": 0.39789770542238173,
+      "grad_norm": 0.5013573169708252,
+      "learning_rate": 6.340673575129535e-05,
+      "loss": 1.2361,
+      "step": 1940
+    },
+    {
+      "epoch": 0.39994872452249713,
+      "grad_norm": 0.4884537160396576,
+      "learning_rate": 6.319084628670121e-05,
+      "loss": 1.2434,
+      "step": 1950
+    },
+    {
+      "epoch": 0.4019997436226125,
+      "grad_norm": 0.46138620376586914,
+      "learning_rate": 6.297495682210708e-05,
+      "loss": 1.257,
+      "step": 1960
+    },
+    {
+      "epoch": 0.40405076272272783,
+      "grad_norm": 0.4941729009151459,
+      "learning_rate": 6.275906735751296e-05,
+      "loss": 1.2347,
+      "step": 1970
+    },
+    {
+      "epoch": 0.40610178182284323,
+      "grad_norm": 0.4675595760345459,
+      "learning_rate": 6.254317789291882e-05,
+      "loss": 1.2353,
+      "step": 1980
+    },
+    {
+      "epoch": 0.4081528009229586,
+      "grad_norm": 0.47944632172584534,
+      "learning_rate": 6.232728842832471e-05,
+      "loss": 1.2643,
+      "step": 1990
+    },
+    {
+      "epoch": 0.410203820023074,
+      "grad_norm": 0.4476461112499237,
+      "learning_rate": 6.211139896373057e-05,
+      "loss": 1.2558,
+      "step": 2000
+    },
+    {
+      "epoch": 0.41225483912318933,
+      "grad_norm": 0.4706653654575348,
+      "learning_rate": 6.189550949913645e-05,
+      "loss": 1.227,
+      "step": 2010
+    },
+    {
+      "epoch": 0.4143058582233047,
+      "grad_norm": 0.48062801361083984,
+      "learning_rate": 6.167962003454232e-05,
+      "loss": 1.2273,
+      "step": 2020
+    },
+    {
+      "epoch": 0.4163568773234201,
+      "grad_norm": 0.46771204471588135,
+      "learning_rate": 6.146373056994818e-05,
+      "loss": 1.2268,
+      "step": 2030
+    },
+    {
+      "epoch": 0.41840789642353543,
+      "grad_norm": 0.4725424647331238,
+      "learning_rate": 6.124784110535406e-05,
+      "loss": 1.2009,
+      "step": 2040
+    },
+    {
+      "epoch": 0.42045891552365083,
+      "grad_norm": 0.47520384192466736,
+      "learning_rate": 6.1031951640759934e-05,
+      "loss": 1.2511,
+      "step": 2050
+    },
+    {
+      "epoch": 0.4225099346237662,
+      "grad_norm": 0.44635480642318726,
+      "learning_rate": 6.081606217616581e-05,
+      "loss": 1.21,
+      "step": 2060
+    },
+    {
+      "epoch": 0.42456095372388153,
+      "grad_norm": 0.47436651587486267,
+      "learning_rate": 6.060017271157168e-05,
+      "loss": 1.2116,
+      "step": 2070
+    },
+    {
+      "epoch": 0.42661197282399693,
+      "grad_norm": 0.5115741491317749,
+      "learning_rate": 6.0384283246977546e-05,
+      "loss": 1.2778,
+      "step": 2080
+    },
+    {
+      "epoch": 0.4286629919241123,
+      "grad_norm": 0.4488040506839752,
+      "learning_rate": 6.016839378238343e-05,
+      "loss": 1.2242,
+      "step": 2090
+    },
+    {
+      "epoch": 0.4307140110242277,
+      "grad_norm": 0.4834796190261841,
+      "learning_rate": 5.9952504317789296e-05,
+      "loss": 1.2357,
+      "step": 2100
+    },
+    {
+      "epoch": 0.43276503012434303,
+      "grad_norm": 0.45478227734565735,
+      "learning_rate": 5.973661485319517e-05,
+      "loss": 1.2233,
+      "step": 2110
+    },
+    {
+      "epoch": 0.4348160492244584,
+      "grad_norm": 0.4539099633693695,
+      "learning_rate": 5.952072538860104e-05,
+      "loss": 1.2527,
+      "step": 2120
+    },
+    {
+      "epoch": 0.4368670683245738,
+      "grad_norm": 0.47722533345222473,
+      "learning_rate": 5.930483592400691e-05,
+      "loss": 1.2015,
+      "step": 2130
+    },
+    {
+      "epoch": 0.43891808742468913,
+      "grad_norm": 0.472023069858551,
+      "learning_rate": 5.908894645941278e-05,
+      "loss": 1.2222,
+      "step": 2140
+    },
+    {
+      "epoch": 0.44096910652480453,
+      "grad_norm": 0.4648214876651764,
+      "learning_rate": 5.887305699481865e-05,
+      "loss": 1.2112,
+      "step": 2150
+    },
+    {
+      "epoch": 0.4430201256249199,
+      "grad_norm": 0.48654377460479736,
+      "learning_rate": 5.8657167530224534e-05,
+      "loss": 1.227,
+      "step": 2160
+    },
+    {
+      "epoch": 0.44507114472503523,
+      "grad_norm": 0.4997814893722534,
+      "learning_rate": 5.84412780656304e-05,
+      "loss": 1.2721,
+      "step": 2170
+    },
+    {
+      "epoch": 0.44712216382515063,
+      "grad_norm": 0.47997352480888367,
+      "learning_rate": 5.822538860103627e-05,
+      "loss": 1.2018,
+      "step": 2180
+    },
+    {
+      "epoch": 0.449173182925266,
+      "grad_norm": 0.4899247884750366,
+      "learning_rate": 5.8009499136442146e-05,
+      "loss": 1.2599,
+      "step": 2190
+    },
+    {
+      "epoch": 0.4512242020253814,
+      "grad_norm": 0.4752749800682068,
+      "learning_rate": 5.7793609671848014e-05,
+      "loss": 1.2171,
+      "step": 2200
+    },
+    {
+      "epoch": 0.45327522112549673,
+      "grad_norm": 0.4801314175128937,
+      "learning_rate": 5.7577720207253896e-05,
+      "loss": 1.2234,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4553262402256121,
+      "grad_norm": 0.4591893255710602,
+      "learning_rate": 5.7361830742659764e-05,
+      "loss": 1.2242,
+      "step": 2220
+    },
+    {
+      "epoch": 0.4573772593257275,
+      "grad_norm": 0.46896713972091675,
+      "learning_rate": 5.7145941278065626e-05,
+      "loss": 1.2117,
+      "step": 2230
+    },
+    {
+      "epoch": 0.45942827842584283,
+      "grad_norm": 0.4853857755661011,
+      "learning_rate": 5.693005181347151e-05,
+      "loss": 1.2218,
+      "step": 2240
+    },
+    {
+      "epoch": 0.46147929752595823,
+      "grad_norm": 0.4648151993751526,
+      "learning_rate": 5.6714162348877376e-05,
+      "loss": 1.2401,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4635303166260736,
+      "grad_norm": 0.4839739501476288,
+      "learning_rate": 5.649827288428325e-05,
+      "loss": 1.1976,
+      "step": 2260
+    },
+    {
+      "epoch": 0.4655813357261889,
+      "grad_norm": 0.4986715018749237,
+      "learning_rate": 5.628238341968912e-05,
+      "loss": 1.2274,
+      "step": 2270
+    },
+    {
+      "epoch": 0.46763235482630433,
+      "grad_norm": 0.4636840522289276,
+      "learning_rate": 5.606649395509499e-05,
+      "loss": 1.236,
+      "step": 2280
+    },
+    {
+      "epoch": 0.4696833739264197,
+      "grad_norm": 0.5011271834373474,
+      "learning_rate": 5.585060449050087e-05,
+      "loss": 1.2275,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4717343930265351,
+      "grad_norm": 0.4648337662220001,
+      "learning_rate": 5.563471502590674e-05,
+      "loss": 1.2457,
+      "step": 2300
+    },
+    {
+      "epoch": 0.47378541212665043,
+      "grad_norm": 0.47708699107170105,
+      "learning_rate": 5.5418825561312614e-05,
+      "loss": 1.2316,
+      "step": 2310
+    },
+    {
+      "epoch": 0.4758364312267658,
+      "grad_norm": 0.4954835772514343,
+      "learning_rate": 5.520293609671848e-05,
+      "loss": 1.229,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4778874503268812,
+      "grad_norm": 0.4701727330684662,
+      "learning_rate": 5.498704663212435e-05,
+      "loss": 1.248,
+      "step": 2330
+    },
+    {
+      "epoch": 0.47993846942699653,
+      "grad_norm": 0.4796009957790375,
+      "learning_rate": 5.477115716753023e-05,
+      "loss": 1.2248,
+      "step": 2340
+    },
+    {
+      "epoch": 0.48198948852711193,
+      "grad_norm": 0.4906330406665802,
+      "learning_rate": 5.4555267702936094e-05,
+      "loss": 1.2628,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4840405076272273,
+      "grad_norm": 0.47203144431114197,
+      "learning_rate": 5.4339378238341976e-05,
+      "loss": 1.2067,
+      "step": 2360
+    },
+    {
+      "epoch": 0.4860915267273426,
+      "grad_norm": 0.503813624382019,
+      "learning_rate": 5.4123488773747845e-05,
+      "loss": 1.2006,
+      "step": 2370
+    },
+    {
+      "epoch": 0.48814254582745803,
+      "grad_norm": 0.4918235242366791,
+      "learning_rate": 5.390759930915371e-05,
+      "loss": 1.1887,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4901935649275734,
+      "grad_norm": 0.4799112379550934,
+      "learning_rate": 5.369170984455959e-05,
+      "loss": 1.2079,
+      "step": 2390
+    },
+    {
+      "epoch": 0.4922445840276888,
+      "grad_norm": 0.4769650101661682,
+      "learning_rate": 5.347582037996546e-05,
+      "loss": 1.1945,
+      "step": 2400
+    },
+    {
+      "epoch": 0.49429560312780413,
+      "grad_norm": 0.5079638957977295,
+      "learning_rate": 5.325993091537134e-05,
+      "loss": 1.2294,
+      "step": 2410
+    },
+    {
+      "epoch": 0.4963466222279195,
+      "grad_norm": 0.520418643951416,
+      "learning_rate": 5.304404145077721e-05,
+      "loss": 1.2308,
+      "step": 2420
+    },
+    {
+      "epoch": 0.4983976413280349,
+      "grad_norm": 0.4546453058719635,
+      "learning_rate": 5.2828151986183075e-05,
+      "loss": 1.2206,
+      "step": 2430
+    },
+    {
+      "epoch": 0.5004486604281503,
+      "grad_norm": 0.47760534286499023,
+      "learning_rate": 5.261226252158895e-05,
+      "loss": 1.208,
+      "step": 2440
+    },
+    {
+      "epoch": 0.5024996795282656,
+      "grad_norm": 0.5267066955566406,
+      "learning_rate": 5.239637305699482e-05,
+      "loss": 1.2123,
+      "step": 2450
+    },
+    {
+      "epoch": 0.504550698628381,
+      "grad_norm": 0.45763811469078064,
+      "learning_rate": 5.2180483592400694e-05,
+      "loss": 1.2159,
+      "step": 2460
+    },
+    {
+      "epoch": 0.5066017177284964,
+      "grad_norm": 0.4922376871109009,
+      "learning_rate": 5.196459412780656e-05,
+      "loss": 1.2456,
+      "step": 2470
+    },
+    {
+      "epoch": 0.5086527368286117,
+      "grad_norm": 0.47043368220329285,
+      "learning_rate": 5.174870466321243e-05,
+      "loss": 1.2052,
+      "step": 2480
+    },
+    {
+      "epoch": 0.5107037559287271,
+      "grad_norm": 0.5082889795303345,
+      "learning_rate": 5.153281519861831e-05,
+      "loss": 1.2393,
+      "step": 2490
+    },
+    {
+      "epoch": 0.5127547750288425,
+      "grad_norm": 0.4955206513404846,
+      "learning_rate": 5.131692573402418e-05,
+      "loss": 1.2323,
+      "step": 2500
+    },
+    {
+      "epoch": 0.5148057941289578,
+      "grad_norm": 0.48625460267066956,
+      "learning_rate": 5.1101036269430057e-05,
+      "loss": 1.206,
+      "step": 2510
+    },
+    {
+      "epoch": 0.5168568132290732,
+      "grad_norm": 0.49060237407684326,
+      "learning_rate": 5.0885146804835925e-05,
+      "loss": 1.2353,
+      "step": 2520
+    },
+    {
+      "epoch": 0.5189078323291886,
+      "grad_norm": 0.46809640526771545,
+      "learning_rate": 5.0669257340241793e-05,
+      "loss": 1.2287,
+      "step": 2530
+    },
+    {
+      "epoch": 0.520958851429304,
+      "grad_norm": 0.4944596290588379,
+      "learning_rate": 5.0453367875647675e-05,
+      "loss": 1.2413,
+      "step": 2540
+    },
+    {
+      "epoch": 0.5230098705294193,
+      "grad_norm": 0.46914994716644287,
+      "learning_rate": 5.023747841105354e-05,
+      "loss": 1.22,
+      "step": 2550
+    },
+    {
+      "epoch": 0.5250608896295347,
+      "grad_norm": 0.4888727366924286,
+      "learning_rate": 5.002158894645942e-05,
+      "loss": 1.2343,
+      "step": 2560
+    },
+    {
+      "epoch": 0.5271119087296501,
+      "grad_norm": 0.4785778522491455,
+      "learning_rate": 4.980569948186529e-05,
+      "loss": 1.187,
+      "step": 2570
+    },
+    {
+      "epoch": 0.5291629278297654,
+      "grad_norm": 0.4947550594806671,
+      "learning_rate": 4.958981001727116e-05,
+      "loss": 1.2288,
+      "step": 2580
+    },
+    {
+      "epoch": 0.5312139469298808,
+      "grad_norm": 0.5263291597366333,
+      "learning_rate": 4.937392055267703e-05,
+      "loss": 1.2044,
+      "step": 2590
+    },
+    {
+      "epoch": 0.5332649660299962,
+      "grad_norm": 0.49239382147789,
+      "learning_rate": 4.9158031088082906e-05,
+      "loss": 1.1865,
+      "step": 2600
+    },
+    {
+      "epoch": 0.5353159851301115,
+      "grad_norm": 0.48874983191490173,
+      "learning_rate": 4.8942141623488775e-05,
+      "loss": 1.2672,
+      "step": 2610
+    },
+    {
+      "epoch": 0.5373670042302269,
+      "grad_norm": 0.48474863171577454,
+      "learning_rate": 4.872625215889465e-05,
+      "loss": 1.2359,
+      "step": 2620
+    },
+    {
+      "epoch": 0.5394180233303423,
+      "grad_norm": 0.4978977143764496,
+      "learning_rate": 4.851036269430052e-05,
+      "loss": 1.2139,
+      "step": 2630
+    },
+    {
+      "epoch": 0.5414690424304577,
+      "grad_norm": 0.5144924521446228,
+      "learning_rate": 4.829447322970639e-05,
+      "loss": 1.221,
+      "step": 2640
+    },
+    {
+      "epoch": 0.543520061530573,
+      "grad_norm": 0.5082759857177734,
+      "learning_rate": 4.807858376511227e-05,
+      "loss": 1.2209,
+      "step": 2650
+    },
+    {
+      "epoch": 0.5455710806306884,
+      "grad_norm": 0.4933965504169464,
+      "learning_rate": 4.786269430051814e-05,
+      "loss": 1.207,
+      "step": 2660
+    },
+    {
+      "epoch": 0.5476220997308038,
+      "grad_norm": 0.49464166164398193,
+      "learning_rate": 4.7646804835924005e-05,
+      "loss": 1.2398,
+      "step": 2670
+    },
+    {
+      "epoch": 0.5496731188309191,
+      "grad_norm": 0.49377110600471497,
+      "learning_rate": 4.743091537132988e-05,
+      "loss": 1.2451,
+      "step": 2680
+    },
+    {
+      "epoch": 0.5517241379310345,
+      "grad_norm": 0.5111104846000671,
+      "learning_rate": 4.7215025906735756e-05,
+      "loss": 1.2197,
+      "step": 2690
+    },
+    {
+      "epoch": 0.5537751570311499,
+      "grad_norm": 0.47716042399406433,
+      "learning_rate": 4.699913644214163e-05,
+      "loss": 1.1891,
+      "step": 2700
+    },
+    {
+      "epoch": 0.5558261761312652,
+      "grad_norm": 0.5081655383110046,
+      "learning_rate": 4.678324697754749e-05,
+      "loss": 1.2507,
+      "step": 2710
+    },
+    {
+      "epoch": 0.5578771952313806,
+      "grad_norm": 0.49036547541618347,
+      "learning_rate": 4.656735751295337e-05,
+      "loss": 1.1805,
+      "step": 2720
+    },
+    {
+      "epoch": 0.559928214331496,
+      "grad_norm": 0.5139365792274475,
+      "learning_rate": 4.635146804835924e-05,
+      "loss": 1.2361,
+      "step": 2730
+    },
+    {
+      "epoch": 0.5619792334316114,
+      "grad_norm": 0.5098669528961182,
+      "learning_rate": 4.613557858376512e-05,
+      "loss": 1.2409,
+      "step": 2740
+    },
+    {
+      "epoch": 0.5640302525317267,
+      "grad_norm": 0.4786950349807739,
+      "learning_rate": 4.5919689119170986e-05,
+      "loss": 1.2067,
+      "step": 2750
+    },
+    {
+      "epoch": 0.5660812716318421,
+      "grad_norm": 0.5063204169273376,
+      "learning_rate": 4.5703799654576855e-05,
+      "loss": 1.1942,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5681322907319575,
+      "grad_norm": 0.511663556098938,
+      "learning_rate": 4.548791018998273e-05,
+      "loss": 1.2017,
+      "step": 2770
+    },
+    {
+      "epoch": 0.5701833098320728,
+      "grad_norm": 0.48765748739242554,
+      "learning_rate": 4.5272020725388605e-05,
+      "loss": 1.222,
+      "step": 2780
+    },
+    {
+      "epoch": 0.5722343289321882,
+      "grad_norm": 0.49707624316215515,
+      "learning_rate": 4.5056131260794474e-05,
+      "loss": 1.2075,
+      "step": 2790
+    },
+    {
+      "epoch": 0.5742853480323036,
+      "grad_norm": 0.5067517757415771,
+      "learning_rate": 4.484024179620035e-05,
+      "loss": 1.211,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5763363671324189,
+      "grad_norm": 0.4615229368209839,
+      "learning_rate": 4.462435233160622e-05,
+      "loss": 1.2303,
+      "step": 2810
+    },
+    {
+      "epoch": 0.5783873862325343,
+      "grad_norm": 0.4948524236679077,
+      "learning_rate": 4.440846286701209e-05,
+      "loss": 1.2024,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5804384053326497,
+      "grad_norm": 0.5140314102172852,
+      "learning_rate": 4.419257340241796e-05,
+      "loss": 1.2217,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5824894244327651,
+      "grad_norm": 0.5108122825622559,
+      "learning_rate": 4.3976683937823836e-05,
+      "loss": 1.1838,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5845404435328804,
+      "grad_norm": 0.5021159052848816,
+      "learning_rate": 4.376079447322971e-05,
+      "loss": 1.2418,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5865914626329958,
+      "grad_norm": 0.5086933374404907,
+      "learning_rate": 4.354490500863558e-05,
+      "loss": 1.2321,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5886424817331112,
+      "grad_norm": 0.5083547830581665,
+      "learning_rate": 4.332901554404145e-05,
+      "loss": 1.2035,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5906935008332265,
+      "grad_norm": 0.4828626215457916,
+      "learning_rate": 4.311312607944732e-05,
+      "loss": 1.2302,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5927445199333419,
+      "grad_norm": 0.5140969157218933,
+      "learning_rate": 4.28972366148532e-05,
+      "loss": 1.2058,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5947955390334573,
+      "grad_norm": 0.497364342212677,
+      "learning_rate": 4.2681347150259074e-05,
+      "loss": 1.2382,
+      "step": 2900
+    },
+    {
+      "epoch": 0.5968465581335726,
+      "grad_norm": 0.49104997515678406,
+      "learning_rate": 4.246545768566494e-05,
+      "loss": 1.2322,
+      "step": 2910
+    },
+    {
+      "epoch": 0.598897577233688,
+      "grad_norm": 0.521659255027771,
+      "learning_rate": 4.224956822107081e-05,
+      "loss": 1.1868,
+      "step": 2920
+    },
+    {
+      "epoch": 0.6009485963338034,
+      "grad_norm": 0.5175550580024719,
+      "learning_rate": 4.2033678756476686e-05,
+      "loss": 1.2169,
+      "step": 2930
+    },
+    {
+      "epoch": 0.6029996154339188,
+      "grad_norm": 0.4998300075531006,
+      "learning_rate": 4.181778929188256e-05,
+      "loss": 1.2227,
+      "step": 2940
+    },
+    {
+      "epoch": 0.6050506345340341,
+      "grad_norm": 0.4932349622249603,
+      "learning_rate": 4.160189982728843e-05,
+      "loss": 1.2371,
+      "step": 2950
+    },
+    {
+      "epoch": 0.6071016536341495,
+      "grad_norm": 0.5610498189926147,
+      "learning_rate": 4.1386010362694304e-05,
+      "loss": 1.2105,
+      "step": 2960
+    },
+    {
+      "epoch": 0.6091526727342649,
+      "grad_norm": 0.4975990355014801,
+      "learning_rate": 4.117012089810017e-05,
+      "loss": 1.2511,
+      "step": 2970
+    },
+    {
+      "epoch": 0.6112036918343802,
+      "grad_norm": 0.5154693722724915,
+      "learning_rate": 4.095423143350605e-05,
+      "loss": 1.2399,
+      "step": 2980
+    },
+    {
+      "epoch": 0.6132547109344956,
+      "grad_norm": 0.4968002736568451,
+      "learning_rate": 4.0738341968911916e-05,
+      "loss": 1.2041,
+      "step": 2990
+    },
+    {
+      "epoch": 0.615305730034611,
+      "grad_norm": 0.4866868555545807,
+      "learning_rate": 4.052245250431779e-05,
+      "loss": 1.1965,
+      "step": 3000
+    },
+    {
+      "epoch": 0.6173567491347263,
+      "grad_norm": 0.5152925848960876,
+      "learning_rate": 4.030656303972367e-05,
+      "loss": 1.2298,
+      "step": 3010
+    },
+    {
+      "epoch": 0.6194077682348417,
+      "grad_norm": 0.513058602809906,
+      "learning_rate": 4.0090673575129535e-05,
+      "loss": 1.2414,
+      "step": 3020
+    },
+    {
+      "epoch": 0.6214587873349571,
+      "grad_norm": 0.5031930208206177,
+      "learning_rate": 3.987478411053541e-05,
+      "loss": 1.1766,
+      "step": 3030
+    },
+    {
+      "epoch": 0.6235098064350725,
+      "grad_norm": 0.5087730288505554,
+      "learning_rate": 3.965889464594128e-05,
+      "loss": 1.229,
+      "step": 3040
+    },
+    {
+      "epoch": 0.6255608255351878,
+      "grad_norm": 0.4878797233104706,
+      "learning_rate": 3.9443005181347154e-05,
+      "loss": 1.2018,
+      "step": 3050
+    },
+    {
+      "epoch": 0.6276118446353032,
+      "grad_norm": 0.5124858617782593,
+      "learning_rate": 3.922711571675303e-05,
+      "loss": 1.1848,
+      "step": 3060
+    },
+    {
+      "epoch": 0.6296628637354186,
+      "grad_norm": 0.49720969796180725,
+      "learning_rate": 3.90112262521589e-05,
+      "loss": 1.1892,
+      "step": 3070
+    },
+    {
+      "epoch": 0.6317138828355339,
+      "grad_norm": 0.49900123476982117,
+      "learning_rate": 3.8795336787564766e-05,
+      "loss": 1.2027,
+      "step": 3080
+    },
+    {
+      "epoch": 0.6337649019356493,
+      "grad_norm": 0.5007952451705933,
+      "learning_rate": 3.857944732297064e-05,
+      "loss": 1.2373,
+      "step": 3090
+    },
+    {
+      "epoch": 0.6358159210357647,
+      "grad_norm": 0.49481576681137085,
+      "learning_rate": 3.8363557858376516e-05,
+      "loss": 1.2294,
+      "step": 3100
+    },
+    {
+      "epoch": 0.63786694013588,
+      "grad_norm": 0.4979318082332611,
+      "learning_rate": 3.8147668393782385e-05,
+      "loss": 1.2312,
+      "step": 3110
+    },
+    {
+      "epoch": 0.6399179592359954,
+      "grad_norm": 0.49939480423927307,
+      "learning_rate": 3.793177892918825e-05,
+      "loss": 1.2394,
+      "step": 3120
+    },
+    {
+      "epoch": 0.6419689783361108,
+      "grad_norm": 0.5186517834663391,
+      "learning_rate": 3.771588946459413e-05,
+      "loss": 1.199,
+      "step": 3130
+    },
+    {
+      "epoch": 0.6440199974362262,
+      "grad_norm": 0.5386569499969482,
+      "learning_rate": 3.7500000000000003e-05,
+      "loss": 1.1801,
+      "step": 3140
+    },
+    {
+      "epoch": 0.6460710165363415,
+      "grad_norm": 0.5134577751159668,
+      "learning_rate": 3.728411053540587e-05,
+      "loss": 1.2286,
+      "step": 3150
+    },
+    {
+      "epoch": 0.6481220356364569,
+      "grad_norm": 0.5191785097122192,
+      "learning_rate": 3.706822107081175e-05,
+      "loss": 1.2068,
+      "step": 3160
+    },
+    {
+      "epoch": 0.6501730547365723,
+      "grad_norm": 0.4857168197631836,
+      "learning_rate": 3.6852331606217615e-05,
+      "loss": 1.2116,
+      "step": 3170
+    },
+    {
+      "epoch": 0.6522240738366876,
+      "grad_norm": 0.5283413529396057,
+      "learning_rate": 3.663644214162349e-05,
+      "loss": 1.1792,
+      "step": 3180
+    },
+    {
+      "epoch": 0.654275092936803,
+      "grad_norm": 0.528938353061676,
+      "learning_rate": 3.6420552677029366e-05,
+      "loss": 1.1963,
+      "step": 3190
+    },
+    {
+      "epoch": 0.6563261120369184,
+      "grad_norm": 0.5067134499549866,
+      "learning_rate": 3.6204663212435234e-05,
+      "loss": 1.2476,
+      "step": 3200
+    },
+    {
+      "epoch": 0.6583771311370337,
+      "grad_norm": 0.4993511736392975,
+      "learning_rate": 3.598877374784111e-05,
+      "loss": 1.2273,
+      "step": 3210
+    },
+    {
+      "epoch": 0.6604281502371491,
+      "grad_norm": 0.5275943279266357,
+      "learning_rate": 3.577288428324698e-05,
+      "loss": 1.2287,
+      "step": 3220
+    },
+    {
+      "epoch": 0.6624791693372645,
+      "grad_norm": 0.49331194162368774,
+      "learning_rate": 3.555699481865285e-05,
+      "loss": 1.1794,
+      "step": 3230
+    },
+    {
+      "epoch": 0.6645301884373799,
+      "grad_norm": 0.5065453052520752,
+      "learning_rate": 3.534110535405872e-05,
+      "loss": 1.2342,
+      "step": 3240
+    },
+    {
+      "epoch": 0.6665812075374952,
+      "grad_norm": 0.5334459543228149,
+      "learning_rate": 3.51252158894646e-05,
+      "loss": 1.1782,
+      "step": 3250
+    },
+    {
+      "epoch": 0.6686322266376106,
+      "grad_norm": 0.535772979259491,
+      "learning_rate": 3.490932642487047e-05,
+      "loss": 1.2108,
+      "step": 3260
+    },
+    {
+      "epoch": 0.670683245737726,
+      "grad_norm": 0.5377807021141052,
+      "learning_rate": 3.469343696027634e-05,
+      "loss": 1.1903,
+      "step": 3270
+    },
+    {
+      "epoch": 0.6727342648378413,
+      "grad_norm": 0.5266278386116028,
+      "learning_rate": 3.447754749568221e-05,
+      "loss": 1.2183,
+      "step": 3280
+    },
+    {
+      "epoch": 0.6747852839379567,
+      "grad_norm": 0.4987232983112335,
+      "learning_rate": 3.4261658031088084e-05,
+      "loss": 1.1915,
+      "step": 3290
+    },
+    {
+      "epoch": 0.6768363030380721,
+      "grad_norm": 0.5178554058074951,
+      "learning_rate": 3.404576856649396e-05,
+      "loss": 1.179,
+      "step": 3300
+    },
+    {
+      "epoch": 0.6788873221381874,
+      "grad_norm": 0.5086014270782471,
+      "learning_rate": 3.382987910189983e-05,
+      "loss": 1.2298,
+      "step": 3310
+    },
+    {
+      "epoch": 0.6809383412383028,
+      "grad_norm": 0.5420427918434143,
+      "learning_rate": 3.3613989637305696e-05,
+      "loss": 1.2072,
+      "step": 3320
+    },
+    {
+      "epoch": 0.6829893603384182,
+      "grad_norm": 0.5170331001281738,
+      "learning_rate": 3.339810017271157e-05,
+      "loss": 1.2252,
+      "step": 3330
+    },
+    {
+      "epoch": 0.6850403794385336,
+      "grad_norm": 0.48680609464645386,
+      "learning_rate": 3.3182210708117446e-05,
+      "loss": 1.2059,
+      "step": 3340
+    },
+    {
+      "epoch": 0.6870913985386489,
+      "grad_norm": 0.5035340189933777,
+      "learning_rate": 3.296632124352332e-05,
+      "loss": 1.2009,
+      "step": 3350
+    },
+    {
+      "epoch": 0.6891424176387643,
+      "grad_norm": 0.513165295124054,
+      "learning_rate": 3.275043177892919e-05,
+      "loss": 1.1844,
+      "step": 3360
+    },
+    {
+      "epoch": 0.6911934367388797,
+      "grad_norm": 0.5243003368377686,
+      "learning_rate": 3.2534542314335065e-05,
+      "loss": 1.2009,
+      "step": 3370
+    },
+    {
+      "epoch": 0.693244455838995,
+      "grad_norm": 0.5219825506210327,
+      "learning_rate": 3.2318652849740933e-05,
+      "loss": 1.2039,
+      "step": 3380
+    },
+    {
+      "epoch": 0.6952954749391104,
+      "grad_norm": 0.5202507972717285,
+      "learning_rate": 3.210276338514681e-05,
+      "loss": 1.225,
+      "step": 3390
+    },
+    {
+      "epoch": 0.6973464940392258,
+      "grad_norm": 0.5152229070663452,
+      "learning_rate": 3.188687392055268e-05,
+      "loss": 1.1886,
+      "step": 3400
+    },
+    {
+      "epoch": 0.6993975131393411,
+      "grad_norm": 0.5382890701293945,
+      "learning_rate": 3.167098445595855e-05,
+      "loss": 1.2113,
+      "step": 3410
+    },
+    {
+      "epoch": 0.7014485322394565,
+      "grad_norm": 0.5525237917900085,
+      "learning_rate": 3.145509499136443e-05,
+      "loss": 1.2283,
+      "step": 3420
+    },
+    {
+      "epoch": 0.7034995513395719,
+      "grad_norm": 0.5308887958526611,
+      "learning_rate": 3.1239205526770296e-05,
+      "loss": 1.2311,
+      "step": 3430
+    },
+    {
+      "epoch": 0.7055505704396873,
+      "grad_norm": 0.5247687697410583,
+      "learning_rate": 3.1023316062176164e-05,
+      "loss": 1.1946,
+      "step": 3440
+    },
+    {
+      "epoch": 0.7076015895398026,
+      "grad_norm": 0.5322206616401672,
+      "learning_rate": 3.080742659758204e-05,
+      "loss": 1.2198,
+      "step": 3450
+    },
+    {
+      "epoch": 0.709652608639918,
+      "grad_norm": 0.5104162693023682,
+      "learning_rate": 3.0591537132987915e-05,
+      "loss": 1.2105,
+      "step": 3460
+    },
+    {
+      "epoch": 0.7117036277400334,
+      "grad_norm": 0.4890803098678589,
+      "learning_rate": 3.0375647668393786e-05,
+      "loss": 1.2074,
+      "step": 3470
+    },
+    {
+      "epoch": 0.7137546468401487,
+      "grad_norm": 0.529225766658783,
+      "learning_rate": 3.0159758203799655e-05,
+      "loss": 1.2321,
+      "step": 3480
+    },
+    {
+      "epoch": 0.7158056659402641,
+      "grad_norm": 0.5252069234848022,
+      "learning_rate": 2.9943868739205527e-05,
+      "loss": 1.1995,
+      "step": 3490
+    },
+    {
+      "epoch": 0.7178566850403795,
+      "grad_norm": 0.5369967818260193,
+      "learning_rate": 2.9727979274611402e-05,
+      "loss": 1.2234,
+      "step": 3500
+    },
+    {
+      "epoch": 0.7199077041404948,
+      "grad_norm": 0.5053485631942749,
+      "learning_rate": 2.9512089810017274e-05,
+      "loss": 1.2035,
+      "step": 3510
+    },
+    {
+      "epoch": 0.7219587232406102,
+      "grad_norm": 0.5131696462631226,
+      "learning_rate": 2.929620034542315e-05,
+      "loss": 1.2681,
+      "step": 3520
+    },
+    {
+      "epoch": 0.7240097423407256,
+      "grad_norm": 0.5332499742507935,
+      "learning_rate": 2.9080310880829014e-05,
+      "loss": 1.2039,
+      "step": 3530
+    },
+    {
+      "epoch": 0.7260607614408409,
+      "grad_norm": 0.5105617046356201,
+      "learning_rate": 2.886442141623489e-05,
+      "loss": 1.2,
+      "step": 3540
+    },
+    {
+      "epoch": 0.7281117805409563,
+      "grad_norm": 0.5197264552116394,
+      "learning_rate": 2.864853195164076e-05,
+      "loss": 1.1821,
+      "step": 3550
+    },
+    {
+      "epoch": 0.7301627996410717,
+      "grad_norm": 0.505455493927002,
+      "learning_rate": 2.8432642487046636e-05,
+      "loss": 1.2158,
+      "step": 3560
+    },
+    {
+      "epoch": 0.7322138187411871,
+      "grad_norm": 0.5290804505348206,
+      "learning_rate": 2.8216753022452508e-05,
+      "loss": 1.174,
+      "step": 3570
+    },
+    {
+      "epoch": 0.7342648378413024,
+      "grad_norm": 0.5349313020706177,
+      "learning_rate": 2.8000863557858376e-05,
+      "loss": 1.2301,
+      "step": 3580
+    },
+    {
+      "epoch": 0.7363158569414178,
+      "grad_norm": 0.4875812530517578,
+      "learning_rate": 2.7784974093264248e-05,
+      "loss": 1.2015,
+      "step": 3590
+    },
+    {
+      "epoch": 0.7383668760415332,
+      "grad_norm": 0.5164597630500793,
+      "learning_rate": 2.7569084628670123e-05,
+      "loss": 1.2294,
+      "step": 3600
+    },
+    {
+      "epoch": 0.7404178951416485,
+      "grad_norm": 0.5129172801971436,
+      "learning_rate": 2.7353195164075995e-05,
+      "loss": 1.2122,
+      "step": 3610
+    },
+    {
+      "epoch": 0.7424689142417639,
+      "grad_norm": 0.5218586921691895,
+      "learning_rate": 2.713730569948187e-05,
+      "loss": 1.2002,
+      "step": 3620
+    },
+    {
+      "epoch": 0.7445199333418793,
+      "grad_norm": 0.5423296093940735,
+      "learning_rate": 2.6921416234887735e-05,
+      "loss": 1.1685,
+      "step": 3630
+    },
+    {
+      "epoch": 0.7465709524419946,
+      "grad_norm": 0.5151218771934509,
+      "learning_rate": 2.670552677029361e-05,
+      "loss": 1.2167,
+      "step": 3640
+    },
+    {
+      "epoch": 0.74862197154211,
+      "grad_norm": 0.5160235166549683,
+      "learning_rate": 2.6489637305699482e-05,
+      "loss": 1.2269,
+      "step": 3650
+    },
+    {
+      "epoch": 0.7506729906422254,
+      "grad_norm": 0.5056514143943787,
+      "learning_rate": 2.6273747841105357e-05,
+      "loss": 1.2467,
+      "step": 3660
+    },
+    {
+      "epoch": 0.7527240097423408,
+      "grad_norm": 0.52911776304245,
+      "learning_rate": 2.605785837651123e-05,
+      "loss": 1.2182,
+      "step": 3670
+    },
+    {
+      "epoch": 0.7547750288424561,
+      "grad_norm": 0.5172019600868225,
+      "learning_rate": 2.5841968911917097e-05,
+      "loss": 1.1888,
+      "step": 3680
+    },
+    {
+      "epoch": 0.7568260479425715,
+      "grad_norm": 0.5043123960494995,
+      "learning_rate": 2.562607944732297e-05,
+      "loss": 1.2004,
+      "step": 3690
+    },
+    {
+      "epoch": 0.7588770670426869,
+      "grad_norm": 0.5103533267974854,
+      "learning_rate": 2.5410189982728844e-05,
+      "loss": 1.1627,
+      "step": 3700
+    },
+    {
+      "epoch": 0.7609280861428022,
+      "grad_norm": 0.5295760631561279,
+      "learning_rate": 2.5194300518134716e-05,
+      "loss": 1.1604,
+      "step": 3710
+    },
+    {
+      "epoch": 0.7629791052429176,
+      "grad_norm": 0.5427724719047546,
+      "learning_rate": 2.4978411053540588e-05,
+      "loss": 1.1781,
+      "step": 3720
+    },
+    {
+      "epoch": 0.765030124343033,
+      "grad_norm": 0.5164818167686462,
+      "learning_rate": 2.476252158894646e-05,
+      "loss": 1.2208,
+      "step": 3730
+    },
+    {
+      "epoch": 0.7670811434431483,
+      "grad_norm": 0.5196744799613953,
+      "learning_rate": 2.4546632124352335e-05,
+      "loss": 1.1971,
+      "step": 3740
+    },
+    {
+      "epoch": 0.7691321625432637,
+      "grad_norm": 0.5128475427627563,
+      "learning_rate": 2.4330742659758203e-05,
+      "loss": 1.1909,
+      "step": 3750
+    },
+    {
+      "epoch": 0.7711831816433791,
+      "grad_norm": 0.49743902683258057,
+      "learning_rate": 2.411485319516408e-05,
+      "loss": 1.2109,
+      "step": 3760
+    },
+    {
+      "epoch": 0.7732342007434945,
+      "grad_norm": 0.5152381658554077,
+      "learning_rate": 2.3898963730569947e-05,
+      "loss": 1.2228,
+      "step": 3770
+    },
+    {
+      "epoch": 0.7752852198436098,
+      "grad_norm": 0.5446299910545349,
+      "learning_rate": 2.3683074265975822e-05,
+      "loss": 1.1953,
+      "step": 3780
+    },
+    {
+      "epoch": 0.7773362389437252,
+      "grad_norm": 0.5300847291946411,
+      "learning_rate": 2.3467184801381694e-05,
+      "loss": 1.1843,
+      "step": 3790
+    },
+    {
+      "epoch": 0.7793872580438406,
+      "grad_norm": 0.5129801630973816,
+      "learning_rate": 2.3251295336787566e-05,
+      "loss": 1.1809,
+      "step": 3800
+    },
+    {
+      "epoch": 0.7814382771439559,
+      "grad_norm": 0.549198567867279,
+      "learning_rate": 2.3035405872193438e-05,
+      "loss": 1.2099,
+      "step": 3810
+    },
+    {
+      "epoch": 0.7834892962440713,
+      "grad_norm": 0.5118544101715088,
+      "learning_rate": 2.281951640759931e-05,
+      "loss": 1.2149,
+      "step": 3820
+    },
+    {
+      "epoch": 0.7855403153441867,
+      "grad_norm": 0.5479713082313538,
+      "learning_rate": 2.260362694300518e-05,
+      "loss": 1.1771,
+      "step": 3830
+    },
+    {
+      "epoch": 0.787591334444302,
+      "grad_norm": 0.541350245475769,
+      "learning_rate": 2.2387737478411056e-05,
+      "loss": 1.1737,
+      "step": 3840
+    },
+    {
+      "epoch": 0.7896423535444174,
+      "grad_norm": 0.5543351769447327,
+      "learning_rate": 2.2171848013816925e-05,
+      "loss": 1.2233,
+      "step": 3850
+    },
+    {
+      "epoch": 0.7916933726445328,
+      "grad_norm": 0.5010188817977905,
+      "learning_rate": 2.19559585492228e-05,
+      "loss": 1.1938,
+      "step": 3860
+    },
+    {
+      "epoch": 0.7937443917446482,
+      "grad_norm": 0.5245205760002136,
+      "learning_rate": 2.1740069084628672e-05,
+      "loss": 1.2015,
+      "step": 3870
+    },
+    {
+      "epoch": 0.7957954108447635,
+      "grad_norm": 0.5324139595031738,
+      "learning_rate": 2.1524179620034544e-05,
+      "loss": 1.2248,
+      "step": 3880
+    },
+    {
+      "epoch": 0.7978464299448789,
+      "grad_norm": 0.5172831416130066,
+      "learning_rate": 2.1308290155440415e-05,
+      "loss": 1.1992,
+      "step": 3890
+    },
+    {
+      "epoch": 0.7998974490449943,
+      "grad_norm": 0.5434138178825378,
+      "learning_rate": 2.1092400690846287e-05,
+      "loss": 1.1813,
+      "step": 3900
+    },
+    {
+      "epoch": 0.8019484681451096,
+      "grad_norm": 0.5221844911575317,
+      "learning_rate": 2.087651122625216e-05,
+      "loss": 1.1625,
+      "step": 3910
+    },
+    {
+      "epoch": 0.803999487245225,
+      "grad_norm": 0.5027469992637634,
+      "learning_rate": 2.0660621761658034e-05,
+      "loss": 1.181,
+      "step": 3920
+    },
+    {
+      "epoch": 0.8060505063453404,
+      "grad_norm": 0.5298044085502625,
+      "learning_rate": 2.0444732297063903e-05,
+      "loss": 1.2079,
+      "step": 3930
+    },
+    {
+      "epoch": 0.8081015254454557,
+      "grad_norm": 0.5463908910751343,
+      "learning_rate": 2.0228842832469778e-05,
+      "loss": 1.2009,
+      "step": 3940
+    },
+    {
+      "epoch": 0.8101525445455711,
+      "grad_norm": 0.5394027233123779,
+      "learning_rate": 2.0012953367875646e-05,
+      "loss": 1.1931,
+      "step": 3950
+    },
+    {
+      "epoch": 0.8122035636456865,
+      "grad_norm": 0.5041294097900391,
+      "learning_rate": 1.979706390328152e-05,
+      "loss": 1.2107,
+      "step": 3960
+    },
+    {
+      "epoch": 0.8142545827458019,
+      "grad_norm": 0.5223291516304016,
+      "learning_rate": 1.9581174438687393e-05,
+      "loss": 1.1775,
+      "step": 3970
+    },
+    {
+      "epoch": 0.8163056018459172,
+      "grad_norm": 0.5221052169799805,
+      "learning_rate": 1.9365284974093265e-05,
+      "loss": 1.2052,
+      "step": 3980
+    },
+    {
+      "epoch": 0.8183566209460326,
+      "grad_norm": 0.5229529738426208,
+      "learning_rate": 1.9149395509499137e-05,
+      "loss": 1.1922,
+      "step": 3990
+    },
+    {
+      "epoch": 0.820407640046148,
+      "grad_norm": 0.5651980042457581,
+      "learning_rate": 1.893350604490501e-05,
+      "loss": 1.2043,
+      "step": 4000
+    },
+    {
+      "epoch": 0.8224586591462633,
+      "grad_norm": 0.5169751644134521,
+      "learning_rate": 1.871761658031088e-05,
+      "loss": 1.2157,
+      "step": 4010
+    },
+    {
+      "epoch": 0.8245096782463787,
+      "grad_norm": 0.5741276144981384,
+      "learning_rate": 1.8501727115716755e-05,
+      "loss": 1.2112,
+      "step": 4020
+    },
+    {
+      "epoch": 0.8265606973464941,
+      "grad_norm": 0.530596137046814,
+      "learning_rate": 1.8285837651122624e-05,
+      "loss": 1.2535,
+      "step": 4030
+    },
+    {
+      "epoch": 0.8286117164466094,
+      "grad_norm": 0.5436383485794067,
+      "learning_rate": 1.80699481865285e-05,
+      "loss": 1.1789,
+      "step": 4040
+    },
+    {
+      "epoch": 0.8306627355467248,
+      "grad_norm": 0.5238965749740601,
+      "learning_rate": 1.7854058721934368e-05,
+      "loss": 1.1645,
+      "step": 4050
+    },
+    {
+      "epoch": 0.8327137546468402,
+      "grad_norm": 0.5226778388023376,
+      "learning_rate": 1.7638169257340243e-05,
+      "loss": 1.2238,
+      "step": 4060
+    },
+    {
+      "epoch": 0.8347647737469556,
+      "grad_norm": 0.5810254812240601,
+      "learning_rate": 1.7422279792746114e-05,
+      "loss": 1.2212,
+      "step": 4070
+    },
+    {
+      "epoch": 0.8368157928470709,
+      "grad_norm": 0.5228540301322937,
+      "learning_rate": 1.7206390328151986e-05,
+      "loss": 1.2025,
+      "step": 4080
+    },
+    {
+      "epoch": 0.8388668119471863,
+      "grad_norm": 0.5112829804420471,
+      "learning_rate": 1.6990500863557858e-05,
+      "loss": 1.1838,
+      "step": 4090
+    },
+    {
+      "epoch": 0.8409178310473017,
+      "grad_norm": 0.5092179775238037,
+      "learning_rate": 1.6774611398963733e-05,
+      "loss": 1.1981,
+      "step": 4100
+    },
+    {
+      "epoch": 0.842968850147417,
+      "grad_norm": 0.5236721634864807,
+      "learning_rate": 1.65587219343696e-05,
+      "loss": 1.1994,
+      "step": 4110
+    },
+    {
+      "epoch": 0.8450198692475324,
+      "grad_norm": 0.5067551732063293,
+      "learning_rate": 1.6342832469775477e-05,
+      "loss": 1.1758,
+      "step": 4120
+    },
+    {
+      "epoch": 0.8470708883476478,
+      "grad_norm": 0.5471055507659912,
+      "learning_rate": 1.6126943005181345e-05,
+      "loss": 1.2315,
+      "step": 4130
+    },
+    {
+      "epoch": 0.8491219074477631,
+      "grad_norm": 0.514798641204834,
+      "learning_rate": 1.591105354058722e-05,
+      "loss": 1.183,
+      "step": 4140
+    },
+    {
+      "epoch": 0.8511729265478785,
+      "grad_norm": 0.5316623449325562,
+      "learning_rate": 1.5695164075993092e-05,
+      "loss": 1.1997,
+      "step": 4150
+    },
+    {
+      "epoch": 0.8532239456479939,
+      "grad_norm": 0.531896710395813,
+      "learning_rate": 1.5479274611398964e-05,
+      "loss": 1.1967,
+      "step": 4160
+    },
+    {
+      "epoch": 0.8552749647481093,
+      "grad_norm": 0.5044012665748596,
+      "learning_rate": 1.5263385146804836e-05,
+      "loss": 1.2061,
+      "step": 4170
+    },
+    {
+      "epoch": 0.8573259838482246,
+      "grad_norm": 0.547264039516449,
+      "learning_rate": 1.5047495682210708e-05,
+      "loss": 1.1975,
+      "step": 4180
+    },
+    {
+      "epoch": 0.85937700294834,
+      "grad_norm": 0.5514972805976868,
+      "learning_rate": 1.4831606217616581e-05,
+      "loss": 1.2044,
+      "step": 4190
+    },
+    {
+      "epoch": 0.8614280220484554,
+      "grad_norm": 0.5322652459144592,
+      "learning_rate": 1.4615716753022455e-05,
+      "loss": 1.2044,
+      "step": 4200
+    },
+    {
+      "epoch": 0.8634790411485707,
+      "grad_norm": 0.5309359431266785,
+      "learning_rate": 1.4399827288428325e-05,
+      "loss": 1.2066,
+      "step": 4210
+    },
+    {
+      "epoch": 0.8655300602486861,
+      "grad_norm": 0.5314792394638062,
+      "learning_rate": 1.4183937823834198e-05,
+      "loss": 1.2006,
+      "step": 4220
+    },
+    {
+      "epoch": 0.8675810793488015,
+      "grad_norm": 0.5549922585487366,
+      "learning_rate": 1.3968048359240068e-05,
+      "loss": 1.2058,
+      "step": 4230
+    },
+    {
+      "epoch": 0.8696320984489168,
+      "grad_norm": 0.5373049378395081,
+      "learning_rate": 1.3752158894645942e-05,
+      "loss": 1.2002,
+      "step": 4240
+    },
+    {
+      "epoch": 0.8716831175490322,
+      "grad_norm": 0.5322666764259338,
+      "learning_rate": 1.3536269430051815e-05,
+      "loss": 1.215,
+      "step": 4250
+    },
+    {
+      "epoch": 0.8737341366491476,
+      "grad_norm": 0.5549564957618713,
+      "learning_rate": 1.3320379965457685e-05,
+      "loss": 1.2131,
+      "step": 4260
+    },
+    {
+      "epoch": 0.875785155749263,
+      "grad_norm": 0.5308319926261902,
+      "learning_rate": 1.3104490500863559e-05,
+      "loss": 1.2203,
+      "step": 4270
+    },
+    {
+      "epoch": 0.8778361748493783,
+      "grad_norm": 0.5089017152786255,
+      "learning_rate": 1.2888601036269432e-05,
+      "loss": 1.1801,
+      "step": 4280
+    },
+    {
+      "epoch": 0.8798871939494937,
+      "grad_norm": 0.5377966165542603,
+      "learning_rate": 1.2672711571675302e-05,
+      "loss": 1.189,
+      "step": 4290
+    },
+    {
+      "epoch": 0.8819382130496091,
+      "grad_norm": 0.5528485178947449,
+      "learning_rate": 1.2456822107081174e-05,
+      "loss": 1.2197,
+      "step": 4300
+    },
+    {
+      "epoch": 0.8839892321497244,
+      "grad_norm": 0.5241679549217224,
+      "learning_rate": 1.2240932642487048e-05,
+      "loss": 1.1652,
+      "step": 4310
+    },
+    {
+      "epoch": 0.8860402512498398,
+      "grad_norm": 0.5626764893531799,
+      "learning_rate": 1.202504317789292e-05,
+      "loss": 1.1805,
+      "step": 4320
+    },
+    {
+      "epoch": 0.8880912703499552,
+      "grad_norm": 0.5248028635978699,
+      "learning_rate": 1.1809153713298791e-05,
+      "loss": 1.1652,
+      "step": 4330
+    },
+    {
+      "epoch": 0.8901422894500705,
+      "grad_norm": 0.5452848672866821,
+      "learning_rate": 1.1593264248704663e-05,
+      "loss": 1.2171,
+      "step": 4340
+    },
+    {
+      "epoch": 0.8921933085501859,
+      "grad_norm": 0.5505712628364563,
+      "learning_rate": 1.1377374784110537e-05,
+      "loss": 1.1967,
+      "step": 4350
+    },
+    {
+      "epoch": 0.8942443276503013,
+      "grad_norm": 0.5437038540840149,
+      "learning_rate": 1.1161485319516408e-05,
+      "loss": 1.2216,
+      "step": 4360
+    },
+    {
+      "epoch": 0.8962953467504167,
+      "grad_norm": 0.5138014554977417,
+      "learning_rate": 1.094559585492228e-05,
+      "loss": 1.193,
+      "step": 4370
+    },
+    {
+      "epoch": 0.898346365850532,
+      "grad_norm": 0.542080283164978,
+      "learning_rate": 1.0729706390328152e-05,
+      "loss": 1.1677,
+      "step": 4380
+    },
+    {
+      "epoch": 0.9003973849506474,
+      "grad_norm": 0.5166792273521423,
+      "learning_rate": 1.0513816925734024e-05,
+      "loss": 1.2147,
+      "step": 4390
+    },
+    {
+      "epoch": 0.9024484040507628,
+      "grad_norm": 0.536491334438324,
+      "learning_rate": 1.0297927461139897e-05,
+      "loss": 1.2077,
+      "step": 4400
+    },
+    {
+      "epoch": 0.9044994231508781,
+      "grad_norm": 0.5504462718963623,
+      "learning_rate": 1.0082037996545769e-05,
+      "loss": 1.1913,
+      "step": 4410
+    },
+    {
+      "epoch": 0.9065504422509935,
+      "grad_norm": 0.5299994945526123,
+      "learning_rate": 9.866148531951641e-06,
+      "loss": 1.1987,
+      "step": 4420
+    },
+    {
+      "epoch": 0.9086014613511089,
+      "grad_norm": 0.5432473421096802,
+      "learning_rate": 9.650259067357513e-06,
+      "loss": 1.199,
+      "step": 4430
+    },
+    {
+      "epoch": 0.9106524804512242,
+      "grad_norm": 0.529331386089325,
+      "learning_rate": 9.434369602763386e-06,
+      "loss": 1.214,
+      "step": 4440
+    },
+    {
+      "epoch": 0.9127034995513396,
+      "grad_norm": 0.49785298109054565,
+      "learning_rate": 9.218480138169258e-06,
+      "loss": 1.202,
+      "step": 4450
+    },
+    {
+      "epoch": 0.914754518651455,
+      "grad_norm": 0.5281327962875366,
+      "learning_rate": 9.00259067357513e-06,
+      "loss": 1.1904,
+      "step": 4460
+    },
+    {
+      "epoch": 0.9168055377515704,
+      "grad_norm": 0.5474033951759338,
+      "learning_rate": 8.786701208981002e-06,
+      "loss": 1.1972,
+      "step": 4470
+    },
+    {
+      "epoch": 0.9188565568516857,
+      "grad_norm": 0.5412236452102661,
+      "learning_rate": 8.570811744386873e-06,
+      "loss": 1.1797,
+      "step": 4480
+    },
+    {
+      "epoch": 0.9209075759518011,
+      "grad_norm": 0.5599170923233032,
+      "learning_rate": 8.354922279792747e-06,
+      "loss": 1.176,
+      "step": 4490
+    },
+    {
+      "epoch": 0.9229585950519165,
+      "grad_norm": 0.5590323805809021,
+      "learning_rate": 8.139032815198619e-06,
+      "loss": 1.1863,
+      "step": 4500
+    },
+    {
+      "epoch": 0.9250096141520318,
+      "grad_norm": 0.566150426864624,
+      "learning_rate": 7.92314335060449e-06,
+      "loss": 1.2217,
+      "step": 4510
+    },
+    {
+      "epoch": 0.9270606332521472,
+      "grad_norm": 0.5459644794464111,
+      "learning_rate": 7.707253886010362e-06,
+      "loss": 1.1903,
+      "step": 4520
+    },
+    {
+      "epoch": 0.9291116523522626,
+      "grad_norm": 0.5333088636398315,
+      "learning_rate": 7.491364421416235e-06,
+      "loss": 1.2076,
+      "step": 4530
+    },
+    {
+      "epoch": 0.9311626714523779,
+      "grad_norm": 0.5921478271484375,
+      "learning_rate": 7.2754749568221076e-06,
+      "loss": 1.191,
+      "step": 4540
+    },
+    {
+      "epoch": 0.9332136905524933,
+      "grad_norm": 0.5061055421829224,
+      "learning_rate": 7.059585492227979e-06,
+      "loss": 1.1787,
+      "step": 4550
+    },
+    {
+      "epoch": 0.9352647096526087,
+      "grad_norm": 0.5804794430732727,
+      "learning_rate": 6.843696027633852e-06,
+      "loss": 1.2096,
+      "step": 4560
+    },
+    {
+      "epoch": 0.9373157287527241,
+      "grad_norm": 0.5328559875488281,
+      "learning_rate": 6.627806563039724e-06,
+      "loss": 1.2072,
+      "step": 4570
+    },
+    {
+      "epoch": 0.9393667478528394,
+      "grad_norm": 0.518925130367279,
+      "learning_rate": 6.4119170984455965e-06,
+      "loss": 1.2119,
+      "step": 4580
+    },
+    {
+      "epoch": 0.9414177669529548,
+      "grad_norm": 0.5092957019805908,
+      "learning_rate": 6.196027633851468e-06,
+      "loss": 1.2137,
+      "step": 4590
+    },
+    {
+      "epoch": 0.9434687860530702,
+      "grad_norm": 0.5156581401824951,
+      "learning_rate": 5.980138169257341e-06,
+      "loss": 1.2059,
+      "step": 4600
+    },
+    {
+      "epoch": 0.9455198051531855,
+      "grad_norm": 0.5467930436134338,
+      "learning_rate": 5.764248704663213e-06,
+      "loss": 1.2111,
+      "step": 4610
+    },
+    {
+      "epoch": 0.9475708242533009,
+      "grad_norm": 0.5478744506835938,
+      "learning_rate": 5.548359240069085e-06,
+      "loss": 1.2014,
+      "step": 4620
+    },
+    {
+      "epoch": 0.9496218433534163,
+      "grad_norm": 0.5648489594459534,
+      "learning_rate": 5.332469775474957e-06,
+      "loss": 1.179,
+      "step": 4630
+    },
+    {
+      "epoch": 0.9516728624535316,
+      "grad_norm": 0.5315075516700745,
+      "learning_rate": 5.11658031088083e-06,
+      "loss": 1.2005,
+      "step": 4640
+    },
+    {
+      "epoch": 0.953723881553647,
+      "grad_norm": 0.5487618446350098,
+      "learning_rate": 4.9006908462867016e-06,
+      "loss": 1.2175,
+      "step": 4650
+    },
+    {
+      "epoch": 0.9557749006537624,
+      "grad_norm": 0.5332956314086914,
+      "learning_rate": 4.684801381692574e-06,
+      "loss": 1.167,
+      "step": 4660
+    },
+    {
+      "epoch": 0.9578259197538777,
+      "grad_norm": 0.5483719110488892,
+      "learning_rate": 4.468911917098446e-06,
+      "loss": 1.1857,
+      "step": 4670
+    },
+    {
+      "epoch": 0.9598769388539931,
+      "grad_norm": 0.5331635475158691,
+      "learning_rate": 4.253022452504319e-06,
+      "loss": 1.209,
+      "step": 4680
+    },
+    {
+      "epoch": 0.9619279579541085,
+      "grad_norm": 0.5277014970779419,
+      "learning_rate": 4.0371329879101905e-06,
+      "loss": 1.1785,
+      "step": 4690
+    },
+    {
+      "epoch": 0.9639789770542239,
+      "grad_norm": 0.5312276482582092,
+      "learning_rate": 3.821243523316062e-06,
+      "loss": 1.1873,
+      "step": 4700
+    },
+    {
+      "epoch": 0.9660299961543392,
+      "grad_norm": 0.532839834690094,
+      "learning_rate": 3.6053540587219345e-06,
+      "loss": 1.2065,
+      "step": 4710
+    },
+    {
+      "epoch": 0.9680810152544546,
+      "grad_norm": 0.5413607954978943,
+      "learning_rate": 3.3894645941278067e-06,
+      "loss": 1.2088,
+      "step": 4720
+    },
+    {
+      "epoch": 0.97013203435457,
+      "grad_norm": 0.5259295105934143,
+      "learning_rate": 3.173575129533679e-06,
+      "loss": 1.2008,
+      "step": 4730
+    },
+    {
+      "epoch": 0.9721830534546853,
+      "grad_norm": 0.5716482996940613,
+      "learning_rate": 2.957685664939551e-06,
+      "loss": 1.199,
+      "step": 4740
+    },
+    {
+      "epoch": 0.9742340725548007,
+      "grad_norm": 0.5410735607147217,
+      "learning_rate": 2.7417962003454234e-06,
+      "loss": 1.2012,
+      "step": 4750
+    },
+    {
+      "epoch": 0.9762850916549161,
+      "grad_norm": 0.5225376486778259,
+      "learning_rate": 2.5259067357512956e-06,
+      "loss": 1.1964,
+      "step": 4760
+    },
+    {
+      "epoch": 0.9783361107550314,
+      "grad_norm": 0.5672590732574463,
+      "learning_rate": 2.310017271157168e-06,
+      "loss": 1.1853,
+      "step": 4770
+    },
+    {
+      "epoch": 0.9803871298551468,
+      "grad_norm": 0.5533677339553833,
+      "learning_rate": 2.09412780656304e-06,
+      "loss": 1.1863,
+      "step": 4780
+    },
+    {
+      "epoch": 0.9824381489552622,
+      "grad_norm": 0.5407289862632751,
+      "learning_rate": 1.878238341968912e-06,
+      "loss": 1.2232,
+      "step": 4790
+    },
+    {
+      "epoch": 0.9844891680553776,
+      "grad_norm": 0.558232843875885,
+      "learning_rate": 1.6623488773747842e-06,
+      "loss": 1.1719,
+      "step": 4800
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 4876,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.2249860917047091e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

lora_checkpoints/checkpoint-4800/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b8f6520f47933838e96dca56ee883040325b73481aff07afcabf963674a84fe
+size 5624

lora_checkpoints/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "boi_token": "<start_of_image>",
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eoi_token": "<end_of_image>",
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "image_token": "<image_soft_token>",
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

lora_checkpoints/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
+size 4689074

lora_checkpoints/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff