diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..b7d7fc7d946ecdea9688cdc0285c2cb3b893cb83 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+outputs/checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+outputs/checkpoint-1500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+outputs/checkpoint-1000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+outputs/checkpoint-2000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+outputs/checkpoint-313/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+outputs/checkpoint-2500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
diff --git a/outputs/.ipynb_checkpoints/README-checkpoint.md b/outputs/.ipynb_checkpoints/README-checkpoint.md
new file mode 100644
index 0000000000000000000000000000000000000000..3c31d8bb65aa32ebf0ae5090690923003aa1ea22
--- /dev/null
+++ b/outputs/.ipynb_checkpoints/README-checkpoint.md
@@ -0,0 +1,59 @@
+---
+base_model: unsloth/gpt-oss-20b-unsloth-bnb-4bit
+library_name: transformers
+model_name: outputs
+tags:
+- generated_from_trainer
+- trl
+- sft
+- unsloth
+licence: license
+---
+
+# Model Card for outputs
+
+This model is a fine-tuned version of [unsloth/gpt-oss-20b-unsloth-bnb-4bit](https://huggingface.co/unsloth/gpt-oss-20b-unsloth-bnb-4bit).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+
+## Quick start
+
+```python
+from transformers import pipeline
+
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="None", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+
+## Training procedure
+
+ 
+
+
+This model was trained with SFT.
+
+### Framework versions
+
+- TRL: 0.22.2
+- Transformers: 4.55.4
+- Pytorch: 2.8.0+cu128
+- Datasets: 3.6.0
+- Tokenizers: 0.21.4
+
+## Citations
+
+
+
+Cite TRL as:
+    
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```
\ No newline at end of file
diff --git a/outputs/README.md b/outputs/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..0bf58439fd19bd30ce4c842fa359ce0d570c835d
--- /dev/null
+++ b/outputs/README.md
@@ -0,0 +1,59 @@
+---
+base_model: unsloth/gpt-oss-20b-unsloth-bnb-4bit
+library_name: transformers
+model_name: outputs
+tags:
+- generated_from_trainer
+- unsloth
+- sft
+- trl
+licence: license
+---
+
+# Model Card for outputs
+
+This model is a fine-tuned version of [unsloth/gpt-oss-20b-unsloth-bnb-4bit](https://huggingface.co/unsloth/gpt-oss-20b-unsloth-bnb-4bit).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+
+## Quick start
+
+```python
+from transformers import pipeline
+
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="None", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+
+## Training procedure
+
+ 
+
+
+This model was trained with SFT.
+
+### Framework versions
+
+- TRL: 0.22.2
+- Transformers: 4.55.4
+- Pytorch: 2.8.0
+- Datasets: 3.6.0
+- Tokenizers: 0.21.4
+
+## Citations
+
+
+
+Cite TRL as:
+    
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```
\ No newline at end of file
diff --git a/outputs/checkpoint-1000/README.md b/outputs/checkpoint-1000/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3abf956c074d00f34a12693c8d6da9738211d7c7
--- /dev/null
+++ b/outputs/checkpoint-1000/README.md
@@ -0,0 +1,209 @@
+---
+base_model: unsloth/gpt-oss-20b-unsloth-bnb-4bit
+library_name: peft
+tags:
+- base_model:adapter:unsloth/gpt-oss-20b-unsloth-bnb-4bit
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/outputs/checkpoint-1000/adapter_config.json b/outputs/checkpoint-1000/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e285b9b6e018b5b9f23736d6699eb1a4267764e7
--- /dev/null
+++ b/outputs/checkpoint-1000/adapter_config.json
@@ -0,0 +1,45 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": {
+    "base_model_class": "GptOssForCausalLM",
+    "parent_library": "transformers.models.gpt_oss.modeling_gpt_oss"
+  },
+  "base_model_name_or_path": "unsloth/gpt-oss-20b-unsloth-bnb-4bit",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "v_proj",
+    "up_proj",
+    "down_proj",
+    "gate_proj",
+    "k_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": null,
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/outputs/checkpoint-1000/chat_template.jinja b/outputs/checkpoint-1000/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..a3650f886e98b2834c25727759c8e0ab8495f316
--- /dev/null
+++ b/outputs/checkpoint-1000/chat_template.jinja
@@ -0,0 +1,315 @@
+{# Copyright 2025-present Unsloth. Apache 2.0 License. Unsloth chat template fixes. Edited from ggml-org & OpenAI #}
+{#-
+  In addition to the normal inputs of `messages` and `tools`, this template also accepts the
+  following kwargs:
+  - "builtin_tools": A list, can contain "browser" and/or "python".
+  - "model_identity": A string that optionally describes the model identity.
+  - "reasoning_effort": A string that describes the reasoning effort, defaults to "medium".
+ #}
+
+{#- Tool Definition Rendering ============================================== #}
+{%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%}
+    {%- if param_spec.type == "array" -%}
+        {%- if param_spec['items'] -%}
+            {%- if param_spec['items']['type'] == "string" -%}
+                {{- "string[]" }}
+            {%- elif param_spec['items']['type'] == "number" -%}
+                {{- "number[]" }}
+            {%- elif param_spec['items']['type'] == "integer" -%}
+                {{- "number[]" }}
+            {%- elif param_spec['items']['type'] == "boolean" -%}
+                {{- "boolean[]" }}
+            {%- else -%}
+                {%- set inner_type = render_typescript_type(param_spec['items'], required_params) -%}
+                {%- if inner_type == "object | object" or inner_type|length > 50 -%}
+                    {{- "any[]" }}
+                {%- else -%}
+                    {{- inner_type + "[]" }}
+                {%- endif -%}
+            {%- endif -%}
+            {%- if param_spec.nullable -%}
+                {{- " | null" }}
+            {%- endif -%}
+        {%- else -%}
+            {{- "any[]" }}
+            {%- if param_spec.nullable -%}
+                {{- " | null" }}
+            {%- endif -%}
+        {%- endif -%}
+    {%- elif param_spec.type is defined and param_spec.type is iterable and param_spec.type is not string and param_spec.type is not mapping and param_spec.type[0] is defined -%}
+        {#- Handle array of types like ["object", "object"] from Union[dict, list] #}
+        {%- if param_spec.type | length > 1 -%}
+            {{- param_spec.type | join(" | ") }}
+        {%- else -%}
+            {{- param_spec.type[0] }}
+        {%- endif -%}
+    {%- elif param_spec.oneOf -%}
+        {#- Handle oneOf schemas - check for complex unions and fallback to any #}
+        {%- set has_object_variants = false -%}
+        {%- for variant in param_spec.oneOf -%}
+            {%- if variant.type == "object" -%}
+                {%- set has_object_variants = true -%}
+            {%- endif -%}
+        {%- endfor -%}
+        {%- if has_object_variants and param_spec.oneOf|length > 1 -%}
+            {{- "any" }}
+        {%- else -%}
+            {%- for variant in param_spec.oneOf -%}
+                {{- render_typescript_type(variant, required_params) -}}
+                {%- if variant.description %}
+                    {{- "// " + variant.description }}
+                {%- endif -%}
+                {%- if variant.default is defined %}
+                    {{ "// default: " + variant.default|tojson }}
+                {%- endif -%}
+                {%- if not loop.last %}
+                    {{- " | " }}
+                {% endif -%}
+            {%- endfor -%}
+        {%- endif -%}
+    {%- elif param_spec.type == "string" -%}
+        {%- if param_spec.enum -%}
+            {{- '"' + param_spec.enum|join('" | "') + '"' -}}
+        {%- else -%}
+            {{- "string" }}
+            {%- if param_spec.nullable %}
+                {{- " | null" }}
+            {%- endif -%}
+        {%- endif -%}
+    {%- elif param_spec.type == "number" -%}
+        {{- "number" }}
+    {%- elif param_spec.type == "integer" -%}
+        {{- "number" }}
+    {%- elif param_spec.type == "boolean" -%}
+        {{- "boolean" }}
+
+    {%- elif param_spec.type == "object" -%}
+        {%- if param_spec.properties -%}
+            {{- "{\n" }}
+            {%- for prop_name, prop_spec in param_spec.properties.items() -%}
+                {{- prop_name -}}
+                {%- if prop_name not in (param_spec.required or []) -%}
+                    {{- "?" }}
+                {%- endif -%}
+                {{- ": " }}
+                {{ render_typescript_type(prop_spec, param_spec.required or []) }}
+                {%- if not loop.last -%}
+                    {{-", " }}
+                {%- endif -%}
+            {%- endfor -%}
+            {{- "}" }}
+        {%- else -%}
+            {{- "object" }}
+        {%- endif -%}
+    {%- else -%}
+        {{- "any" }}
+    {%- endif -%}
+{%- endmacro -%}
+
+{%- macro render_tool_namespace(namespace_name, tools) -%}
+    {{- "## " + namespace_name + "\n\n" }}
+    {{- "namespace " + namespace_name + " {\n\n" }}
+    {%- for tool in tools %}
+        {%- set tool = tool.function %}
+        {{- "// " + tool.description + "\n" }}
+        {{- "type "+ tool.name + " = " }}
+        {%- if tool.parameters and tool.parameters.properties -%}
+            {{- "(_: " }}
+            {{- "{\n" }}
+            {%- for param_name, param_spec in tool.parameters.properties.items() %}
+                {{- "// " + param_spec.description + "\n" }}
+                {{- param_name }}
+                {%- if param_name not in (tool.parameters.required or []) -%}
+                    {{- "?" }}
+                {%- endif -%}
+                {{- ": " }}
+                {{- render_typescript_type(param_spec, tool.parameters.required or []) }}
+                {%- if param_spec.default is defined -%}
+                    {%- if param_spec.enum %}
+                        {{- ", // default: " + param_spec.default }}
+                    {%- elif param_spec.oneOf %}
+                        {{- "// default: " + param_spec.default }}
+                    {%- else %}
+                        {{- ", // default: " + param_spec.default|tojson }}
+                    {%- endif -%}
+                {%- endif -%}
+                {%- if not loop.last %}
+                    {{- ",\n" }}
+                {%- else %}
+                    {{- "\n" }}
+                {%- endif -%}
+            {%- endfor %}
+            {{- "}) => any;\n\n" }}
+        {%- else -%}
+            {{- "() => any;\n\n" }}
+        {%- endif -%}
+    {%- endfor %}
+    {{- "} // namespace " + namespace_name }}
+{%- endmacro -%}
+
+{%- macro render_builtin_tools(browser_tool, python_tool) -%}
+    {%- if browser_tool %}
+        {{- "## browser\n\n" }}
+        {{- "// Tool for browsing.\n" }}
+        {{- "// The `cursor` appears in brackets before each browsing display: `[{cursor}]`.\n" }}
+        {{- "// Cite information from the tool using the following format:\n" }}
+        {{- "// `【{cursor}†L{line_start}(-L{line_end})?】`, for example: `【6†L9-L11】` or `【8†L3】`.\n" }}
+        {{- "// Do not quote more than 10 words directly from the tool output.\n" }}
+        {{- "// sources=web (default: web)\n" }}
+        {{- "namespace browser {\n\n" }}
+        {{- "// Searches for information related to `query` and displays `topn` results.\n" }}
+        {{- "type search = (_: {\n" }}
+        {{- "query: string,\n" }}
+        {{- "topn?: number, // default: 10\n" }}
+        {{- "source?: string,\n" }}
+        {{- "}) => any;\n\n" }}
+        {{- "// Opens the link `id` from the page indicated by `cursor` starting at line number `loc`, showing `num_lines` lines.\n" }}
+        {{- "// Valid link ids are displayed with the formatting: `【{id}†.*】`.\n" }}
+        {{- "// If `cursor` is not provided, the most recent page is implied.\n" }}
+        {{- "// If `id` is a string, it is treated as a fully qualified URL associated with `source`.\n" }}
+        {{- "// If `loc` is not provided, the viewport will be positioned at the beginning of the document or centered on the most relevant passage, if available.\n" }}
+        {{- "// Use this function without `id` to scroll to a new location of an opened page.\n" }}
+        {{- "type open = (_: {\n" }}
+        {{- "id?: number | string, // default: -1\n" }}
+        {{- "cursor?: number, // default: -1\n" }}
+        {{- "loc?: number, // default: -1\n" }}
+        {{- "num_lines?: number, // default: -1\n" }}
+        {{- "view_source?: boolean, // default: false\n" }}
+        {{- "source?: string,\n" }}
+        {{- "}) => any;\n\n" }}
+        {{- "// Finds exact matches of `pattern` in the current page, or the page given by `cursor`.\n" }}
+        {{- "type find = (_: {\n" }}
+        {{- "pattern: string,\n" }}
+        {{- "cursor?: number, // default: -1\n" }}
+        {{- "}) => any;\n\n" }}
+        {{- "} // namespace browser\n\n" }}
+    {%- endif -%}
+
+    {%- if python_tool %}
+        {{- "## python\n\n" }}
+        {{- "Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).\n\n" }}
+        {{- "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 120.0 seconds. The drive at '/mnt/data' can be used to save and persist user files. Internet access for this session is UNKNOWN. Depends on the cluster.\n\n" }}
+    {%- endif -%}
+{%- endmacro -%}
+
+{#- System Message Construction ============================================ #}
+{%- macro build_system_message() -%}
+    {%- if model_identity is not defined %}
+        {{- "You are ChatGPT, a large language model trained by OpenAI.\n" -}}
+    {%- else %}
+        {{- model_identity }}
+    {%- endif %}
+    {{- "Knowledge cutoff: 2024-06\n" }}
+    {{- "Current date: " + strftime_now("%Y-%m-%d") + "\n\n" }}
+    {%- if reasoning_effort is not defined %}
+        {%- set reasoning_effort = "medium" %}
+    {%- endif %}
+    {{- "Reasoning: " + reasoning_effort + "\n\n" }}
+    {%- if builtin_tools is defined %}
+        {{- "# Tools\n\n" }}
+        {%- set available_builtin_tools = namespace(browser=false, python=false) %}
+        {%- for tool in builtin_tools %}
+            {%- if tool == "browser" %}
+                {%- set available_builtin_tools.browser = true %}
+            {%- elif tool == "python" %}
+                {%- set available_builtin_tools.python = true %}
+            {%- endif %}
+        {%- endfor %}
+        {{- render_builtin_tools(available_builtin_tools.browser, available_builtin_tools.python) }}
+    {%- endif -%}
+    {{- "# Valid channels: analysis, commentary, final. Channel must be included for every message." }}
+    {%- if tools is defined -%}
+        {{- "\nCalls to these tools must go to the commentary channel: 'functions'." }}
+    {%- endif -%}
+{%- endmacro -%}
+
+{#- Main Template Logic ================================================= #}
+{#- Set defaults #}
+
+{#- Render system message #}
+{{- "<|start|>system<|message|>" }}
+{{- build_system_message() }}
+{{- "<|end|>" }}
+
+{#- Extract developer message #}
+{%- if messages[0].role == "developer" or messages[0].role == "system" %}
+    {%- set developer_message = messages[0].content %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set developer_message = "" %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+
+{#- Render developer message #}
+{%- if developer_message or tools %}
+    {{- "<|start|>developer<|message|>" }}
+    {%- if developer_message %}
+        {{- "# Instructions\n\n" }}
+        {{- developer_message }}
+    {%- endif %}
+    {%- if tools -%}
+        {{- "\n\n" }}
+        {{- "# Tools\n\n" }}
+        {{- render_tool_namespace("functions", tools) }}
+    {%- endif -%}
+    {{- "<|end|>" }}
+{%- endif %}
+
+{#- Render messages #}
+{%- set last_tool_call = namespace(name=none) %}
+{%- for message in loop_messages -%}
+    {#- At this point only assistant/user/tool messages should remain #}
+    {%- if message.role == 'assistant' -%}
+        {%- if "tool_calls" in message %}
+            {#- We assume max 1 tool call per message, and so we infer the tool call name #}
+            {#- in "tool" messages from the most recent assistant tool call name #}
+            {%- set tool_call = message.tool_calls[0] %}
+            {%- if tool_call.function %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {%- if message.content %}
+                {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.content + "<|end|>" }}
+            {%- endif %}
+            {{- "<|start|>assistant to=" }}
+            {{- "functions." + tool_call.name + "<|channel|>commentary json<|message|>" }}
+            {{- tool_call.arguments|tojson }}
+            {{- "<|call|>" }}
+            {%- set last_tool_call.name = tool_call.name %}
+        {%- elif "thinking" in message and loop.last and not add_generation_prompt %}
+            {#- Only render the CoT if the final turn is an assistant turn and add_generation_prompt is false #}
+            {#- This is a situation that should only occur in training, never in inference. #}
+            {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }}
+            {#- <|return|> indicates the end of generation, but <|end|> does not #}
+            {#- <|return|> should never be an input to the model, but we include it as the final token #}
+            {#- when training, so the model learns to emit it. #}
+            {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|return|>" }}
+            {%- set last_tool_call.name = none %}
+        {%- elif "thinking" in message %}
+            {#- CoT is dropped during all previous turns, so we never render it for inference #}
+            {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|end|>" }}
+            {%- set last_tool_call.name = none %}
+        {%- elif loop.last and not add_generation_prompt %}
+            {#- <|return|> indicates the end of generation, but <|end|> does not #}
+            {#- <|return|> should never be an input to the model, but we include it as the final token #}
+            {#- when training, so the model learns to emit it. #}
+            {{- "<|start|>assistant<|message|>" + message.content + "<|return|>" }}
+        {%- else %}
+            {{- "<|start|>assistant<|message|>" + message.content + "<|end|>" }}
+            {%- set last_tool_call.name = none %}
+        {%- endif %}
+    {%- elif message.role == 'tool' -%}
+        {%- if last_tool_call.name is none %}
+            {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }}
+        {%- endif %}
+        {{- "<|start|>functions." + last_tool_call.name }}
+        {{- " to=assistant<|channel|>commentary<|message|>" + message.content|tojson + "<|end|>" }}
+    {%- else -%}
+        {{- "<|start|>user<|message|>" + message.content + "<|end|>" }}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Generation prompt #}
+{%- if add_generation_prompt -%}
+<|start|>assistant
+{%- endif -%}
+{# Copyright 2025-present Unsloth. Apache 2.0 License. Unsloth chat template fixes. Edited from ggml-org & OpenAI #}
\ No newline at end of file
diff --git a/outputs/checkpoint-1000/special_tokens_map.json b/outputs/checkpoint-1000/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..6fba18753f4d09dbb8fcdf1482daff36b963d639
--- /dev/null
+++ b/outputs/checkpoint-1000/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|return|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|reserved_200017|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/outputs/checkpoint-1000/tokenizer.json b/outputs/checkpoint-1000/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..6ec3ef1795cbbda6b7cb7d1f114919cbe3fdd647
--- /dev/null
+++ b/outputs/checkpoint-1000/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0614fe83cadab421296e664e1f48f4261fa8fef6e03e63bb75c20f38e37d07d3
+size 27868174
diff --git a/outputs/checkpoint-1000/tokenizer_config.json b/outputs/checkpoint-1000/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..482ae30d27a74c38d2228e69dd37c529fc485a45
--- /dev/null
+++ b/outputs/checkpoint-1000/tokenizer_config.json
@@ -0,0 +1,185 @@
+{
+  "added_tokens_decoder": {
+    "199998": {
+      "content": "<|startoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "199999": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200000": {
+      "content": "<|reserved_200000|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200001": {
+      "content": "<|reserved_200001|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200002": {
+      "content": "<|return|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200003": {
+      "content": "<|constrain|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200004": {
+      "content": "<|reserved_200004|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200005": {
+      "content": "<|channel|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200006": {
+      "content": "<|start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200007": {
+      "content": "<|end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200008": {
+      "content": "<|message|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200009": {
+      "content": "<|reserved_200009|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200010": {
+      "content": "<|reserved_200010|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200011": {
+      "content": "<|reserved_200011|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200012": {
+      "content": "<|call|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200013": {
+      "content": "<|reserved_200013|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200014": {
+      "content": "<|reserved_200014|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200015": {
+      "content": "<|reserved_200015|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200016": {
+      "content": "<|reserved_200016|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200017": {
+      "content": "<|reserved_200017|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200018": {
+      "content": "<|endofprompt|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|startoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|return|>",
+  "extra_special_tokens": {},
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 131072,
+  "pad_token": "<|reserved_200017|>",
+  "padding_side": "right",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": null
+}
diff --git a/outputs/checkpoint-1000/trainer_state.json b/outputs/checkpoint-1000/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..172192f0d56d38a9f55c5ff8477f045dc5515bdc
--- /dev/null
+++ b/outputs/checkpoint-1000/trainer_state.json
@@ -0,0 +1,7034 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.3886136209074128,
+  "eval_steps": 500,
+  "global_step": 1000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 13.684800148010254,
+      "learning_rate": 0.0,
+      "loss": 2.3276,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 13.660787582397461,
+      "learning_rate": 4e-05,
+      "loss": 2.2792,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 13.35280704498291,
+      "learning_rate": 8e-05,
+      "loss": 2.4151,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 6.15027379989624,
+      "learning_rate": 0.00012,
+      "loss": 1.7812,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 1.3168226480484009,
+      "learning_rate": 0.00016,
+      "loss": 1.4536,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.9872580170631409,
+      "learning_rate": 0.0002,
+      "loss": 1.4171,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.7496100664138794,
+      "learning_rate": 0.00019935064935064936,
+      "loss": 1.4168,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.7376005053520203,
+      "learning_rate": 0.00019870129870129872,
+      "loss": 1.3659,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5281137824058533,
+      "learning_rate": 0.00019805194805194807,
+      "loss": 1.2566,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.5485746264457703,
+      "learning_rate": 0.00019740259740259742,
+      "loss": 1.3761,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.5506592392921448,
+      "learning_rate": 0.00019675324675324675,
+      "loss": 1.3327,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.49382686614990234,
+      "learning_rate": 0.00019610389610389613,
+      "loss": 1.3727,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.36203011870384216,
+      "learning_rate": 0.00019545454545454548,
+      "loss": 1.1515,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.3528599739074707,
+      "learning_rate": 0.0001948051948051948,
+      "loss": 1.2636,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.31244418025016785,
+      "learning_rate": 0.00019415584415584416,
+      "loss": 1.1873,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.3379523754119873,
+      "learning_rate": 0.00019350649350649354,
+      "loss": 1.2657,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.3025083839893341,
+      "learning_rate": 0.00019285714285714286,
+      "loss": 1.2846,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.2560190260410309,
+      "learning_rate": 0.00019220779220779222,
+      "loss": 1.1587,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.2554129958152771,
+      "learning_rate": 0.00019155844155844157,
+      "loss": 1.2812,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.22662702202796936,
+      "learning_rate": 0.00019090909090909092,
+      "loss": 1.1664,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.2515714168548584,
+      "learning_rate": 0.00019025974025974027,
+      "loss": 1.2177,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.24396637082099915,
+      "learning_rate": 0.00018961038961038963,
+      "loss": 1.2053,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.24488303065299988,
+      "learning_rate": 0.00018896103896103895,
+      "loss": 1.2074,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.2168620079755783,
+      "learning_rate": 0.00018831168831168833,
+      "loss": 1.1284,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.24021224677562714,
+      "learning_rate": 0.00018766233766233769,
+      "loss": 1.2169,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.20057056844234467,
+      "learning_rate": 0.000187012987012987,
+      "loss": 1.1031,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.19900795817375183,
+      "learning_rate": 0.00018636363636363636,
+      "loss": 1.1004,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.2019268423318863,
+      "learning_rate": 0.00018571428571428572,
+      "loss": 1.1476,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.1996479034423828,
+      "learning_rate": 0.00018506493506493507,
+      "loss": 1.1455,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.25262022018432617,
+      "learning_rate": 0.00018441558441558442,
+      "loss": 1.1025,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.225438192486763,
+      "learning_rate": 0.00018376623376623378,
+      "loss": 1.1954,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.17834505438804626,
+      "learning_rate": 0.00018311688311688313,
+      "loss": 1.0934,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.20071206986904144,
+      "learning_rate": 0.00018246753246753248,
+      "loss": 1.0488,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.1920139640569687,
+      "learning_rate": 0.00018181818181818183,
+      "loss": 1.123,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.18714852631092072,
+      "learning_rate": 0.0001811688311688312,
+      "loss": 1.0798,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.18315713107585907,
+      "learning_rate": 0.00018051948051948054,
+      "loss": 1.1107,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.19156870245933533,
+      "learning_rate": 0.00017987012987012987,
+      "loss": 1.1125,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.21527768671512604,
+      "learning_rate": 0.00017922077922077922,
+      "loss": 1.1346,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.1871163249015808,
+      "learning_rate": 0.0001785714285714286,
+      "loss": 1.0742,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.17750784754753113,
+      "learning_rate": 0.00017792207792207792,
+      "loss": 1.1323,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.177419051527977,
+      "learning_rate": 0.00017727272727272728,
+      "loss": 1.1405,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.16714292764663696,
+      "learning_rate": 0.00017662337662337663,
+      "loss": 1.1084,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.1610356718301773,
+      "learning_rate": 0.00017597402597402598,
+      "loss": 1.1125,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.2548656761646271,
+      "learning_rate": 0.00017532467532467534,
+      "loss": 1.1114,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.1731044203042984,
+      "learning_rate": 0.0001746753246753247,
+      "loss": 1.1197,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.1739533394575119,
+      "learning_rate": 0.00017402597402597401,
+      "loss": 1.1777,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.2178352177143097,
+      "learning_rate": 0.0001733766233766234,
+      "loss": 1.1111,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.17247150838375092,
+      "learning_rate": 0.00017272727272727275,
+      "loss": 1.1253,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.18075324594974518,
+      "learning_rate": 0.00017207792207792207,
+      "loss": 1.1358,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.15898071229457855,
+      "learning_rate": 0.00017142857142857143,
+      "loss": 1.0606,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.16518613696098328,
+      "learning_rate": 0.0001707792207792208,
+      "loss": 1.0944,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.16035063564777374,
+      "learning_rate": 0.00017012987012987013,
+      "loss": 1.0554,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.1686483472585678,
+      "learning_rate": 0.00016948051948051948,
+      "loss": 1.0384,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.16575631499290466,
+      "learning_rate": 0.00016883116883116884,
+      "loss": 1.0243,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.16840039193630219,
+      "learning_rate": 0.0001681818181818182,
+      "loss": 1.117,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.17616064846515656,
+      "learning_rate": 0.00016753246753246754,
+      "loss": 1.0743,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.168218195438385,
+      "learning_rate": 0.0001668831168831169,
+      "loss": 1.0627,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.17026656866073608,
+      "learning_rate": 0.00016623376623376625,
+      "loss": 1.0059,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.16454458236694336,
+      "learning_rate": 0.0001655844155844156,
+      "loss": 0.9943,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.17185136675834656,
+      "learning_rate": 0.00016493506493506495,
+      "loss": 1.1545,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.17822986841201782,
+      "learning_rate": 0.00016428571428571428,
+      "loss": 1.073,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.1676608771085739,
+      "learning_rate": 0.00016363636363636366,
+      "loss": 1.0886,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.1727771908044815,
+      "learning_rate": 0.000162987012987013,
+      "loss": 1.0432,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.17827573418617249,
+      "learning_rate": 0.00016233766233766234,
+      "loss": 1.083,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.19807517528533936,
+      "learning_rate": 0.0001616883116883117,
+      "loss": 1.1208,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.17693684995174408,
+      "learning_rate": 0.00016103896103896104,
+      "loss": 1.089,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.15489234030246735,
+      "learning_rate": 0.0001603896103896104,
+      "loss": 0.9707,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.16443990170955658,
+      "learning_rate": 0.00015974025974025975,
+      "loss": 1.0643,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.2051103413105011,
+      "learning_rate": 0.0001590909090909091,
+      "loss": 1.1246,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.18824075162410736,
+      "learning_rate": 0.00015844155844155845,
+      "loss": 1.0855,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.18659448623657227,
+      "learning_rate": 0.0001577922077922078,
+      "loss": 1.1412,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.1854114979505539,
+      "learning_rate": 0.00015714285714285716,
+      "loss": 1.0249,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.1876193732023239,
+      "learning_rate": 0.00015649350649350649,
+      "loss": 1.1029,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.1888684630393982,
+      "learning_rate": 0.00015584415584415587,
+      "loss": 1.0789,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.20240606367588043,
+      "learning_rate": 0.0001551948051948052,
+      "loss": 1.0495,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.232120081782341,
+      "learning_rate": 0.00015454545454545454,
+      "loss": 1.0735,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.16897843778133392,
+      "learning_rate": 0.0001538961038961039,
+      "loss": 1.0164,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.18796634674072266,
+      "learning_rate": 0.00015324675324675325,
+      "loss": 1.0676,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.19574032723903656,
+      "learning_rate": 0.0001525974025974026,
+      "loss": 1.0456,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.18007811903953552,
+      "learning_rate": 0.00015194805194805196,
+      "loss": 1.0894,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.18932929635047913,
+      "learning_rate": 0.0001512987012987013,
+      "loss": 1.0729,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.20614288747310638,
+      "learning_rate": 0.00015064935064935066,
+      "loss": 1.0854,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.19291089475154877,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.1217,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.18916529417037964,
+      "learning_rate": 0.00014935064935064934,
+      "loss": 1.0963,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.20306220650672913,
+      "learning_rate": 0.00014870129870129872,
+      "loss": 1.0898,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.17870067059993744,
+      "learning_rate": 0.00014805194805194807,
+      "loss": 1.0213,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.18411923944950104,
+      "learning_rate": 0.0001474025974025974,
+      "loss": 1.0844,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.18788227438926697,
+      "learning_rate": 0.00014675324675324675,
+      "loss": 1.0338,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.23874884843826294,
+      "learning_rate": 0.00014610389610389613,
+      "loss": 1.1118,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.19380499422550201,
+      "learning_rate": 0.00014545454545454546,
+      "loss": 1.0464,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.18968750536441803,
+      "learning_rate": 0.0001448051948051948,
+      "loss": 1.0569,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.19545753300189972,
+      "learning_rate": 0.00014415584415584416,
+      "loss": 1.1225,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.19170494377613068,
+      "learning_rate": 0.00014350649350649352,
+      "loss": 1.0602,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.17953918874263763,
+      "learning_rate": 0.00014285714285714287,
+      "loss": 1.032,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.1822536289691925,
+      "learning_rate": 0.00014220779220779222,
+      "loss": 1.0559,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.18591298162937164,
+      "learning_rate": 0.00014155844155844155,
+      "loss": 1.031,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.2129002958536148,
+      "learning_rate": 0.00014090909090909093,
+      "loss": 1.1391,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.18386681377887726,
+      "learning_rate": 0.00014025974025974028,
+      "loss": 0.9919,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.18314239382743835,
+      "learning_rate": 0.0001396103896103896,
+      "loss": 1.0445,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.1999066174030304,
+      "learning_rate": 0.00013896103896103896,
+      "loss": 1.0538,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.18741188943386078,
+      "learning_rate": 0.00013831168831168834,
+      "loss": 1.0722,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.19351010024547577,
+      "learning_rate": 0.00013766233766233766,
+      "loss": 1.0491,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.18859203159809113,
+      "learning_rate": 0.00013701298701298702,
+      "loss": 1.0593,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.1962767392396927,
+      "learning_rate": 0.00013636363636363637,
+      "loss": 1.1344,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.20819440484046936,
+      "learning_rate": 0.00013571428571428572,
+      "loss": 1.1137,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.19590184092521667,
+      "learning_rate": 0.00013506493506493507,
+      "loss": 1.0624,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.18631424009799957,
+      "learning_rate": 0.00013441558441558443,
+      "loss": 1.0587,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.19572143256664276,
+      "learning_rate": 0.00013376623376623375,
+      "loss": 1.0494,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.1910988837480545,
+      "learning_rate": 0.00013311688311688313,
+      "loss": 1.0481,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.19455869495868683,
+      "learning_rate": 0.00013246753246753249,
+      "loss": 1.029,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.18669827282428741,
+      "learning_rate": 0.0001318181818181818,
+      "loss": 1.0513,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.17523664236068726,
+      "learning_rate": 0.0001311688311688312,
+      "loss": 1.0126,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.17929129302501678,
+      "learning_rate": 0.00013051948051948052,
+      "loss": 1.0717,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.19380168616771698,
+      "learning_rate": 0.00012987012987012987,
+      "loss": 1.0324,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.18090228736400604,
+      "learning_rate": 0.00012922077922077922,
+      "loss": 1.0515,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.2067340910434723,
+      "learning_rate": 0.00012857142857142858,
+      "loss": 1.0939,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.1880485862493515,
+      "learning_rate": 0.00012792207792207793,
+      "loss": 1.0986,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.182168647646904,
+      "learning_rate": 0.00012727272727272728,
+      "loss": 1.0109,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.20187129080295563,
+      "learning_rate": 0.00012662337662337663,
+      "loss": 1.0668,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.2082669734954834,
+      "learning_rate": 0.000125974025974026,
+      "loss": 1.054,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.18294434249401093,
+      "learning_rate": 0.00012532467532467534,
+      "loss": 1.0397,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.20515067875385284,
+      "learning_rate": 0.00012467532467532467,
+      "loss": 1.1092,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.1758790761232376,
+      "learning_rate": 0.00012402597402597402,
+      "loss": 0.9755,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.2170792669057846,
+      "learning_rate": 0.0001233766233766234,
+      "loss": 1.0434,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.202157124876976,
+      "learning_rate": 0.00012272727272727272,
+      "loss": 1.1129,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.18556398153305054,
+      "learning_rate": 0.00012207792207792208,
+      "loss": 1.0665,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.20196087658405304,
+      "learning_rate": 0.00012142857142857143,
+      "loss": 1.1,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.1921566128730774,
+      "learning_rate": 0.0001207792207792208,
+      "loss": 1.0918,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.18866224586963654,
+      "learning_rate": 0.00012012987012987014,
+      "loss": 1.0014,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.207601398229599,
+      "learning_rate": 0.00011948051948051949,
+      "loss": 1.0726,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.21592366695404053,
+      "learning_rate": 0.00011883116883116883,
+      "loss": 1.1379,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.2016124576330185,
+      "learning_rate": 0.0001181818181818182,
+      "loss": 1.1428,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.20478437840938568,
+      "learning_rate": 0.00011753246753246753,
+      "loss": 1.121,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.22730594873428345,
+      "learning_rate": 0.00011688311688311689,
+      "loss": 1.0319,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.22592711448669434,
+      "learning_rate": 0.00011623376623376625,
+      "loss": 1.1264,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.20035041868686676,
+      "learning_rate": 0.00011558441558441559,
+      "loss": 1.0686,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.20648567378520966,
+      "learning_rate": 0.00011493506493506494,
+      "loss": 1.0817,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.21222743391990662,
+      "learning_rate": 0.00011428571428571428,
+      "loss": 1.0678,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.2075391560792923,
+      "learning_rate": 0.00011363636363636365,
+      "loss": 1.0897,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.1964101791381836,
+      "learning_rate": 0.000112987012987013,
+      "loss": 1.0906,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.22406511008739471,
+      "learning_rate": 0.00011233766233766234,
+      "loss": 1.0594,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.23787978291511536,
+      "learning_rate": 0.00011168831168831168,
+      "loss": 1.1053,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.21196185052394867,
+      "learning_rate": 0.00011103896103896105,
+      "loss": 1.0923,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.21042804419994354,
+      "learning_rate": 0.0001103896103896104,
+      "loss": 1.0381,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.2267436534166336,
+      "learning_rate": 0.00010974025974025974,
+      "loss": 1.0818,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.23742735385894775,
+      "learning_rate": 0.00010909090909090909,
+      "loss": 1.0872,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.17787213623523712,
+      "learning_rate": 0.00010844155844155846,
+      "loss": 1.03,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.22422832250595093,
+      "learning_rate": 0.0001077922077922078,
+      "loss": 1.0738,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.22946301102638245,
+      "learning_rate": 0.00010714285714285715,
+      "loss": 1.0274,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.2137996405363083,
+      "learning_rate": 0.00010649350649350649,
+      "loss": 1.0539,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.1748756766319275,
+      "learning_rate": 0.00010584415584415586,
+      "loss": 1.0355,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.22275175154209137,
+      "learning_rate": 0.0001051948051948052,
+      "loss": 1.1696,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.20996077358722687,
+      "learning_rate": 0.00010454545454545455,
+      "loss": 1.0303,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.1945938766002655,
+      "learning_rate": 0.00010389610389610389,
+      "loss": 0.9747,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.1970377266407013,
+      "learning_rate": 0.00010324675324675325,
+      "loss": 1.0358,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.18814732134342194,
+      "learning_rate": 0.00010259740259740261,
+      "loss": 0.9612,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.2153233289718628,
+      "learning_rate": 0.00010194805194805195,
+      "loss": 1.0749,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.21788008511066437,
+      "learning_rate": 0.0001012987012987013,
+      "loss": 1.0883,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.214650496840477,
+      "learning_rate": 0.00010064935064935067,
+      "loss": 1.0539,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.19312834739685059,
+      "learning_rate": 0.0001,
+      "loss": 1.0657,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.19916598498821259,
+      "learning_rate": 9.935064935064936e-05,
+      "loss": 1.0478,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.2057606726884842,
+      "learning_rate": 9.870129870129871e-05,
+      "loss": 1.0094,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.22159607708454132,
+      "learning_rate": 9.805194805194806e-05,
+      "loss": 1.0952,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.18274275958538055,
+      "learning_rate": 9.74025974025974e-05,
+      "loss": 1.0065,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.19835162162780762,
+      "learning_rate": 9.675324675324677e-05,
+      "loss": 1.0742,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.2114904820919037,
+      "learning_rate": 9.610389610389611e-05,
+      "loss": 1.1109,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.21488523483276367,
+      "learning_rate": 9.545454545454546e-05,
+      "loss": 1.0465,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.19870303571224213,
+      "learning_rate": 9.480519480519481e-05,
+      "loss": 1.0318,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.20413029193878174,
+      "learning_rate": 9.415584415584417e-05,
+      "loss": 1.0817,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.1847231239080429,
+      "learning_rate": 9.35064935064935e-05,
+      "loss": 1.0144,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.2715964913368225,
+      "learning_rate": 9.285714285714286e-05,
+      "loss": 0.9832,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.2225002497434616,
+      "learning_rate": 9.220779220779221e-05,
+      "loss": 1.1051,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.22931510210037231,
+      "learning_rate": 9.155844155844156e-05,
+      "loss": 1.1042,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.21848627924919128,
+      "learning_rate": 9.090909090909092e-05,
+      "loss": 1.1151,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.19852259755134583,
+      "learning_rate": 9.025974025974027e-05,
+      "loss": 1.0889,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.2080363780260086,
+      "learning_rate": 8.961038961038961e-05,
+      "loss": 1.0777,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.22391024231910706,
+      "learning_rate": 8.896103896103896e-05,
+      "loss": 1.1092,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.21793846786022186,
+      "learning_rate": 8.831168831168831e-05,
+      "loss": 1.044,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.2009749859571457,
+      "learning_rate": 8.766233766233767e-05,
+      "loss": 1.0198,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.19432318210601807,
+      "learning_rate": 8.701298701298701e-05,
+      "loss": 1.075,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.18634547293186188,
+      "learning_rate": 8.636363636363637e-05,
+      "loss": 0.9964,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.1947103589773178,
+      "learning_rate": 8.571428571428571e-05,
+      "loss": 1.0025,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.23098671436309814,
+      "learning_rate": 8.506493506493507e-05,
+      "loss": 1.0562,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.19686414301395416,
+      "learning_rate": 8.441558441558442e-05,
+      "loss": 1.0285,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.19852428138256073,
+      "learning_rate": 8.376623376623377e-05,
+      "loss": 1.0054,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.21483510732650757,
+      "learning_rate": 8.311688311688312e-05,
+      "loss": 1.108,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.23313644528388977,
+      "learning_rate": 8.246753246753248e-05,
+      "loss": 1.1383,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.21453145146369934,
+      "learning_rate": 8.181818181818183e-05,
+      "loss": 1.0911,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.20268195867538452,
+      "learning_rate": 8.116883116883117e-05,
+      "loss": 1.0145,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.20576398074626923,
+      "learning_rate": 8.051948051948052e-05,
+      "loss": 1.0829,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.21732626855373383,
+      "learning_rate": 7.987012987012987e-05,
+      "loss": 1.0152,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.22046895325183868,
+      "learning_rate": 7.922077922077923e-05,
+      "loss": 1.1311,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.19727715849876404,
+      "learning_rate": 7.857142857142858e-05,
+      "loss": 1.0364,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.20861488580703735,
+      "learning_rate": 7.792207792207793e-05,
+      "loss": 1.0435,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.18545083701610565,
+      "learning_rate": 7.727272727272727e-05,
+      "loss": 1.0299,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.19965052604675293,
+      "learning_rate": 7.662337662337662e-05,
+      "loss": 1.0511,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.23673909902572632,
+      "learning_rate": 7.597402597402598e-05,
+      "loss": 1.081,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.17583179473876953,
+      "learning_rate": 7.532467532467533e-05,
+      "loss": 0.9808,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.2129366099834442,
+      "learning_rate": 7.467532467532467e-05,
+      "loss": 1.0522,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.21679140627384186,
+      "learning_rate": 7.402597402597404e-05,
+      "loss": 1.0567,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.2032000720500946,
+      "learning_rate": 7.337662337662338e-05,
+      "loss": 1.0466,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.1887970268726349,
+      "learning_rate": 7.272727272727273e-05,
+      "loss": 1.0329,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.21060192584991455,
+      "learning_rate": 7.207792207792208e-05,
+      "loss": 1.1021,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.21191425621509552,
+      "learning_rate": 7.142857142857143e-05,
+      "loss": 0.99,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.1995989829301834,
+      "learning_rate": 7.077922077922077e-05,
+      "loss": 1.0526,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.1849513053894043,
+      "learning_rate": 7.012987012987014e-05,
+      "loss": 0.9998,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.1948779672384262,
+      "learning_rate": 6.948051948051948e-05,
+      "loss": 1.075,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.20374052226543427,
+      "learning_rate": 6.883116883116883e-05,
+      "loss": 1.0933,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.2102465033531189,
+      "learning_rate": 6.818181818181818e-05,
+      "loss": 1.1123,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.21376173198223114,
+      "learning_rate": 6.753246753246754e-05,
+      "loss": 1.1233,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.20934203267097473,
+      "learning_rate": 6.688311688311688e-05,
+      "loss": 1.1374,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.18604128062725067,
+      "learning_rate": 6.623376623376624e-05,
+      "loss": 1.0213,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.19644233584403992,
+      "learning_rate": 6.55844155844156e-05,
+      "loss": 1.0046,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.18479463458061218,
+      "learning_rate": 6.493506493506494e-05,
+      "loss": 0.9792,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.1945149153470993,
+      "learning_rate": 6.428571428571429e-05,
+      "loss": 1.0584,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.2070147544145584,
+      "learning_rate": 6.363636363636364e-05,
+      "loss": 1.071,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.19645985960960388,
+      "learning_rate": 6.2987012987013e-05,
+      "loss": 1.0721,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.1960117667913437,
+      "learning_rate": 6.233766233766233e-05,
+      "loss": 1.071,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.20168261229991913,
+      "learning_rate": 6.16883116883117e-05,
+      "loss": 1.0808,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.21254412829875946,
+      "learning_rate": 6.103896103896104e-05,
+      "loss": 1.0287,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.21271063387393951,
+      "learning_rate": 6.03896103896104e-05,
+      "loss": 1.0605,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.2081408053636551,
+      "learning_rate": 5.9740259740259744e-05,
+      "loss": 1.091,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.21113798022270203,
+      "learning_rate": 5.90909090909091e-05,
+      "loss": 1.1323,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.20670844614505768,
+      "learning_rate": 5.844155844155844e-05,
+      "loss": 1.0955,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.2010120451450348,
+      "learning_rate": 5.7792207792207796e-05,
+      "loss": 1.1068,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.20379121601581573,
+      "learning_rate": 5.714285714285714e-05,
+      "loss": 1.0419,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.22799807786941528,
+      "learning_rate": 5.64935064935065e-05,
+      "loss": 1.0904,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.2005995213985443,
+      "learning_rate": 5.584415584415584e-05,
+      "loss": 1.078,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.20329605042934418,
+      "learning_rate": 5.51948051948052e-05,
+      "loss": 1.0245,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.19283504784107208,
+      "learning_rate": 5.4545454545454546e-05,
+      "loss": 1.0367,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.20624355971813202,
+      "learning_rate": 5.38961038961039e-05,
+      "loss": 1.1046,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.21362991631031036,
+      "learning_rate": 5.3246753246753245e-05,
+      "loss": 1.1104,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.20447863638401031,
+      "learning_rate": 5.25974025974026e-05,
+      "loss": 1.0514,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.1974381059408188,
+      "learning_rate": 5.1948051948051944e-05,
+      "loss": 1.0048,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.21237170696258545,
+      "learning_rate": 5.1298701298701304e-05,
+      "loss": 1.1299,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.21224971115589142,
+      "learning_rate": 5.064935064935065e-05,
+      "loss": 1.05,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.19865018129348755,
+      "learning_rate": 5e-05,
+      "loss": 1.0665,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.19199275970458984,
+      "learning_rate": 4.9350649350649355e-05,
+      "loss": 0.9531,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.19573214650154114,
+      "learning_rate": 4.87012987012987e-05,
+      "loss": 1.0318,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.21338805556297302,
+      "learning_rate": 4.8051948051948054e-05,
+      "loss": 1.0343,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.2254691869020462,
+      "learning_rate": 4.740259740259741e-05,
+      "loss": 1.0472,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.18101665377616882,
+      "learning_rate": 4.675324675324675e-05,
+      "loss": 1.017,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.22090592980384827,
+      "learning_rate": 4.6103896103896106e-05,
+      "loss": 1.0389,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.20865507423877716,
+      "learning_rate": 4.545454545454546e-05,
+      "loss": 1.0369,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.21619610488414764,
+      "learning_rate": 4.4805194805194805e-05,
+      "loss": 1.109,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.21694771945476532,
+      "learning_rate": 4.415584415584416e-05,
+      "loss": 1.0525,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.2182662934064865,
+      "learning_rate": 4.3506493506493503e-05,
+      "loss": 1.0331,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.2026486098766327,
+      "learning_rate": 4.2857142857142856e-05,
+      "loss": 1.027,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.19606547057628632,
+      "learning_rate": 4.220779220779221e-05,
+      "loss": 1.0242,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.22107470035552979,
+      "learning_rate": 4.155844155844156e-05,
+      "loss": 1.0924,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.19960008561611176,
+      "learning_rate": 4.0909090909090915e-05,
+      "loss": 1.0384,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.1945488154888153,
+      "learning_rate": 4.025974025974026e-05,
+      "loss": 1.0673,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.22067414224147797,
+      "learning_rate": 3.9610389610389614e-05,
+      "loss": 1.0426,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.19010980427265167,
+      "learning_rate": 3.8961038961038966e-05,
+      "loss": 1.0617,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.18781176209449768,
+      "learning_rate": 3.831168831168831e-05,
+      "loss": 1.0243,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.20388829708099365,
+      "learning_rate": 3.7662337662337665e-05,
+      "loss": 1.0476,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.19911155104637146,
+      "learning_rate": 3.701298701298702e-05,
+      "loss": 1.0324,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.19884039461612701,
+      "learning_rate": 3.6363636363636364e-05,
+      "loss": 1.0242,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.19036105275154114,
+      "learning_rate": 3.571428571428572e-05,
+      "loss": 1.0323,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.20039844512939453,
+      "learning_rate": 3.506493506493507e-05,
+      "loss": 1.0749,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.1899934560060501,
+      "learning_rate": 3.4415584415584416e-05,
+      "loss": 1.0115,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.20019090175628662,
+      "learning_rate": 3.376623376623377e-05,
+      "loss": 1.0782,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.2020583152770996,
+      "learning_rate": 3.311688311688312e-05,
+      "loss": 1.0687,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.21407337486743927,
+      "learning_rate": 3.246753246753247e-05,
+      "loss": 1.1015,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.1871640682220459,
+      "learning_rate": 3.181818181818182e-05,
+      "loss": 0.9637,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.21622811257839203,
+      "learning_rate": 3.1168831168831166e-05,
+      "loss": 1.1222,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.22504661977291107,
+      "learning_rate": 3.051948051948052e-05,
+      "loss": 1.132,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.19177629053592682,
+      "learning_rate": 2.9870129870129872e-05,
+      "loss": 1.0281,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.1970544159412384,
+      "learning_rate": 2.922077922077922e-05,
+      "loss": 1.0393,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.21554522216320038,
+      "learning_rate": 2.857142857142857e-05,
+      "loss": 1.074,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.21131229400634766,
+      "learning_rate": 2.792207792207792e-05,
+      "loss": 1.054,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.19816523790359497,
+      "learning_rate": 2.7272727272727273e-05,
+      "loss": 1.0456,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.21075209975242615,
+      "learning_rate": 2.6623376623376623e-05,
+      "loss": 1.0758,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.2296527624130249,
+      "learning_rate": 2.5974025974025972e-05,
+      "loss": 1.0917,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.19722610712051392,
+      "learning_rate": 2.5324675324675325e-05,
+      "loss": 1.0704,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.18721099197864532,
+      "learning_rate": 2.4675324675324678e-05,
+      "loss": 0.9919,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.20244193077087402,
+      "learning_rate": 2.4025974025974027e-05,
+      "loss": 1.0368,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.19518914818763733,
+      "learning_rate": 2.3376623376623376e-05,
+      "loss": 1.0436,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.19650357961654663,
+      "learning_rate": 2.272727272727273e-05,
+      "loss": 1.0306,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.20320096611976624,
+      "learning_rate": 2.207792207792208e-05,
+      "loss": 1.0941,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.18296951055526733,
+      "learning_rate": 2.1428571428571428e-05,
+      "loss": 0.9802,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.21357610821723938,
+      "learning_rate": 2.077922077922078e-05,
+      "loss": 1.0449,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.193921759724617,
+      "learning_rate": 2.012987012987013e-05,
+      "loss": 1.0116,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.1953902244567871,
+      "learning_rate": 1.9480519480519483e-05,
+      "loss": 1.0105,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.19440975785255432,
+      "learning_rate": 1.8831168831168833e-05,
+      "loss": 0.9952,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.21054105460643768,
+      "learning_rate": 1.8181818181818182e-05,
+      "loss": 1.0701,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.18844804167747498,
+      "learning_rate": 1.7532467532467535e-05,
+      "loss": 1.0146,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.2067311704158783,
+      "learning_rate": 1.6883116883116884e-05,
+      "loss": 1.0781,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.1941213756799698,
+      "learning_rate": 1.6233766233766234e-05,
+      "loss": 0.9814,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.22726193070411682,
+      "learning_rate": 1.5584415584415583e-05,
+      "loss": 1.1431,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.18025581538677216,
+      "learning_rate": 1.4935064935064936e-05,
+      "loss": 0.9649,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.21535000205039978,
+      "learning_rate": 1.4285714285714285e-05,
+      "loss": 1.0441,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.20014546811580658,
+      "learning_rate": 1.3636363636363637e-05,
+      "loss": 1.0166,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.22738787531852722,
+      "learning_rate": 1.2987012987012986e-05,
+      "loss": 1.0564,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.2020861804485321,
+      "learning_rate": 1.2337662337662339e-05,
+      "loss": 1.1241,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.19888809323310852,
+      "learning_rate": 1.1688311688311688e-05,
+      "loss": 1.1114,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.20912377536296844,
+      "learning_rate": 1.103896103896104e-05,
+      "loss": 1.0971,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.21206621825695038,
+      "learning_rate": 1.038961038961039e-05,
+      "loss": 1.0601,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.18667680025100708,
+      "learning_rate": 9.740259740259742e-06,
+      "loss": 1.0291,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.21125559508800507,
+      "learning_rate": 9.090909090909091e-06,
+      "loss": 1.0483,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.21776145696640015,
+      "learning_rate": 8.441558441558442e-06,
+      "loss": 0.9912,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.20144303143024445,
+      "learning_rate": 7.792207792207792e-06,
+      "loss": 1.0357,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.1984029859304428,
+      "learning_rate": 7.142857142857143e-06,
+      "loss": 1.0648,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.17972829937934875,
+      "learning_rate": 6.493506493506493e-06,
+      "loss": 1.0033,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.1818286031484604,
+      "learning_rate": 5.844155844155844e-06,
+      "loss": 0.997,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.19670912623405457,
+      "learning_rate": 5.194805194805195e-06,
+      "loss": 1.0256,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.20527283847332,
+      "learning_rate": 4.5454545454545455e-06,
+      "loss": 1.0348,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.19025909900665283,
+      "learning_rate": 3.896103896103896e-06,
+      "loss": 1.0682,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.19544818997383118,
+      "learning_rate": 3.2467532467532465e-06,
+      "loss": 0.9872,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.22112183272838593,
+      "learning_rate": 2.5974025974025976e-06,
+      "loss": 1.0661,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.23328153789043427,
+      "learning_rate": 1.948051948051948e-06,
+      "loss": 1.0691,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.20181375741958618,
+      "learning_rate": 1.2987012987012988e-06,
+      "loss": 0.9416,
+      "step": 312
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.29312625527381897,
+      "learning_rate": 6.493506493506494e-07,
+      "loss": 1.1216,
+      "step": 313
+    },
+    {
+      "epoch": 0.12202467696492762,
+      "grad_norm": 0.2231415957212448,
+      "learning_rate": 0.0,
+      "loss": 1.0468,
+      "step": 314
+    },
+    {
+      "epoch": 0.12241329058583503,
+      "grad_norm": 0.22263288497924805,
+      "learning_rate": 0.00017594394706111328,
+      "loss": 1.0399,
+      "step": 315
+    },
+    {
+      "epoch": 0.12280190420674245,
+      "grad_norm": 0.22909891605377197,
+      "learning_rate": 0.00017586609575710393,
+      "loss": 1.1069,
+      "step": 316
+    },
+    {
+      "epoch": 0.12319051782764986,
+      "grad_norm": 0.23951445519924164,
+      "learning_rate": 0.0001757882444530946,
+      "loss": 1.1036,
+      "step": 317
+    },
+    {
+      "epoch": 0.12357913144855727,
+      "grad_norm": 0.2409268021583557,
+      "learning_rate": 0.00017571039314908526,
+      "loss": 1.1114,
+      "step": 318
+    },
+    {
+      "epoch": 0.12396774506946469,
+      "grad_norm": 0.23753899335861206,
+      "learning_rate": 0.00017563254184507592,
+      "loss": 1.1297,
+      "step": 319
+    },
+    {
+      "epoch": 0.12435635869037209,
+      "grad_norm": 0.2823902666568756,
+      "learning_rate": 0.00017555469054106657,
+      "loss": 1.1293,
+      "step": 320
+    },
+    {
+      "epoch": 0.12474497231127951,
+      "grad_norm": 0.24093545973300934,
+      "learning_rate": 0.00017547683923705722,
+      "loss": 1.0678,
+      "step": 321
+    },
+    {
+      "epoch": 0.12513358593218693,
+      "grad_norm": 0.22565563023090363,
+      "learning_rate": 0.0001753989879330479,
+      "loss": 1.1408,
+      "step": 322
+    },
+    {
+      "epoch": 0.12552219955309435,
+      "grad_norm": 0.22569572925567627,
+      "learning_rate": 0.00017532113662903855,
+      "loss": 1.0543,
+      "step": 323
+    },
+    {
+      "epoch": 0.12591081317400174,
+      "grad_norm": 0.24962866306304932,
+      "learning_rate": 0.0001752432853250292,
+      "loss": 1.0818,
+      "step": 324
+    },
+    {
+      "epoch": 0.12629942679490916,
+      "grad_norm": 0.22184576094150543,
+      "learning_rate": 0.00017516543402101986,
+      "loss": 1.0835,
+      "step": 325
+    },
+    {
+      "epoch": 0.12668804041581658,
+      "grad_norm": 0.2572194039821625,
+      "learning_rate": 0.0001750875827170105,
+      "loss": 1.0767,
+      "step": 326
+    },
+    {
+      "epoch": 0.127076654036724,
+      "grad_norm": 0.24131342768669128,
+      "learning_rate": 0.00017500973141300116,
+      "loss": 1.0981,
+      "step": 327
+    },
+    {
+      "epoch": 0.1274652676576314,
+      "grad_norm": 0.2386389970779419,
+      "learning_rate": 0.00017493188010899184,
+      "loss": 1.0828,
+      "step": 328
+    },
+    {
+      "epoch": 0.1278538812785388,
+      "grad_norm": 0.2654125690460205,
+      "learning_rate": 0.0001748540288049825,
+      "loss": 1.1266,
+      "step": 329
+    },
+    {
+      "epoch": 0.12824249489944622,
+      "grad_norm": 0.2925739884376526,
+      "learning_rate": 0.00017477617750097314,
+      "loss": 1.0983,
+      "step": 330
+    },
+    {
+      "epoch": 0.12863110852035364,
+      "grad_norm": 0.26589342951774597,
+      "learning_rate": 0.0001746983261969638,
+      "loss": 1.1029,
+      "step": 331
+    },
+    {
+      "epoch": 0.12901972214126106,
+      "grad_norm": 0.24565957486629486,
+      "learning_rate": 0.00017462047489295445,
+      "loss": 1.0975,
+      "step": 332
+    },
+    {
+      "epoch": 0.12940833576216845,
+      "grad_norm": 0.2459682673215866,
+      "learning_rate": 0.00017454262358894513,
+      "loss": 1.0566,
+      "step": 333
+    },
+    {
+      "epoch": 0.12979694938307587,
+      "grad_norm": 0.23349183797836304,
+      "learning_rate": 0.00017446477228493578,
+      "loss": 1.0833,
+      "step": 334
+    },
+    {
+      "epoch": 0.1301855630039833,
+      "grad_norm": 0.26166337728500366,
+      "learning_rate": 0.00017438692098092643,
+      "loss": 1.1598,
+      "step": 335
+    },
+    {
+      "epoch": 0.1305741766248907,
+      "grad_norm": 0.24188168346881866,
+      "learning_rate": 0.00017430906967691708,
+      "loss": 1.0728,
+      "step": 336
+    },
+    {
+      "epoch": 0.13096279024579813,
+      "grad_norm": 0.22922398149967194,
+      "learning_rate": 0.00017423121837290773,
+      "loss": 1.0311,
+      "step": 337
+    },
+    {
+      "epoch": 0.13135140386670552,
+      "grad_norm": 0.2652754485607147,
+      "learning_rate": 0.00017415336706889841,
+      "loss": 1.1096,
+      "step": 338
+    },
+    {
+      "epoch": 0.13174001748761294,
+      "grad_norm": 0.2355881780385971,
+      "learning_rate": 0.00017407551576488907,
+      "loss": 1.0964,
+      "step": 339
+    },
+    {
+      "epoch": 0.13212863110852036,
+      "grad_norm": 0.244523823261261,
+      "learning_rate": 0.00017399766446087972,
+      "loss": 1.142,
+      "step": 340
+    },
+    {
+      "epoch": 0.13251724472942777,
+      "grad_norm": 0.24705976247787476,
+      "learning_rate": 0.00017391981315687037,
+      "loss": 1.0943,
+      "step": 341
+    },
+    {
+      "epoch": 0.13290585835033517,
+      "grad_norm": 0.22817552089691162,
+      "learning_rate": 0.00017384196185286102,
+      "loss": 1.0621,
+      "step": 342
+    },
+    {
+      "epoch": 0.13329447197124258,
+      "grad_norm": 0.22605225443840027,
+      "learning_rate": 0.0001737641105488517,
+      "loss": 1.0714,
+      "step": 343
+    },
+    {
+      "epoch": 0.13368308559215,
+      "grad_norm": 0.2584545314311981,
+      "learning_rate": 0.00017368625924484235,
+      "loss": 1.1367,
+      "step": 344
+    },
+    {
+      "epoch": 0.13407169921305742,
+      "grad_norm": 0.2248220443725586,
+      "learning_rate": 0.000173608407940833,
+      "loss": 1.0872,
+      "step": 345
+    },
+    {
+      "epoch": 0.13446031283396484,
+      "grad_norm": 0.2141868770122528,
+      "learning_rate": 0.00017353055663682368,
+      "loss": 1.0572,
+      "step": 346
+    },
+    {
+      "epoch": 0.13484892645487223,
+      "grad_norm": 0.2615523934364319,
+      "learning_rate": 0.00017345270533281434,
+      "loss": 1.1048,
+      "step": 347
+    },
+    {
+      "epoch": 0.13523754007577965,
+      "grad_norm": 0.22990448772907257,
+      "learning_rate": 0.000173374854028805,
+      "loss": 1.0528,
+      "step": 348
+    },
+    {
+      "epoch": 0.13562615369668707,
+      "grad_norm": 0.2132262885570526,
+      "learning_rate": 0.00017329700272479564,
+      "loss": 1.0476,
+      "step": 349
+    },
+    {
+      "epoch": 0.1360147673175945,
+      "grad_norm": 0.2578272819519043,
+      "learning_rate": 0.00017321915142078632,
+      "loss": 1.0852,
+      "step": 350
+    },
+    {
+      "epoch": 0.1364033809385019,
+      "grad_norm": 0.22881457209587097,
+      "learning_rate": 0.00017314130011677697,
+      "loss": 1.1017,
+      "step": 351
+    },
+    {
+      "epoch": 0.1367919945594093,
+      "grad_norm": 0.21067696809768677,
+      "learning_rate": 0.00017306344881276762,
+      "loss": 1.0444,
+      "step": 352
+    },
+    {
+      "epoch": 0.13718060818031672,
+      "grad_norm": 0.2304215282201767,
+      "learning_rate": 0.0001729855975087583,
+      "loss": 1.0737,
+      "step": 353
+    },
+    {
+      "epoch": 0.13756922180122413,
+      "grad_norm": 0.2031925916671753,
+      "learning_rate": 0.00017290774620474895,
+      "loss": 1.0036,
+      "step": 354
+    },
+    {
+      "epoch": 0.13795783542213155,
+      "grad_norm": 0.27281051874160767,
+      "learning_rate": 0.0001728298949007396,
+      "loss": 1.148,
+      "step": 355
+    },
+    {
+      "epoch": 0.13834644904303897,
+      "grad_norm": 0.204191654920578,
+      "learning_rate": 0.00017275204359673026,
+      "loss": 0.9607,
+      "step": 356
+    },
+    {
+      "epoch": 0.13873506266394636,
+      "grad_norm": 0.221976637840271,
+      "learning_rate": 0.0001726741922927209,
+      "loss": 1.1068,
+      "step": 357
+    },
+    {
+      "epoch": 0.13912367628485378,
+      "grad_norm": 0.20831729471683502,
+      "learning_rate": 0.0001725963409887116,
+      "loss": 1.034,
+      "step": 358
+    },
+    {
+      "epoch": 0.1395122899057612,
+      "grad_norm": 0.21639779210090637,
+      "learning_rate": 0.00017251848968470224,
+      "loss": 1.0613,
+      "step": 359
+    },
+    {
+      "epoch": 0.13990090352666862,
+      "grad_norm": 0.1959424465894699,
+      "learning_rate": 0.0001724406383806929,
+      "loss": 1.0506,
+      "step": 360
+    },
+    {
+      "epoch": 0.140289517147576,
+      "grad_norm": 0.2044398933649063,
+      "learning_rate": 0.00017236278707668355,
+      "loss": 1.0316,
+      "step": 361
+    },
+    {
+      "epoch": 0.14067813076848343,
+      "grad_norm": 0.21483004093170166,
+      "learning_rate": 0.0001722849357726742,
+      "loss": 1.0361,
+      "step": 362
+    },
+    {
+      "epoch": 0.14106674438939085,
+      "grad_norm": 0.237701416015625,
+      "learning_rate": 0.00017220708446866485,
+      "loss": 1.1264,
+      "step": 363
+    },
+    {
+      "epoch": 0.14145535801029827,
+      "grad_norm": 0.20750795304775238,
+      "learning_rate": 0.00017212923316465553,
+      "loss": 1.0523,
+      "step": 364
+    },
+    {
+      "epoch": 0.14184397163120568,
+      "grad_norm": 0.2252965271472931,
+      "learning_rate": 0.00017205138186064618,
+      "loss": 1.0764,
+      "step": 365
+    },
+    {
+      "epoch": 0.14223258525211308,
+      "grad_norm": 0.2033565789461136,
+      "learning_rate": 0.00017197353055663683,
+      "loss": 1.064,
+      "step": 366
+    },
+    {
+      "epoch": 0.1426211988730205,
+      "grad_norm": 0.21123190224170685,
+      "learning_rate": 0.00017189567925262749,
+      "loss": 1.0515,
+      "step": 367
+    },
+    {
+      "epoch": 0.1430098124939279,
+      "grad_norm": 0.20646221935749054,
+      "learning_rate": 0.00017181782794861814,
+      "loss": 1.0617,
+      "step": 368
+    },
+    {
+      "epoch": 0.14339842611483533,
+      "grad_norm": 0.2079589068889618,
+      "learning_rate": 0.00017173997664460882,
+      "loss": 1.0569,
+      "step": 369
+    },
+    {
+      "epoch": 0.14378703973574275,
+      "grad_norm": 0.216246098279953,
+      "learning_rate": 0.00017166212534059947,
+      "loss": 1.0986,
+      "step": 370
+    },
+    {
+      "epoch": 0.14417565335665014,
+      "grad_norm": 0.20711806416511536,
+      "learning_rate": 0.00017158427403659012,
+      "loss": 1.1342,
+      "step": 371
+    },
+    {
+      "epoch": 0.14456426697755756,
+      "grad_norm": 0.235435351729393,
+      "learning_rate": 0.00017150642273258077,
+      "loss": 1.1082,
+      "step": 372
+    },
+    {
+      "epoch": 0.14495288059846498,
+      "grad_norm": 0.2273191511631012,
+      "learning_rate": 0.00017142857142857143,
+      "loss": 1.1064,
+      "step": 373
+    },
+    {
+      "epoch": 0.1453414942193724,
+      "grad_norm": 0.2075672745704651,
+      "learning_rate": 0.0001713507201245621,
+      "loss": 1.0536,
+      "step": 374
+    },
+    {
+      "epoch": 0.14573010784027982,
+      "grad_norm": 0.20764274895191193,
+      "learning_rate": 0.00017127286882055276,
+      "loss": 1.0673,
+      "step": 375
+    },
+    {
+      "epoch": 0.1461187214611872,
+      "grad_norm": 0.2441243678331375,
+      "learning_rate": 0.0001711950175165434,
+      "loss": 1.1271,
+      "step": 376
+    },
+    {
+      "epoch": 0.14650733508209463,
+      "grad_norm": 0.2383374124765396,
+      "learning_rate": 0.00017111716621253406,
+      "loss": 1.083,
+      "step": 377
+    },
+    {
+      "epoch": 0.14689594870300204,
+      "grad_norm": 0.2172410786151886,
+      "learning_rate": 0.0001710393149085247,
+      "loss": 1.0605,
+      "step": 378
+    },
+    {
+      "epoch": 0.14728456232390946,
+      "grad_norm": 0.22591541707515717,
+      "learning_rate": 0.0001709614636045154,
+      "loss": 1.0931,
+      "step": 379
+    },
+    {
+      "epoch": 0.14767317594481685,
+      "grad_norm": 0.23099495470523834,
+      "learning_rate": 0.00017088361230050604,
+      "loss": 1.1021,
+      "step": 380
+    },
+    {
+      "epoch": 0.14806178956572427,
+      "grad_norm": 0.21461094915866852,
+      "learning_rate": 0.0001708057609964967,
+      "loss": 1.0959,
+      "step": 381
+    },
+    {
+      "epoch": 0.1484504031866317,
+      "grad_norm": 0.21557241678237915,
+      "learning_rate": 0.00017072790969248735,
+      "loss": 1.0155,
+      "step": 382
+    },
+    {
+      "epoch": 0.1488390168075391,
+      "grad_norm": 0.234396293759346,
+      "learning_rate": 0.000170650058388478,
+      "loss": 1.1289,
+      "step": 383
+    },
+    {
+      "epoch": 0.14922763042844653,
+      "grad_norm": 0.22895503044128418,
+      "learning_rate": 0.00017057220708446868,
+      "loss": 0.9919,
+      "step": 384
+    },
+    {
+      "epoch": 0.14961624404935392,
+      "grad_norm": 0.2054683268070221,
+      "learning_rate": 0.00017049435578045933,
+      "loss": 1.0607,
+      "step": 385
+    },
+    {
+      "epoch": 0.15000485767026134,
+      "grad_norm": 0.25569215416908264,
+      "learning_rate": 0.00017041650447644998,
+      "loss": 1.0517,
+      "step": 386
+    },
+    {
+      "epoch": 0.15039347129116876,
+      "grad_norm": 0.2222641259431839,
+      "learning_rate": 0.00017033865317244064,
+      "loss": 1.0404,
+      "step": 387
+    },
+    {
+      "epoch": 0.15078208491207618,
+      "grad_norm": 0.20501169562339783,
+      "learning_rate": 0.0001702608018684313,
+      "loss": 0.9897,
+      "step": 388
+    },
+    {
+      "epoch": 0.1511706985329836,
+      "grad_norm": 0.22080403566360474,
+      "learning_rate": 0.00017018295056442197,
+      "loss": 1.1013,
+      "step": 389
+    },
+    {
+      "epoch": 0.15155931215389098,
+      "grad_norm": 0.21218529343605042,
+      "learning_rate": 0.00017010509926041262,
+      "loss": 1.0541,
+      "step": 390
+    },
+    {
+      "epoch": 0.1519479257747984,
+      "grad_norm": 0.23064807057380676,
+      "learning_rate": 0.00017002724795640327,
+      "loss": 1.037,
+      "step": 391
+    },
+    {
+      "epoch": 0.15233653939570582,
+      "grad_norm": 0.21164493262767792,
+      "learning_rate": 0.00016994939665239392,
+      "loss": 1.0769,
+      "step": 392
+    },
+    {
+      "epoch": 0.15272515301661324,
+      "grad_norm": 0.22565549612045288,
+      "learning_rate": 0.00016987154534838457,
+      "loss": 1.0638,
+      "step": 393
+    },
+    {
+      "epoch": 0.15311376663752063,
+      "grad_norm": 0.22492647171020508,
+      "learning_rate": 0.00016979369404437525,
+      "loss": 1.063,
+      "step": 394
+    },
+    {
+      "epoch": 0.15350238025842805,
+      "grad_norm": 0.22335395216941833,
+      "learning_rate": 0.0001697158427403659,
+      "loss": 1.1032,
+      "step": 395
+    },
+    {
+      "epoch": 0.15389099387933547,
+      "grad_norm": 0.2164154201745987,
+      "learning_rate": 0.00016963799143635656,
+      "loss": 1.1275,
+      "step": 396
+    },
+    {
+      "epoch": 0.1542796075002429,
+      "grad_norm": 0.22547736763954163,
+      "learning_rate": 0.0001695601401323472,
+      "loss": 1.1324,
+      "step": 397
+    },
+    {
+      "epoch": 0.1546682211211503,
+      "grad_norm": 0.2028045952320099,
+      "learning_rate": 0.0001694822888283379,
+      "loss": 1.0057,
+      "step": 398
+    },
+    {
+      "epoch": 0.1550568347420577,
+      "grad_norm": 0.20770573616027832,
+      "learning_rate": 0.00016940443752432854,
+      "loss": 1.0311,
+      "step": 399
+    },
+    {
+      "epoch": 0.15544544836296512,
+      "grad_norm": 0.2231476902961731,
+      "learning_rate": 0.0001693265862203192,
+      "loss": 1.0535,
+      "step": 400
+    },
+    {
+      "epoch": 0.15583406198387253,
+      "grad_norm": 0.21618099510669708,
+      "learning_rate": 0.00016924873491630987,
+      "loss": 1.0616,
+      "step": 401
+    },
+    {
+      "epoch": 0.15622267560477995,
+      "grad_norm": 0.24024419486522675,
+      "learning_rate": 0.00016917088361230052,
+      "loss": 1.1324,
+      "step": 402
+    },
+    {
+      "epoch": 0.15661128922568737,
+      "grad_norm": 0.2002171128988266,
+      "learning_rate": 0.00016909303230829118,
+      "loss": 1.015,
+      "step": 403
+    },
+    {
+      "epoch": 0.15699990284659476,
+      "grad_norm": 0.21771477162837982,
+      "learning_rate": 0.00016901518100428183,
+      "loss": 1.0817,
+      "step": 404
+    },
+    {
+      "epoch": 0.15738851646750218,
+      "grad_norm": 0.22052259743213654,
+      "learning_rate": 0.0001689373297002725,
+      "loss": 1.0836,
+      "step": 405
+    },
+    {
+      "epoch": 0.1577771300884096,
+      "grad_norm": 0.1964062750339508,
+      "learning_rate": 0.00016885947839626316,
+      "loss": 1.0505,
+      "step": 406
+    },
+    {
+      "epoch": 0.15816574370931702,
+      "grad_norm": 0.22714298963546753,
+      "learning_rate": 0.0001687816270922538,
+      "loss": 1.0702,
+      "step": 407
+    },
+    {
+      "epoch": 0.15855435733022444,
+      "grad_norm": 0.20647728443145752,
+      "learning_rate": 0.00016870377578824446,
+      "loss": 1.0349,
+      "step": 408
+    },
+    {
+      "epoch": 0.15894297095113183,
+      "grad_norm": 0.2355160117149353,
+      "learning_rate": 0.00016862592448423512,
+      "loss": 1.0305,
+      "step": 409
+    },
+    {
+      "epoch": 0.15933158457203925,
+      "grad_norm": 0.22890770435333252,
+      "learning_rate": 0.0001685480731802258,
+      "loss": 1.0854,
+      "step": 410
+    },
+    {
+      "epoch": 0.15972019819294667,
+      "grad_norm": 0.21947838366031647,
+      "learning_rate": 0.00016847022187621645,
+      "loss": 1.0948,
+      "step": 411
+    },
+    {
+      "epoch": 0.16010881181385409,
+      "grad_norm": 0.22334899008274078,
+      "learning_rate": 0.0001683923705722071,
+      "loss": 1.006,
+      "step": 412
+    },
+    {
+      "epoch": 0.16049742543476148,
+      "grad_norm": 0.22324936091899872,
+      "learning_rate": 0.00016831451926819775,
+      "loss": 1.0402,
+      "step": 413
+    },
+    {
+      "epoch": 0.1608860390556689,
+      "grad_norm": 0.21462097764015198,
+      "learning_rate": 0.0001682366679641884,
+      "loss": 1.077,
+      "step": 414
+    },
+    {
+      "epoch": 0.1612746526765763,
+      "grad_norm": 0.24567006528377533,
+      "learning_rate": 0.00016815881666017908,
+      "loss": 1.15,
+      "step": 415
+    },
+    {
+      "epoch": 0.16166326629748373,
+      "grad_norm": 0.26437243819236755,
+      "learning_rate": 0.00016808096535616973,
+      "loss": 1.1251,
+      "step": 416
+    },
+    {
+      "epoch": 0.16205187991839115,
+      "grad_norm": 0.2217959761619568,
+      "learning_rate": 0.00016800311405216039,
+      "loss": 1.1103,
+      "step": 417
+    },
+    {
+      "epoch": 0.16244049353929854,
+      "grad_norm": 0.24402475357055664,
+      "learning_rate": 0.00016792526274815104,
+      "loss": 1.0672,
+      "step": 418
+    },
+    {
+      "epoch": 0.16282910716020596,
+      "grad_norm": 0.21609526872634888,
+      "learning_rate": 0.0001678474114441417,
+      "loss": 1.0291,
+      "step": 419
+    },
+    {
+      "epoch": 0.16321772078111338,
+      "grad_norm": 0.20054642856121063,
+      "learning_rate": 0.00016776956014013237,
+      "loss": 1.0704,
+      "step": 420
+    },
+    {
+      "epoch": 0.1636063344020208,
+      "grad_norm": 0.22864869236946106,
+      "learning_rate": 0.00016769170883612302,
+      "loss": 1.0612,
+      "step": 421
+    },
+    {
+      "epoch": 0.16399494802292822,
+      "grad_norm": 0.22651974856853485,
+      "learning_rate": 0.00016761385753211367,
+      "loss": 1.0749,
+      "step": 422
+    },
+    {
+      "epoch": 0.1643835616438356,
+      "grad_norm": 0.21587328612804413,
+      "learning_rate": 0.00016753600622810433,
+      "loss": 1.0398,
+      "step": 423
+    },
+    {
+      "epoch": 0.16477217526474303,
+      "grad_norm": 0.1953774094581604,
+      "learning_rate": 0.00016745815492409498,
+      "loss": 1.0275,
+      "step": 424
+    },
+    {
+      "epoch": 0.16516078888565044,
+      "grad_norm": 0.21803410351276398,
+      "learning_rate": 0.00016738030362008566,
+      "loss": 1.1219,
+      "step": 425
+    },
+    {
+      "epoch": 0.16554940250655786,
+      "grad_norm": 0.2034682035446167,
+      "learning_rate": 0.0001673024523160763,
+      "loss": 1.0342,
+      "step": 426
+    },
+    {
+      "epoch": 0.16593801612746525,
+      "grad_norm": 0.20135951042175293,
+      "learning_rate": 0.00016722460101206696,
+      "loss": 0.9802,
+      "step": 427
+    },
+    {
+      "epoch": 0.16632662974837267,
+      "grad_norm": 0.23310376703739166,
+      "learning_rate": 0.0001671467497080576,
+      "loss": 1.0789,
+      "step": 428
+    },
+    {
+      "epoch": 0.1667152433692801,
+      "grad_norm": 0.21475404500961304,
+      "learning_rate": 0.00016706889840404827,
+      "loss": 1.0416,
+      "step": 429
+    },
+    {
+      "epoch": 0.1671038569901875,
+      "grad_norm": 0.21661072969436646,
+      "learning_rate": 0.00016699104710003894,
+      "loss": 1.0568,
+      "step": 430
+    },
+    {
+      "epoch": 0.16749247061109493,
+      "grad_norm": 0.20310629904270172,
+      "learning_rate": 0.0001669131957960296,
+      "loss": 0.9968,
+      "step": 431
+    },
+    {
+      "epoch": 0.16788108423200232,
+      "grad_norm": 0.2596947252750397,
+      "learning_rate": 0.00016683534449202025,
+      "loss": 1.0478,
+      "step": 432
+    },
+    {
+      "epoch": 0.16826969785290974,
+      "grad_norm": 0.22226987779140472,
+      "learning_rate": 0.0001667574931880109,
+      "loss": 1.0898,
+      "step": 433
+    },
+    {
+      "epoch": 0.16865831147381716,
+      "grad_norm": 0.22499911487102509,
+      "learning_rate": 0.00016667964188400155,
+      "loss": 1.07,
+      "step": 434
+    },
+    {
+      "epoch": 0.16904692509472458,
+      "grad_norm": 0.2717292308807373,
+      "learning_rate": 0.0001666017905799922,
+      "loss": 1.0562,
+      "step": 435
+    },
+    {
+      "epoch": 0.169435538715632,
+      "grad_norm": 0.22052323818206787,
+      "learning_rate": 0.00016652393927598288,
+      "loss": 1.0732,
+      "step": 436
+    },
+    {
+      "epoch": 0.16982415233653939,
+      "grad_norm": 0.21741728484630585,
+      "learning_rate": 0.00016644608797197354,
+      "loss": 1.0409,
+      "step": 437
+    },
+    {
+      "epoch": 0.1702127659574468,
+      "grad_norm": 0.20701193809509277,
+      "learning_rate": 0.0001663682366679642,
+      "loss": 1.0731,
+      "step": 438
+    },
+    {
+      "epoch": 0.17060137957835422,
+      "grad_norm": 0.22071130573749542,
+      "learning_rate": 0.00016629038536395484,
+      "loss": 1.0992,
+      "step": 439
+    },
+    {
+      "epoch": 0.17098999319926164,
+      "grad_norm": 0.20261412858963013,
+      "learning_rate": 0.0001662125340599455,
+      "loss": 1.0051,
+      "step": 440
+    },
+    {
+      "epoch": 0.17137860682016906,
+      "grad_norm": 0.2082947939634323,
+      "learning_rate": 0.00016613468275593617,
+      "loss": 1.0477,
+      "step": 441
+    },
+    {
+      "epoch": 0.17176722044107645,
+      "grad_norm": 0.22534717619419098,
+      "learning_rate": 0.00016605683145192682,
+      "loss": 1.041,
+      "step": 442
+    },
+    {
+      "epoch": 0.17215583406198387,
+      "grad_norm": 0.21547731757164001,
+      "learning_rate": 0.00016597898014791748,
+      "loss": 1.0528,
+      "step": 443
+    },
+    {
+      "epoch": 0.1725444476828913,
+      "grad_norm": 0.24141089618206024,
+      "learning_rate": 0.00016590112884390813,
+      "loss": 1.0928,
+      "step": 444
+    },
+    {
+      "epoch": 0.1729330613037987,
+      "grad_norm": 0.21910884976387024,
+      "learning_rate": 0.00016582327753989878,
+      "loss": 1.063,
+      "step": 445
+    },
+    {
+      "epoch": 0.1733216749247061,
+      "grad_norm": 0.21782316267490387,
+      "learning_rate": 0.00016574542623588946,
+      "loss": 1.0976,
+      "step": 446
+    },
+    {
+      "epoch": 0.17371028854561352,
+      "grad_norm": 0.21771778166294098,
+      "learning_rate": 0.0001656675749318801,
+      "loss": 1.0677,
+      "step": 447
+    },
+    {
+      "epoch": 0.17409890216652094,
+      "grad_norm": 0.22117659449577332,
+      "learning_rate": 0.00016558972362787076,
+      "loss": 1.0669,
+      "step": 448
+    },
+    {
+      "epoch": 0.17448751578742835,
+      "grad_norm": 0.21918092668056488,
+      "learning_rate": 0.00016551187232386141,
+      "loss": 1.0955,
+      "step": 449
+    },
+    {
+      "epoch": 0.17487612940833577,
+      "grad_norm": 0.22027818858623505,
+      "learning_rate": 0.0001654340210198521,
+      "loss": 1.0201,
+      "step": 450
+    },
+    {
+      "epoch": 0.17526474302924316,
+      "grad_norm": 0.2042885720729828,
+      "learning_rate": 0.00016535616971584275,
+      "loss": 1.0881,
+      "step": 451
+    },
+    {
+      "epoch": 0.17565335665015058,
+      "grad_norm": 0.21788261830806732,
+      "learning_rate": 0.0001652783184118334,
+      "loss": 1.0918,
+      "step": 452
+    },
+    {
+      "epoch": 0.176041970271058,
+      "grad_norm": 0.23332571983337402,
+      "learning_rate": 0.00016520046710782408,
+      "loss": 1.091,
+      "step": 453
+    },
+    {
+      "epoch": 0.17643058389196542,
+      "grad_norm": 0.20204192399978638,
+      "learning_rate": 0.00016512261580381473,
+      "loss": 1.0366,
+      "step": 454
+    },
+    {
+      "epoch": 0.17681919751287284,
+      "grad_norm": 0.21761906147003174,
+      "learning_rate": 0.00016504476449980538,
+      "loss": 1.0131,
+      "step": 455
+    },
+    {
+      "epoch": 0.17720781113378023,
+      "grad_norm": 0.2152051478624344,
+      "learning_rate": 0.00016496691319579606,
+      "loss": 1.0868,
+      "step": 456
+    },
+    {
+      "epoch": 0.17759642475468765,
+      "grad_norm": 0.22776494920253754,
+      "learning_rate": 0.0001648890618917867,
+      "loss": 1.0807,
+      "step": 457
+    },
+    {
+      "epoch": 0.17798503837559507,
+      "grad_norm": 0.2171342968940735,
+      "learning_rate": 0.00016481121058777736,
+      "loss": 1.0537,
+      "step": 458
+    },
+    {
+      "epoch": 0.17837365199650249,
+      "grad_norm": 0.2046273946762085,
+      "learning_rate": 0.00016473335928376802,
+      "loss": 1.0097,
+      "step": 459
+    },
+    {
+      "epoch": 0.17876226561740988,
+      "grad_norm": 0.2047681361436844,
+      "learning_rate": 0.00016465550797975867,
+      "loss": 1.0204,
+      "step": 460
+    },
+    {
+      "epoch": 0.1791508792383173,
+      "grad_norm": 0.1876862645149231,
+      "learning_rate": 0.00016457765667574935,
+      "loss": 0.9383,
+      "step": 461
+    },
+    {
+      "epoch": 0.17953949285922471,
+      "grad_norm": 0.218430757522583,
+      "learning_rate": 0.00016449980537174,
+      "loss": 1.0721,
+      "step": 462
+    },
+    {
+      "epoch": 0.17992810648013213,
+      "grad_norm": 0.2245480865240097,
+      "learning_rate": 0.00016442195406773065,
+      "loss": 1.0859,
+      "step": 463
+    },
+    {
+      "epoch": 0.18031672010103955,
+      "grad_norm": 0.22577151656150818,
+      "learning_rate": 0.0001643441027637213,
+      "loss": 1.0825,
+      "step": 464
+    },
+    {
+      "epoch": 0.18070533372194694,
+      "grad_norm": 0.20132745802402496,
+      "learning_rate": 0.00016426625145971196,
+      "loss": 1.0615,
+      "step": 465
+    },
+    {
+      "epoch": 0.18109394734285436,
+      "grad_norm": 0.2277505248785019,
+      "learning_rate": 0.00016418840015570263,
+      "loss": 1.0426,
+      "step": 466
+    },
+    {
+      "epoch": 0.18148256096376178,
+      "grad_norm": 0.22540105879306793,
+      "learning_rate": 0.0001641105488516933,
+      "loss": 1.0481,
+      "step": 467
+    },
+    {
+      "epoch": 0.1818711745846692,
+      "grad_norm": 0.20358088612556458,
+      "learning_rate": 0.00016403269754768394,
+      "loss": 1.0286,
+      "step": 468
+    },
+    {
+      "epoch": 0.18225978820557662,
+      "grad_norm": 0.22534145414829254,
+      "learning_rate": 0.0001639548462436746,
+      "loss": 1.1183,
+      "step": 469
+    },
+    {
+      "epoch": 0.182648401826484,
+      "grad_norm": 0.2188873142004013,
+      "learning_rate": 0.00016387699493966524,
+      "loss": 1.0439,
+      "step": 470
+    },
+    {
+      "epoch": 0.18303701544739143,
+      "grad_norm": 0.2128048539161682,
+      "learning_rate": 0.00016379914363565592,
+      "loss": 1.027,
+      "step": 471
+    },
+    {
+      "epoch": 0.18342562906829885,
+      "grad_norm": 0.2518141567707062,
+      "learning_rate": 0.00016372129233164657,
+      "loss": 1.0468,
+      "step": 472
+    },
+    {
+      "epoch": 0.18381424268920626,
+      "grad_norm": 0.2189142256975174,
+      "learning_rate": 0.00016364344102763723,
+      "loss": 1.0581,
+      "step": 473
+    },
+    {
+      "epoch": 0.18420285631011368,
+      "grad_norm": 0.31266725063323975,
+      "learning_rate": 0.00016356558972362788,
+      "loss": 1.0554,
+      "step": 474
+    },
+    {
+      "epoch": 0.18459146993102107,
+      "grad_norm": 0.21343916654586792,
+      "learning_rate": 0.00016348773841961853,
+      "loss": 1.0795,
+      "step": 475
+    },
+    {
+      "epoch": 0.1849800835519285,
+      "grad_norm": 0.22907280921936035,
+      "learning_rate": 0.00016340988711560918,
+      "loss": 1.0304,
+      "step": 476
+    },
+    {
+      "epoch": 0.1853686971728359,
+      "grad_norm": 0.2105257511138916,
+      "learning_rate": 0.00016333203581159986,
+      "loss": 1.0231,
+      "step": 477
+    },
+    {
+      "epoch": 0.18575731079374333,
+      "grad_norm": 0.19537831842899323,
+      "learning_rate": 0.00016325418450759051,
+      "loss": 1.0103,
+      "step": 478
+    },
+    {
+      "epoch": 0.18614592441465072,
+      "grad_norm": 0.20522372424602509,
+      "learning_rate": 0.00016317633320358117,
+      "loss": 1.0196,
+      "step": 479
+    },
+    {
+      "epoch": 0.18653453803555814,
+      "grad_norm": 0.21646477282047272,
+      "learning_rate": 0.00016309848189957182,
+      "loss": 1.0579,
+      "step": 480
+    },
+    {
+      "epoch": 0.18692315165646556,
+      "grad_norm": 0.21077193319797516,
+      "learning_rate": 0.00016302063059556247,
+      "loss": 1.0638,
+      "step": 481
+    },
+    {
+      "epoch": 0.18731176527737298,
+      "grad_norm": 0.20357473194599152,
+      "learning_rate": 0.00016294277929155315,
+      "loss": 1.0635,
+      "step": 482
+    },
+    {
+      "epoch": 0.1877003788982804,
+      "grad_norm": 0.2188001275062561,
+      "learning_rate": 0.0001628649279875438,
+      "loss": 1.0267,
+      "step": 483
+    },
+    {
+      "epoch": 0.1880889925191878,
+      "grad_norm": 0.2128928154706955,
+      "learning_rate": 0.00016278707668353445,
+      "loss": 0.9706,
+      "step": 484
+    },
+    {
+      "epoch": 0.1884776061400952,
+      "grad_norm": 0.22081372141838074,
+      "learning_rate": 0.0001627092253795251,
+      "loss": 1.08,
+      "step": 485
+    },
+    {
+      "epoch": 0.18886621976100262,
+      "grad_norm": 0.2250615805387497,
+      "learning_rate": 0.00016263137407551576,
+      "loss": 1.1451,
+      "step": 486
+    },
+    {
+      "epoch": 0.18925483338191004,
+      "grad_norm": 0.1984967589378357,
+      "learning_rate": 0.00016255352277150644,
+      "loss": 1.0744,
+      "step": 487
+    },
+    {
+      "epoch": 0.18964344700281746,
+      "grad_norm": 0.20778900384902954,
+      "learning_rate": 0.0001624756714674971,
+      "loss": 1.0623,
+      "step": 488
+    },
+    {
+      "epoch": 0.19003206062372485,
+      "grad_norm": 0.2026563137769699,
+      "learning_rate": 0.00016239782016348774,
+      "loss": 1.0714,
+      "step": 489
+    },
+    {
+      "epoch": 0.19042067424463227,
+      "grad_norm": 0.21598374843597412,
+      "learning_rate": 0.0001623199688594784,
+      "loss": 1.0869,
+      "step": 490
+    },
+    {
+      "epoch": 0.1908092878655397,
+      "grad_norm": 0.18944978713989258,
+      "learning_rate": 0.00016224211755546904,
+      "loss": 1.055,
+      "step": 491
+    },
+    {
+      "epoch": 0.1911979014864471,
+      "grad_norm": 0.20698946714401245,
+      "learning_rate": 0.00016216426625145972,
+      "loss": 1.0392,
+      "step": 492
+    },
+    {
+      "epoch": 0.1915865151073545,
+      "grad_norm": 0.22395353019237518,
+      "learning_rate": 0.00016208641494745038,
+      "loss": 1.0681,
+      "step": 493
+    },
+    {
+      "epoch": 0.19197512872826192,
+      "grad_norm": 0.22372962534427643,
+      "learning_rate": 0.00016200856364344103,
+      "loss": 1.0767,
+      "step": 494
+    },
+    {
+      "epoch": 0.19236374234916934,
+      "grad_norm": 0.2066701054573059,
+      "learning_rate": 0.00016193071233943168,
+      "loss": 1.0061,
+      "step": 495
+    },
+    {
+      "epoch": 0.19275235597007676,
+      "grad_norm": 0.19716408848762512,
+      "learning_rate": 0.00016185286103542233,
+      "loss": 1.039,
+      "step": 496
+    },
+    {
+      "epoch": 0.19314096959098417,
+      "grad_norm": 0.22159601747989655,
+      "learning_rate": 0.000161775009731413,
+      "loss": 1.0832,
+      "step": 497
+    },
+    {
+      "epoch": 0.19352958321189156,
+      "grad_norm": 0.21509626507759094,
+      "learning_rate": 0.00016169715842740366,
+      "loss": 1.0264,
+      "step": 498
+    },
+    {
+      "epoch": 0.19391819683279898,
+      "grad_norm": 0.21598199009895325,
+      "learning_rate": 0.00016161930712339431,
+      "loss": 1.049,
+      "step": 499
+    },
+    {
+      "epoch": 0.1943068104537064,
+      "grad_norm": 0.20279590785503387,
+      "learning_rate": 0.00016154145581938497,
+      "loss": 1.0505,
+      "step": 500
+    },
+    {
+      "epoch": 0.19469542407461382,
+      "grad_norm": 0.21796855330467224,
+      "learning_rate": 0.00016146360451537565,
+      "loss": 1.0885,
+      "step": 501
+    },
+    {
+      "epoch": 0.19508403769552124,
+      "grad_norm": 0.22128933668136597,
+      "learning_rate": 0.0001613857532113663,
+      "loss": 1.0903,
+      "step": 502
+    },
+    {
+      "epoch": 0.19547265131642863,
+      "grad_norm": 0.2032536417245865,
+      "learning_rate": 0.00016130790190735695,
+      "loss": 1.0285,
+      "step": 503
+    },
+    {
+      "epoch": 0.19586126493733605,
+      "grad_norm": 0.23738974332809448,
+      "learning_rate": 0.0001612300506033476,
+      "loss": 1.1188,
+      "step": 504
+    },
+    {
+      "epoch": 0.19624987855824347,
+      "grad_norm": 0.19614790380001068,
+      "learning_rate": 0.00016115219929933828,
+      "loss": 1.04,
+      "step": 505
+    },
+    {
+      "epoch": 0.1966384921791509,
+      "grad_norm": 0.2198178917169571,
+      "learning_rate": 0.00016107434799532893,
+      "loss": 1.0696,
+      "step": 506
+    },
+    {
+      "epoch": 0.1970271058000583,
+      "grad_norm": 0.18814648687839508,
+      "learning_rate": 0.00016099649669131959,
+      "loss": 1.0203,
+      "step": 507
+    },
+    {
+      "epoch": 0.1974157194209657,
+      "grad_norm": 0.20699037611484528,
+      "learning_rate": 0.00016091864538731026,
+      "loss": 1.1074,
+      "step": 508
+    },
+    {
+      "epoch": 0.19780433304187311,
+      "grad_norm": 0.21490445733070374,
+      "learning_rate": 0.00016084079408330092,
+      "loss": 1.0682,
+      "step": 509
+    },
+    {
+      "epoch": 0.19819294666278053,
+      "grad_norm": 0.2363848090171814,
+      "learning_rate": 0.00016076294277929157,
+      "loss": 1.0408,
+      "step": 510
+    },
+    {
+      "epoch": 0.19858156028368795,
+      "grad_norm": 0.20186659693717957,
+      "learning_rate": 0.00016068509147528222,
+      "loss": 1.026,
+      "step": 511
+    },
+    {
+      "epoch": 0.19897017390459534,
+      "grad_norm": 0.21564024686813354,
+      "learning_rate": 0.00016060724017127287,
+      "loss": 1.0418,
+      "step": 512
+    },
+    {
+      "epoch": 0.19935878752550276,
+      "grad_norm": 0.19151560962200165,
+      "learning_rate": 0.00016052938886726355,
+      "loss": 1.0037,
+      "step": 513
+    },
+    {
+      "epoch": 0.19974740114641018,
+      "grad_norm": 0.21038194000720978,
+      "learning_rate": 0.0001604515375632542,
+      "loss": 1.0545,
+      "step": 514
+    },
+    {
+      "epoch": 0.2001360147673176,
+      "grad_norm": 0.20496582984924316,
+      "learning_rate": 0.00016037368625924486,
+      "loss": 1.0543,
+      "step": 515
+    },
+    {
+      "epoch": 0.20052462838822502,
+      "grad_norm": 0.20689113438129425,
+      "learning_rate": 0.0001602958349552355,
+      "loss": 1.0905,
+      "step": 516
+    },
+    {
+      "epoch": 0.2009132420091324,
+      "grad_norm": 0.2284041792154312,
+      "learning_rate": 0.00016021798365122616,
+      "loss": 1.0717,
+      "step": 517
+    },
+    {
+      "epoch": 0.20130185563003983,
+      "grad_norm": 0.23457761108875275,
+      "learning_rate": 0.00016014013234721684,
+      "loss": 1.106,
+      "step": 518
+    },
+    {
+      "epoch": 0.20169046925094725,
+      "grad_norm": 0.2088528722524643,
+      "learning_rate": 0.0001600622810432075,
+      "loss": 1.0428,
+      "step": 519
+    },
+    {
+      "epoch": 0.20207908287185467,
+      "grad_norm": 0.2170068770647049,
+      "learning_rate": 0.00015998442973919814,
+      "loss": 0.9875,
+      "step": 520
+    },
+    {
+      "epoch": 0.20246769649276208,
+      "grad_norm": 0.2270561158657074,
+      "learning_rate": 0.0001599065784351888,
+      "loss": 1.0676,
+      "step": 521
+    },
+    {
+      "epoch": 0.20285631011366947,
+      "grad_norm": 0.2151324599981308,
+      "learning_rate": 0.00015982872713117945,
+      "loss": 1.0675,
+      "step": 522
+    },
+    {
+      "epoch": 0.2032449237345769,
+      "grad_norm": 0.23113249242305756,
+      "learning_rate": 0.00015975087582717013,
+      "loss": 1.0608,
+      "step": 523
+    },
+    {
+      "epoch": 0.2036335373554843,
+      "grad_norm": 0.2587106227874756,
+      "learning_rate": 0.00015967302452316078,
+      "loss": 1.0867,
+      "step": 524
+    },
+    {
+      "epoch": 0.20402215097639173,
+      "grad_norm": 0.21842992305755615,
+      "learning_rate": 0.00015959517321915143,
+      "loss": 1.0726,
+      "step": 525
+    },
+    {
+      "epoch": 0.20441076459729912,
+      "grad_norm": 0.20867805182933807,
+      "learning_rate": 0.00015951732191514208,
+      "loss": 1.0578,
+      "step": 526
+    },
+    {
+      "epoch": 0.20479937821820654,
+      "grad_norm": 0.2396962195634842,
+      "learning_rate": 0.00015943947061113273,
+      "loss": 1.0292,
+      "step": 527
+    },
+    {
+      "epoch": 0.20518799183911396,
+      "grad_norm": 0.221155047416687,
+      "learning_rate": 0.00015936161930712341,
+      "loss": 1.0019,
+      "step": 528
+    },
+    {
+      "epoch": 0.20557660546002138,
+      "grad_norm": 0.20032119750976562,
+      "learning_rate": 0.00015928376800311407,
+      "loss": 1.0435,
+      "step": 529
+    },
+    {
+      "epoch": 0.2059652190809288,
+      "grad_norm": 0.24095888435840607,
+      "learning_rate": 0.00015920591669910472,
+      "loss": 1.0355,
+      "step": 530
+    },
+    {
+      "epoch": 0.2063538327018362,
+      "grad_norm": 0.2286604344844818,
+      "learning_rate": 0.00015912806539509537,
+      "loss": 0.9989,
+      "step": 531
+    },
+    {
+      "epoch": 0.2067424463227436,
+      "grad_norm": 0.21537137031555176,
+      "learning_rate": 0.00015905021409108602,
+      "loss": 1.0642,
+      "step": 532
+    },
+    {
+      "epoch": 0.20713105994365102,
+      "grad_norm": 0.22447925806045532,
+      "learning_rate": 0.0001589723627870767,
+      "loss": 1.1244,
+      "step": 533
+    },
+    {
+      "epoch": 0.20751967356455844,
+      "grad_norm": 0.21077273786067963,
+      "learning_rate": 0.00015889451148306735,
+      "loss": 1.0167,
+      "step": 534
+    },
+    {
+      "epoch": 0.20790828718546586,
+      "grad_norm": 0.22340558469295502,
+      "learning_rate": 0.000158816660179058,
+      "loss": 1.0991,
+      "step": 535
+    },
+    {
+      "epoch": 0.20829690080637325,
+      "grad_norm": 0.223599374294281,
+      "learning_rate": 0.00015873880887504866,
+      "loss": 1.086,
+      "step": 536
+    },
+    {
+      "epoch": 0.20868551442728067,
+      "grad_norm": 0.2615208923816681,
+      "learning_rate": 0.0001586609575710393,
+      "loss": 1.0584,
+      "step": 537
+    },
+    {
+      "epoch": 0.2090741280481881,
+      "grad_norm": 0.2085907757282257,
+      "learning_rate": 0.00015858310626703,
+      "loss": 1.0994,
+      "step": 538
+    },
+    {
+      "epoch": 0.2094627416690955,
+      "grad_norm": 0.2170211672782898,
+      "learning_rate": 0.00015850525496302064,
+      "loss": 1.1105,
+      "step": 539
+    },
+    {
+      "epoch": 0.20985135529000293,
+      "grad_norm": 0.21978625655174255,
+      "learning_rate": 0.0001584274036590113,
+      "loss": 1.002,
+      "step": 540
+    },
+    {
+      "epoch": 0.21023996891091032,
+      "grad_norm": 0.23684021830558777,
+      "learning_rate": 0.00015834955235500194,
+      "loss": 1.1216,
+      "step": 541
+    },
+    {
+      "epoch": 0.21062858253181774,
+      "grad_norm": 0.220269113779068,
+      "learning_rate": 0.0001582717010509926,
+      "loss": 1.0773,
+      "step": 542
+    },
+    {
+      "epoch": 0.21101719615272516,
+      "grad_norm": 0.22447973489761353,
+      "learning_rate": 0.00015819384974698328,
+      "loss": 1.0941,
+      "step": 543
+    },
+    {
+      "epoch": 0.21140580977363257,
+      "grad_norm": 0.22435730695724487,
+      "learning_rate": 0.00015811599844297393,
+      "loss": 1.0138,
+      "step": 544
+    },
+    {
+      "epoch": 0.21179442339453997,
+      "grad_norm": 0.2230793684720993,
+      "learning_rate": 0.00015803814713896458,
+      "loss": 1.0343,
+      "step": 545
+    },
+    {
+      "epoch": 0.21218303701544738,
+      "grad_norm": 0.23491905629634857,
+      "learning_rate": 0.00015796029583495523,
+      "loss": 1.11,
+      "step": 546
+    },
+    {
+      "epoch": 0.2125716506363548,
+      "grad_norm": 0.213560551404953,
+      "learning_rate": 0.00015788244453094588,
+      "loss": 1.0615,
+      "step": 547
+    },
+    {
+      "epoch": 0.21296026425726222,
+      "grad_norm": 0.21392837166786194,
+      "learning_rate": 0.00015780459322693654,
+      "loss": 1.0872,
+      "step": 548
+    },
+    {
+      "epoch": 0.21334887787816964,
+      "grad_norm": 0.20007692277431488,
+      "learning_rate": 0.00015772674192292722,
+      "loss": 1.0394,
+      "step": 549
+    },
+    {
+      "epoch": 0.21373749149907703,
+      "grad_norm": 0.1969841718673706,
+      "learning_rate": 0.00015764889061891787,
+      "loss": 1.0381,
+      "step": 550
+    },
+    {
+      "epoch": 0.21412610511998445,
+      "grad_norm": 0.21874025464057922,
+      "learning_rate": 0.00015757103931490852,
+      "loss": 1.0822,
+      "step": 551
+    },
+    {
+      "epoch": 0.21451471874089187,
+      "grad_norm": 0.21824273467063904,
+      "learning_rate": 0.00015749318801089917,
+      "loss": 1.0802,
+      "step": 552
+    },
+    {
+      "epoch": 0.2149033323617993,
+      "grad_norm": 0.20942047238349915,
+      "learning_rate": 0.00015741533670688985,
+      "loss": 1.0634,
+      "step": 553
+    },
+    {
+      "epoch": 0.2152919459827067,
+      "grad_norm": 0.1940152943134308,
+      "learning_rate": 0.0001573374854028805,
+      "loss": 1.0264,
+      "step": 554
+    },
+    {
+      "epoch": 0.2156805596036141,
+      "grad_norm": 0.19859059154987335,
+      "learning_rate": 0.00015725963409887115,
+      "loss": 0.9701,
+      "step": 555
+    },
+    {
+      "epoch": 0.21606917322452152,
+      "grad_norm": 0.22239404916763306,
+      "learning_rate": 0.0001571817827948618,
+      "loss": 1.1282,
+      "step": 556
+    },
+    {
+      "epoch": 0.21645778684542893,
+      "grad_norm": 0.23820599913597107,
+      "learning_rate": 0.00015710393149085249,
+      "loss": 1.1123,
+      "step": 557
+    },
+    {
+      "epoch": 0.21684640046633635,
+      "grad_norm": 0.21279917657375336,
+      "learning_rate": 0.00015702608018684314,
+      "loss": 1.0542,
+      "step": 558
+    },
+    {
+      "epoch": 0.21723501408724374,
+      "grad_norm": 0.2065514773130417,
+      "learning_rate": 0.0001569482288828338,
+      "loss": 1.0685,
+      "step": 559
+    },
+    {
+      "epoch": 0.21762362770815116,
+      "grad_norm": 0.20130831003189087,
+      "learning_rate": 0.00015687037757882447,
+      "loss": 0.9869,
+      "step": 560
+    },
+    {
+      "epoch": 0.21801224132905858,
+      "grad_norm": 0.2187541127204895,
+      "learning_rate": 0.00015679252627481512,
+      "loss": 1.1095,
+      "step": 561
+    },
+    {
+      "epoch": 0.218400854949966,
+      "grad_norm": 0.21028277277946472,
+      "learning_rate": 0.00015671467497080577,
+      "loss": 1.0804,
+      "step": 562
+    },
+    {
+      "epoch": 0.21878946857087342,
+      "grad_norm": 0.8187636733055115,
+      "learning_rate": 0.00015663682366679643,
+      "loss": 1.0782,
+      "step": 563
+    },
+    {
+      "epoch": 0.2191780821917808,
+      "grad_norm": 0.20059974491596222,
+      "learning_rate": 0.0001565589723627871,
+      "loss": 1.0279,
+      "step": 564
+    },
+    {
+      "epoch": 0.21956669581268823,
+      "grad_norm": 0.20440839231014252,
+      "learning_rate": 0.00015648112105877776,
+      "loss": 0.9863,
+      "step": 565
+    },
+    {
+      "epoch": 0.21995530943359565,
+      "grad_norm": 0.21423624455928802,
+      "learning_rate": 0.0001564032697547684,
+      "loss": 1.0685,
+      "step": 566
+    },
+    {
+      "epoch": 0.22034392305450307,
+      "grad_norm": 0.22430062294006348,
+      "learning_rate": 0.00015632541845075906,
+      "loss": 1.0761,
+      "step": 567
+    },
+    {
+      "epoch": 0.22073253667541048,
+      "grad_norm": 0.22782258689403534,
+      "learning_rate": 0.0001562475671467497,
+      "loss": 1.1024,
+      "step": 568
+    },
+    {
+      "epoch": 0.22112115029631788,
+      "grad_norm": 0.21150320768356323,
+      "learning_rate": 0.0001561697158427404,
+      "loss": 1.0621,
+      "step": 569
+    },
+    {
+      "epoch": 0.2215097639172253,
+      "grad_norm": 0.20342351496219635,
+      "learning_rate": 0.00015609186453873104,
+      "loss": 1.0667,
+      "step": 570
+    },
+    {
+      "epoch": 0.2218983775381327,
+      "grad_norm": 0.22866711020469666,
+      "learning_rate": 0.0001560140132347217,
+      "loss": 1.0631,
+      "step": 571
+    },
+    {
+      "epoch": 0.22228699115904013,
+      "grad_norm": 0.2200063169002533,
+      "learning_rate": 0.00015593616193071235,
+      "loss": 1.0448,
+      "step": 572
+    },
+    {
+      "epoch": 0.22267560477994755,
+      "grad_norm": 0.19440248608589172,
+      "learning_rate": 0.000155858310626703,
+      "loss": 1.037,
+      "step": 573
+    },
+    {
+      "epoch": 0.22306421840085494,
+      "grad_norm": 0.205752432346344,
+      "learning_rate": 0.00015578045932269368,
+      "loss": 1.0465,
+      "step": 574
+    },
+    {
+      "epoch": 0.22345283202176236,
+      "grad_norm": 0.22247998416423798,
+      "learning_rate": 0.00015570260801868433,
+      "loss": 0.997,
+      "step": 575
+    },
+    {
+      "epoch": 0.22384144564266978,
+      "grad_norm": 0.22199274599552155,
+      "learning_rate": 0.00015562475671467498,
+      "loss": 1.0178,
+      "step": 576
+    },
+    {
+      "epoch": 0.2242300592635772,
+      "grad_norm": 0.2114989310503006,
+      "learning_rate": 0.00015554690541066564,
+      "loss": 1.0457,
+      "step": 577
+    },
+    {
+      "epoch": 0.2246186728844846,
+      "grad_norm": 0.24248506128787994,
+      "learning_rate": 0.0001554690541066563,
+      "loss": 1.002,
+      "step": 578
+    },
+    {
+      "epoch": 0.225007286505392,
+      "grad_norm": 0.2565505802631378,
+      "learning_rate": 0.00015539120280264697,
+      "loss": 1.0541,
+      "step": 579
+    },
+    {
+      "epoch": 0.22539590012629943,
+      "grad_norm": 0.22799409925937653,
+      "learning_rate": 0.00015531335149863762,
+      "loss": 1.0788,
+      "step": 580
+    },
+    {
+      "epoch": 0.22578451374720684,
+      "grad_norm": 0.2196080982685089,
+      "learning_rate": 0.00015523550019462827,
+      "loss": 1.0877,
+      "step": 581
+    },
+    {
+      "epoch": 0.22617312736811426,
+      "grad_norm": 0.21992824971675873,
+      "learning_rate": 0.00015515764889061892,
+      "loss": 1.0213,
+      "step": 582
+    },
+    {
+      "epoch": 0.22656174098902165,
+      "grad_norm": 0.22793298959732056,
+      "learning_rate": 0.00015507979758660957,
+      "loss": 1.0633,
+      "step": 583
+    },
+    {
+      "epoch": 0.22695035460992907,
+      "grad_norm": 0.21707972884178162,
+      "learning_rate": 0.00015500194628260023,
+      "loss": 1.081,
+      "step": 584
+    },
+    {
+      "epoch": 0.2273389682308365,
+      "grad_norm": 0.220685675740242,
+      "learning_rate": 0.0001549240949785909,
+      "loss": 1.0658,
+      "step": 585
+    },
+    {
+      "epoch": 0.2277275818517439,
+      "grad_norm": 0.22576668858528137,
+      "learning_rate": 0.00015484624367458156,
+      "loss": 1.0795,
+      "step": 586
+    },
+    {
+      "epoch": 0.22811619547265133,
+      "grad_norm": 0.21778982877731323,
+      "learning_rate": 0.0001547683923705722,
+      "loss": 1.033,
+      "step": 587
+    },
+    {
+      "epoch": 0.22850480909355872,
+      "grad_norm": 0.22748610377311707,
+      "learning_rate": 0.00015469054106656286,
+      "loss": 1.0948,
+      "step": 588
+    },
+    {
+      "epoch": 0.22889342271446614,
+      "grad_norm": 0.21561284363269806,
+      "learning_rate": 0.00015461268976255351,
+      "loss": 1.0022,
+      "step": 589
+    },
+    {
+      "epoch": 0.22928203633537356,
+      "grad_norm": 0.2419756054878235,
+      "learning_rate": 0.0001545348384585442,
+      "loss": 1.0786,
+      "step": 590
+    },
+    {
+      "epoch": 0.22967064995628098,
+      "grad_norm": 0.20479315519332886,
+      "learning_rate": 0.00015445698715453485,
+      "loss": 1.027,
+      "step": 591
+    },
+    {
+      "epoch": 0.2300592635771884,
+      "grad_norm": 0.21365883946418762,
+      "learning_rate": 0.0001543791358505255,
+      "loss": 1.0773,
+      "step": 592
+    },
+    {
+      "epoch": 0.23044787719809579,
+      "grad_norm": 0.23133166134357452,
+      "learning_rate": 0.00015430128454651615,
+      "loss": 1.0877,
+      "step": 593
+    },
+    {
+      "epoch": 0.2308364908190032,
+      "grad_norm": 0.2110515981912613,
+      "learning_rate": 0.0001542234332425068,
+      "loss": 1.0509,
+      "step": 594
+    },
+    {
+      "epoch": 0.23122510443991062,
+      "grad_norm": 0.20658442378044128,
+      "learning_rate": 0.00015414558193849748,
+      "loss": 1.0623,
+      "step": 595
+    },
+    {
+      "epoch": 0.23161371806081804,
+      "grad_norm": 0.21831996738910675,
+      "learning_rate": 0.00015406773063448813,
+      "loss": 1.021,
+      "step": 596
+    },
+    {
+      "epoch": 0.23200233168172543,
+      "grad_norm": 0.23015642166137695,
+      "learning_rate": 0.00015398987933047878,
+      "loss": 1.0358,
+      "step": 597
+    },
+    {
+      "epoch": 0.23239094530263285,
+      "grad_norm": 0.23071645200252533,
+      "learning_rate": 0.00015391202802646944,
+      "loss": 1.1255,
+      "step": 598
+    },
+    {
+      "epoch": 0.23277955892354027,
+      "grad_norm": 0.19513486325740814,
+      "learning_rate": 0.0001538341767224601,
+      "loss": 1.0189,
+      "step": 599
+    },
+    {
+      "epoch": 0.2331681725444477,
+      "grad_norm": 0.20821452140808105,
+      "learning_rate": 0.00015375632541845077,
+      "loss": 1.0843,
+      "step": 600
+    },
+    {
+      "epoch": 0.2335567861653551,
+      "grad_norm": 0.20563223958015442,
+      "learning_rate": 0.00015367847411444142,
+      "loss": 1.0012,
+      "step": 601
+    },
+    {
+      "epoch": 0.2339453997862625,
+      "grad_norm": 0.22674202919006348,
+      "learning_rate": 0.00015360062281043207,
+      "loss": 1.0371,
+      "step": 602
+    },
+    {
+      "epoch": 0.23433401340716992,
+      "grad_norm": 0.20744135975837708,
+      "learning_rate": 0.00015352277150642272,
+      "loss": 1.0466,
+      "step": 603
+    },
+    {
+      "epoch": 0.23472262702807734,
+      "grad_norm": 0.22103577852249146,
+      "learning_rate": 0.00015344492020241338,
+      "loss": 1.0942,
+      "step": 604
+    },
+    {
+      "epoch": 0.23511124064898475,
+      "grad_norm": 0.20643098652362823,
+      "learning_rate": 0.00015336706889840406,
+      "loss": 1.0682,
+      "step": 605
+    },
+    {
+      "epoch": 0.23549985426989217,
+      "grad_norm": 0.23436777293682098,
+      "learning_rate": 0.0001532892175943947,
+      "loss": 1.0613,
+      "step": 606
+    },
+    {
+      "epoch": 0.23588846789079956,
+      "grad_norm": 0.21898899972438812,
+      "learning_rate": 0.00015321136629038536,
+      "loss": 1.0571,
+      "step": 607
+    },
+    {
+      "epoch": 0.23627708151170698,
+      "grad_norm": 0.20569247007369995,
+      "learning_rate": 0.00015313351498637604,
+      "loss": 1.061,
+      "step": 608
+    },
+    {
+      "epoch": 0.2366656951326144,
+      "grad_norm": 0.2099207490682602,
+      "learning_rate": 0.0001530556636823667,
+      "loss": 1.0776,
+      "step": 609
+    },
+    {
+      "epoch": 0.23705430875352182,
+      "grad_norm": 0.20078738033771515,
+      "learning_rate": 0.00015297781237835734,
+      "loss": 1.0341,
+      "step": 610
+    },
+    {
+      "epoch": 0.2374429223744292,
+      "grad_norm": 0.20327065885066986,
+      "learning_rate": 0.000152899961074348,
+      "loss": 1.0168,
+      "step": 611
+    },
+    {
+      "epoch": 0.23783153599533663,
+      "grad_norm": 0.21741214394569397,
+      "learning_rate": 0.00015282210977033867,
+      "loss": 1.0726,
+      "step": 612
+    },
+    {
+      "epoch": 0.23822014961624405,
+      "grad_norm": 0.2065727263689041,
+      "learning_rate": 0.00015274425846632933,
+      "loss": 1.0474,
+      "step": 613
+    },
+    {
+      "epoch": 0.23860876323715147,
+      "grad_norm": 0.21241194009780884,
+      "learning_rate": 0.00015266640716231998,
+      "loss": 1.0666,
+      "step": 614
+    },
+    {
+      "epoch": 0.23899737685805889,
+      "grad_norm": 0.2194201797246933,
+      "learning_rate": 0.00015258855585831066,
+      "loss": 1.1411,
+      "step": 615
+    },
+    {
+      "epoch": 0.23938599047896628,
+      "grad_norm": 0.21537193655967712,
+      "learning_rate": 0.0001525107045543013,
+      "loss": 1.081,
+      "step": 616
+    },
+    {
+      "epoch": 0.2397746040998737,
+      "grad_norm": 0.21125951409339905,
+      "learning_rate": 0.00015243285325029196,
+      "loss": 1.0679,
+      "step": 617
+    },
+    {
+      "epoch": 0.2401632177207811,
+      "grad_norm": 0.21342721581459045,
+      "learning_rate": 0.0001523550019462826,
+      "loss": 1.0564,
+      "step": 618
+    },
+    {
+      "epoch": 0.24055183134168853,
+      "grad_norm": 0.2223503291606903,
+      "learning_rate": 0.00015227715064227327,
+      "loss": 1.1163,
+      "step": 619
+    },
+    {
+      "epoch": 0.24094044496259595,
+      "grad_norm": 0.21626527607440948,
+      "learning_rate": 0.00015219929933826394,
+      "loss": 1.0793,
+      "step": 620
+    },
+    {
+      "epoch": 0.24132905858350334,
+      "grad_norm": 0.21899500489234924,
+      "learning_rate": 0.0001521214480342546,
+      "loss": 1.0864,
+      "step": 621
+    },
+    {
+      "epoch": 0.24171767220441076,
+      "grad_norm": 0.2499915212392807,
+      "learning_rate": 0.00015204359673024525,
+      "loss": 1.1381,
+      "step": 622
+    },
+    {
+      "epoch": 0.24210628582531818,
+      "grad_norm": 0.2108345925807953,
+      "learning_rate": 0.0001519657454262359,
+      "loss": 1.0534,
+      "step": 623
+    },
+    {
+      "epoch": 0.2424948994462256,
+      "grad_norm": 0.2224910855293274,
+      "learning_rate": 0.00015188789412222655,
+      "loss": 1.0235,
+      "step": 624
+    },
+    {
+      "epoch": 0.24288351306713302,
+      "grad_norm": 0.22163094580173492,
+      "learning_rate": 0.0001518100428182172,
+      "loss": 1.0143,
+      "step": 625
+    },
+    {
+      "epoch": 0.2432721266880404,
+      "grad_norm": 0.20709283649921417,
+      "learning_rate": 0.00015173219151420788,
+      "loss": 1.0506,
+      "step": 626
+    },
+    {
+      "epoch": 0.24366074030894783,
+      "grad_norm": 0.2112802267074585,
+      "learning_rate": 0.00015165434021019854,
+      "loss": 1.0692,
+      "step": 627
+    },
+    {
+      "epoch": 0.24404935392985525,
+      "grad_norm": 0.23622830212116241,
+      "learning_rate": 0.0001515764889061892,
+      "loss": 1.0769,
+      "step": 628
+    },
+    {
+      "epoch": 0.24443796755076266,
+      "grad_norm": 0.23328271508216858,
+      "learning_rate": 0.00015149863760217984,
+      "loss": 1.1158,
+      "step": 629
+    },
+    {
+      "epoch": 0.24482658117167005,
+      "grad_norm": 0.2071760892868042,
+      "learning_rate": 0.0001514207862981705,
+      "loss": 1.0133,
+      "step": 630
+    },
+    {
+      "epoch": 0.24521519479257747,
+      "grad_norm": 0.21428920328617096,
+      "learning_rate": 0.00015134293499416117,
+      "loss": 1.0342,
+      "step": 631
+    },
+    {
+      "epoch": 0.2456038084134849,
+      "grad_norm": 0.22225375473499298,
+      "learning_rate": 0.00015126508369015182,
+      "loss": 1.1054,
+      "step": 632
+    },
+    {
+      "epoch": 0.2459924220343923,
+      "grad_norm": 0.2096671611070633,
+      "learning_rate": 0.00015118723238614248,
+      "loss": 1.0229,
+      "step": 633
+    },
+    {
+      "epoch": 0.24638103565529973,
+      "grad_norm": 0.21473252773284912,
+      "learning_rate": 0.00015110938108213313,
+      "loss": 1.0915,
+      "step": 634
+    },
+    {
+      "epoch": 0.24676964927620712,
+      "grad_norm": 0.2071562111377716,
+      "learning_rate": 0.00015103152977812378,
+      "loss": 1.047,
+      "step": 635
+    },
+    {
+      "epoch": 0.24715826289711454,
+      "grad_norm": 0.19868609309196472,
+      "learning_rate": 0.00015095367847411446,
+      "loss": 1.0073,
+      "step": 636
+    },
+    {
+      "epoch": 0.24754687651802196,
+      "grad_norm": 0.20937366783618927,
+      "learning_rate": 0.0001508758271701051,
+      "loss": 1.0155,
+      "step": 637
+    },
+    {
+      "epoch": 0.24793549013892938,
+      "grad_norm": 0.19225911796092987,
+      "learning_rate": 0.00015079797586609576,
+      "loss": 1.0163,
+      "step": 638
+    },
+    {
+      "epoch": 0.2483241037598368,
+      "grad_norm": 0.20427283644676208,
+      "learning_rate": 0.00015072012456208641,
+      "loss": 1.062,
+      "step": 639
+    },
+    {
+      "epoch": 0.24871271738074419,
+      "grad_norm": 0.21640253067016602,
+      "learning_rate": 0.00015064227325807707,
+      "loss": 1.025,
+      "step": 640
+    },
+    {
+      "epoch": 0.2491013310016516,
+      "grad_norm": 0.20416739583015442,
+      "learning_rate": 0.00015056442195406775,
+      "loss": 1.0635,
+      "step": 641
+    },
+    {
+      "epoch": 0.24948994462255902,
+      "grad_norm": 0.1990521252155304,
+      "learning_rate": 0.0001504865706500584,
+      "loss": 1.0757,
+      "step": 642
+    },
+    {
+      "epoch": 0.24987855824346644,
+      "grad_norm": 0.21636444330215454,
+      "learning_rate": 0.00015040871934604905,
+      "loss": 1.0441,
+      "step": 643
+    },
+    {
+      "epoch": 0.25026717186437386,
+      "grad_norm": 0.21253719925880432,
+      "learning_rate": 0.0001503308680420397,
+      "loss": 1.0574,
+      "step": 644
+    },
+    {
+      "epoch": 0.2506557854852813,
+      "grad_norm": 0.2134159356355667,
+      "learning_rate": 0.00015025301673803035,
+      "loss": 1.0396,
+      "step": 645
+    },
+    {
+      "epoch": 0.2510443991061887,
+      "grad_norm": 0.2018527239561081,
+      "learning_rate": 0.00015017516543402103,
+      "loss": 1.0606,
+      "step": 646
+    },
+    {
+      "epoch": 0.25143301272709606,
+      "grad_norm": 0.20320741832256317,
+      "learning_rate": 0.00015009731413001169,
+      "loss": 1.0093,
+      "step": 647
+    },
+    {
+      "epoch": 0.2518216263480035,
+      "grad_norm": 0.21007056534290314,
+      "learning_rate": 0.00015001946282600234,
+      "loss": 1.0284,
+      "step": 648
+    },
+    {
+      "epoch": 0.2522102399689109,
+      "grad_norm": 0.22453372180461884,
+      "learning_rate": 0.000149941611521993,
+      "loss": 1.0271,
+      "step": 649
+    },
+    {
+      "epoch": 0.2525988535898183,
+      "grad_norm": 0.19889335334300995,
+      "learning_rate": 0.00014986376021798364,
+      "loss": 1.0238,
+      "step": 650
+    },
+    {
+      "epoch": 0.25298746721072574,
+      "grad_norm": 0.19339965283870697,
+      "learning_rate": 0.00014978590891397432,
+      "loss": 1.024,
+      "step": 651
+    },
+    {
+      "epoch": 0.25337608083163315,
+      "grad_norm": 0.22362011671066284,
+      "learning_rate": 0.00014970805760996497,
+      "loss": 1.0722,
+      "step": 652
+    },
+    {
+      "epoch": 0.2537646944525406,
+      "grad_norm": 0.2110588103532791,
+      "learning_rate": 0.00014963020630595562,
+      "loss": 1.0541,
+      "step": 653
+    },
+    {
+      "epoch": 0.254153308073448,
+      "grad_norm": 0.203025683760643,
+      "learning_rate": 0.00014955235500194628,
+      "loss": 1.0335,
+      "step": 654
+    },
+    {
+      "epoch": 0.2545419216943554,
+      "grad_norm": 0.20884902775287628,
+      "learning_rate": 0.00014947450369793693,
+      "loss": 1.0507,
+      "step": 655
+    },
+    {
+      "epoch": 0.2549305353152628,
+      "grad_norm": 0.21234256029129028,
+      "learning_rate": 0.0001493966523939276,
+      "loss": 1.0372,
+      "step": 656
+    },
+    {
+      "epoch": 0.2553191489361702,
+      "grad_norm": 0.1984352171421051,
+      "learning_rate": 0.00014931880108991826,
+      "loss": 0.9979,
+      "step": 657
+    },
+    {
+      "epoch": 0.2557077625570776,
+      "grad_norm": 0.18848282098770142,
+      "learning_rate": 0.0001492409497859089,
+      "loss": 0.9973,
+      "step": 658
+    },
+    {
+      "epoch": 0.25609637617798503,
+      "grad_norm": 0.2201709896326065,
+      "learning_rate": 0.00014916309848189956,
+      "loss": 1.0386,
+      "step": 659
+    },
+    {
+      "epoch": 0.25648498979889245,
+      "grad_norm": 0.23094095289707184,
+      "learning_rate": 0.00014908524717789024,
+      "loss": 1.1205,
+      "step": 660
+    },
+    {
+      "epoch": 0.25687360341979987,
+      "grad_norm": 0.21087734401226044,
+      "learning_rate": 0.0001490073958738809,
+      "loss": 1.0231,
+      "step": 661
+    },
+    {
+      "epoch": 0.2572622170407073,
+      "grad_norm": 0.24970979988574982,
+      "learning_rate": 0.00014892954456987155,
+      "loss": 1.0421,
+      "step": 662
+    },
+    {
+      "epoch": 0.2576508306616147,
+      "grad_norm": 0.22024711966514587,
+      "learning_rate": 0.00014885169326586223,
+      "loss": 1.1033,
+      "step": 663
+    },
+    {
+      "epoch": 0.2580394442825221,
+      "grad_norm": 0.2195248156785965,
+      "learning_rate": 0.00014877384196185288,
+      "loss": 1.089,
+      "step": 664
+    },
+    {
+      "epoch": 0.25842805790342954,
+      "grad_norm": 0.20236417651176453,
+      "learning_rate": 0.00014869599065784353,
+      "loss": 1.0196,
+      "step": 665
+    },
+    {
+      "epoch": 0.2588166715243369,
+      "grad_norm": 0.21973329782485962,
+      "learning_rate": 0.00014861813935383418,
+      "loss": 1.0844,
+      "step": 666
+    },
+    {
+      "epoch": 0.2592052851452443,
+      "grad_norm": 0.2069879174232483,
+      "learning_rate": 0.00014854028804982486,
+      "loss": 1.0312,
+      "step": 667
+    },
+    {
+      "epoch": 0.25959389876615174,
+      "grad_norm": 0.2037455290555954,
+      "learning_rate": 0.00014846243674581551,
+      "loss": 1.0018,
+      "step": 668
+    },
+    {
+      "epoch": 0.25998251238705916,
+      "grad_norm": 0.24176378548145294,
+      "learning_rate": 0.00014838458544180617,
+      "loss": 1.0749,
+      "step": 669
+    },
+    {
+      "epoch": 0.2603711260079666,
+      "grad_norm": 0.2007879763841629,
+      "learning_rate": 0.00014830673413779682,
+      "loss": 1.0443,
+      "step": 670
+    },
+    {
+      "epoch": 0.260759739628874,
+      "grad_norm": 0.23503245413303375,
+      "learning_rate": 0.00014822888283378747,
+      "loss": 1.0674,
+      "step": 671
+    },
+    {
+      "epoch": 0.2611483532497814,
+      "grad_norm": 0.2166167050600052,
+      "learning_rate": 0.00014815103152977815,
+      "loss": 1.079,
+      "step": 672
+    },
+    {
+      "epoch": 0.26153696687068884,
+      "grad_norm": 0.2293982058763504,
+      "learning_rate": 0.0001480731802257688,
+      "loss": 1.0517,
+      "step": 673
+    },
+    {
+      "epoch": 0.26192558049159625,
+      "grad_norm": 0.21040330827236176,
+      "learning_rate": 0.00014799532892175945,
+      "loss": 1.0475,
+      "step": 674
+    },
+    {
+      "epoch": 0.2623141941125036,
+      "grad_norm": 0.20750463008880615,
+      "learning_rate": 0.0001479174776177501,
+      "loss": 1.025,
+      "step": 675
+    },
+    {
+      "epoch": 0.26270280773341104,
+      "grad_norm": 0.2748873233795166,
+      "learning_rate": 0.00014783962631374076,
+      "loss": 1.0212,
+      "step": 676
+    },
+    {
+      "epoch": 0.26309142135431846,
+      "grad_norm": 0.19212333858013153,
+      "learning_rate": 0.00014776177500973144,
+      "loss": 1.0049,
+      "step": 677
+    },
+    {
+      "epoch": 0.2634800349752259,
+      "grad_norm": 0.207731693983078,
+      "learning_rate": 0.0001476839237057221,
+      "loss": 1.0062,
+      "step": 678
+    },
+    {
+      "epoch": 0.2638686485961333,
+      "grad_norm": 0.2177981585264206,
+      "learning_rate": 0.00014760607240171274,
+      "loss": 1.0489,
+      "step": 679
+    },
+    {
+      "epoch": 0.2642572622170407,
+      "grad_norm": 0.23239290714263916,
+      "learning_rate": 0.0001475282210977034,
+      "loss": 1.0856,
+      "step": 680
+    },
+    {
+      "epoch": 0.26464587583794813,
+      "grad_norm": 0.2033151388168335,
+      "learning_rate": 0.00014745036979369404,
+      "loss": 1.0389,
+      "step": 681
+    },
+    {
+      "epoch": 0.26503448945885555,
+      "grad_norm": 0.20917408168315887,
+      "learning_rate": 0.00014737251848968472,
+      "loss": 1.1208,
+      "step": 682
+    },
+    {
+      "epoch": 0.26542310307976297,
+      "grad_norm": 0.22075454890727997,
+      "learning_rate": 0.00014729466718567538,
+      "loss": 1.0435,
+      "step": 683
+    },
+    {
+      "epoch": 0.26581171670067033,
+      "grad_norm": 0.23094993829727173,
+      "learning_rate": 0.00014721681588166603,
+      "loss": 1.0649,
+      "step": 684
+    },
+    {
+      "epoch": 0.26620033032157775,
+      "grad_norm": 0.21209536492824554,
+      "learning_rate": 0.00014713896457765668,
+      "loss": 1.0578,
+      "step": 685
+    },
+    {
+      "epoch": 0.26658894394248517,
+      "grad_norm": 0.21412219107151031,
+      "learning_rate": 0.00014706111327364733,
+      "loss": 1.1137,
+      "step": 686
+    },
+    {
+      "epoch": 0.2669775575633926,
+      "grad_norm": 0.21175475418567657,
+      "learning_rate": 0.000146983261969638,
+      "loss": 1.023,
+      "step": 687
+    },
+    {
+      "epoch": 0.2673661711843,
+      "grad_norm": 0.21968993544578552,
+      "learning_rate": 0.00014690541066562866,
+      "loss": 1.1183,
+      "step": 688
+    },
+    {
+      "epoch": 0.2677547848052074,
+      "grad_norm": 0.20414218306541443,
+      "learning_rate": 0.00014682755936161932,
+      "loss": 1.078,
+      "step": 689
+    },
+    {
+      "epoch": 0.26814339842611484,
+      "grad_norm": 0.18986597657203674,
+      "learning_rate": 0.00014674970805760997,
+      "loss": 1.0029,
+      "step": 690
+    },
+    {
+      "epoch": 0.26853201204702226,
+      "grad_norm": 0.21215832233428955,
+      "learning_rate": 0.00014667185675360062,
+      "loss": 1.0759,
+      "step": 691
+    },
+    {
+      "epoch": 0.2689206256679297,
+      "grad_norm": 0.2113744169473648,
+      "learning_rate": 0.0001465940054495913,
+      "loss": 1.1027,
+      "step": 692
+    },
+    {
+      "epoch": 0.2693092392888371,
+      "grad_norm": 0.22010880708694458,
+      "learning_rate": 0.00014651615414558195,
+      "loss": 1.0984,
+      "step": 693
+    },
+    {
+      "epoch": 0.26969785290974446,
+      "grad_norm": 0.203857421875,
+      "learning_rate": 0.0001464383028415726,
+      "loss": 1.0407,
+      "step": 694
+    },
+    {
+      "epoch": 0.2700864665306519,
+      "grad_norm": 0.21120867133140564,
+      "learning_rate": 0.00014636045153756325,
+      "loss": 1.0521,
+      "step": 695
+    },
+    {
+      "epoch": 0.2704750801515593,
+      "grad_norm": 0.20039112865924835,
+      "learning_rate": 0.0001462826002335539,
+      "loss": 1.0897,
+      "step": 696
+    },
+    {
+      "epoch": 0.2708636937724667,
+      "grad_norm": 0.22893202304840088,
+      "learning_rate": 0.00014620474892954456,
+      "loss": 1.0903,
+      "step": 697
+    },
+    {
+      "epoch": 0.27125230739337414,
+      "grad_norm": 0.19886267185211182,
+      "learning_rate": 0.00014612689762553524,
+      "loss": 1.0889,
+      "step": 698
+    },
+    {
+      "epoch": 0.27164092101428156,
+      "grad_norm": 0.18892349302768707,
+      "learning_rate": 0.0001460490463215259,
+      "loss": 0.981,
+      "step": 699
+    },
+    {
+      "epoch": 0.272029534635189,
+      "grad_norm": 0.20602507889270782,
+      "learning_rate": 0.00014597119501751654,
+      "loss": 1.0223,
+      "step": 700
+    },
+    {
+      "epoch": 0.2724181482560964,
+      "grad_norm": 0.21480505168437958,
+      "learning_rate": 0.0001458933437135072,
+      "loss": 1.0355,
+      "step": 701
+    },
+    {
+      "epoch": 0.2728067618770038,
+      "grad_norm": 0.21011753380298615,
+      "learning_rate": 0.00014581549240949785,
+      "loss": 1.0613,
+      "step": 702
+    },
+    {
+      "epoch": 0.2731953754979112,
+      "grad_norm": 0.19350819289684296,
+      "learning_rate": 0.00014573764110548853,
+      "loss": 1.0144,
+      "step": 703
+    },
+    {
+      "epoch": 0.2735839891188186,
+      "grad_norm": 0.207548126578331,
+      "learning_rate": 0.00014565978980147918,
+      "loss": 1.0465,
+      "step": 704
+    },
+    {
+      "epoch": 0.273972602739726,
+      "grad_norm": 0.22220565378665924,
+      "learning_rate": 0.00014558193849746983,
+      "loss": 1.1073,
+      "step": 705
+    },
+    {
+      "epoch": 0.27436121636063343,
+      "grad_norm": 0.193622425198555,
+      "learning_rate": 0.00014550408719346048,
+      "loss": 1.0357,
+      "step": 706
+    },
+    {
+      "epoch": 0.27474982998154085,
+      "grad_norm": 0.2067158818244934,
+      "learning_rate": 0.00014542623588945113,
+      "loss": 1.0502,
+      "step": 707
+    },
+    {
+      "epoch": 0.27513844360244827,
+      "grad_norm": 0.2218742072582245,
+      "learning_rate": 0.0001453483845854418,
+      "loss": 0.9934,
+      "step": 708
+    },
+    {
+      "epoch": 0.2755270572233557,
+      "grad_norm": 0.22316142916679382,
+      "learning_rate": 0.00014527053328143246,
+      "loss": 1.0707,
+      "step": 709
+    },
+    {
+      "epoch": 0.2759156708442631,
+      "grad_norm": 0.21004025638103485,
+      "learning_rate": 0.00014519268197742312,
+      "loss": 1.0543,
+      "step": 710
+    },
+    {
+      "epoch": 0.2763042844651705,
+      "grad_norm": 0.22070440649986267,
+      "learning_rate": 0.00014511483067341377,
+      "loss": 1.0467,
+      "step": 711
+    },
+    {
+      "epoch": 0.27669289808607794,
+      "grad_norm": 0.21463747322559357,
+      "learning_rate": 0.00014503697936940445,
+      "loss": 1.0793,
+      "step": 712
+    },
+    {
+      "epoch": 0.2770815117069853,
+      "grad_norm": 0.23452533781528473,
+      "learning_rate": 0.0001449591280653951,
+      "loss": 1.043,
+      "step": 713
+    },
+    {
+      "epoch": 0.2774701253278927,
+      "grad_norm": 0.2405795156955719,
+      "learning_rate": 0.00014488127676138575,
+      "loss": 1.0752,
+      "step": 714
+    },
+    {
+      "epoch": 0.27785873894880014,
+      "grad_norm": 0.21546585857868195,
+      "learning_rate": 0.00014480342545737643,
+      "loss": 1.0834,
+      "step": 715
+    },
+    {
+      "epoch": 0.27824735256970756,
+      "grad_norm": 0.22675828635692596,
+      "learning_rate": 0.00014472557415336708,
+      "loss": 1.055,
+      "step": 716
+    },
+    {
+      "epoch": 0.278635966190615,
+      "grad_norm": 0.2117871195077896,
+      "learning_rate": 0.00014464772284935774,
+      "loss": 1.03,
+      "step": 717
+    },
+    {
+      "epoch": 0.2790245798115224,
+      "grad_norm": 0.2193155735731125,
+      "learning_rate": 0.00014456987154534841,
+      "loss": 1.0073,
+      "step": 718
+    },
+    {
+      "epoch": 0.2794131934324298,
+      "grad_norm": 0.21447965502738953,
+      "learning_rate": 0.00014449202024133907,
+      "loss": 1.0174,
+      "step": 719
+    },
+    {
+      "epoch": 0.27980180705333724,
+      "grad_norm": 0.22867532074451447,
+      "learning_rate": 0.00014441416893732972,
+      "loss": 1.0948,
+      "step": 720
+    },
+    {
+      "epoch": 0.28019042067424466,
+      "grad_norm": 0.21570557355880737,
+      "learning_rate": 0.00014433631763332037,
+      "loss": 1.0105,
+      "step": 721
+    },
+    {
+      "epoch": 0.280579034295152,
+      "grad_norm": 0.20787014067173004,
+      "learning_rate": 0.00014425846632931102,
+      "loss": 1.0384,
+      "step": 722
+    },
+    {
+      "epoch": 0.28096764791605944,
+      "grad_norm": 0.19924762845039368,
+      "learning_rate": 0.0001441806150253017,
+      "loss": 1.0653,
+      "step": 723
+    },
+    {
+      "epoch": 0.28135626153696686,
+      "grad_norm": 0.1996215283870697,
+      "learning_rate": 0.00014410276372129235,
+      "loss": 1.0439,
+      "step": 724
+    },
+    {
+      "epoch": 0.2817448751578743,
+      "grad_norm": 0.2054813802242279,
+      "learning_rate": 0.000144024912417283,
+      "loss": 0.9895,
+      "step": 725
+    },
+    {
+      "epoch": 0.2821334887787817,
+      "grad_norm": 0.2268310785293579,
+      "learning_rate": 0.00014394706111327366,
+      "loss": 1.0993,
+      "step": 726
+    },
+    {
+      "epoch": 0.2825221023996891,
+      "grad_norm": 0.19867680966854095,
+      "learning_rate": 0.0001438692098092643,
+      "loss": 0.985,
+      "step": 727
+    },
+    {
+      "epoch": 0.28291071602059653,
+      "grad_norm": 0.21099598705768585,
+      "learning_rate": 0.000143791358505255,
+      "loss": 1.0333,
+      "step": 728
+    },
+    {
+      "epoch": 0.28329932964150395,
+      "grad_norm": 0.22479215264320374,
+      "learning_rate": 0.00014371350720124564,
+      "loss": 1.0449,
+      "step": 729
+    },
+    {
+      "epoch": 0.28368794326241137,
+      "grad_norm": 0.22717688977718353,
+      "learning_rate": 0.0001436356558972363,
+      "loss": 1.0482,
+      "step": 730
+    },
+    {
+      "epoch": 0.2840765568833188,
+      "grad_norm": 0.20389345288276672,
+      "learning_rate": 0.00014355780459322695,
+      "loss": 0.956,
+      "step": 731
+    },
+    {
+      "epoch": 0.28446517050422615,
+      "grad_norm": 0.21583619713783264,
+      "learning_rate": 0.0001434799532892176,
+      "loss": 1.0154,
+      "step": 732
+    },
+    {
+      "epoch": 0.28485378412513357,
+      "grad_norm": 0.2219148874282837,
+      "learning_rate": 0.00014340210198520825,
+      "loss": 1.0553,
+      "step": 733
+    },
+    {
+      "epoch": 0.285242397746041,
+      "grad_norm": 0.19920189678668976,
+      "learning_rate": 0.00014332425068119893,
+      "loss": 0.9881,
+      "step": 734
+    },
+    {
+      "epoch": 0.2856310113669484,
+      "grad_norm": 0.2295670360326767,
+      "learning_rate": 0.00014324639937718958,
+      "loss": 1.0529,
+      "step": 735
+    },
+    {
+      "epoch": 0.2860196249878558,
+      "grad_norm": 0.21271567046642303,
+      "learning_rate": 0.00014316854807318023,
+      "loss": 1.037,
+      "step": 736
+    },
+    {
+      "epoch": 0.28640823860876324,
+      "grad_norm": 0.21304361522197723,
+      "learning_rate": 0.00014309069676917088,
+      "loss": 1.048,
+      "step": 737
+    },
+    {
+      "epoch": 0.28679685222967066,
+      "grad_norm": 0.19902732968330383,
+      "learning_rate": 0.00014301284546516154,
+      "loss": 1.0306,
+      "step": 738
+    },
+    {
+      "epoch": 0.2871854658505781,
+      "grad_norm": 0.1995929330587387,
+      "learning_rate": 0.00014293499416115222,
+      "loss": 1.0394,
+      "step": 739
+    },
+    {
+      "epoch": 0.2875740794714855,
+      "grad_norm": 0.20426060259342194,
+      "learning_rate": 0.00014285714285714287,
+      "loss": 1.0052,
+      "step": 740
+    },
+    {
+      "epoch": 0.28796269309239286,
+      "grad_norm": 0.20284566283226013,
+      "learning_rate": 0.00014277929155313352,
+      "loss": 1.0115,
+      "step": 741
+    },
+    {
+      "epoch": 0.2883513067133003,
+      "grad_norm": 0.2041557878255844,
+      "learning_rate": 0.00014270144024912417,
+      "loss": 1.0473,
+      "step": 742
+    },
+    {
+      "epoch": 0.2887399203342077,
+      "grad_norm": 0.2152249962091446,
+      "learning_rate": 0.00014262358894511482,
+      "loss": 1.0802,
+      "step": 743
+    },
+    {
+      "epoch": 0.2891285339551151,
+      "grad_norm": 0.20569871366024017,
+      "learning_rate": 0.0001425457376411055,
+      "loss": 1.0203,
+      "step": 744
+    },
+    {
+      "epoch": 0.28951714757602254,
+      "grad_norm": 0.21128378808498383,
+      "learning_rate": 0.00014246788633709616,
+      "loss": 1.108,
+      "step": 745
+    },
+    {
+      "epoch": 0.28990576119692996,
+      "grad_norm": 0.19587135314941406,
+      "learning_rate": 0.0001423900350330868,
+      "loss": 1.0427,
+      "step": 746
+    },
+    {
+      "epoch": 0.2902943748178374,
+      "grad_norm": 0.22052550315856934,
+      "learning_rate": 0.00014231218372907746,
+      "loss": 1.055,
+      "step": 747
+    },
+    {
+      "epoch": 0.2906829884387448,
+      "grad_norm": 0.21291717886924744,
+      "learning_rate": 0.0001422343324250681,
+      "loss": 1.0591,
+      "step": 748
+    },
+    {
+      "epoch": 0.2910716020596522,
+      "grad_norm": 0.20634084939956665,
+      "learning_rate": 0.0001421564811210588,
+      "loss": 1.0527,
+      "step": 749
+    },
+    {
+      "epoch": 0.29146021568055963,
+      "grad_norm": 0.2075488269329071,
+      "learning_rate": 0.00014207862981704944,
+      "loss": 1.0786,
+      "step": 750
+    },
+    {
+      "epoch": 0.291848829301467,
+      "grad_norm": 0.19780080020427704,
+      "learning_rate": 0.0001420007785130401,
+      "loss": 1.059,
+      "step": 751
+    },
+    {
+      "epoch": 0.2922374429223744,
+      "grad_norm": 0.21212074160575867,
+      "learning_rate": 0.00014192292720903075,
+      "loss": 1.0346,
+      "step": 752
+    },
+    {
+      "epoch": 0.29262605654328183,
+      "grad_norm": 0.2218451350927353,
+      "learning_rate": 0.0001418450759050214,
+      "loss": 1.0908,
+      "step": 753
+    },
+    {
+      "epoch": 0.29301467016418925,
+      "grad_norm": 0.20107759535312653,
+      "learning_rate": 0.00014176722460101208,
+      "loss": 1.0202,
+      "step": 754
+    },
+    {
+      "epoch": 0.29340328378509667,
+      "grad_norm": 0.20933273434638977,
+      "learning_rate": 0.00014168937329700273,
+      "loss": 1.0719,
+      "step": 755
+    },
+    {
+      "epoch": 0.2937918974060041,
+      "grad_norm": 0.22369107604026794,
+      "learning_rate": 0.00014161152199299338,
+      "loss": 1.0433,
+      "step": 756
+    },
+    {
+      "epoch": 0.2941805110269115,
+      "grad_norm": 0.2113707810640335,
+      "learning_rate": 0.00014153367068898403,
+      "loss": 1.0637,
+      "step": 757
+    },
+    {
+      "epoch": 0.2945691246478189,
+      "grad_norm": 0.21105700731277466,
+      "learning_rate": 0.00014145581938497469,
+      "loss": 1.0468,
+      "step": 758
+    },
+    {
+      "epoch": 0.29495773826872634,
+      "grad_norm": 0.20189693570137024,
+      "learning_rate": 0.00014137796808096537,
+      "loss": 1.0281,
+      "step": 759
+    },
+    {
+      "epoch": 0.2953463518896337,
+      "grad_norm": 0.1954152137041092,
+      "learning_rate": 0.00014130011677695602,
+      "loss": 1.0519,
+      "step": 760
+    },
+    {
+      "epoch": 0.2957349655105411,
+      "grad_norm": 0.24295592308044434,
+      "learning_rate": 0.00014122226547294667,
+      "loss": 1.1303,
+      "step": 761
+    },
+    {
+      "epoch": 0.29612357913144854,
+      "grad_norm": 0.20158620178699493,
+      "learning_rate": 0.00014114441416893732,
+      "loss": 1.0367,
+      "step": 762
+    },
+    {
+      "epoch": 0.29651219275235596,
+      "grad_norm": 0.20734666287899017,
+      "learning_rate": 0.00014106656286492797,
+      "loss": 1.0392,
+      "step": 763
+    },
+    {
+      "epoch": 0.2969008063732634,
+      "grad_norm": 0.2177533656358719,
+      "learning_rate": 0.00014098871156091865,
+      "loss": 1.0619,
+      "step": 764
+    },
+    {
+      "epoch": 0.2972894199941708,
+      "grad_norm": 0.1961720883846283,
+      "learning_rate": 0.0001409108602569093,
+      "loss": 0.9872,
+      "step": 765
+    },
+    {
+      "epoch": 0.2976780336150782,
+      "grad_norm": 0.21530941128730774,
+      "learning_rate": 0.00014083300895289996,
+      "loss": 1.1246,
+      "step": 766
+    },
+    {
+      "epoch": 0.29806664723598564,
+      "grad_norm": 0.2039783000946045,
+      "learning_rate": 0.00014075515764889064,
+      "loss": 1.0789,
+      "step": 767
+    },
+    {
+      "epoch": 0.29845526085689306,
+      "grad_norm": 0.20641569793224335,
+      "learning_rate": 0.0001406773063448813,
+      "loss": 1.05,
+      "step": 768
+    },
+    {
+      "epoch": 0.2988438744778004,
+      "grad_norm": 0.2071225494146347,
+      "learning_rate": 0.00014059945504087194,
+      "loss": 1.047,
+      "step": 769
+    },
+    {
+      "epoch": 0.29923248809870784,
+      "grad_norm": 0.20367531478405,
+      "learning_rate": 0.00014052160373686262,
+      "loss": 1.0734,
+      "step": 770
+    },
+    {
+      "epoch": 0.29962110171961526,
+      "grad_norm": 0.21718619763851166,
+      "learning_rate": 0.00014044375243285327,
+      "loss": 1.0613,
+      "step": 771
+    },
+    {
+      "epoch": 0.3000097153405227,
+      "grad_norm": 0.21649087965488434,
+      "learning_rate": 0.00014036590112884392,
+      "loss": 1.0671,
+      "step": 772
+    },
+    {
+      "epoch": 0.3003983289614301,
+      "grad_norm": 0.22223225235939026,
+      "learning_rate": 0.00014028804982483458,
+      "loss": 1.0977,
+      "step": 773
+    },
+    {
+      "epoch": 0.3007869425823375,
+      "grad_norm": 0.23101870715618134,
+      "learning_rate": 0.00014021019852082523,
+      "loss": 1.1236,
+      "step": 774
+    },
+    {
+      "epoch": 0.30117555620324493,
+      "grad_norm": 0.22855506837368011,
+      "learning_rate": 0.0001401323472168159,
+      "loss": 1.0517,
+      "step": 775
+    },
+    {
+      "epoch": 0.30156416982415235,
+      "grad_norm": 0.20862117409706116,
+      "learning_rate": 0.00014005449591280656,
+      "loss": 1.0493,
+      "step": 776
+    },
+    {
+      "epoch": 0.30195278344505977,
+      "grad_norm": 0.21692048013210297,
+      "learning_rate": 0.0001399766446087972,
+      "loss": 1.0681,
+      "step": 777
+    },
+    {
+      "epoch": 0.3023413970659672,
+      "grad_norm": 0.21541331708431244,
+      "learning_rate": 0.00013989879330478786,
+      "loss": 1.0775,
+      "step": 778
+    },
+    {
+      "epoch": 0.30273001068687455,
+      "grad_norm": 0.21221749484539032,
+      "learning_rate": 0.00013982094200077851,
+      "loss": 1.0421,
+      "step": 779
+    },
+    {
+      "epoch": 0.30311862430778197,
+      "grad_norm": 0.22497743368148804,
+      "learning_rate": 0.0001397430906967692,
+      "loss": 1.1115,
+      "step": 780
+    },
+    {
+      "epoch": 0.3035072379286894,
+      "grad_norm": 0.1974119246006012,
+      "learning_rate": 0.00013966523939275985,
+      "loss": 1.0264,
+      "step": 781
+    },
+    {
+      "epoch": 0.3038958515495968,
+      "grad_norm": 0.20349323749542236,
+      "learning_rate": 0.0001395873880887505,
+      "loss": 1.0512,
+      "step": 782
+    },
+    {
+      "epoch": 0.3042844651705042,
+      "grad_norm": 0.21116937696933746,
+      "learning_rate": 0.00013950953678474115,
+      "loss": 1.0135,
+      "step": 783
+    },
+    {
+      "epoch": 0.30467307879141164,
+      "grad_norm": 0.2133677899837494,
+      "learning_rate": 0.0001394316854807318,
+      "loss": 1.0694,
+      "step": 784
+    },
+    {
+      "epoch": 0.30506169241231906,
+      "grad_norm": 0.20406191051006317,
+      "learning_rate": 0.00013935383417672248,
+      "loss": 1.0179,
+      "step": 785
+    },
+    {
+      "epoch": 0.3054503060332265,
+      "grad_norm": 0.21428678929805756,
+      "learning_rate": 0.00013927598287271313,
+      "loss": 1.0577,
+      "step": 786
+    },
+    {
+      "epoch": 0.3058389196541339,
+      "grad_norm": 0.20878921449184418,
+      "learning_rate": 0.00013919813156870379,
+      "loss": 1.0311,
+      "step": 787
+    },
+    {
+      "epoch": 0.30622753327504126,
+      "grad_norm": 0.19033175706863403,
+      "learning_rate": 0.00013912028026469444,
+      "loss": 0.976,
+      "step": 788
+    },
+    {
+      "epoch": 0.3066161468959487,
+      "grad_norm": 0.22138020396232605,
+      "learning_rate": 0.0001390424289606851,
+      "loss": 1.0438,
+      "step": 789
+    },
+    {
+      "epoch": 0.3070047605168561,
+      "grad_norm": 0.20765596628189087,
+      "learning_rate": 0.00013896457765667577,
+      "loss": 1.0865,
+      "step": 790
+    },
+    {
+      "epoch": 0.3073933741377635,
+      "grad_norm": 0.209733247756958,
+      "learning_rate": 0.00013888672635266642,
+      "loss": 1.0648,
+      "step": 791
+    },
+    {
+      "epoch": 0.30778198775867094,
+      "grad_norm": 0.1896686851978302,
+      "learning_rate": 0.00013880887504865707,
+      "loss": 1.0133,
+      "step": 792
+    },
+    {
+      "epoch": 0.30817060137957836,
+      "grad_norm": 0.21651998162269592,
+      "learning_rate": 0.00013873102374464772,
+      "loss": 1.0729,
+      "step": 793
+    },
+    {
+      "epoch": 0.3085592150004858,
+      "grad_norm": 0.21751996874809265,
+      "learning_rate": 0.00013865317244063838,
+      "loss": 1.0444,
+      "step": 794
+    },
+    {
+      "epoch": 0.3089478286213932,
+      "grad_norm": 0.20593520998954773,
+      "learning_rate": 0.00013857532113662906,
+      "loss": 1.0304,
+      "step": 795
+    },
+    {
+      "epoch": 0.3093364422423006,
+      "grad_norm": 0.19937261939048767,
+      "learning_rate": 0.0001384974698326197,
+      "loss": 1.0017,
+      "step": 796
+    },
+    {
+      "epoch": 0.30972505586320803,
+      "grad_norm": 0.18901696801185608,
+      "learning_rate": 0.00013841961852861036,
+      "loss": 1.0362,
+      "step": 797
+    },
+    {
+      "epoch": 0.3101136694841154,
+      "grad_norm": 0.2079760730266571,
+      "learning_rate": 0.000138341767224601,
+      "loss": 1.0784,
+      "step": 798
+    },
+    {
+      "epoch": 0.3105022831050228,
+      "grad_norm": 0.24873265624046326,
+      "learning_rate": 0.00013826391592059166,
+      "loss": 1.1026,
+      "step": 799
+    },
+    {
+      "epoch": 0.31089089672593023,
+      "grad_norm": 0.20185396075248718,
+      "learning_rate": 0.00013818606461658234,
+      "loss": 1.0235,
+      "step": 800
+    },
+    {
+      "epoch": 0.31127951034683765,
+      "grad_norm": 0.211393803358078,
+      "learning_rate": 0.000138108213312573,
+      "loss": 1.0999,
+      "step": 801
+    },
+    {
+      "epoch": 0.31166812396774507,
+      "grad_norm": 0.19948823750019073,
+      "learning_rate": 0.00013803036200856365,
+      "loss": 1.0242,
+      "step": 802
+    },
+    {
+      "epoch": 0.3120567375886525,
+      "grad_norm": 0.21470944583415985,
+      "learning_rate": 0.0001379525107045543,
+      "loss": 1.0736,
+      "step": 803
+    },
+    {
+      "epoch": 0.3124453512095599,
+      "grad_norm": 0.2195902317762375,
+      "learning_rate": 0.00013787465940054495,
+      "loss": 1.0368,
+      "step": 804
+    },
+    {
+      "epoch": 0.3128339648304673,
+      "grad_norm": 0.22142355144023895,
+      "learning_rate": 0.00013779680809653563,
+      "loss": 1.1022,
+      "step": 805
+    },
+    {
+      "epoch": 0.31322257845137474,
+      "grad_norm": 0.20487886667251587,
+      "learning_rate": 0.00013771895679252628,
+      "loss": 1.0478,
+      "step": 806
+    },
+    {
+      "epoch": 0.3136111920722821,
+      "grad_norm": 0.217549130320549,
+      "learning_rate": 0.00013764110548851693,
+      "loss": 1.0526,
+      "step": 807
+    },
+    {
+      "epoch": 0.3139998056931895,
+      "grad_norm": 0.20199982821941376,
+      "learning_rate": 0.0001375632541845076,
+      "loss": 0.9992,
+      "step": 808
+    },
+    {
+      "epoch": 0.31438841931409695,
+      "grad_norm": 0.19496634602546692,
+      "learning_rate": 0.00013748540288049824,
+      "loss": 1.0179,
+      "step": 809
+    },
+    {
+      "epoch": 0.31477703293500436,
+      "grad_norm": 0.21999460458755493,
+      "learning_rate": 0.0001374075515764889,
+      "loss": 1.0547,
+      "step": 810
+    },
+    {
+      "epoch": 0.3151656465559118,
+      "grad_norm": 0.21421074867248535,
+      "learning_rate": 0.00013732970027247957,
+      "loss": 1.0283,
+      "step": 811
+    },
+    {
+      "epoch": 0.3155542601768192,
+      "grad_norm": 0.1913364827632904,
+      "learning_rate": 0.00013725184896847022,
+      "loss": 0.9826,
+      "step": 812
+    },
+    {
+      "epoch": 0.3159428737977266,
+      "grad_norm": 0.20509806275367737,
+      "learning_rate": 0.00013717399766446087,
+      "loss": 1.0303,
+      "step": 813
+    },
+    {
+      "epoch": 0.31633148741863404,
+      "grad_norm": 0.20309868454933167,
+      "learning_rate": 0.00013709614636045153,
+      "loss": 1.0479,
+      "step": 814
+    },
+    {
+      "epoch": 0.31672010103954146,
+      "grad_norm": 0.2274443656206131,
+      "learning_rate": 0.0001370182950564422,
+      "loss": 1.1311,
+      "step": 815
+    },
+    {
+      "epoch": 0.3171087146604489,
+      "grad_norm": 0.22785170376300812,
+      "learning_rate": 0.00013694044375243286,
+      "loss": 1.1009,
+      "step": 816
+    },
+    {
+      "epoch": 0.31749732828135624,
+      "grad_norm": 0.2105439007282257,
+      "learning_rate": 0.0001368625924484235,
+      "loss": 1.0251,
+      "step": 817
+    },
+    {
+      "epoch": 0.31788594190226366,
+      "grad_norm": 0.20583970844745636,
+      "learning_rate": 0.00013678474114441416,
+      "loss": 1.0833,
+      "step": 818
+    },
+    {
+      "epoch": 0.3182745555231711,
+      "grad_norm": 0.21091191470623016,
+      "learning_rate": 0.00013670688984040484,
+      "loss": 1.071,
+      "step": 819
+    },
+    {
+      "epoch": 0.3186631691440785,
+      "grad_norm": 0.20645928382873535,
+      "learning_rate": 0.0001366290385363955,
+      "loss": 1.0605,
+      "step": 820
+    },
+    {
+      "epoch": 0.3190517827649859,
+      "grad_norm": 0.1990513950586319,
+      "learning_rate": 0.00013655118723238614,
+      "loss": 1.0461,
+      "step": 821
+    },
+    {
+      "epoch": 0.31944039638589333,
+      "grad_norm": 0.2192249745130539,
+      "learning_rate": 0.00013647333592837682,
+      "loss": 1.0975,
+      "step": 822
+    },
+    {
+      "epoch": 0.31982901000680075,
+      "grad_norm": 0.2157617211341858,
+      "learning_rate": 0.00013639548462436748,
+      "loss": 1.091,
+      "step": 823
+    },
+    {
+      "epoch": 0.32021762362770817,
+      "grad_norm": 0.21964526176452637,
+      "learning_rate": 0.00013631763332035813,
+      "loss": 1.0286,
+      "step": 824
+    },
+    {
+      "epoch": 0.3206062372486156,
+      "grad_norm": 0.2079797089099884,
+      "learning_rate": 0.00013623978201634878,
+      "loss": 1.0257,
+      "step": 825
+    },
+    {
+      "epoch": 0.32099485086952295,
+      "grad_norm": 0.21220168471336365,
+      "learning_rate": 0.00013616193071233946,
+      "loss": 1.0046,
+      "step": 826
+    },
+    {
+      "epoch": 0.32138346449043037,
+      "grad_norm": 0.2885231673717499,
+      "learning_rate": 0.0001360840794083301,
+      "loss": 1.1442,
+      "step": 827
+    },
+    {
+      "epoch": 0.3217720781113378,
+      "grad_norm": 0.2096511274576187,
+      "learning_rate": 0.00013600622810432076,
+      "loss": 1.0209,
+      "step": 828
+    },
+    {
+      "epoch": 0.3221606917322452,
+      "grad_norm": 0.2179451286792755,
+      "learning_rate": 0.00013592837680031142,
+      "loss": 1.0548,
+      "step": 829
+    },
+    {
+      "epoch": 0.3225493053531526,
+      "grad_norm": 0.2096329927444458,
+      "learning_rate": 0.00013585052549630207,
+      "loss": 1.0279,
+      "step": 830
+    },
+    {
+      "epoch": 0.32293791897406005,
+      "grad_norm": 0.22531811892986298,
+      "learning_rate": 0.00013577267419229275,
+      "loss": 1.0463,
+      "step": 831
+    },
+    {
+      "epoch": 0.32332653259496746,
+      "grad_norm": 0.22516901791095734,
+      "learning_rate": 0.0001356948228882834,
+      "loss": 1.1127,
+      "step": 832
+    },
+    {
+      "epoch": 0.3237151462158749,
+      "grad_norm": 0.22487780451774597,
+      "learning_rate": 0.00013561697158427405,
+      "loss": 1.0707,
+      "step": 833
+    },
+    {
+      "epoch": 0.3241037598367823,
+      "grad_norm": 0.20976543426513672,
+      "learning_rate": 0.0001355391202802647,
+      "loss": 1.0217,
+      "step": 834
+    },
+    {
+      "epoch": 0.32449237345768966,
+      "grad_norm": 0.19849295914173126,
+      "learning_rate": 0.00013546126897625535,
+      "loss": 1.021,
+      "step": 835
+    },
+    {
+      "epoch": 0.3248809870785971,
+      "grad_norm": 0.21772268414497375,
+      "learning_rate": 0.00013538341767224603,
+      "loss": 1.0605,
+      "step": 836
+    },
+    {
+      "epoch": 0.3252696006995045,
+      "grad_norm": 0.19670265913009644,
+      "learning_rate": 0.00013530556636823669,
+      "loss": 1.0165,
+      "step": 837
+    },
+    {
+      "epoch": 0.3256582143204119,
+      "grad_norm": 0.19339734315872192,
+      "learning_rate": 0.00013522771506422734,
+      "loss": 1.0203,
+      "step": 838
+    },
+    {
+      "epoch": 0.32604682794131934,
+      "grad_norm": 0.21289557218551636,
+      "learning_rate": 0.000135149863760218,
+      "loss": 1.0252,
+      "step": 839
+    },
+    {
+      "epoch": 0.32643544156222676,
+      "grad_norm": 0.1964789777994156,
+      "learning_rate": 0.00013507201245620864,
+      "loss": 1.0392,
+      "step": 840
+    },
+    {
+      "epoch": 0.3268240551831342,
+      "grad_norm": 0.20783716440200806,
+      "learning_rate": 0.00013499416115219932,
+      "loss": 1.0569,
+      "step": 841
+    },
+    {
+      "epoch": 0.3272126688040416,
+      "grad_norm": 0.22782161831855774,
+      "learning_rate": 0.00013491630984818997,
+      "loss": 1.0555,
+      "step": 842
+    },
+    {
+      "epoch": 0.327601282424949,
+      "grad_norm": 0.22771142423152924,
+      "learning_rate": 0.00013483845854418063,
+      "loss": 1.085,
+      "step": 843
+    },
+    {
+      "epoch": 0.32798989604585643,
+      "grad_norm": 0.19773711264133453,
+      "learning_rate": 0.00013476060724017128,
+      "loss": 1.008,
+      "step": 844
+    },
+    {
+      "epoch": 0.3283785096667638,
+      "grad_norm": 0.22399166226387024,
+      "learning_rate": 0.00013468275593616193,
+      "loss": 1.0511,
+      "step": 845
+    },
+    {
+      "epoch": 0.3287671232876712,
+      "grad_norm": 0.20488236844539642,
+      "learning_rate": 0.00013460490463215258,
+      "loss": 1.0883,
+      "step": 846
+    },
+    {
+      "epoch": 0.32915573690857863,
+      "grad_norm": 0.21387654542922974,
+      "learning_rate": 0.00013452705332814326,
+      "loss": 1.0808,
+      "step": 847
+    },
+    {
+      "epoch": 0.32954435052948605,
+      "grad_norm": 0.1972568780183792,
+      "learning_rate": 0.0001344492020241339,
+      "loss": 1.0555,
+      "step": 848
+    },
+    {
+      "epoch": 0.32993296415039347,
+      "grad_norm": 0.20835663378238678,
+      "learning_rate": 0.00013437135072012456,
+      "loss": 1.0473,
+      "step": 849
+    },
+    {
+      "epoch": 0.3303215777713009,
+      "grad_norm": 0.19707520306110382,
+      "learning_rate": 0.00013429349941611522,
+      "loss": 0.9585,
+      "step": 850
+    },
+    {
+      "epoch": 0.3307101913922083,
+      "grad_norm": 0.19163411855697632,
+      "learning_rate": 0.00013421564811210587,
+      "loss": 1.0025,
+      "step": 851
+    },
+    {
+      "epoch": 0.3310988050131157,
+      "grad_norm": 0.19730083644390106,
+      "learning_rate": 0.00013413779680809655,
+      "loss": 1.0696,
+      "step": 852
+    },
+    {
+      "epoch": 0.33148741863402315,
+      "grad_norm": 0.19537493586540222,
+      "learning_rate": 0.0001340599455040872,
+      "loss": 1.0466,
+      "step": 853
+    },
+    {
+      "epoch": 0.3318760322549305,
+      "grad_norm": 0.2255164235830307,
+      "learning_rate": 0.00013398209420007785,
+      "loss": 1.0659,
+      "step": 854
+    },
+    {
+      "epoch": 0.3322646458758379,
+      "grad_norm": 0.19774770736694336,
+      "learning_rate": 0.0001339042428960685,
+      "loss": 1.0326,
+      "step": 855
+    },
+    {
+      "epoch": 0.33265325949674535,
+      "grad_norm": 0.2004510909318924,
+      "learning_rate": 0.00013382639159205916,
+      "loss": 1.0327,
+      "step": 856
+    },
+    {
+      "epoch": 0.33304187311765276,
+      "grad_norm": 0.19187591969966888,
+      "learning_rate": 0.00013374854028804984,
+      "loss": 1.0069,
+      "step": 857
+    },
+    {
+      "epoch": 0.3334304867385602,
+      "grad_norm": 0.18775832653045654,
+      "learning_rate": 0.0001336706889840405,
+      "loss": 1.0083,
+      "step": 858
+    },
+    {
+      "epoch": 0.3338191003594676,
+      "grad_norm": 0.2005717158317566,
+      "learning_rate": 0.00013359283768003114,
+      "loss": 1.0398,
+      "step": 859
+    },
+    {
+      "epoch": 0.334207713980375,
+      "grad_norm": 0.19705893099308014,
+      "learning_rate": 0.0001335149863760218,
+      "loss": 1.0031,
+      "step": 860
+    },
+    {
+      "epoch": 0.33459632760128244,
+      "grad_norm": 0.19589562714099884,
+      "learning_rate": 0.00013343713507201244,
+      "loss": 0.9831,
+      "step": 861
+    },
+    {
+      "epoch": 0.33498494122218986,
+      "grad_norm": 0.19302591681480408,
+      "learning_rate": 0.00013335928376800312,
+      "loss": 1.0009,
+      "step": 862
+    },
+    {
+      "epoch": 0.3353735548430973,
+      "grad_norm": 0.20499618351459503,
+      "learning_rate": 0.00013328143246399377,
+      "loss": 1.0205,
+      "step": 863
+    },
+    {
+      "epoch": 0.33576216846400464,
+      "grad_norm": 0.20514456927776337,
+      "learning_rate": 0.00013320358115998443,
+      "loss": 1.0837,
+      "step": 864
+    },
+    {
+      "epoch": 0.33615078208491206,
+      "grad_norm": 0.19285848736763,
+      "learning_rate": 0.00013312572985597508,
+      "loss": 1.0167,
+      "step": 865
+    },
+    {
+      "epoch": 0.3365393957058195,
+      "grad_norm": 0.20891553163528442,
+      "learning_rate": 0.00013304787855196573,
+      "loss": 1.0127,
+      "step": 866
+    },
+    {
+      "epoch": 0.3369280093267269,
+      "grad_norm": 0.20511706173419952,
+      "learning_rate": 0.0001329700272479564,
+      "loss": 0.964,
+      "step": 867
+    },
+    {
+      "epoch": 0.3373166229476343,
+      "grad_norm": 0.1855512261390686,
+      "learning_rate": 0.00013289217594394706,
+      "loss": 0.9721,
+      "step": 868
+    },
+    {
+      "epoch": 0.33770523656854173,
+      "grad_norm": 0.20010098814964294,
+      "learning_rate": 0.00013281432463993771,
+      "loss": 1.0411,
+      "step": 869
+    },
+    {
+      "epoch": 0.33809385018944915,
+      "grad_norm": 0.1991325318813324,
+      "learning_rate": 0.0001327364733359284,
+      "loss": 0.9658,
+      "step": 870
+    },
+    {
+      "epoch": 0.33848246381035657,
+      "grad_norm": 0.19895736873149872,
+      "learning_rate": 0.00013265862203191905,
+      "loss": 1.0744,
+      "step": 871
+    },
+    {
+      "epoch": 0.338871077431264,
+      "grad_norm": 0.2091255635023117,
+      "learning_rate": 0.0001325807707279097,
+      "loss": 1.0375,
+      "step": 872
+    },
+    {
+      "epoch": 0.33925969105217135,
+      "grad_norm": 0.21355532109737396,
+      "learning_rate": 0.00013250291942390035,
+      "loss": 1.09,
+      "step": 873
+    },
+    {
+      "epoch": 0.33964830467307877,
+      "grad_norm": 0.21844851970672607,
+      "learning_rate": 0.00013242506811989103,
+      "loss": 1.0769,
+      "step": 874
+    },
+    {
+      "epoch": 0.3400369182939862,
+      "grad_norm": 0.1877543330192566,
+      "learning_rate": 0.00013234721681588168,
+      "loss": 1.0199,
+      "step": 875
+    },
+    {
+      "epoch": 0.3404255319148936,
+      "grad_norm": 0.2020038366317749,
+      "learning_rate": 0.00013226936551187233,
+      "loss": 1.0218,
+      "step": 876
+    },
+    {
+      "epoch": 0.340814145535801,
+      "grad_norm": 0.20682141184806824,
+      "learning_rate": 0.000132191514207863,
+      "loss": 1.0891,
+      "step": 877
+    },
+    {
+      "epoch": 0.34120275915670845,
+      "grad_norm": 0.21942824125289917,
+      "learning_rate": 0.00013211366290385366,
+      "loss": 0.9877,
+      "step": 878
+    },
+    {
+      "epoch": 0.34159137277761586,
+      "grad_norm": 0.21150313317775726,
+      "learning_rate": 0.00013203581159984432,
+      "loss": 1.0815,
+      "step": 879
+    },
+    {
+      "epoch": 0.3419799863985233,
+      "grad_norm": 0.2073293924331665,
+      "learning_rate": 0.00013195796029583497,
+      "loss": 1.0579,
+      "step": 880
+    },
+    {
+      "epoch": 0.3423686000194307,
+      "grad_norm": 0.221574068069458,
+      "learning_rate": 0.00013188010899182562,
+      "loss": 1.0279,
+      "step": 881
+    },
+    {
+      "epoch": 0.3427572136403381,
+      "grad_norm": 0.22334492206573486,
+      "learning_rate": 0.00013180225768781627,
+      "loss": 1.0837,
+      "step": 882
+    },
+    {
+      "epoch": 0.3431458272612455,
+      "grad_norm": 0.18817654252052307,
+      "learning_rate": 0.00013172440638380695,
+      "loss": 1.0262,
+      "step": 883
+    },
+    {
+      "epoch": 0.3435344408821529,
+      "grad_norm": 0.20126822590827942,
+      "learning_rate": 0.0001316465550797976,
+      "loss": 1.0679,
+      "step": 884
+    },
+    {
+      "epoch": 0.3439230545030603,
+      "grad_norm": 0.2128864973783493,
+      "learning_rate": 0.00013156870377578825,
+      "loss": 1.0316,
+      "step": 885
+    },
+    {
+      "epoch": 0.34431166812396774,
+      "grad_norm": 0.20054499804973602,
+      "learning_rate": 0.0001314908524717789,
+      "loss": 1.0024,
+      "step": 886
+    },
+    {
+      "epoch": 0.34470028174487516,
+      "grad_norm": 0.21358034014701843,
+      "learning_rate": 0.00013141300116776956,
+      "loss": 1.0475,
+      "step": 887
+    },
+    {
+      "epoch": 0.3450888953657826,
+      "grad_norm": 0.21377703547477722,
+      "learning_rate": 0.00013133514986376024,
+      "loss": 1.0957,
+      "step": 888
+    },
+    {
+      "epoch": 0.34547750898669,
+      "grad_norm": 0.20166514813899994,
+      "learning_rate": 0.0001312572985597509,
+      "loss": 1.0189,
+      "step": 889
+    },
+    {
+      "epoch": 0.3458661226075974,
+      "grad_norm": 0.20424878597259521,
+      "learning_rate": 0.00013117944725574154,
+      "loss": 1.0896,
+      "step": 890
+    },
+    {
+      "epoch": 0.34625473622850483,
+      "grad_norm": 0.19028648734092712,
+      "learning_rate": 0.0001311015959517322,
+      "loss": 0.9881,
+      "step": 891
+    },
+    {
+      "epoch": 0.3466433498494122,
+      "grad_norm": 0.20828665792942047,
+      "learning_rate": 0.00013102374464772285,
+      "loss": 0.9932,
+      "step": 892
+    },
+    {
+      "epoch": 0.3470319634703196,
+      "grad_norm": 0.20756572484970093,
+      "learning_rate": 0.00013094589334371353,
+      "loss": 1.0406,
+      "step": 893
+    },
+    {
+      "epoch": 0.34742057709122703,
+      "grad_norm": 0.20768921077251434,
+      "learning_rate": 0.00013086804203970418,
+      "loss": 0.9652,
+      "step": 894
+    },
+    {
+      "epoch": 0.34780919071213445,
+      "grad_norm": 0.20660027861595154,
+      "learning_rate": 0.00013079019073569483,
+      "loss": 1.0728,
+      "step": 895
+    },
+    {
+      "epoch": 0.34819780433304187,
+      "grad_norm": 0.20186837017536163,
+      "learning_rate": 0.00013071233943168548,
+      "loss": 1.0407,
+      "step": 896
+    },
+    {
+      "epoch": 0.3485864179539493,
+      "grad_norm": 0.20880667865276337,
+      "learning_rate": 0.00013063448812767613,
+      "loss": 1.0275,
+      "step": 897
+    },
+    {
+      "epoch": 0.3489750315748567,
+      "grad_norm": 0.22212949395179749,
+      "learning_rate": 0.0001305566368236668,
+      "loss": 1.0293,
+      "step": 898
+    },
+    {
+      "epoch": 0.3493636451957641,
+      "grad_norm": 0.20552745461463928,
+      "learning_rate": 0.00013047878551965746,
+      "loss": 1.0434,
+      "step": 899
+    },
+    {
+      "epoch": 0.34975225881667155,
+      "grad_norm": 0.21239839494228363,
+      "learning_rate": 0.00013040093421564812,
+      "loss": 1.052,
+      "step": 900
+    },
+    {
+      "epoch": 0.3501408724375789,
+      "grad_norm": 0.22420544922351837,
+      "learning_rate": 0.00013032308291163877,
+      "loss": 1.0236,
+      "step": 901
+    },
+    {
+      "epoch": 0.35052948605848633,
+      "grad_norm": 0.23435090482234955,
+      "learning_rate": 0.00013024523160762942,
+      "loss": 1.0876,
+      "step": 902
+    },
+    {
+      "epoch": 0.35091809967939375,
+      "grad_norm": 0.22763386368751526,
+      "learning_rate": 0.0001301673803036201,
+      "loss": 1.0636,
+      "step": 903
+    },
+    {
+      "epoch": 0.35130671330030117,
+      "grad_norm": 0.20948883891105652,
+      "learning_rate": 0.00013008952899961075,
+      "loss": 1.0083,
+      "step": 904
+    },
+    {
+      "epoch": 0.3516953269212086,
+      "grad_norm": 0.20408779382705688,
+      "learning_rate": 0.0001300116776956014,
+      "loss": 1.039,
+      "step": 905
+    },
+    {
+      "epoch": 0.352083940542116,
+      "grad_norm": 0.2126050591468811,
+      "learning_rate": 0.00012993382639159206,
+      "loss": 1.0365,
+      "step": 906
+    },
+    {
+      "epoch": 0.3524725541630234,
+      "grad_norm": 0.20314334332942963,
+      "learning_rate": 0.0001298559750875827,
+      "loss": 1.0474,
+      "step": 907
+    },
+    {
+      "epoch": 0.35286116778393084,
+      "grad_norm": 0.23720984160900116,
+      "learning_rate": 0.0001297781237835734,
+      "loss": 1.0529,
+      "step": 908
+    },
+    {
+      "epoch": 0.35324978140483826,
+      "grad_norm": 0.22642800211906433,
+      "learning_rate": 0.00012970027247956404,
+      "loss": 1.0586,
+      "step": 909
+    },
+    {
+      "epoch": 0.3536383950257457,
+      "grad_norm": 0.20469972491264343,
+      "learning_rate": 0.0001296224211755547,
+      "loss": 1.0267,
+      "step": 910
+    },
+    {
+      "epoch": 0.35402700864665304,
+      "grad_norm": 0.197368785738945,
+      "learning_rate": 0.00012954456987154534,
+      "loss": 1.0348,
+      "step": 911
+    },
+    {
+      "epoch": 0.35441562226756046,
+      "grad_norm": 0.21924498677253723,
+      "learning_rate": 0.000129466718567536,
+      "loss": 1.0861,
+      "step": 912
+    },
+    {
+      "epoch": 0.3548042358884679,
+      "grad_norm": 0.22006285190582275,
+      "learning_rate": 0.00012938886726352667,
+      "loss": 1.0545,
+      "step": 913
+    },
+    {
+      "epoch": 0.3551928495093753,
+      "grad_norm": 0.22419220209121704,
+      "learning_rate": 0.00012931101595951733,
+      "loss": 1.0716,
+      "step": 914
+    },
+    {
+      "epoch": 0.3555814631302827,
+      "grad_norm": 0.215990349650383,
+      "learning_rate": 0.00012923316465550798,
+      "loss": 1.0619,
+      "step": 915
+    },
+    {
+      "epoch": 0.35597007675119013,
+      "grad_norm": 0.20783264935016632,
+      "learning_rate": 0.00012915531335149863,
+      "loss": 1.0412,
+      "step": 916
+    },
+    {
+      "epoch": 0.35635869037209755,
+      "grad_norm": 0.24584618210792542,
+      "learning_rate": 0.00012907746204748928,
+      "loss": 1.1165,
+      "step": 917
+    },
+    {
+      "epoch": 0.35674730399300497,
+      "grad_norm": 0.23146122694015503,
+      "learning_rate": 0.00012899961074347996,
+      "loss": 1.1111,
+      "step": 918
+    },
+    {
+      "epoch": 0.3571359176139124,
+      "grad_norm": 0.19983729720115662,
+      "learning_rate": 0.00012892175943947061,
+      "loss": 1.0674,
+      "step": 919
+    },
+    {
+      "epoch": 0.35752453123481975,
+      "grad_norm": 0.2161000818014145,
+      "learning_rate": 0.00012884390813546127,
+      "loss": 1.076,
+      "step": 920
+    },
+    {
+      "epoch": 0.35791314485572717,
+      "grad_norm": 0.21042793989181519,
+      "learning_rate": 0.00012876605683145192,
+      "loss": 1.0535,
+      "step": 921
+    },
+    {
+      "epoch": 0.3583017584766346,
+      "grad_norm": 0.20135439932346344,
+      "learning_rate": 0.0001286882055274426,
+      "loss": 1.0059,
+      "step": 922
+    },
+    {
+      "epoch": 0.358690372097542,
+      "grad_norm": 0.19394971430301666,
+      "learning_rate": 0.00012861035422343325,
+      "loss": 1.0381,
+      "step": 923
+    },
+    {
+      "epoch": 0.35907898571844943,
+      "grad_norm": 0.21171030402183533,
+      "learning_rate": 0.0001285325029194239,
+      "loss": 1.0513,
+      "step": 924
+    },
+    {
+      "epoch": 0.35946759933935685,
+      "grad_norm": 0.19476690888404846,
+      "learning_rate": 0.00012845465161541458,
+      "loss": 1.0003,
+      "step": 925
+    },
+    {
+      "epoch": 0.35985621296026427,
+      "grad_norm": 0.20468670129776,
+      "learning_rate": 0.00012837680031140523,
+      "loss": 1.0608,
+      "step": 926
+    },
+    {
+      "epoch": 0.3602448265811717,
+      "grad_norm": 0.21159446239471436,
+      "learning_rate": 0.00012829894900739588,
+      "loss": 1.0734,
+      "step": 927
+    },
+    {
+      "epoch": 0.3606334402020791,
+      "grad_norm": 0.21179519593715668,
+      "learning_rate": 0.00012822109770338654,
+      "loss": 1.0957,
+      "step": 928
+    },
+    {
+      "epoch": 0.3610220538229865,
+      "grad_norm": 0.20997527241706848,
+      "learning_rate": 0.00012814324639937722,
+      "loss": 1.0644,
+      "step": 929
+    },
+    {
+      "epoch": 0.3614106674438939,
+      "grad_norm": 0.21178296208381653,
+      "learning_rate": 0.00012806539509536787,
+      "loss": 1.0208,
+      "step": 930
+    },
+    {
+      "epoch": 0.3617992810648013,
+      "grad_norm": 0.20890356600284576,
+      "learning_rate": 0.00012798754379135852,
+      "loss": 1.0888,
+      "step": 931
+    },
+    {
+      "epoch": 0.3621878946857087,
+      "grad_norm": 0.20177409052848816,
+      "learning_rate": 0.00012790969248734917,
+      "loss": 0.9741,
+      "step": 932
+    },
+    {
+      "epoch": 0.36257650830661614,
+      "grad_norm": 0.23504556715488434,
+      "learning_rate": 0.00012783184118333982,
+      "loss": 1.1048,
+      "step": 933
+    },
+    {
+      "epoch": 0.36296512192752356,
+      "grad_norm": 0.22829356789588928,
+      "learning_rate": 0.0001277539898793305,
+      "loss": 1.0798,
+      "step": 934
+    },
+    {
+      "epoch": 0.363353735548431,
+      "grad_norm": 0.2068483531475067,
+      "learning_rate": 0.00012767613857532116,
+      "loss": 1.0452,
+      "step": 935
+    },
+    {
+      "epoch": 0.3637423491693384,
+      "grad_norm": 0.2093171775341034,
+      "learning_rate": 0.0001275982872713118,
+      "loss": 1.0742,
+      "step": 936
+    },
+    {
+      "epoch": 0.3641309627902458,
+      "grad_norm": 0.21478736400604248,
+      "learning_rate": 0.00012752043596730246,
+      "loss": 1.0572,
+      "step": 937
+    },
+    {
+      "epoch": 0.36451957641115323,
+      "grad_norm": 0.1906953752040863,
+      "learning_rate": 0.0001274425846632931,
+      "loss": 1.0107,
+      "step": 938
+    },
+    {
+      "epoch": 0.3649081900320606,
+      "grad_norm": 0.20580604672431946,
+      "learning_rate": 0.0001273647333592838,
+      "loss": 1.0677,
+      "step": 939
+    },
+    {
+      "epoch": 0.365296803652968,
+      "grad_norm": 0.22586850821971893,
+      "learning_rate": 0.00012728688205527444,
+      "loss": 1.0389,
+      "step": 940
+    },
+    {
+      "epoch": 0.36568541727387543,
+      "grad_norm": 0.199899360537529,
+      "learning_rate": 0.0001272090307512651,
+      "loss": 1.0462,
+      "step": 941
+    },
+    {
+      "epoch": 0.36607403089478285,
+      "grad_norm": 0.19881689548492432,
+      "learning_rate": 0.00012713117944725575,
+      "loss": 1.0565,
+      "step": 942
+    },
+    {
+      "epoch": 0.3664626445156903,
+      "grad_norm": 0.21748925745487213,
+      "learning_rate": 0.0001270533281432464,
+      "loss": 1.0659,
+      "step": 943
+    },
+    {
+      "epoch": 0.3668512581365977,
+      "grad_norm": 0.19363689422607422,
+      "learning_rate": 0.00012697547683923708,
+      "loss": 1.0307,
+      "step": 944
+    },
+    {
+      "epoch": 0.3672398717575051,
+      "grad_norm": 0.21701784431934357,
+      "learning_rate": 0.00012689762553522773,
+      "loss": 1.0684,
+      "step": 945
+    },
+    {
+      "epoch": 0.36762848537841253,
+      "grad_norm": 0.21406958997249603,
+      "learning_rate": 0.00012681977423121838,
+      "loss": 1.0703,
+      "step": 946
+    },
+    {
+      "epoch": 0.36801709899931995,
+      "grad_norm": 0.23539729416370392,
+      "learning_rate": 0.00012674192292720903,
+      "loss": 1.1537,
+      "step": 947
+    },
+    {
+      "epoch": 0.36840571262022737,
+      "grad_norm": 0.2177354395389557,
+      "learning_rate": 0.00012666407162319969,
+      "loss": 1.0131,
+      "step": 948
+    },
+    {
+      "epoch": 0.36879432624113473,
+      "grad_norm": 0.255346417427063,
+      "learning_rate": 0.00012658622031919037,
+      "loss": 0.9807,
+      "step": 949
+    },
+    {
+      "epoch": 0.36918293986204215,
+      "grad_norm": 0.2139921486377716,
+      "learning_rate": 0.00012650836901518102,
+      "loss": 1.0392,
+      "step": 950
+    },
+    {
+      "epoch": 0.36957155348294957,
+      "grad_norm": 0.22490833699703217,
+      "learning_rate": 0.00012643051771117167,
+      "loss": 1.0512,
+      "step": 951
+    },
+    {
+      "epoch": 0.369960167103857,
+      "grad_norm": 0.20698820054531097,
+      "learning_rate": 0.00012635266640716232,
+      "loss": 1.0391,
+      "step": 952
+    },
+    {
+      "epoch": 0.3703487807247644,
+      "grad_norm": 0.2276201844215393,
+      "learning_rate": 0.00012627481510315297,
+      "loss": 1.0513,
+      "step": 953
+    },
+    {
+      "epoch": 0.3707373943456718,
+      "grad_norm": 0.2493600994348526,
+      "learning_rate": 0.00012619696379914365,
+      "loss": 1.0136,
+      "step": 954
+    },
+    {
+      "epoch": 0.37112600796657924,
+      "grad_norm": 0.2155001014471054,
+      "learning_rate": 0.0001261191124951343,
+      "loss": 1.0523,
+      "step": 955
+    },
+    {
+      "epoch": 0.37151462158748666,
+      "grad_norm": 0.21571211516857147,
+      "learning_rate": 0.00012604126119112496,
+      "loss": 1.0288,
+      "step": 956
+    },
+    {
+      "epoch": 0.3719032352083941,
+      "grad_norm": 0.23238877952098846,
+      "learning_rate": 0.0001259634098871156,
+      "loss": 1.0638,
+      "step": 957
+    },
+    {
+      "epoch": 0.37229184882930144,
+      "grad_norm": 0.2002813220024109,
+      "learning_rate": 0.00012588555858310626,
+      "loss": 0.9665,
+      "step": 958
+    },
+    {
+      "epoch": 0.37268046245020886,
+      "grad_norm": 0.21712858974933624,
+      "learning_rate": 0.0001258077072790969,
+      "loss": 1.0469,
+      "step": 959
+    },
+    {
+      "epoch": 0.3730690760711163,
+      "grad_norm": 0.2178192287683487,
+      "learning_rate": 0.0001257298559750876,
+      "loss": 1.0267,
+      "step": 960
+    },
+    {
+      "epoch": 0.3734576896920237,
+      "grad_norm": 0.25488024950027466,
+      "learning_rate": 0.00012565200467107824,
+      "loss": 1.0153,
+      "step": 961
+    },
+    {
+      "epoch": 0.3738463033129311,
+      "grad_norm": 0.20070038735866547,
+      "learning_rate": 0.0001255741533670689,
+      "loss": 1.0279,
+      "step": 962
+    },
+    {
+      "epoch": 0.37423491693383854,
+      "grad_norm": 0.21885356307029724,
+      "learning_rate": 0.00012549630206305955,
+      "loss": 1.0395,
+      "step": 963
+    },
+    {
+      "epoch": 0.37462353055474595,
+      "grad_norm": 0.2407921701669693,
+      "learning_rate": 0.0001254184507590502,
+      "loss": 1.0767,
+      "step": 964
+    },
+    {
+      "epoch": 0.3750121441756534,
+      "grad_norm": 0.20645053684711456,
+      "learning_rate": 0.00012534059945504088,
+      "loss": 1.0318,
+      "step": 965
+    },
+    {
+      "epoch": 0.3754007577965608,
+      "grad_norm": 0.21275092661380768,
+      "learning_rate": 0.00012526274815103153,
+      "loss": 1.0546,
+      "step": 966
+    },
+    {
+      "epoch": 0.3757893714174682,
+      "grad_norm": 0.21574917435646057,
+      "learning_rate": 0.00012518489684702218,
+      "loss": 1.032,
+      "step": 967
+    },
+    {
+      "epoch": 0.3761779850383756,
+      "grad_norm": 0.21589480340480804,
+      "learning_rate": 0.00012510704554301284,
+      "loss": 1.0834,
+      "step": 968
+    },
+    {
+      "epoch": 0.376566598659283,
+      "grad_norm": 0.19576796889305115,
+      "learning_rate": 0.0001250291942390035,
+      "loss": 1.0178,
+      "step": 969
+    },
+    {
+      "epoch": 0.3769552122801904,
+      "grad_norm": 0.20941287279129028,
+      "learning_rate": 0.00012495134293499417,
+      "loss": 1.0712,
+      "step": 970
+    },
+    {
+      "epoch": 0.37734382590109783,
+      "grad_norm": 0.22585494816303253,
+      "learning_rate": 0.00012487349163098482,
+      "loss": 1.0401,
+      "step": 971
+    },
+    {
+      "epoch": 0.37773243952200525,
+      "grad_norm": 0.21093420684337616,
+      "learning_rate": 0.00012479564032697547,
+      "loss": 1.0569,
+      "step": 972
+    },
+    {
+      "epoch": 0.37812105314291267,
+      "grad_norm": 0.22375014424324036,
+      "learning_rate": 0.00012471778902296612,
+      "loss": 1.0687,
+      "step": 973
+    },
+    {
+      "epoch": 0.3785096667638201,
+      "grad_norm": 0.19787487387657166,
+      "learning_rate": 0.0001246399377189568,
+      "loss": 1.0266,
+      "step": 974
+    },
+    {
+      "epoch": 0.3788982803847275,
+      "grad_norm": 0.20633013546466827,
+      "learning_rate": 0.00012456208641494745,
+      "loss": 0.9996,
+      "step": 975
+    },
+    {
+      "epoch": 0.3792868940056349,
+      "grad_norm": 0.21559873223304749,
+      "learning_rate": 0.0001244842351109381,
+      "loss": 1.0851,
+      "step": 976
+    },
+    {
+      "epoch": 0.3796755076265423,
+      "grad_norm": 0.2166333943605423,
+      "learning_rate": 0.00012440638380692879,
+      "loss": 1.0859,
+      "step": 977
+    },
+    {
+      "epoch": 0.3800641212474497,
+      "grad_norm": 0.18558773398399353,
+      "learning_rate": 0.00012432853250291944,
+      "loss": 0.9534,
+      "step": 978
+    },
+    {
+      "epoch": 0.3804527348683571,
+      "grad_norm": 0.2086942344903946,
+      "learning_rate": 0.0001242506811989101,
+      "loss": 1.0786,
+      "step": 979
+    },
+    {
+      "epoch": 0.38084134848926454,
+      "grad_norm": 0.2207823544740677,
+      "learning_rate": 0.00012417282989490074,
+      "loss": 1.0626,
+      "step": 980
+    },
+    {
+      "epoch": 0.38122996211017196,
+      "grad_norm": 0.21255749464035034,
+      "learning_rate": 0.00012409497859089142,
+      "loss": 1.063,
+      "step": 981
+    },
+    {
+      "epoch": 0.3816185757310794,
+      "grad_norm": 0.20682042837142944,
+      "learning_rate": 0.00012401712728688207,
+      "loss": 1.034,
+      "step": 982
+    },
+    {
+      "epoch": 0.3820071893519868,
+      "grad_norm": 0.2084134966135025,
+      "learning_rate": 0.00012393927598287272,
+      "loss": 1.0481,
+      "step": 983
+    },
+    {
+      "epoch": 0.3823958029728942,
+      "grad_norm": 0.1922312080860138,
+      "learning_rate": 0.00012386142467886338,
+      "loss": 1.0461,
+      "step": 984
+    },
+    {
+      "epoch": 0.38278441659380164,
+      "grad_norm": 0.20893707871437073,
+      "learning_rate": 0.00012378357337485406,
+      "loss": 1.0797,
+      "step": 985
+    },
+    {
+      "epoch": 0.383173030214709,
+      "grad_norm": 0.19717541337013245,
+      "learning_rate": 0.0001237057220708447,
+      "loss": 1.0028,
+      "step": 986
+    },
+    {
+      "epoch": 0.3835616438356164,
+      "grad_norm": 0.20688053965568542,
+      "learning_rate": 0.00012362787076683536,
+      "loss": 0.989,
+      "step": 987
+    },
+    {
+      "epoch": 0.38395025745652384,
+      "grad_norm": 0.20580583810806274,
+      "learning_rate": 0.000123550019462826,
+      "loss": 1.06,
+      "step": 988
+    },
+    {
+      "epoch": 0.38433887107743125,
+      "grad_norm": 0.2151709794998169,
+      "learning_rate": 0.00012347216815881666,
+      "loss": 1.0685,
+      "step": 989
+    },
+    {
+      "epoch": 0.3847274846983387,
+      "grad_norm": 0.19573980569839478,
+      "learning_rate": 0.00012339431685480734,
+      "loss": 1.0072,
+      "step": 990
+    },
+    {
+      "epoch": 0.3851160983192461,
+      "grad_norm": 0.1949119120836258,
+      "learning_rate": 0.000123316465550798,
+      "loss": 0.9995,
+      "step": 991
+    },
+    {
+      "epoch": 0.3855047119401535,
+      "grad_norm": 0.2062375247478485,
+      "learning_rate": 0.00012323861424678865,
+      "loss": 1.0694,
+      "step": 992
+    },
+    {
+      "epoch": 0.38589332556106093,
+      "grad_norm": 0.2007209211587906,
+      "learning_rate": 0.0001231607629427793,
+      "loss": 1.0397,
+      "step": 993
+    },
+    {
+      "epoch": 0.38628193918196835,
+      "grad_norm": 0.2231544405221939,
+      "learning_rate": 0.00012308291163876995,
+      "loss": 1.0755,
+      "step": 994
+    },
+    {
+      "epoch": 0.38667055280287577,
+      "grad_norm": 0.2103337049484253,
+      "learning_rate": 0.0001230050603347606,
+      "loss": 1.0505,
+      "step": 995
+    },
+    {
+      "epoch": 0.38705916642378313,
+      "grad_norm": 0.20178386569023132,
+      "learning_rate": 0.00012292720903075128,
+      "loss": 1.0696,
+      "step": 996
+    },
+    {
+      "epoch": 0.38744778004469055,
+      "grad_norm": 0.21268007159233093,
+      "learning_rate": 0.00012284935772674193,
+      "loss": 1.0262,
+      "step": 997
+    },
+    {
+      "epoch": 0.38783639366559797,
+      "grad_norm": 0.21439722180366516,
+      "learning_rate": 0.0001227715064227326,
+      "loss": 1.0718,
+      "step": 998
+    },
+    {
+      "epoch": 0.3882250072865054,
+      "grad_norm": 0.19691336154937744,
+      "learning_rate": 0.00012269365511872324,
+      "loss": 0.9663,
+      "step": 999
+    },
+    {
+      "epoch": 0.3886136209074128,
+      "grad_norm": 0.2165926694869995,
+      "learning_rate": 0.0001226158038147139,
+      "loss": 1.0432,
+      "step": 1000
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 2574,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8.588818692527948e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/outputs/checkpoint-1500/README.md b/outputs/checkpoint-1500/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3abf956c074d00f34a12693c8d6da9738211d7c7
--- /dev/null
+++ b/outputs/checkpoint-1500/README.md
@@ -0,0 +1,209 @@
+---
+base_model: unsloth/gpt-oss-20b-unsloth-bnb-4bit
+library_name: peft
+tags:
+- base_model:adapter:unsloth/gpt-oss-20b-unsloth-bnb-4bit
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/outputs/checkpoint-1500/adapter_config.json b/outputs/checkpoint-1500/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e285b9b6e018b5b9f23736d6699eb1a4267764e7
--- /dev/null
+++ b/outputs/checkpoint-1500/adapter_config.json
@@ -0,0 +1,45 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": {
+    "base_model_class": "GptOssForCausalLM",
+    "parent_library": "transformers.models.gpt_oss.modeling_gpt_oss"
+  },
+  "base_model_name_or_path": "unsloth/gpt-oss-20b-unsloth-bnb-4bit",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "v_proj",
+    "up_proj",
+    "down_proj",
+    "gate_proj",
+    "k_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": null,
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/outputs/checkpoint-1500/chat_template.jinja b/outputs/checkpoint-1500/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..a3650f886e98b2834c25727759c8e0ab8495f316
--- /dev/null
+++ b/outputs/checkpoint-1500/chat_template.jinja
@@ -0,0 +1,315 @@
+{# Copyright 2025-present Unsloth. Apache 2.0 License. Unsloth chat template fixes. Edited from ggml-org & OpenAI #}
+{#-
+  In addition to the normal inputs of `messages` and `tools`, this template also accepts the
+  following kwargs:
+  - "builtin_tools": A list, can contain "browser" and/or "python".
+  - "model_identity": A string that optionally describes the model identity.
+  - "reasoning_effort": A string that describes the reasoning effort, defaults to "medium".
+ #}
+
+{#- Tool Definition Rendering ============================================== #}
+{%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%}
+    {%- if param_spec.type == "array" -%}
+        {%- if param_spec['items'] -%}
+            {%- if param_spec['items']['type'] == "string" -%}
+                {{- "string[]" }}
+            {%- elif param_spec['items']['type'] == "number" -%}
+                {{- "number[]" }}
+            {%- elif param_spec['items']['type'] == "integer" -%}
+                {{- "number[]" }}
+            {%- elif param_spec['items']['type'] == "boolean" -%}
+                {{- "boolean[]" }}
+            {%- else -%}
+                {%- set inner_type = render_typescript_type(param_spec['items'], required_params) -%}
+                {%- if inner_type == "object | object" or inner_type|length > 50 -%}
+                    {{- "any[]" }}
+                {%- else -%}
+                    {{- inner_type + "[]" }}
+                {%- endif -%}
+            {%- endif -%}
+            {%- if param_spec.nullable -%}
+                {{- " | null" }}
+            {%- endif -%}
+        {%- else -%}
+            {{- "any[]" }}
+            {%- if param_spec.nullable -%}
+                {{- " | null" }}
+            {%- endif -%}
+        {%- endif -%}
+    {%- elif param_spec.type is defined and param_spec.type is iterable and param_spec.type is not string and param_spec.type is not mapping and param_spec.type[0] is defined -%}
+        {#- Handle array of types like ["object", "object"] from Union[dict, list] #}
+        {%- if param_spec.type | length > 1 -%}
+            {{- param_spec.type | join(" | ") }}
+        {%- else -%}
+            {{- param_spec.type[0] }}
+        {%- endif -%}
+    {%- elif param_spec.oneOf -%}
+        {#- Handle oneOf schemas - check for complex unions and fallback to any #}
+        {%- set has_object_variants = false -%}
+        {%- for variant in param_spec.oneOf -%}
+            {%- if variant.type == "object" -%}
+                {%- set has_object_variants = true -%}
+            {%- endif -%}
+        {%- endfor -%}
+        {%- if has_object_variants and param_spec.oneOf|length > 1 -%}
+            {{- "any" }}
+        {%- else -%}
+            {%- for variant in param_spec.oneOf -%}
+                {{- render_typescript_type(variant, required_params) -}}
+                {%- if variant.description %}
+                    {{- "// " + variant.description }}
+                {%- endif -%}
+                {%- if variant.default is defined %}
+                    {{ "// default: " + variant.default|tojson }}
+                {%- endif -%}
+                {%- if not loop.last %}
+                    {{- " | " }}
+                {% endif -%}
+            {%- endfor -%}
+        {%- endif -%}
+    {%- elif param_spec.type == "string" -%}
+        {%- if param_spec.enum -%}
+            {{- '"' + param_spec.enum|join('" | "') + '"' -}}
+        {%- else -%}
+            {{- "string" }}
+            {%- if param_spec.nullable %}
+                {{- " | null" }}
+            {%- endif -%}
+        {%- endif -%}
+    {%- elif param_spec.type == "number" -%}
+        {{- "number" }}
+    {%- elif param_spec.type == "integer" -%}
+        {{- "number" }}
+    {%- elif param_spec.type == "boolean" -%}
+        {{- "boolean" }}
+
+    {%- elif param_spec.type == "object" -%}
+        {%- if param_spec.properties -%}
+            {{- "{\n" }}
+            {%- for prop_name, prop_spec in param_spec.properties.items() -%}
+                {{- prop_name -}}
+                {%- if prop_name not in (param_spec.required or []) -%}
+                    {{- "?" }}
+                {%- endif -%}
+                {{- ": " }}
+                {{ render_typescript_type(prop_spec, param_spec.required or []) }}
+                {%- if not loop.last -%}
+                    {{-", " }}
+                {%- endif -%}
+            {%- endfor -%}
+            {{- "}" }}
+        {%- else -%}
+            {{- "object" }}
+        {%- endif -%}
+    {%- else -%}
+        {{- "any" }}
+    {%- endif -%}
+{%- endmacro -%}
+
+{%- macro render_tool_namespace(namespace_name, tools) -%}
+    {{- "## " + namespace_name + "\n\n" }}
+    {{- "namespace " + namespace_name + " {\n\n" }}
+    {%- for tool in tools %}
+        {%- set tool = tool.function %}
+        {{- "// " + tool.description + "\n" }}
+        {{- "type "+ tool.name + " = " }}
+        {%- if tool.parameters and tool.parameters.properties -%}
+            {{- "(_: " }}
+            {{- "{\n" }}
+            {%- for param_name, param_spec in tool.parameters.properties.items() %}
+                {{- "// " + param_spec.description + "\n" }}
+                {{- param_name }}
+                {%- if param_name not in (tool.parameters.required or []) -%}
+                    {{- "?" }}
+                {%- endif -%}
+                {{- ": " }}
+                {{- render_typescript_type(param_spec, tool.parameters.required or []) }}
+                {%- if param_spec.default is defined -%}
+                    {%- if param_spec.enum %}
+                        {{- ", // default: " + param_spec.default }}
+                    {%- elif param_spec.oneOf %}
+                        {{- "// default: " + param_spec.default }}
+                    {%- else %}
+                        {{- ", // default: " + param_spec.default|tojson }}
+                    {%- endif -%}
+                {%- endif -%}
+                {%- if not loop.last %}
+                    {{- ",\n" }}
+                {%- else %}
+                    {{- "\n" }}
+                {%- endif -%}
+            {%- endfor %}
+            {{- "}) => any;\n\n" }}
+        {%- else -%}
+            {{- "() => any;\n\n" }}
+        {%- endif -%}
+    {%- endfor %}
+    {{- "} // namespace " + namespace_name }}
+{%- endmacro -%}
+
+{%- macro render_builtin_tools(browser_tool, python_tool) -%}
+    {%- if browser_tool %}
+        {{- "## browser\n\n" }}
+        {{- "// Tool for browsing.\n" }}
+        {{- "// The `cursor` appears in brackets before each browsing display: `[{cursor}]`.\n" }}
+        {{- "// Cite information from the tool using the following format:\n" }}
+        {{- "// `【{cursor}†L{line_start}(-L{line_end})?】`, for example: `【6†L9-L11】` or `【8†L3】`.\n" }}
+        {{- "// Do not quote more than 10 words directly from the tool output.\n" }}
+        {{- "// sources=web (default: web)\n" }}
+        {{- "namespace browser {\n\n" }}
+        {{- "// Searches for information related to `query` and displays `topn` results.\n" }}
+        {{- "type search = (_: {\n" }}
+        {{- "query: string,\n" }}
+        {{- "topn?: number, // default: 10\n" }}
+        {{- "source?: string,\n" }}
+        {{- "}) => any;\n\n" }}
+        {{- "// Opens the link `id` from the page indicated by `cursor` starting at line number `loc`, showing `num_lines` lines.\n" }}
+        {{- "// Valid link ids are displayed with the formatting: `【{id}†.*】`.\n" }}
+        {{- "// If `cursor` is not provided, the most recent page is implied.\n" }}
+        {{- "// If `id` is a string, it is treated as a fully qualified URL associated with `source`.\n" }}
+        {{- "// If `loc` is not provided, the viewport will be positioned at the beginning of the document or centered on the most relevant passage, if available.\n" }}
+        {{- "// Use this function without `id` to scroll to a new location of an opened page.\n" }}
+        {{- "type open = (_: {\n" }}
+        {{- "id?: number | string, // default: -1\n" }}
+        {{- "cursor?: number, // default: -1\n" }}
+        {{- "loc?: number, // default: -1\n" }}
+        {{- "num_lines?: number, // default: -1\n" }}
+        {{- "view_source?: boolean, // default: false\n" }}
+        {{- "source?: string,\n" }}
+        {{- "}) => any;\n\n" }}
+        {{- "// Finds exact matches of `pattern` in the current page, or the page given by `cursor`.\n" }}
+        {{- "type find = (_: {\n" }}
+        {{- "pattern: string,\n" }}
+        {{- "cursor?: number, // default: -1\n" }}
+        {{- "}) => any;\n\n" }}
+        {{- "} // namespace browser\n\n" }}
+    {%- endif -%}
+
+    {%- if python_tool %}
+        {{- "## python\n\n" }}
+        {{- "Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).\n\n" }}
+        {{- "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 120.0 seconds. The drive at '/mnt/data' can be used to save and persist user files. Internet access for this session is UNKNOWN. Depends on the cluster.\n\n" }}
+    {%- endif -%}
+{%- endmacro -%}
+
+{#- System Message Construction ============================================ #}
+{%- macro build_system_message() -%}
+    {%- if model_identity is not defined %}
+        {{- "You are ChatGPT, a large language model trained by OpenAI.\n" -}}
+    {%- else %}
+        {{- model_identity }}
+    {%- endif %}
+    {{- "Knowledge cutoff: 2024-06\n" }}
+    {{- "Current date: " + strftime_now("%Y-%m-%d") + "\n\n" }}
+    {%- if reasoning_effort is not defined %}
+        {%- set reasoning_effort = "medium" %}
+    {%- endif %}
+    {{- "Reasoning: " + reasoning_effort + "\n\n" }}
+    {%- if builtin_tools is defined %}
+        {{- "# Tools\n\n" }}
+        {%- set available_builtin_tools = namespace(browser=false, python=false) %}
+        {%- for tool in builtin_tools %}
+            {%- if tool == "browser" %}
+                {%- set available_builtin_tools.browser = true %}
+            {%- elif tool == "python" %}
+                {%- set available_builtin_tools.python = true %}
+            {%- endif %}
+        {%- endfor %}
+        {{- render_builtin_tools(available_builtin_tools.browser, available_builtin_tools.python) }}
+    {%- endif -%}
+    {{- "# Valid channels: analysis, commentary, final. Channel must be included for every message." }}
+    {%- if tools is defined -%}
+        {{- "\nCalls to these tools must go to the commentary channel: 'functions'." }}
+    {%- endif -%}
+{%- endmacro -%}
+
+{#- Main Template Logic ================================================= #}
+{#- Set defaults #}
+
+{#- Render system message #}
+{{- "<|start|>system<|message|>" }}
+{{- build_system_message() }}
+{{- "<|end|>" }}
+
+{#- Extract developer message #}
+{%- if messages[0].role == "developer" or messages[0].role == "system" %}
+    {%- set developer_message = messages[0].content %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set developer_message = "" %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+
+{#- Render developer message #}
+{%- if developer_message or tools %}
+    {{- "<|start|>developer<|message|>" }}
+    {%- if developer_message %}
+        {{- "# Instructions\n\n" }}
+        {{- developer_message }}
+    {%- endif %}
+    {%- if tools -%}
+        {{- "\n\n" }}
+        {{- "# Tools\n\n" }}
+        {{- render_tool_namespace("functions", tools) }}
+    {%- endif -%}
+    {{- "<|end|>" }}
+{%- endif %}
+
+{#- Render messages #}
+{%- set last_tool_call = namespace(name=none) %}
+{%- for message in loop_messages -%}
+    {#- At this point only assistant/user/tool messages should remain #}
+    {%- if message.role == 'assistant' -%}
+        {%- if "tool_calls" in message %}
+            {#- We assume max 1 tool call per message, and so we infer the tool call name #}
+            {#- in "tool" messages from the most recent assistant tool call name #}
+            {%- set tool_call = message.tool_calls[0] %}
+            {%- if tool_call.function %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {%- if message.content %}
+                {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.content + "<|end|>" }}
+            {%- endif %}
+            {{- "<|start|>assistant to=" }}
+            {{- "functions." + tool_call.name + "<|channel|>commentary json<|message|>" }}
+            {{- tool_call.arguments|tojson }}
+            {{- "<|call|>" }}
+            {%- set last_tool_call.name = tool_call.name %}
+        {%- elif "thinking" in message and loop.last and not add_generation_prompt %}
+            {#- Only render the CoT if the final turn is an assistant turn and add_generation_prompt is false #}
+            {#- This is a situation that should only occur in training, never in inference. #}
+            {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }}
+            {#- <|return|> indicates the end of generation, but <|end|> does not #}
+            {#- <|return|> should never be an input to the model, but we include it as the final token #}
+            {#- when training, so the model learns to emit it. #}
+            {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|return|>" }}
+            {%- set last_tool_call.name = none %}
+        {%- elif "thinking" in message %}
+            {#- CoT is dropped during all previous turns, so we never render it for inference #}
+            {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|end|>" }}
+            {%- set last_tool_call.name = none %}
+        {%- elif loop.last and not add_generation_prompt %}
+            {#- <|return|> indicates the end of generation, but <|end|> does not #}
+            {#- <|return|> should never be an input to the model, but we include it as the final token #}
+            {#- when training, so the model learns to emit it. #}
+            {{- "<|start|>assistant<|message|>" + message.content + "<|return|>" }}
+        {%- else %}
+            {{- "<|start|>assistant<|message|>" + message.content + "<|end|>" }}
+            {%- set last_tool_call.name = none %}
+        {%- endif %}
+    {%- elif message.role == 'tool' -%}
+        {%- if last_tool_call.name is none %}
+            {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }}
+        {%- endif %}
+        {{- "<|start|>functions." + last_tool_call.name }}
+        {{- " to=assistant<|channel|>commentary<|message|>" + message.content|tojson + "<|end|>" }}
+    {%- else -%}
+        {{- "<|start|>user<|message|>" + message.content + "<|end|>" }}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Generation prompt #}
+{%- if add_generation_prompt -%}
+<|start|>assistant
+{%- endif -%}
+{# Copyright 2025-present Unsloth. Apache 2.0 License. Unsloth chat template fixes. Edited from ggml-org & OpenAI #}
\ No newline at end of file
diff --git a/outputs/checkpoint-1500/optimizer.pt b/outputs/checkpoint-1500/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..edb671a87ce6447336468309d51b19215040e05a
--- /dev/null
+++ b/outputs/checkpoint-1500/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d7f1f22a5f354441b5f815d259903c11b98274ed999c6581547affb39792f494
+size 16894883
diff --git a/outputs/checkpoint-1500/special_tokens_map.json b/outputs/checkpoint-1500/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..6fba18753f4d09dbb8fcdf1482daff36b963d639
--- /dev/null
+++ b/outputs/checkpoint-1500/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|return|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|reserved_200017|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/outputs/checkpoint-1500/tokenizer.json b/outputs/checkpoint-1500/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..6ec3ef1795cbbda6b7cb7d1f114919cbe3fdd647
--- /dev/null
+++ b/outputs/checkpoint-1500/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0614fe83cadab421296e664e1f48f4261fa8fef6e03e63bb75c20f38e37d07d3
+size 27868174
diff --git a/outputs/checkpoint-1500/tokenizer_config.json b/outputs/checkpoint-1500/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..482ae30d27a74c38d2228e69dd37c529fc485a45
--- /dev/null
+++ b/outputs/checkpoint-1500/tokenizer_config.json
@@ -0,0 +1,185 @@
+{
+  "added_tokens_decoder": {
+    "199998": {
+      "content": "<|startoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "199999": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200000": {
+      "content": "<|reserved_200000|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200001": {
+      "content": "<|reserved_200001|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200002": {
+      "content": "<|return|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200003": {
+      "content": "<|constrain|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200004": {
+      "content": "<|reserved_200004|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200005": {
+      "content": "<|channel|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200006": {
+      "content": "<|start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200007": {
+      "content": "<|end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200008": {
+      "content": "<|message|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200009": {
+      "content": "<|reserved_200009|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200010": {
+      "content": "<|reserved_200010|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200011": {
+      "content": "<|reserved_200011|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200012": {
+      "content": "<|call|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200013": {
+      "content": "<|reserved_200013|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200014": {
+      "content": "<|reserved_200014|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200015": {
+      "content": "<|reserved_200015|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200016": {
+      "content": "<|reserved_200016|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200017": {
+      "content": "<|reserved_200017|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200018": {
+      "content": "<|endofprompt|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|startoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|return|>",
+  "extra_special_tokens": {},
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 131072,
+  "pad_token": "<|reserved_200017|>",
+  "padding_side": "right",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": null
+}
diff --git a/outputs/checkpoint-1500/trainer_state.json b/outputs/checkpoint-1500/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..4b88f7892cfdf75d50e4740f4e2866b38fd846b3
--- /dev/null
+++ b/outputs/checkpoint-1500/trainer_state.json
@@ -0,0 +1,10534 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.5829204313611193,
+  "eval_steps": 500,
+  "global_step": 1500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 13.684800148010254,
+      "learning_rate": 0.0,
+      "loss": 2.3276,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 13.660787582397461,
+      "learning_rate": 4e-05,
+      "loss": 2.2792,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 13.35280704498291,
+      "learning_rate": 8e-05,
+      "loss": 2.4151,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 6.15027379989624,
+      "learning_rate": 0.00012,
+      "loss": 1.7812,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 1.3168226480484009,
+      "learning_rate": 0.00016,
+      "loss": 1.4536,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.9872580170631409,
+      "learning_rate": 0.0002,
+      "loss": 1.4171,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.7496100664138794,
+      "learning_rate": 0.00019935064935064936,
+      "loss": 1.4168,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.7376005053520203,
+      "learning_rate": 0.00019870129870129872,
+      "loss": 1.3659,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5281137824058533,
+      "learning_rate": 0.00019805194805194807,
+      "loss": 1.2566,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.5485746264457703,
+      "learning_rate": 0.00019740259740259742,
+      "loss": 1.3761,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.5506592392921448,
+      "learning_rate": 0.00019675324675324675,
+      "loss": 1.3327,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.49382686614990234,
+      "learning_rate": 0.00019610389610389613,
+      "loss": 1.3727,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.36203011870384216,
+      "learning_rate": 0.00019545454545454548,
+      "loss": 1.1515,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.3528599739074707,
+      "learning_rate": 0.0001948051948051948,
+      "loss": 1.2636,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.31244418025016785,
+      "learning_rate": 0.00019415584415584416,
+      "loss": 1.1873,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.3379523754119873,
+      "learning_rate": 0.00019350649350649354,
+      "loss": 1.2657,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.3025083839893341,
+      "learning_rate": 0.00019285714285714286,
+      "loss": 1.2846,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.2560190260410309,
+      "learning_rate": 0.00019220779220779222,
+      "loss": 1.1587,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.2554129958152771,
+      "learning_rate": 0.00019155844155844157,
+      "loss": 1.2812,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.22662702202796936,
+      "learning_rate": 0.00019090909090909092,
+      "loss": 1.1664,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.2515714168548584,
+      "learning_rate": 0.00019025974025974027,
+      "loss": 1.2177,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.24396637082099915,
+      "learning_rate": 0.00018961038961038963,
+      "loss": 1.2053,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.24488303065299988,
+      "learning_rate": 0.00018896103896103895,
+      "loss": 1.2074,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.2168620079755783,
+      "learning_rate": 0.00018831168831168833,
+      "loss": 1.1284,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.24021224677562714,
+      "learning_rate": 0.00018766233766233769,
+      "loss": 1.2169,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.20057056844234467,
+      "learning_rate": 0.000187012987012987,
+      "loss": 1.1031,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.19900795817375183,
+      "learning_rate": 0.00018636363636363636,
+      "loss": 1.1004,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.2019268423318863,
+      "learning_rate": 0.00018571428571428572,
+      "loss": 1.1476,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.1996479034423828,
+      "learning_rate": 0.00018506493506493507,
+      "loss": 1.1455,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.25262022018432617,
+      "learning_rate": 0.00018441558441558442,
+      "loss": 1.1025,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.225438192486763,
+      "learning_rate": 0.00018376623376623378,
+      "loss": 1.1954,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.17834505438804626,
+      "learning_rate": 0.00018311688311688313,
+      "loss": 1.0934,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.20071206986904144,
+      "learning_rate": 0.00018246753246753248,
+      "loss": 1.0488,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.1920139640569687,
+      "learning_rate": 0.00018181818181818183,
+      "loss": 1.123,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.18714852631092072,
+      "learning_rate": 0.0001811688311688312,
+      "loss": 1.0798,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.18315713107585907,
+      "learning_rate": 0.00018051948051948054,
+      "loss": 1.1107,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.19156870245933533,
+      "learning_rate": 0.00017987012987012987,
+      "loss": 1.1125,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.21527768671512604,
+      "learning_rate": 0.00017922077922077922,
+      "loss": 1.1346,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.1871163249015808,
+      "learning_rate": 0.0001785714285714286,
+      "loss": 1.0742,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.17750784754753113,
+      "learning_rate": 0.00017792207792207792,
+      "loss": 1.1323,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.177419051527977,
+      "learning_rate": 0.00017727272727272728,
+      "loss": 1.1405,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.16714292764663696,
+      "learning_rate": 0.00017662337662337663,
+      "loss": 1.1084,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.1610356718301773,
+      "learning_rate": 0.00017597402597402598,
+      "loss": 1.1125,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.2548656761646271,
+      "learning_rate": 0.00017532467532467534,
+      "loss": 1.1114,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.1731044203042984,
+      "learning_rate": 0.0001746753246753247,
+      "loss": 1.1197,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.1739533394575119,
+      "learning_rate": 0.00017402597402597401,
+      "loss": 1.1777,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.2178352177143097,
+      "learning_rate": 0.0001733766233766234,
+      "loss": 1.1111,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.17247150838375092,
+      "learning_rate": 0.00017272727272727275,
+      "loss": 1.1253,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.18075324594974518,
+      "learning_rate": 0.00017207792207792207,
+      "loss": 1.1358,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.15898071229457855,
+      "learning_rate": 0.00017142857142857143,
+      "loss": 1.0606,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.16518613696098328,
+      "learning_rate": 0.0001707792207792208,
+      "loss": 1.0944,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.16035063564777374,
+      "learning_rate": 0.00017012987012987013,
+      "loss": 1.0554,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.1686483472585678,
+      "learning_rate": 0.00016948051948051948,
+      "loss": 1.0384,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.16575631499290466,
+      "learning_rate": 0.00016883116883116884,
+      "loss": 1.0243,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.16840039193630219,
+      "learning_rate": 0.0001681818181818182,
+      "loss": 1.117,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.17616064846515656,
+      "learning_rate": 0.00016753246753246754,
+      "loss": 1.0743,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.168218195438385,
+      "learning_rate": 0.0001668831168831169,
+      "loss": 1.0627,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.17026656866073608,
+      "learning_rate": 0.00016623376623376625,
+      "loss": 1.0059,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.16454458236694336,
+      "learning_rate": 0.0001655844155844156,
+      "loss": 0.9943,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.17185136675834656,
+      "learning_rate": 0.00016493506493506495,
+      "loss": 1.1545,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.17822986841201782,
+      "learning_rate": 0.00016428571428571428,
+      "loss": 1.073,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.1676608771085739,
+      "learning_rate": 0.00016363636363636366,
+      "loss": 1.0886,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.1727771908044815,
+      "learning_rate": 0.000162987012987013,
+      "loss": 1.0432,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.17827573418617249,
+      "learning_rate": 0.00016233766233766234,
+      "loss": 1.083,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.19807517528533936,
+      "learning_rate": 0.0001616883116883117,
+      "loss": 1.1208,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.17693684995174408,
+      "learning_rate": 0.00016103896103896104,
+      "loss": 1.089,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.15489234030246735,
+      "learning_rate": 0.0001603896103896104,
+      "loss": 0.9707,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.16443990170955658,
+      "learning_rate": 0.00015974025974025975,
+      "loss": 1.0643,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.2051103413105011,
+      "learning_rate": 0.0001590909090909091,
+      "loss": 1.1246,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.18824075162410736,
+      "learning_rate": 0.00015844155844155845,
+      "loss": 1.0855,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.18659448623657227,
+      "learning_rate": 0.0001577922077922078,
+      "loss": 1.1412,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.1854114979505539,
+      "learning_rate": 0.00015714285714285716,
+      "loss": 1.0249,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.1876193732023239,
+      "learning_rate": 0.00015649350649350649,
+      "loss": 1.1029,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.1888684630393982,
+      "learning_rate": 0.00015584415584415587,
+      "loss": 1.0789,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.20240606367588043,
+      "learning_rate": 0.0001551948051948052,
+      "loss": 1.0495,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.232120081782341,
+      "learning_rate": 0.00015454545454545454,
+      "loss": 1.0735,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.16897843778133392,
+      "learning_rate": 0.0001538961038961039,
+      "loss": 1.0164,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.18796634674072266,
+      "learning_rate": 0.00015324675324675325,
+      "loss": 1.0676,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.19574032723903656,
+      "learning_rate": 0.0001525974025974026,
+      "loss": 1.0456,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.18007811903953552,
+      "learning_rate": 0.00015194805194805196,
+      "loss": 1.0894,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.18932929635047913,
+      "learning_rate": 0.0001512987012987013,
+      "loss": 1.0729,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.20614288747310638,
+      "learning_rate": 0.00015064935064935066,
+      "loss": 1.0854,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.19291089475154877,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.1217,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.18916529417037964,
+      "learning_rate": 0.00014935064935064934,
+      "loss": 1.0963,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.20306220650672913,
+      "learning_rate": 0.00014870129870129872,
+      "loss": 1.0898,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.17870067059993744,
+      "learning_rate": 0.00014805194805194807,
+      "loss": 1.0213,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.18411923944950104,
+      "learning_rate": 0.0001474025974025974,
+      "loss": 1.0844,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.18788227438926697,
+      "learning_rate": 0.00014675324675324675,
+      "loss": 1.0338,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.23874884843826294,
+      "learning_rate": 0.00014610389610389613,
+      "loss": 1.1118,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.19380499422550201,
+      "learning_rate": 0.00014545454545454546,
+      "loss": 1.0464,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.18968750536441803,
+      "learning_rate": 0.0001448051948051948,
+      "loss": 1.0569,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.19545753300189972,
+      "learning_rate": 0.00014415584415584416,
+      "loss": 1.1225,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.19170494377613068,
+      "learning_rate": 0.00014350649350649352,
+      "loss": 1.0602,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.17953918874263763,
+      "learning_rate": 0.00014285714285714287,
+      "loss": 1.032,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.1822536289691925,
+      "learning_rate": 0.00014220779220779222,
+      "loss": 1.0559,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.18591298162937164,
+      "learning_rate": 0.00014155844155844155,
+      "loss": 1.031,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.2129002958536148,
+      "learning_rate": 0.00014090909090909093,
+      "loss": 1.1391,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.18386681377887726,
+      "learning_rate": 0.00014025974025974028,
+      "loss": 0.9919,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.18314239382743835,
+      "learning_rate": 0.0001396103896103896,
+      "loss": 1.0445,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.1999066174030304,
+      "learning_rate": 0.00013896103896103896,
+      "loss": 1.0538,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.18741188943386078,
+      "learning_rate": 0.00013831168831168834,
+      "loss": 1.0722,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.19351010024547577,
+      "learning_rate": 0.00013766233766233766,
+      "loss": 1.0491,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.18859203159809113,
+      "learning_rate": 0.00013701298701298702,
+      "loss": 1.0593,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.1962767392396927,
+      "learning_rate": 0.00013636363636363637,
+      "loss": 1.1344,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.20819440484046936,
+      "learning_rate": 0.00013571428571428572,
+      "loss": 1.1137,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.19590184092521667,
+      "learning_rate": 0.00013506493506493507,
+      "loss": 1.0624,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.18631424009799957,
+      "learning_rate": 0.00013441558441558443,
+      "loss": 1.0587,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.19572143256664276,
+      "learning_rate": 0.00013376623376623375,
+      "loss": 1.0494,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.1910988837480545,
+      "learning_rate": 0.00013311688311688313,
+      "loss": 1.0481,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.19455869495868683,
+      "learning_rate": 0.00013246753246753249,
+      "loss": 1.029,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.18669827282428741,
+      "learning_rate": 0.0001318181818181818,
+      "loss": 1.0513,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.17523664236068726,
+      "learning_rate": 0.0001311688311688312,
+      "loss": 1.0126,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.17929129302501678,
+      "learning_rate": 0.00013051948051948052,
+      "loss": 1.0717,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.19380168616771698,
+      "learning_rate": 0.00012987012987012987,
+      "loss": 1.0324,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.18090228736400604,
+      "learning_rate": 0.00012922077922077922,
+      "loss": 1.0515,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.2067340910434723,
+      "learning_rate": 0.00012857142857142858,
+      "loss": 1.0939,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.1880485862493515,
+      "learning_rate": 0.00012792207792207793,
+      "loss": 1.0986,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.182168647646904,
+      "learning_rate": 0.00012727272727272728,
+      "loss": 1.0109,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.20187129080295563,
+      "learning_rate": 0.00012662337662337663,
+      "loss": 1.0668,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.2082669734954834,
+      "learning_rate": 0.000125974025974026,
+      "loss": 1.054,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.18294434249401093,
+      "learning_rate": 0.00012532467532467534,
+      "loss": 1.0397,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.20515067875385284,
+      "learning_rate": 0.00012467532467532467,
+      "loss": 1.1092,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.1758790761232376,
+      "learning_rate": 0.00012402597402597402,
+      "loss": 0.9755,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.2170792669057846,
+      "learning_rate": 0.0001233766233766234,
+      "loss": 1.0434,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.202157124876976,
+      "learning_rate": 0.00012272727272727272,
+      "loss": 1.1129,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.18556398153305054,
+      "learning_rate": 0.00012207792207792208,
+      "loss": 1.0665,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.20196087658405304,
+      "learning_rate": 0.00012142857142857143,
+      "loss": 1.1,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.1921566128730774,
+      "learning_rate": 0.0001207792207792208,
+      "loss": 1.0918,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.18866224586963654,
+      "learning_rate": 0.00012012987012987014,
+      "loss": 1.0014,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.207601398229599,
+      "learning_rate": 0.00011948051948051949,
+      "loss": 1.0726,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.21592366695404053,
+      "learning_rate": 0.00011883116883116883,
+      "loss": 1.1379,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.2016124576330185,
+      "learning_rate": 0.0001181818181818182,
+      "loss": 1.1428,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.20478437840938568,
+      "learning_rate": 0.00011753246753246753,
+      "loss": 1.121,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.22730594873428345,
+      "learning_rate": 0.00011688311688311689,
+      "loss": 1.0319,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.22592711448669434,
+      "learning_rate": 0.00011623376623376625,
+      "loss": 1.1264,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.20035041868686676,
+      "learning_rate": 0.00011558441558441559,
+      "loss": 1.0686,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.20648567378520966,
+      "learning_rate": 0.00011493506493506494,
+      "loss": 1.0817,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.21222743391990662,
+      "learning_rate": 0.00011428571428571428,
+      "loss": 1.0678,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.2075391560792923,
+      "learning_rate": 0.00011363636363636365,
+      "loss": 1.0897,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.1964101791381836,
+      "learning_rate": 0.000112987012987013,
+      "loss": 1.0906,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.22406511008739471,
+      "learning_rate": 0.00011233766233766234,
+      "loss": 1.0594,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.23787978291511536,
+      "learning_rate": 0.00011168831168831168,
+      "loss": 1.1053,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.21196185052394867,
+      "learning_rate": 0.00011103896103896105,
+      "loss": 1.0923,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.21042804419994354,
+      "learning_rate": 0.0001103896103896104,
+      "loss": 1.0381,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.2267436534166336,
+      "learning_rate": 0.00010974025974025974,
+      "loss": 1.0818,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.23742735385894775,
+      "learning_rate": 0.00010909090909090909,
+      "loss": 1.0872,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.17787213623523712,
+      "learning_rate": 0.00010844155844155846,
+      "loss": 1.03,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.22422832250595093,
+      "learning_rate": 0.0001077922077922078,
+      "loss": 1.0738,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.22946301102638245,
+      "learning_rate": 0.00010714285714285715,
+      "loss": 1.0274,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.2137996405363083,
+      "learning_rate": 0.00010649350649350649,
+      "loss": 1.0539,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.1748756766319275,
+      "learning_rate": 0.00010584415584415586,
+      "loss": 1.0355,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.22275175154209137,
+      "learning_rate": 0.0001051948051948052,
+      "loss": 1.1696,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.20996077358722687,
+      "learning_rate": 0.00010454545454545455,
+      "loss": 1.0303,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.1945938766002655,
+      "learning_rate": 0.00010389610389610389,
+      "loss": 0.9747,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.1970377266407013,
+      "learning_rate": 0.00010324675324675325,
+      "loss": 1.0358,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.18814732134342194,
+      "learning_rate": 0.00010259740259740261,
+      "loss": 0.9612,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.2153233289718628,
+      "learning_rate": 0.00010194805194805195,
+      "loss": 1.0749,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.21788008511066437,
+      "learning_rate": 0.0001012987012987013,
+      "loss": 1.0883,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.214650496840477,
+      "learning_rate": 0.00010064935064935067,
+      "loss": 1.0539,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.19312834739685059,
+      "learning_rate": 0.0001,
+      "loss": 1.0657,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.19916598498821259,
+      "learning_rate": 9.935064935064936e-05,
+      "loss": 1.0478,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.2057606726884842,
+      "learning_rate": 9.870129870129871e-05,
+      "loss": 1.0094,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.22159607708454132,
+      "learning_rate": 9.805194805194806e-05,
+      "loss": 1.0952,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.18274275958538055,
+      "learning_rate": 9.74025974025974e-05,
+      "loss": 1.0065,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.19835162162780762,
+      "learning_rate": 9.675324675324677e-05,
+      "loss": 1.0742,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.2114904820919037,
+      "learning_rate": 9.610389610389611e-05,
+      "loss": 1.1109,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.21488523483276367,
+      "learning_rate": 9.545454545454546e-05,
+      "loss": 1.0465,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.19870303571224213,
+      "learning_rate": 9.480519480519481e-05,
+      "loss": 1.0318,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.20413029193878174,
+      "learning_rate": 9.415584415584417e-05,
+      "loss": 1.0817,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.1847231239080429,
+      "learning_rate": 9.35064935064935e-05,
+      "loss": 1.0144,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.2715964913368225,
+      "learning_rate": 9.285714285714286e-05,
+      "loss": 0.9832,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.2225002497434616,
+      "learning_rate": 9.220779220779221e-05,
+      "loss": 1.1051,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.22931510210037231,
+      "learning_rate": 9.155844155844156e-05,
+      "loss": 1.1042,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.21848627924919128,
+      "learning_rate": 9.090909090909092e-05,
+      "loss": 1.1151,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.19852259755134583,
+      "learning_rate": 9.025974025974027e-05,
+      "loss": 1.0889,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.2080363780260086,
+      "learning_rate": 8.961038961038961e-05,
+      "loss": 1.0777,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.22391024231910706,
+      "learning_rate": 8.896103896103896e-05,
+      "loss": 1.1092,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.21793846786022186,
+      "learning_rate": 8.831168831168831e-05,
+      "loss": 1.044,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.2009749859571457,
+      "learning_rate": 8.766233766233767e-05,
+      "loss": 1.0198,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.19432318210601807,
+      "learning_rate": 8.701298701298701e-05,
+      "loss": 1.075,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.18634547293186188,
+      "learning_rate": 8.636363636363637e-05,
+      "loss": 0.9964,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.1947103589773178,
+      "learning_rate": 8.571428571428571e-05,
+      "loss": 1.0025,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.23098671436309814,
+      "learning_rate": 8.506493506493507e-05,
+      "loss": 1.0562,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.19686414301395416,
+      "learning_rate": 8.441558441558442e-05,
+      "loss": 1.0285,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.19852428138256073,
+      "learning_rate": 8.376623376623377e-05,
+      "loss": 1.0054,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.21483510732650757,
+      "learning_rate": 8.311688311688312e-05,
+      "loss": 1.108,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.23313644528388977,
+      "learning_rate": 8.246753246753248e-05,
+      "loss": 1.1383,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.21453145146369934,
+      "learning_rate": 8.181818181818183e-05,
+      "loss": 1.0911,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.20268195867538452,
+      "learning_rate": 8.116883116883117e-05,
+      "loss": 1.0145,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.20576398074626923,
+      "learning_rate": 8.051948051948052e-05,
+      "loss": 1.0829,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.21732626855373383,
+      "learning_rate": 7.987012987012987e-05,
+      "loss": 1.0152,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.22046895325183868,
+      "learning_rate": 7.922077922077923e-05,
+      "loss": 1.1311,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.19727715849876404,
+      "learning_rate": 7.857142857142858e-05,
+      "loss": 1.0364,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.20861488580703735,
+      "learning_rate": 7.792207792207793e-05,
+      "loss": 1.0435,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.18545083701610565,
+      "learning_rate": 7.727272727272727e-05,
+      "loss": 1.0299,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.19965052604675293,
+      "learning_rate": 7.662337662337662e-05,
+      "loss": 1.0511,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.23673909902572632,
+      "learning_rate": 7.597402597402598e-05,
+      "loss": 1.081,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.17583179473876953,
+      "learning_rate": 7.532467532467533e-05,
+      "loss": 0.9808,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.2129366099834442,
+      "learning_rate": 7.467532467532467e-05,
+      "loss": 1.0522,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.21679140627384186,
+      "learning_rate": 7.402597402597404e-05,
+      "loss": 1.0567,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.2032000720500946,
+      "learning_rate": 7.337662337662338e-05,
+      "loss": 1.0466,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.1887970268726349,
+      "learning_rate": 7.272727272727273e-05,
+      "loss": 1.0329,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.21060192584991455,
+      "learning_rate": 7.207792207792208e-05,
+      "loss": 1.1021,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.21191425621509552,
+      "learning_rate": 7.142857142857143e-05,
+      "loss": 0.99,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.1995989829301834,
+      "learning_rate": 7.077922077922077e-05,
+      "loss": 1.0526,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.1849513053894043,
+      "learning_rate": 7.012987012987014e-05,
+      "loss": 0.9998,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.1948779672384262,
+      "learning_rate": 6.948051948051948e-05,
+      "loss": 1.075,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.20374052226543427,
+      "learning_rate": 6.883116883116883e-05,
+      "loss": 1.0933,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.2102465033531189,
+      "learning_rate": 6.818181818181818e-05,
+      "loss": 1.1123,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.21376173198223114,
+      "learning_rate": 6.753246753246754e-05,
+      "loss": 1.1233,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.20934203267097473,
+      "learning_rate": 6.688311688311688e-05,
+      "loss": 1.1374,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.18604128062725067,
+      "learning_rate": 6.623376623376624e-05,
+      "loss": 1.0213,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.19644233584403992,
+      "learning_rate": 6.55844155844156e-05,
+      "loss": 1.0046,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.18479463458061218,
+      "learning_rate": 6.493506493506494e-05,
+      "loss": 0.9792,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.1945149153470993,
+      "learning_rate": 6.428571428571429e-05,
+      "loss": 1.0584,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.2070147544145584,
+      "learning_rate": 6.363636363636364e-05,
+      "loss": 1.071,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.19645985960960388,
+      "learning_rate": 6.2987012987013e-05,
+      "loss": 1.0721,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.1960117667913437,
+      "learning_rate": 6.233766233766233e-05,
+      "loss": 1.071,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.20168261229991913,
+      "learning_rate": 6.16883116883117e-05,
+      "loss": 1.0808,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.21254412829875946,
+      "learning_rate": 6.103896103896104e-05,
+      "loss": 1.0287,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.21271063387393951,
+      "learning_rate": 6.03896103896104e-05,
+      "loss": 1.0605,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.2081408053636551,
+      "learning_rate": 5.9740259740259744e-05,
+      "loss": 1.091,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.21113798022270203,
+      "learning_rate": 5.90909090909091e-05,
+      "loss": 1.1323,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.20670844614505768,
+      "learning_rate": 5.844155844155844e-05,
+      "loss": 1.0955,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.2010120451450348,
+      "learning_rate": 5.7792207792207796e-05,
+      "loss": 1.1068,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.20379121601581573,
+      "learning_rate": 5.714285714285714e-05,
+      "loss": 1.0419,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.22799807786941528,
+      "learning_rate": 5.64935064935065e-05,
+      "loss": 1.0904,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.2005995213985443,
+      "learning_rate": 5.584415584415584e-05,
+      "loss": 1.078,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.20329605042934418,
+      "learning_rate": 5.51948051948052e-05,
+      "loss": 1.0245,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.19283504784107208,
+      "learning_rate": 5.4545454545454546e-05,
+      "loss": 1.0367,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.20624355971813202,
+      "learning_rate": 5.38961038961039e-05,
+      "loss": 1.1046,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.21362991631031036,
+      "learning_rate": 5.3246753246753245e-05,
+      "loss": 1.1104,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.20447863638401031,
+      "learning_rate": 5.25974025974026e-05,
+      "loss": 1.0514,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.1974381059408188,
+      "learning_rate": 5.1948051948051944e-05,
+      "loss": 1.0048,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.21237170696258545,
+      "learning_rate": 5.1298701298701304e-05,
+      "loss": 1.1299,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.21224971115589142,
+      "learning_rate": 5.064935064935065e-05,
+      "loss": 1.05,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.19865018129348755,
+      "learning_rate": 5e-05,
+      "loss": 1.0665,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.19199275970458984,
+      "learning_rate": 4.9350649350649355e-05,
+      "loss": 0.9531,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.19573214650154114,
+      "learning_rate": 4.87012987012987e-05,
+      "loss": 1.0318,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.21338805556297302,
+      "learning_rate": 4.8051948051948054e-05,
+      "loss": 1.0343,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.2254691869020462,
+      "learning_rate": 4.740259740259741e-05,
+      "loss": 1.0472,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.18101665377616882,
+      "learning_rate": 4.675324675324675e-05,
+      "loss": 1.017,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.22090592980384827,
+      "learning_rate": 4.6103896103896106e-05,
+      "loss": 1.0389,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.20865507423877716,
+      "learning_rate": 4.545454545454546e-05,
+      "loss": 1.0369,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.21619610488414764,
+      "learning_rate": 4.4805194805194805e-05,
+      "loss": 1.109,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.21694771945476532,
+      "learning_rate": 4.415584415584416e-05,
+      "loss": 1.0525,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.2182662934064865,
+      "learning_rate": 4.3506493506493503e-05,
+      "loss": 1.0331,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.2026486098766327,
+      "learning_rate": 4.2857142857142856e-05,
+      "loss": 1.027,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.19606547057628632,
+      "learning_rate": 4.220779220779221e-05,
+      "loss": 1.0242,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.22107470035552979,
+      "learning_rate": 4.155844155844156e-05,
+      "loss": 1.0924,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.19960008561611176,
+      "learning_rate": 4.0909090909090915e-05,
+      "loss": 1.0384,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.1945488154888153,
+      "learning_rate": 4.025974025974026e-05,
+      "loss": 1.0673,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.22067414224147797,
+      "learning_rate": 3.9610389610389614e-05,
+      "loss": 1.0426,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.19010980427265167,
+      "learning_rate": 3.8961038961038966e-05,
+      "loss": 1.0617,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.18781176209449768,
+      "learning_rate": 3.831168831168831e-05,
+      "loss": 1.0243,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.20388829708099365,
+      "learning_rate": 3.7662337662337665e-05,
+      "loss": 1.0476,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.19911155104637146,
+      "learning_rate": 3.701298701298702e-05,
+      "loss": 1.0324,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.19884039461612701,
+      "learning_rate": 3.6363636363636364e-05,
+      "loss": 1.0242,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.19036105275154114,
+      "learning_rate": 3.571428571428572e-05,
+      "loss": 1.0323,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.20039844512939453,
+      "learning_rate": 3.506493506493507e-05,
+      "loss": 1.0749,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.1899934560060501,
+      "learning_rate": 3.4415584415584416e-05,
+      "loss": 1.0115,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.20019090175628662,
+      "learning_rate": 3.376623376623377e-05,
+      "loss": 1.0782,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.2020583152770996,
+      "learning_rate": 3.311688311688312e-05,
+      "loss": 1.0687,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.21407337486743927,
+      "learning_rate": 3.246753246753247e-05,
+      "loss": 1.1015,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.1871640682220459,
+      "learning_rate": 3.181818181818182e-05,
+      "loss": 0.9637,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.21622811257839203,
+      "learning_rate": 3.1168831168831166e-05,
+      "loss": 1.1222,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.22504661977291107,
+      "learning_rate": 3.051948051948052e-05,
+      "loss": 1.132,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.19177629053592682,
+      "learning_rate": 2.9870129870129872e-05,
+      "loss": 1.0281,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.1970544159412384,
+      "learning_rate": 2.922077922077922e-05,
+      "loss": 1.0393,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.21554522216320038,
+      "learning_rate": 2.857142857142857e-05,
+      "loss": 1.074,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.21131229400634766,
+      "learning_rate": 2.792207792207792e-05,
+      "loss": 1.054,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.19816523790359497,
+      "learning_rate": 2.7272727272727273e-05,
+      "loss": 1.0456,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.21075209975242615,
+      "learning_rate": 2.6623376623376623e-05,
+      "loss": 1.0758,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.2296527624130249,
+      "learning_rate": 2.5974025974025972e-05,
+      "loss": 1.0917,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.19722610712051392,
+      "learning_rate": 2.5324675324675325e-05,
+      "loss": 1.0704,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.18721099197864532,
+      "learning_rate": 2.4675324675324678e-05,
+      "loss": 0.9919,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.20244193077087402,
+      "learning_rate": 2.4025974025974027e-05,
+      "loss": 1.0368,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.19518914818763733,
+      "learning_rate": 2.3376623376623376e-05,
+      "loss": 1.0436,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.19650357961654663,
+      "learning_rate": 2.272727272727273e-05,
+      "loss": 1.0306,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.20320096611976624,
+      "learning_rate": 2.207792207792208e-05,
+      "loss": 1.0941,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.18296951055526733,
+      "learning_rate": 2.1428571428571428e-05,
+      "loss": 0.9802,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.21357610821723938,
+      "learning_rate": 2.077922077922078e-05,
+      "loss": 1.0449,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.193921759724617,
+      "learning_rate": 2.012987012987013e-05,
+      "loss": 1.0116,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.1953902244567871,
+      "learning_rate": 1.9480519480519483e-05,
+      "loss": 1.0105,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.19440975785255432,
+      "learning_rate": 1.8831168831168833e-05,
+      "loss": 0.9952,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.21054105460643768,
+      "learning_rate": 1.8181818181818182e-05,
+      "loss": 1.0701,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.18844804167747498,
+      "learning_rate": 1.7532467532467535e-05,
+      "loss": 1.0146,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.2067311704158783,
+      "learning_rate": 1.6883116883116884e-05,
+      "loss": 1.0781,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.1941213756799698,
+      "learning_rate": 1.6233766233766234e-05,
+      "loss": 0.9814,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.22726193070411682,
+      "learning_rate": 1.5584415584415583e-05,
+      "loss": 1.1431,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.18025581538677216,
+      "learning_rate": 1.4935064935064936e-05,
+      "loss": 0.9649,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.21535000205039978,
+      "learning_rate": 1.4285714285714285e-05,
+      "loss": 1.0441,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.20014546811580658,
+      "learning_rate": 1.3636363636363637e-05,
+      "loss": 1.0166,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.22738787531852722,
+      "learning_rate": 1.2987012987012986e-05,
+      "loss": 1.0564,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.2020861804485321,
+      "learning_rate": 1.2337662337662339e-05,
+      "loss": 1.1241,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.19888809323310852,
+      "learning_rate": 1.1688311688311688e-05,
+      "loss": 1.1114,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.20912377536296844,
+      "learning_rate": 1.103896103896104e-05,
+      "loss": 1.0971,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.21206621825695038,
+      "learning_rate": 1.038961038961039e-05,
+      "loss": 1.0601,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.18667680025100708,
+      "learning_rate": 9.740259740259742e-06,
+      "loss": 1.0291,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.21125559508800507,
+      "learning_rate": 9.090909090909091e-06,
+      "loss": 1.0483,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.21776145696640015,
+      "learning_rate": 8.441558441558442e-06,
+      "loss": 0.9912,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.20144303143024445,
+      "learning_rate": 7.792207792207792e-06,
+      "loss": 1.0357,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.1984029859304428,
+      "learning_rate": 7.142857142857143e-06,
+      "loss": 1.0648,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.17972829937934875,
+      "learning_rate": 6.493506493506493e-06,
+      "loss": 1.0033,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.1818286031484604,
+      "learning_rate": 5.844155844155844e-06,
+      "loss": 0.997,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.19670912623405457,
+      "learning_rate": 5.194805194805195e-06,
+      "loss": 1.0256,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.20527283847332,
+      "learning_rate": 4.5454545454545455e-06,
+      "loss": 1.0348,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.19025909900665283,
+      "learning_rate": 3.896103896103896e-06,
+      "loss": 1.0682,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.19544818997383118,
+      "learning_rate": 3.2467532467532465e-06,
+      "loss": 0.9872,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.22112183272838593,
+      "learning_rate": 2.5974025974025976e-06,
+      "loss": 1.0661,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.23328153789043427,
+      "learning_rate": 1.948051948051948e-06,
+      "loss": 1.0691,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.20181375741958618,
+      "learning_rate": 1.2987012987012988e-06,
+      "loss": 0.9416,
+      "step": 312
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.29312625527381897,
+      "learning_rate": 6.493506493506494e-07,
+      "loss": 1.1216,
+      "step": 313
+    },
+    {
+      "epoch": 0.12202467696492762,
+      "grad_norm": 0.2231415957212448,
+      "learning_rate": 0.0,
+      "loss": 1.0468,
+      "step": 314
+    },
+    {
+      "epoch": 0.12241329058583503,
+      "grad_norm": 0.22263288497924805,
+      "learning_rate": 0.00017594394706111328,
+      "loss": 1.0399,
+      "step": 315
+    },
+    {
+      "epoch": 0.12280190420674245,
+      "grad_norm": 0.22909891605377197,
+      "learning_rate": 0.00017586609575710393,
+      "loss": 1.1069,
+      "step": 316
+    },
+    {
+      "epoch": 0.12319051782764986,
+      "grad_norm": 0.23951445519924164,
+      "learning_rate": 0.0001757882444530946,
+      "loss": 1.1036,
+      "step": 317
+    },
+    {
+      "epoch": 0.12357913144855727,
+      "grad_norm": 0.2409268021583557,
+      "learning_rate": 0.00017571039314908526,
+      "loss": 1.1114,
+      "step": 318
+    },
+    {
+      "epoch": 0.12396774506946469,
+      "grad_norm": 0.23753899335861206,
+      "learning_rate": 0.00017563254184507592,
+      "loss": 1.1297,
+      "step": 319
+    },
+    {
+      "epoch": 0.12435635869037209,
+      "grad_norm": 0.2823902666568756,
+      "learning_rate": 0.00017555469054106657,
+      "loss": 1.1293,
+      "step": 320
+    },
+    {
+      "epoch": 0.12474497231127951,
+      "grad_norm": 0.24093545973300934,
+      "learning_rate": 0.00017547683923705722,
+      "loss": 1.0678,
+      "step": 321
+    },
+    {
+      "epoch": 0.12513358593218693,
+      "grad_norm": 0.22565563023090363,
+      "learning_rate": 0.0001753989879330479,
+      "loss": 1.1408,
+      "step": 322
+    },
+    {
+      "epoch": 0.12552219955309435,
+      "grad_norm": 0.22569572925567627,
+      "learning_rate": 0.00017532113662903855,
+      "loss": 1.0543,
+      "step": 323
+    },
+    {
+      "epoch": 0.12591081317400174,
+      "grad_norm": 0.24962866306304932,
+      "learning_rate": 0.0001752432853250292,
+      "loss": 1.0818,
+      "step": 324
+    },
+    {
+      "epoch": 0.12629942679490916,
+      "grad_norm": 0.22184576094150543,
+      "learning_rate": 0.00017516543402101986,
+      "loss": 1.0835,
+      "step": 325
+    },
+    {
+      "epoch": 0.12668804041581658,
+      "grad_norm": 0.2572194039821625,
+      "learning_rate": 0.0001750875827170105,
+      "loss": 1.0767,
+      "step": 326
+    },
+    {
+      "epoch": 0.127076654036724,
+      "grad_norm": 0.24131342768669128,
+      "learning_rate": 0.00017500973141300116,
+      "loss": 1.0981,
+      "step": 327
+    },
+    {
+      "epoch": 0.1274652676576314,
+      "grad_norm": 0.2386389970779419,
+      "learning_rate": 0.00017493188010899184,
+      "loss": 1.0828,
+      "step": 328
+    },
+    {
+      "epoch": 0.1278538812785388,
+      "grad_norm": 0.2654125690460205,
+      "learning_rate": 0.0001748540288049825,
+      "loss": 1.1266,
+      "step": 329
+    },
+    {
+      "epoch": 0.12824249489944622,
+      "grad_norm": 0.2925739884376526,
+      "learning_rate": 0.00017477617750097314,
+      "loss": 1.0983,
+      "step": 330
+    },
+    {
+      "epoch": 0.12863110852035364,
+      "grad_norm": 0.26589342951774597,
+      "learning_rate": 0.0001746983261969638,
+      "loss": 1.1029,
+      "step": 331
+    },
+    {
+      "epoch": 0.12901972214126106,
+      "grad_norm": 0.24565957486629486,
+      "learning_rate": 0.00017462047489295445,
+      "loss": 1.0975,
+      "step": 332
+    },
+    {
+      "epoch": 0.12940833576216845,
+      "grad_norm": 0.2459682673215866,
+      "learning_rate": 0.00017454262358894513,
+      "loss": 1.0566,
+      "step": 333
+    },
+    {
+      "epoch": 0.12979694938307587,
+      "grad_norm": 0.23349183797836304,
+      "learning_rate": 0.00017446477228493578,
+      "loss": 1.0833,
+      "step": 334
+    },
+    {
+      "epoch": 0.1301855630039833,
+      "grad_norm": 0.26166337728500366,
+      "learning_rate": 0.00017438692098092643,
+      "loss": 1.1598,
+      "step": 335
+    },
+    {
+      "epoch": 0.1305741766248907,
+      "grad_norm": 0.24188168346881866,
+      "learning_rate": 0.00017430906967691708,
+      "loss": 1.0728,
+      "step": 336
+    },
+    {
+      "epoch": 0.13096279024579813,
+      "grad_norm": 0.22922398149967194,
+      "learning_rate": 0.00017423121837290773,
+      "loss": 1.0311,
+      "step": 337
+    },
+    {
+      "epoch": 0.13135140386670552,
+      "grad_norm": 0.2652754485607147,
+      "learning_rate": 0.00017415336706889841,
+      "loss": 1.1096,
+      "step": 338
+    },
+    {
+      "epoch": 0.13174001748761294,
+      "grad_norm": 0.2355881780385971,
+      "learning_rate": 0.00017407551576488907,
+      "loss": 1.0964,
+      "step": 339
+    },
+    {
+      "epoch": 0.13212863110852036,
+      "grad_norm": 0.244523823261261,
+      "learning_rate": 0.00017399766446087972,
+      "loss": 1.142,
+      "step": 340
+    },
+    {
+      "epoch": 0.13251724472942777,
+      "grad_norm": 0.24705976247787476,
+      "learning_rate": 0.00017391981315687037,
+      "loss": 1.0943,
+      "step": 341
+    },
+    {
+      "epoch": 0.13290585835033517,
+      "grad_norm": 0.22817552089691162,
+      "learning_rate": 0.00017384196185286102,
+      "loss": 1.0621,
+      "step": 342
+    },
+    {
+      "epoch": 0.13329447197124258,
+      "grad_norm": 0.22605225443840027,
+      "learning_rate": 0.0001737641105488517,
+      "loss": 1.0714,
+      "step": 343
+    },
+    {
+      "epoch": 0.13368308559215,
+      "grad_norm": 0.2584545314311981,
+      "learning_rate": 0.00017368625924484235,
+      "loss": 1.1367,
+      "step": 344
+    },
+    {
+      "epoch": 0.13407169921305742,
+      "grad_norm": 0.2248220443725586,
+      "learning_rate": 0.000173608407940833,
+      "loss": 1.0872,
+      "step": 345
+    },
+    {
+      "epoch": 0.13446031283396484,
+      "grad_norm": 0.2141868770122528,
+      "learning_rate": 0.00017353055663682368,
+      "loss": 1.0572,
+      "step": 346
+    },
+    {
+      "epoch": 0.13484892645487223,
+      "grad_norm": 0.2615523934364319,
+      "learning_rate": 0.00017345270533281434,
+      "loss": 1.1048,
+      "step": 347
+    },
+    {
+      "epoch": 0.13523754007577965,
+      "grad_norm": 0.22990448772907257,
+      "learning_rate": 0.000173374854028805,
+      "loss": 1.0528,
+      "step": 348
+    },
+    {
+      "epoch": 0.13562615369668707,
+      "grad_norm": 0.2132262885570526,
+      "learning_rate": 0.00017329700272479564,
+      "loss": 1.0476,
+      "step": 349
+    },
+    {
+      "epoch": 0.1360147673175945,
+      "grad_norm": 0.2578272819519043,
+      "learning_rate": 0.00017321915142078632,
+      "loss": 1.0852,
+      "step": 350
+    },
+    {
+      "epoch": 0.1364033809385019,
+      "grad_norm": 0.22881457209587097,
+      "learning_rate": 0.00017314130011677697,
+      "loss": 1.1017,
+      "step": 351
+    },
+    {
+      "epoch": 0.1367919945594093,
+      "grad_norm": 0.21067696809768677,
+      "learning_rate": 0.00017306344881276762,
+      "loss": 1.0444,
+      "step": 352
+    },
+    {
+      "epoch": 0.13718060818031672,
+      "grad_norm": 0.2304215282201767,
+      "learning_rate": 0.0001729855975087583,
+      "loss": 1.0737,
+      "step": 353
+    },
+    {
+      "epoch": 0.13756922180122413,
+      "grad_norm": 0.2031925916671753,
+      "learning_rate": 0.00017290774620474895,
+      "loss": 1.0036,
+      "step": 354
+    },
+    {
+      "epoch": 0.13795783542213155,
+      "grad_norm": 0.27281051874160767,
+      "learning_rate": 0.0001728298949007396,
+      "loss": 1.148,
+      "step": 355
+    },
+    {
+      "epoch": 0.13834644904303897,
+      "grad_norm": 0.204191654920578,
+      "learning_rate": 0.00017275204359673026,
+      "loss": 0.9607,
+      "step": 356
+    },
+    {
+      "epoch": 0.13873506266394636,
+      "grad_norm": 0.221976637840271,
+      "learning_rate": 0.0001726741922927209,
+      "loss": 1.1068,
+      "step": 357
+    },
+    {
+      "epoch": 0.13912367628485378,
+      "grad_norm": 0.20831729471683502,
+      "learning_rate": 0.0001725963409887116,
+      "loss": 1.034,
+      "step": 358
+    },
+    {
+      "epoch": 0.1395122899057612,
+      "grad_norm": 0.21639779210090637,
+      "learning_rate": 0.00017251848968470224,
+      "loss": 1.0613,
+      "step": 359
+    },
+    {
+      "epoch": 0.13990090352666862,
+      "grad_norm": 0.1959424465894699,
+      "learning_rate": 0.0001724406383806929,
+      "loss": 1.0506,
+      "step": 360
+    },
+    {
+      "epoch": 0.140289517147576,
+      "grad_norm": 0.2044398933649063,
+      "learning_rate": 0.00017236278707668355,
+      "loss": 1.0316,
+      "step": 361
+    },
+    {
+      "epoch": 0.14067813076848343,
+      "grad_norm": 0.21483004093170166,
+      "learning_rate": 0.0001722849357726742,
+      "loss": 1.0361,
+      "step": 362
+    },
+    {
+      "epoch": 0.14106674438939085,
+      "grad_norm": 0.237701416015625,
+      "learning_rate": 0.00017220708446866485,
+      "loss": 1.1264,
+      "step": 363
+    },
+    {
+      "epoch": 0.14145535801029827,
+      "grad_norm": 0.20750795304775238,
+      "learning_rate": 0.00017212923316465553,
+      "loss": 1.0523,
+      "step": 364
+    },
+    {
+      "epoch": 0.14184397163120568,
+      "grad_norm": 0.2252965271472931,
+      "learning_rate": 0.00017205138186064618,
+      "loss": 1.0764,
+      "step": 365
+    },
+    {
+      "epoch": 0.14223258525211308,
+      "grad_norm": 0.2033565789461136,
+      "learning_rate": 0.00017197353055663683,
+      "loss": 1.064,
+      "step": 366
+    },
+    {
+      "epoch": 0.1426211988730205,
+      "grad_norm": 0.21123190224170685,
+      "learning_rate": 0.00017189567925262749,
+      "loss": 1.0515,
+      "step": 367
+    },
+    {
+      "epoch": 0.1430098124939279,
+      "grad_norm": 0.20646221935749054,
+      "learning_rate": 0.00017181782794861814,
+      "loss": 1.0617,
+      "step": 368
+    },
+    {
+      "epoch": 0.14339842611483533,
+      "grad_norm": 0.2079589068889618,
+      "learning_rate": 0.00017173997664460882,
+      "loss": 1.0569,
+      "step": 369
+    },
+    {
+      "epoch": 0.14378703973574275,
+      "grad_norm": 0.216246098279953,
+      "learning_rate": 0.00017166212534059947,
+      "loss": 1.0986,
+      "step": 370
+    },
+    {
+      "epoch": 0.14417565335665014,
+      "grad_norm": 0.20711806416511536,
+      "learning_rate": 0.00017158427403659012,
+      "loss": 1.1342,
+      "step": 371
+    },
+    {
+      "epoch": 0.14456426697755756,
+      "grad_norm": 0.235435351729393,
+      "learning_rate": 0.00017150642273258077,
+      "loss": 1.1082,
+      "step": 372
+    },
+    {
+      "epoch": 0.14495288059846498,
+      "grad_norm": 0.2273191511631012,
+      "learning_rate": 0.00017142857142857143,
+      "loss": 1.1064,
+      "step": 373
+    },
+    {
+      "epoch": 0.1453414942193724,
+      "grad_norm": 0.2075672745704651,
+      "learning_rate": 0.0001713507201245621,
+      "loss": 1.0536,
+      "step": 374
+    },
+    {
+      "epoch": 0.14573010784027982,
+      "grad_norm": 0.20764274895191193,
+      "learning_rate": 0.00017127286882055276,
+      "loss": 1.0673,
+      "step": 375
+    },
+    {
+      "epoch": 0.1461187214611872,
+      "grad_norm": 0.2441243678331375,
+      "learning_rate": 0.0001711950175165434,
+      "loss": 1.1271,
+      "step": 376
+    },
+    {
+      "epoch": 0.14650733508209463,
+      "grad_norm": 0.2383374124765396,
+      "learning_rate": 0.00017111716621253406,
+      "loss": 1.083,
+      "step": 377
+    },
+    {
+      "epoch": 0.14689594870300204,
+      "grad_norm": 0.2172410786151886,
+      "learning_rate": 0.0001710393149085247,
+      "loss": 1.0605,
+      "step": 378
+    },
+    {
+      "epoch": 0.14728456232390946,
+      "grad_norm": 0.22591541707515717,
+      "learning_rate": 0.0001709614636045154,
+      "loss": 1.0931,
+      "step": 379
+    },
+    {
+      "epoch": 0.14767317594481685,
+      "grad_norm": 0.23099495470523834,
+      "learning_rate": 0.00017088361230050604,
+      "loss": 1.1021,
+      "step": 380
+    },
+    {
+      "epoch": 0.14806178956572427,
+      "grad_norm": 0.21461094915866852,
+      "learning_rate": 0.0001708057609964967,
+      "loss": 1.0959,
+      "step": 381
+    },
+    {
+      "epoch": 0.1484504031866317,
+      "grad_norm": 0.21557241678237915,
+      "learning_rate": 0.00017072790969248735,
+      "loss": 1.0155,
+      "step": 382
+    },
+    {
+      "epoch": 0.1488390168075391,
+      "grad_norm": 0.234396293759346,
+      "learning_rate": 0.000170650058388478,
+      "loss": 1.1289,
+      "step": 383
+    },
+    {
+      "epoch": 0.14922763042844653,
+      "grad_norm": 0.22895503044128418,
+      "learning_rate": 0.00017057220708446868,
+      "loss": 0.9919,
+      "step": 384
+    },
+    {
+      "epoch": 0.14961624404935392,
+      "grad_norm": 0.2054683268070221,
+      "learning_rate": 0.00017049435578045933,
+      "loss": 1.0607,
+      "step": 385
+    },
+    {
+      "epoch": 0.15000485767026134,
+      "grad_norm": 0.25569215416908264,
+      "learning_rate": 0.00017041650447644998,
+      "loss": 1.0517,
+      "step": 386
+    },
+    {
+      "epoch": 0.15039347129116876,
+      "grad_norm": 0.2222641259431839,
+      "learning_rate": 0.00017033865317244064,
+      "loss": 1.0404,
+      "step": 387
+    },
+    {
+      "epoch": 0.15078208491207618,
+      "grad_norm": 0.20501169562339783,
+      "learning_rate": 0.0001702608018684313,
+      "loss": 0.9897,
+      "step": 388
+    },
+    {
+      "epoch": 0.1511706985329836,
+      "grad_norm": 0.22080403566360474,
+      "learning_rate": 0.00017018295056442197,
+      "loss": 1.1013,
+      "step": 389
+    },
+    {
+      "epoch": 0.15155931215389098,
+      "grad_norm": 0.21218529343605042,
+      "learning_rate": 0.00017010509926041262,
+      "loss": 1.0541,
+      "step": 390
+    },
+    {
+      "epoch": 0.1519479257747984,
+      "grad_norm": 0.23064807057380676,
+      "learning_rate": 0.00017002724795640327,
+      "loss": 1.037,
+      "step": 391
+    },
+    {
+      "epoch": 0.15233653939570582,
+      "grad_norm": 0.21164493262767792,
+      "learning_rate": 0.00016994939665239392,
+      "loss": 1.0769,
+      "step": 392
+    },
+    {
+      "epoch": 0.15272515301661324,
+      "grad_norm": 0.22565549612045288,
+      "learning_rate": 0.00016987154534838457,
+      "loss": 1.0638,
+      "step": 393
+    },
+    {
+      "epoch": 0.15311376663752063,
+      "grad_norm": 0.22492647171020508,
+      "learning_rate": 0.00016979369404437525,
+      "loss": 1.063,
+      "step": 394
+    },
+    {
+      "epoch": 0.15350238025842805,
+      "grad_norm": 0.22335395216941833,
+      "learning_rate": 0.0001697158427403659,
+      "loss": 1.1032,
+      "step": 395
+    },
+    {
+      "epoch": 0.15389099387933547,
+      "grad_norm": 0.2164154201745987,
+      "learning_rate": 0.00016963799143635656,
+      "loss": 1.1275,
+      "step": 396
+    },
+    {
+      "epoch": 0.1542796075002429,
+      "grad_norm": 0.22547736763954163,
+      "learning_rate": 0.0001695601401323472,
+      "loss": 1.1324,
+      "step": 397
+    },
+    {
+      "epoch": 0.1546682211211503,
+      "grad_norm": 0.2028045952320099,
+      "learning_rate": 0.0001694822888283379,
+      "loss": 1.0057,
+      "step": 398
+    },
+    {
+      "epoch": 0.1550568347420577,
+      "grad_norm": 0.20770573616027832,
+      "learning_rate": 0.00016940443752432854,
+      "loss": 1.0311,
+      "step": 399
+    },
+    {
+      "epoch": 0.15544544836296512,
+      "grad_norm": 0.2231476902961731,
+      "learning_rate": 0.0001693265862203192,
+      "loss": 1.0535,
+      "step": 400
+    },
+    {
+      "epoch": 0.15583406198387253,
+      "grad_norm": 0.21618099510669708,
+      "learning_rate": 0.00016924873491630987,
+      "loss": 1.0616,
+      "step": 401
+    },
+    {
+      "epoch": 0.15622267560477995,
+      "grad_norm": 0.24024419486522675,
+      "learning_rate": 0.00016917088361230052,
+      "loss": 1.1324,
+      "step": 402
+    },
+    {
+      "epoch": 0.15661128922568737,
+      "grad_norm": 0.2002171128988266,
+      "learning_rate": 0.00016909303230829118,
+      "loss": 1.015,
+      "step": 403
+    },
+    {
+      "epoch": 0.15699990284659476,
+      "grad_norm": 0.21771477162837982,
+      "learning_rate": 0.00016901518100428183,
+      "loss": 1.0817,
+      "step": 404
+    },
+    {
+      "epoch": 0.15738851646750218,
+      "grad_norm": 0.22052259743213654,
+      "learning_rate": 0.0001689373297002725,
+      "loss": 1.0836,
+      "step": 405
+    },
+    {
+      "epoch": 0.1577771300884096,
+      "grad_norm": 0.1964062750339508,
+      "learning_rate": 0.00016885947839626316,
+      "loss": 1.0505,
+      "step": 406
+    },
+    {
+      "epoch": 0.15816574370931702,
+      "grad_norm": 0.22714298963546753,
+      "learning_rate": 0.0001687816270922538,
+      "loss": 1.0702,
+      "step": 407
+    },
+    {
+      "epoch": 0.15855435733022444,
+      "grad_norm": 0.20647728443145752,
+      "learning_rate": 0.00016870377578824446,
+      "loss": 1.0349,
+      "step": 408
+    },
+    {
+      "epoch": 0.15894297095113183,
+      "grad_norm": 0.2355160117149353,
+      "learning_rate": 0.00016862592448423512,
+      "loss": 1.0305,
+      "step": 409
+    },
+    {
+      "epoch": 0.15933158457203925,
+      "grad_norm": 0.22890770435333252,
+      "learning_rate": 0.0001685480731802258,
+      "loss": 1.0854,
+      "step": 410
+    },
+    {
+      "epoch": 0.15972019819294667,
+      "grad_norm": 0.21947838366031647,
+      "learning_rate": 0.00016847022187621645,
+      "loss": 1.0948,
+      "step": 411
+    },
+    {
+      "epoch": 0.16010881181385409,
+      "grad_norm": 0.22334899008274078,
+      "learning_rate": 0.0001683923705722071,
+      "loss": 1.006,
+      "step": 412
+    },
+    {
+      "epoch": 0.16049742543476148,
+      "grad_norm": 0.22324936091899872,
+      "learning_rate": 0.00016831451926819775,
+      "loss": 1.0402,
+      "step": 413
+    },
+    {
+      "epoch": 0.1608860390556689,
+      "grad_norm": 0.21462097764015198,
+      "learning_rate": 0.0001682366679641884,
+      "loss": 1.077,
+      "step": 414
+    },
+    {
+      "epoch": 0.1612746526765763,
+      "grad_norm": 0.24567006528377533,
+      "learning_rate": 0.00016815881666017908,
+      "loss": 1.15,
+      "step": 415
+    },
+    {
+      "epoch": 0.16166326629748373,
+      "grad_norm": 0.26437243819236755,
+      "learning_rate": 0.00016808096535616973,
+      "loss": 1.1251,
+      "step": 416
+    },
+    {
+      "epoch": 0.16205187991839115,
+      "grad_norm": 0.2217959761619568,
+      "learning_rate": 0.00016800311405216039,
+      "loss": 1.1103,
+      "step": 417
+    },
+    {
+      "epoch": 0.16244049353929854,
+      "grad_norm": 0.24402475357055664,
+      "learning_rate": 0.00016792526274815104,
+      "loss": 1.0672,
+      "step": 418
+    },
+    {
+      "epoch": 0.16282910716020596,
+      "grad_norm": 0.21609526872634888,
+      "learning_rate": 0.0001678474114441417,
+      "loss": 1.0291,
+      "step": 419
+    },
+    {
+      "epoch": 0.16321772078111338,
+      "grad_norm": 0.20054642856121063,
+      "learning_rate": 0.00016776956014013237,
+      "loss": 1.0704,
+      "step": 420
+    },
+    {
+      "epoch": 0.1636063344020208,
+      "grad_norm": 0.22864869236946106,
+      "learning_rate": 0.00016769170883612302,
+      "loss": 1.0612,
+      "step": 421
+    },
+    {
+      "epoch": 0.16399494802292822,
+      "grad_norm": 0.22651974856853485,
+      "learning_rate": 0.00016761385753211367,
+      "loss": 1.0749,
+      "step": 422
+    },
+    {
+      "epoch": 0.1643835616438356,
+      "grad_norm": 0.21587328612804413,
+      "learning_rate": 0.00016753600622810433,
+      "loss": 1.0398,
+      "step": 423
+    },
+    {
+      "epoch": 0.16477217526474303,
+      "grad_norm": 0.1953774094581604,
+      "learning_rate": 0.00016745815492409498,
+      "loss": 1.0275,
+      "step": 424
+    },
+    {
+      "epoch": 0.16516078888565044,
+      "grad_norm": 0.21803410351276398,
+      "learning_rate": 0.00016738030362008566,
+      "loss": 1.1219,
+      "step": 425
+    },
+    {
+      "epoch": 0.16554940250655786,
+      "grad_norm": 0.2034682035446167,
+      "learning_rate": 0.0001673024523160763,
+      "loss": 1.0342,
+      "step": 426
+    },
+    {
+      "epoch": 0.16593801612746525,
+      "grad_norm": 0.20135951042175293,
+      "learning_rate": 0.00016722460101206696,
+      "loss": 0.9802,
+      "step": 427
+    },
+    {
+      "epoch": 0.16632662974837267,
+      "grad_norm": 0.23310376703739166,
+      "learning_rate": 0.0001671467497080576,
+      "loss": 1.0789,
+      "step": 428
+    },
+    {
+      "epoch": 0.1667152433692801,
+      "grad_norm": 0.21475404500961304,
+      "learning_rate": 0.00016706889840404827,
+      "loss": 1.0416,
+      "step": 429
+    },
+    {
+      "epoch": 0.1671038569901875,
+      "grad_norm": 0.21661072969436646,
+      "learning_rate": 0.00016699104710003894,
+      "loss": 1.0568,
+      "step": 430
+    },
+    {
+      "epoch": 0.16749247061109493,
+      "grad_norm": 0.20310629904270172,
+      "learning_rate": 0.0001669131957960296,
+      "loss": 0.9968,
+      "step": 431
+    },
+    {
+      "epoch": 0.16788108423200232,
+      "grad_norm": 0.2596947252750397,
+      "learning_rate": 0.00016683534449202025,
+      "loss": 1.0478,
+      "step": 432
+    },
+    {
+      "epoch": 0.16826969785290974,
+      "grad_norm": 0.22226987779140472,
+      "learning_rate": 0.0001667574931880109,
+      "loss": 1.0898,
+      "step": 433
+    },
+    {
+      "epoch": 0.16865831147381716,
+      "grad_norm": 0.22499911487102509,
+      "learning_rate": 0.00016667964188400155,
+      "loss": 1.07,
+      "step": 434
+    },
+    {
+      "epoch": 0.16904692509472458,
+      "grad_norm": 0.2717292308807373,
+      "learning_rate": 0.0001666017905799922,
+      "loss": 1.0562,
+      "step": 435
+    },
+    {
+      "epoch": 0.169435538715632,
+      "grad_norm": 0.22052323818206787,
+      "learning_rate": 0.00016652393927598288,
+      "loss": 1.0732,
+      "step": 436
+    },
+    {
+      "epoch": 0.16982415233653939,
+      "grad_norm": 0.21741728484630585,
+      "learning_rate": 0.00016644608797197354,
+      "loss": 1.0409,
+      "step": 437
+    },
+    {
+      "epoch": 0.1702127659574468,
+      "grad_norm": 0.20701193809509277,
+      "learning_rate": 0.0001663682366679642,
+      "loss": 1.0731,
+      "step": 438
+    },
+    {
+      "epoch": 0.17060137957835422,
+      "grad_norm": 0.22071130573749542,
+      "learning_rate": 0.00016629038536395484,
+      "loss": 1.0992,
+      "step": 439
+    },
+    {
+      "epoch": 0.17098999319926164,
+      "grad_norm": 0.20261412858963013,
+      "learning_rate": 0.0001662125340599455,
+      "loss": 1.0051,
+      "step": 440
+    },
+    {
+      "epoch": 0.17137860682016906,
+      "grad_norm": 0.2082947939634323,
+      "learning_rate": 0.00016613468275593617,
+      "loss": 1.0477,
+      "step": 441
+    },
+    {
+      "epoch": 0.17176722044107645,
+      "grad_norm": 0.22534717619419098,
+      "learning_rate": 0.00016605683145192682,
+      "loss": 1.041,
+      "step": 442
+    },
+    {
+      "epoch": 0.17215583406198387,
+      "grad_norm": 0.21547731757164001,
+      "learning_rate": 0.00016597898014791748,
+      "loss": 1.0528,
+      "step": 443
+    },
+    {
+      "epoch": 0.1725444476828913,
+      "grad_norm": 0.24141089618206024,
+      "learning_rate": 0.00016590112884390813,
+      "loss": 1.0928,
+      "step": 444
+    },
+    {
+      "epoch": 0.1729330613037987,
+      "grad_norm": 0.21910884976387024,
+      "learning_rate": 0.00016582327753989878,
+      "loss": 1.063,
+      "step": 445
+    },
+    {
+      "epoch": 0.1733216749247061,
+      "grad_norm": 0.21782316267490387,
+      "learning_rate": 0.00016574542623588946,
+      "loss": 1.0976,
+      "step": 446
+    },
+    {
+      "epoch": 0.17371028854561352,
+      "grad_norm": 0.21771778166294098,
+      "learning_rate": 0.0001656675749318801,
+      "loss": 1.0677,
+      "step": 447
+    },
+    {
+      "epoch": 0.17409890216652094,
+      "grad_norm": 0.22117659449577332,
+      "learning_rate": 0.00016558972362787076,
+      "loss": 1.0669,
+      "step": 448
+    },
+    {
+      "epoch": 0.17448751578742835,
+      "grad_norm": 0.21918092668056488,
+      "learning_rate": 0.00016551187232386141,
+      "loss": 1.0955,
+      "step": 449
+    },
+    {
+      "epoch": 0.17487612940833577,
+      "grad_norm": 0.22027818858623505,
+      "learning_rate": 0.0001654340210198521,
+      "loss": 1.0201,
+      "step": 450
+    },
+    {
+      "epoch": 0.17526474302924316,
+      "grad_norm": 0.2042885720729828,
+      "learning_rate": 0.00016535616971584275,
+      "loss": 1.0881,
+      "step": 451
+    },
+    {
+      "epoch": 0.17565335665015058,
+      "grad_norm": 0.21788261830806732,
+      "learning_rate": 0.0001652783184118334,
+      "loss": 1.0918,
+      "step": 452
+    },
+    {
+      "epoch": 0.176041970271058,
+      "grad_norm": 0.23332571983337402,
+      "learning_rate": 0.00016520046710782408,
+      "loss": 1.091,
+      "step": 453
+    },
+    {
+      "epoch": 0.17643058389196542,
+      "grad_norm": 0.20204192399978638,
+      "learning_rate": 0.00016512261580381473,
+      "loss": 1.0366,
+      "step": 454
+    },
+    {
+      "epoch": 0.17681919751287284,
+      "grad_norm": 0.21761906147003174,
+      "learning_rate": 0.00016504476449980538,
+      "loss": 1.0131,
+      "step": 455
+    },
+    {
+      "epoch": 0.17720781113378023,
+      "grad_norm": 0.2152051478624344,
+      "learning_rate": 0.00016496691319579606,
+      "loss": 1.0868,
+      "step": 456
+    },
+    {
+      "epoch": 0.17759642475468765,
+      "grad_norm": 0.22776494920253754,
+      "learning_rate": 0.0001648890618917867,
+      "loss": 1.0807,
+      "step": 457
+    },
+    {
+      "epoch": 0.17798503837559507,
+      "grad_norm": 0.2171342968940735,
+      "learning_rate": 0.00016481121058777736,
+      "loss": 1.0537,
+      "step": 458
+    },
+    {
+      "epoch": 0.17837365199650249,
+      "grad_norm": 0.2046273946762085,
+      "learning_rate": 0.00016473335928376802,
+      "loss": 1.0097,
+      "step": 459
+    },
+    {
+      "epoch": 0.17876226561740988,
+      "grad_norm": 0.2047681361436844,
+      "learning_rate": 0.00016465550797975867,
+      "loss": 1.0204,
+      "step": 460
+    },
+    {
+      "epoch": 0.1791508792383173,
+      "grad_norm": 0.1876862645149231,
+      "learning_rate": 0.00016457765667574935,
+      "loss": 0.9383,
+      "step": 461
+    },
+    {
+      "epoch": 0.17953949285922471,
+      "grad_norm": 0.218430757522583,
+      "learning_rate": 0.00016449980537174,
+      "loss": 1.0721,
+      "step": 462
+    },
+    {
+      "epoch": 0.17992810648013213,
+      "grad_norm": 0.2245480865240097,
+      "learning_rate": 0.00016442195406773065,
+      "loss": 1.0859,
+      "step": 463
+    },
+    {
+      "epoch": 0.18031672010103955,
+      "grad_norm": 0.22577151656150818,
+      "learning_rate": 0.0001643441027637213,
+      "loss": 1.0825,
+      "step": 464
+    },
+    {
+      "epoch": 0.18070533372194694,
+      "grad_norm": 0.20132745802402496,
+      "learning_rate": 0.00016426625145971196,
+      "loss": 1.0615,
+      "step": 465
+    },
+    {
+      "epoch": 0.18109394734285436,
+      "grad_norm": 0.2277505248785019,
+      "learning_rate": 0.00016418840015570263,
+      "loss": 1.0426,
+      "step": 466
+    },
+    {
+      "epoch": 0.18148256096376178,
+      "grad_norm": 0.22540105879306793,
+      "learning_rate": 0.0001641105488516933,
+      "loss": 1.0481,
+      "step": 467
+    },
+    {
+      "epoch": 0.1818711745846692,
+      "grad_norm": 0.20358088612556458,
+      "learning_rate": 0.00016403269754768394,
+      "loss": 1.0286,
+      "step": 468
+    },
+    {
+      "epoch": 0.18225978820557662,
+      "grad_norm": 0.22534145414829254,
+      "learning_rate": 0.0001639548462436746,
+      "loss": 1.1183,
+      "step": 469
+    },
+    {
+      "epoch": 0.182648401826484,
+      "grad_norm": 0.2188873142004013,
+      "learning_rate": 0.00016387699493966524,
+      "loss": 1.0439,
+      "step": 470
+    },
+    {
+      "epoch": 0.18303701544739143,
+      "grad_norm": 0.2128048539161682,
+      "learning_rate": 0.00016379914363565592,
+      "loss": 1.027,
+      "step": 471
+    },
+    {
+      "epoch": 0.18342562906829885,
+      "grad_norm": 0.2518141567707062,
+      "learning_rate": 0.00016372129233164657,
+      "loss": 1.0468,
+      "step": 472
+    },
+    {
+      "epoch": 0.18381424268920626,
+      "grad_norm": 0.2189142256975174,
+      "learning_rate": 0.00016364344102763723,
+      "loss": 1.0581,
+      "step": 473
+    },
+    {
+      "epoch": 0.18420285631011368,
+      "grad_norm": 0.31266725063323975,
+      "learning_rate": 0.00016356558972362788,
+      "loss": 1.0554,
+      "step": 474
+    },
+    {
+      "epoch": 0.18459146993102107,
+      "grad_norm": 0.21343916654586792,
+      "learning_rate": 0.00016348773841961853,
+      "loss": 1.0795,
+      "step": 475
+    },
+    {
+      "epoch": 0.1849800835519285,
+      "grad_norm": 0.22907280921936035,
+      "learning_rate": 0.00016340988711560918,
+      "loss": 1.0304,
+      "step": 476
+    },
+    {
+      "epoch": 0.1853686971728359,
+      "grad_norm": 0.2105257511138916,
+      "learning_rate": 0.00016333203581159986,
+      "loss": 1.0231,
+      "step": 477
+    },
+    {
+      "epoch": 0.18575731079374333,
+      "grad_norm": 0.19537831842899323,
+      "learning_rate": 0.00016325418450759051,
+      "loss": 1.0103,
+      "step": 478
+    },
+    {
+      "epoch": 0.18614592441465072,
+      "grad_norm": 0.20522372424602509,
+      "learning_rate": 0.00016317633320358117,
+      "loss": 1.0196,
+      "step": 479
+    },
+    {
+      "epoch": 0.18653453803555814,
+      "grad_norm": 0.21646477282047272,
+      "learning_rate": 0.00016309848189957182,
+      "loss": 1.0579,
+      "step": 480
+    },
+    {
+      "epoch": 0.18692315165646556,
+      "grad_norm": 0.21077193319797516,
+      "learning_rate": 0.00016302063059556247,
+      "loss": 1.0638,
+      "step": 481
+    },
+    {
+      "epoch": 0.18731176527737298,
+      "grad_norm": 0.20357473194599152,
+      "learning_rate": 0.00016294277929155315,
+      "loss": 1.0635,
+      "step": 482
+    },
+    {
+      "epoch": 0.1877003788982804,
+      "grad_norm": 0.2188001275062561,
+      "learning_rate": 0.0001628649279875438,
+      "loss": 1.0267,
+      "step": 483
+    },
+    {
+      "epoch": 0.1880889925191878,
+      "grad_norm": 0.2128928154706955,
+      "learning_rate": 0.00016278707668353445,
+      "loss": 0.9706,
+      "step": 484
+    },
+    {
+      "epoch": 0.1884776061400952,
+      "grad_norm": 0.22081372141838074,
+      "learning_rate": 0.0001627092253795251,
+      "loss": 1.08,
+      "step": 485
+    },
+    {
+      "epoch": 0.18886621976100262,
+      "grad_norm": 0.2250615805387497,
+      "learning_rate": 0.00016263137407551576,
+      "loss": 1.1451,
+      "step": 486
+    },
+    {
+      "epoch": 0.18925483338191004,
+      "grad_norm": 0.1984967589378357,
+      "learning_rate": 0.00016255352277150644,
+      "loss": 1.0744,
+      "step": 487
+    },
+    {
+      "epoch": 0.18964344700281746,
+      "grad_norm": 0.20778900384902954,
+      "learning_rate": 0.0001624756714674971,
+      "loss": 1.0623,
+      "step": 488
+    },
+    {
+      "epoch": 0.19003206062372485,
+      "grad_norm": 0.2026563137769699,
+      "learning_rate": 0.00016239782016348774,
+      "loss": 1.0714,
+      "step": 489
+    },
+    {
+      "epoch": 0.19042067424463227,
+      "grad_norm": 0.21598374843597412,
+      "learning_rate": 0.0001623199688594784,
+      "loss": 1.0869,
+      "step": 490
+    },
+    {
+      "epoch": 0.1908092878655397,
+      "grad_norm": 0.18944978713989258,
+      "learning_rate": 0.00016224211755546904,
+      "loss": 1.055,
+      "step": 491
+    },
+    {
+      "epoch": 0.1911979014864471,
+      "grad_norm": 0.20698946714401245,
+      "learning_rate": 0.00016216426625145972,
+      "loss": 1.0392,
+      "step": 492
+    },
+    {
+      "epoch": 0.1915865151073545,
+      "grad_norm": 0.22395353019237518,
+      "learning_rate": 0.00016208641494745038,
+      "loss": 1.0681,
+      "step": 493
+    },
+    {
+      "epoch": 0.19197512872826192,
+      "grad_norm": 0.22372962534427643,
+      "learning_rate": 0.00016200856364344103,
+      "loss": 1.0767,
+      "step": 494
+    },
+    {
+      "epoch": 0.19236374234916934,
+      "grad_norm": 0.2066701054573059,
+      "learning_rate": 0.00016193071233943168,
+      "loss": 1.0061,
+      "step": 495
+    },
+    {
+      "epoch": 0.19275235597007676,
+      "grad_norm": 0.19716408848762512,
+      "learning_rate": 0.00016185286103542233,
+      "loss": 1.039,
+      "step": 496
+    },
+    {
+      "epoch": 0.19314096959098417,
+      "grad_norm": 0.22159601747989655,
+      "learning_rate": 0.000161775009731413,
+      "loss": 1.0832,
+      "step": 497
+    },
+    {
+      "epoch": 0.19352958321189156,
+      "grad_norm": 0.21509626507759094,
+      "learning_rate": 0.00016169715842740366,
+      "loss": 1.0264,
+      "step": 498
+    },
+    {
+      "epoch": 0.19391819683279898,
+      "grad_norm": 0.21598199009895325,
+      "learning_rate": 0.00016161930712339431,
+      "loss": 1.049,
+      "step": 499
+    },
+    {
+      "epoch": 0.1943068104537064,
+      "grad_norm": 0.20279590785503387,
+      "learning_rate": 0.00016154145581938497,
+      "loss": 1.0505,
+      "step": 500
+    },
+    {
+      "epoch": 0.19469542407461382,
+      "grad_norm": 0.21796855330467224,
+      "learning_rate": 0.00016146360451537565,
+      "loss": 1.0885,
+      "step": 501
+    },
+    {
+      "epoch": 0.19508403769552124,
+      "grad_norm": 0.22128933668136597,
+      "learning_rate": 0.0001613857532113663,
+      "loss": 1.0903,
+      "step": 502
+    },
+    {
+      "epoch": 0.19547265131642863,
+      "grad_norm": 0.2032536417245865,
+      "learning_rate": 0.00016130790190735695,
+      "loss": 1.0285,
+      "step": 503
+    },
+    {
+      "epoch": 0.19586126493733605,
+      "grad_norm": 0.23738974332809448,
+      "learning_rate": 0.0001612300506033476,
+      "loss": 1.1188,
+      "step": 504
+    },
+    {
+      "epoch": 0.19624987855824347,
+      "grad_norm": 0.19614790380001068,
+      "learning_rate": 0.00016115219929933828,
+      "loss": 1.04,
+      "step": 505
+    },
+    {
+      "epoch": 0.1966384921791509,
+      "grad_norm": 0.2198178917169571,
+      "learning_rate": 0.00016107434799532893,
+      "loss": 1.0696,
+      "step": 506
+    },
+    {
+      "epoch": 0.1970271058000583,
+      "grad_norm": 0.18814648687839508,
+      "learning_rate": 0.00016099649669131959,
+      "loss": 1.0203,
+      "step": 507
+    },
+    {
+      "epoch": 0.1974157194209657,
+      "grad_norm": 0.20699037611484528,
+      "learning_rate": 0.00016091864538731026,
+      "loss": 1.1074,
+      "step": 508
+    },
+    {
+      "epoch": 0.19780433304187311,
+      "grad_norm": 0.21490445733070374,
+      "learning_rate": 0.00016084079408330092,
+      "loss": 1.0682,
+      "step": 509
+    },
+    {
+      "epoch": 0.19819294666278053,
+      "grad_norm": 0.2363848090171814,
+      "learning_rate": 0.00016076294277929157,
+      "loss": 1.0408,
+      "step": 510
+    },
+    {
+      "epoch": 0.19858156028368795,
+      "grad_norm": 0.20186659693717957,
+      "learning_rate": 0.00016068509147528222,
+      "loss": 1.026,
+      "step": 511
+    },
+    {
+      "epoch": 0.19897017390459534,
+      "grad_norm": 0.21564024686813354,
+      "learning_rate": 0.00016060724017127287,
+      "loss": 1.0418,
+      "step": 512
+    },
+    {
+      "epoch": 0.19935878752550276,
+      "grad_norm": 0.19151560962200165,
+      "learning_rate": 0.00016052938886726355,
+      "loss": 1.0037,
+      "step": 513
+    },
+    {
+      "epoch": 0.19974740114641018,
+      "grad_norm": 0.21038194000720978,
+      "learning_rate": 0.0001604515375632542,
+      "loss": 1.0545,
+      "step": 514
+    },
+    {
+      "epoch": 0.2001360147673176,
+      "grad_norm": 0.20496582984924316,
+      "learning_rate": 0.00016037368625924486,
+      "loss": 1.0543,
+      "step": 515
+    },
+    {
+      "epoch": 0.20052462838822502,
+      "grad_norm": 0.20689113438129425,
+      "learning_rate": 0.0001602958349552355,
+      "loss": 1.0905,
+      "step": 516
+    },
+    {
+      "epoch": 0.2009132420091324,
+      "grad_norm": 0.2284041792154312,
+      "learning_rate": 0.00016021798365122616,
+      "loss": 1.0717,
+      "step": 517
+    },
+    {
+      "epoch": 0.20130185563003983,
+      "grad_norm": 0.23457761108875275,
+      "learning_rate": 0.00016014013234721684,
+      "loss": 1.106,
+      "step": 518
+    },
+    {
+      "epoch": 0.20169046925094725,
+      "grad_norm": 0.2088528722524643,
+      "learning_rate": 0.0001600622810432075,
+      "loss": 1.0428,
+      "step": 519
+    },
+    {
+      "epoch": 0.20207908287185467,
+      "grad_norm": 0.2170068770647049,
+      "learning_rate": 0.00015998442973919814,
+      "loss": 0.9875,
+      "step": 520
+    },
+    {
+      "epoch": 0.20246769649276208,
+      "grad_norm": 0.2270561158657074,
+      "learning_rate": 0.0001599065784351888,
+      "loss": 1.0676,
+      "step": 521
+    },
+    {
+      "epoch": 0.20285631011366947,
+      "grad_norm": 0.2151324599981308,
+      "learning_rate": 0.00015982872713117945,
+      "loss": 1.0675,
+      "step": 522
+    },
+    {
+      "epoch": 0.2032449237345769,
+      "grad_norm": 0.23113249242305756,
+      "learning_rate": 0.00015975087582717013,
+      "loss": 1.0608,
+      "step": 523
+    },
+    {
+      "epoch": 0.2036335373554843,
+      "grad_norm": 0.2587106227874756,
+      "learning_rate": 0.00015967302452316078,
+      "loss": 1.0867,
+      "step": 524
+    },
+    {
+      "epoch": 0.20402215097639173,
+      "grad_norm": 0.21842992305755615,
+      "learning_rate": 0.00015959517321915143,
+      "loss": 1.0726,
+      "step": 525
+    },
+    {
+      "epoch": 0.20441076459729912,
+      "grad_norm": 0.20867805182933807,
+      "learning_rate": 0.00015951732191514208,
+      "loss": 1.0578,
+      "step": 526
+    },
+    {
+      "epoch": 0.20479937821820654,
+      "grad_norm": 0.2396962195634842,
+      "learning_rate": 0.00015943947061113273,
+      "loss": 1.0292,
+      "step": 527
+    },
+    {
+      "epoch": 0.20518799183911396,
+      "grad_norm": 0.221155047416687,
+      "learning_rate": 0.00015936161930712341,
+      "loss": 1.0019,
+      "step": 528
+    },
+    {
+      "epoch": 0.20557660546002138,
+      "grad_norm": 0.20032119750976562,
+      "learning_rate": 0.00015928376800311407,
+      "loss": 1.0435,
+      "step": 529
+    },
+    {
+      "epoch": 0.2059652190809288,
+      "grad_norm": 0.24095888435840607,
+      "learning_rate": 0.00015920591669910472,
+      "loss": 1.0355,
+      "step": 530
+    },
+    {
+      "epoch": 0.2063538327018362,
+      "grad_norm": 0.2286604344844818,
+      "learning_rate": 0.00015912806539509537,
+      "loss": 0.9989,
+      "step": 531
+    },
+    {
+      "epoch": 0.2067424463227436,
+      "grad_norm": 0.21537137031555176,
+      "learning_rate": 0.00015905021409108602,
+      "loss": 1.0642,
+      "step": 532
+    },
+    {
+      "epoch": 0.20713105994365102,
+      "grad_norm": 0.22447925806045532,
+      "learning_rate": 0.0001589723627870767,
+      "loss": 1.1244,
+      "step": 533
+    },
+    {
+      "epoch": 0.20751967356455844,
+      "grad_norm": 0.21077273786067963,
+      "learning_rate": 0.00015889451148306735,
+      "loss": 1.0167,
+      "step": 534
+    },
+    {
+      "epoch": 0.20790828718546586,
+      "grad_norm": 0.22340558469295502,
+      "learning_rate": 0.000158816660179058,
+      "loss": 1.0991,
+      "step": 535
+    },
+    {
+      "epoch": 0.20829690080637325,
+      "grad_norm": 0.223599374294281,
+      "learning_rate": 0.00015873880887504866,
+      "loss": 1.086,
+      "step": 536
+    },
+    {
+      "epoch": 0.20868551442728067,
+      "grad_norm": 0.2615208923816681,
+      "learning_rate": 0.0001586609575710393,
+      "loss": 1.0584,
+      "step": 537
+    },
+    {
+      "epoch": 0.2090741280481881,
+      "grad_norm": 0.2085907757282257,
+      "learning_rate": 0.00015858310626703,
+      "loss": 1.0994,
+      "step": 538
+    },
+    {
+      "epoch": 0.2094627416690955,
+      "grad_norm": 0.2170211672782898,
+      "learning_rate": 0.00015850525496302064,
+      "loss": 1.1105,
+      "step": 539
+    },
+    {
+      "epoch": 0.20985135529000293,
+      "grad_norm": 0.21978625655174255,
+      "learning_rate": 0.0001584274036590113,
+      "loss": 1.002,
+      "step": 540
+    },
+    {
+      "epoch": 0.21023996891091032,
+      "grad_norm": 0.23684021830558777,
+      "learning_rate": 0.00015834955235500194,
+      "loss": 1.1216,
+      "step": 541
+    },
+    {
+      "epoch": 0.21062858253181774,
+      "grad_norm": 0.220269113779068,
+      "learning_rate": 0.0001582717010509926,
+      "loss": 1.0773,
+      "step": 542
+    },
+    {
+      "epoch": 0.21101719615272516,
+      "grad_norm": 0.22447973489761353,
+      "learning_rate": 0.00015819384974698328,
+      "loss": 1.0941,
+      "step": 543
+    },
+    {
+      "epoch": 0.21140580977363257,
+      "grad_norm": 0.22435730695724487,
+      "learning_rate": 0.00015811599844297393,
+      "loss": 1.0138,
+      "step": 544
+    },
+    {
+      "epoch": 0.21179442339453997,
+      "grad_norm": 0.2230793684720993,
+      "learning_rate": 0.00015803814713896458,
+      "loss": 1.0343,
+      "step": 545
+    },
+    {
+      "epoch": 0.21218303701544738,
+      "grad_norm": 0.23491905629634857,
+      "learning_rate": 0.00015796029583495523,
+      "loss": 1.11,
+      "step": 546
+    },
+    {
+      "epoch": 0.2125716506363548,
+      "grad_norm": 0.213560551404953,
+      "learning_rate": 0.00015788244453094588,
+      "loss": 1.0615,
+      "step": 547
+    },
+    {
+      "epoch": 0.21296026425726222,
+      "grad_norm": 0.21392837166786194,
+      "learning_rate": 0.00015780459322693654,
+      "loss": 1.0872,
+      "step": 548
+    },
+    {
+      "epoch": 0.21334887787816964,
+      "grad_norm": 0.20007692277431488,
+      "learning_rate": 0.00015772674192292722,
+      "loss": 1.0394,
+      "step": 549
+    },
+    {
+      "epoch": 0.21373749149907703,
+      "grad_norm": 0.1969841718673706,
+      "learning_rate": 0.00015764889061891787,
+      "loss": 1.0381,
+      "step": 550
+    },
+    {
+      "epoch": 0.21412610511998445,
+      "grad_norm": 0.21874025464057922,
+      "learning_rate": 0.00015757103931490852,
+      "loss": 1.0822,
+      "step": 551
+    },
+    {
+      "epoch": 0.21451471874089187,
+      "grad_norm": 0.21824273467063904,
+      "learning_rate": 0.00015749318801089917,
+      "loss": 1.0802,
+      "step": 552
+    },
+    {
+      "epoch": 0.2149033323617993,
+      "grad_norm": 0.20942047238349915,
+      "learning_rate": 0.00015741533670688985,
+      "loss": 1.0634,
+      "step": 553
+    },
+    {
+      "epoch": 0.2152919459827067,
+      "grad_norm": 0.1940152943134308,
+      "learning_rate": 0.0001573374854028805,
+      "loss": 1.0264,
+      "step": 554
+    },
+    {
+      "epoch": 0.2156805596036141,
+      "grad_norm": 0.19859059154987335,
+      "learning_rate": 0.00015725963409887115,
+      "loss": 0.9701,
+      "step": 555
+    },
+    {
+      "epoch": 0.21606917322452152,
+      "grad_norm": 0.22239404916763306,
+      "learning_rate": 0.0001571817827948618,
+      "loss": 1.1282,
+      "step": 556
+    },
+    {
+      "epoch": 0.21645778684542893,
+      "grad_norm": 0.23820599913597107,
+      "learning_rate": 0.00015710393149085249,
+      "loss": 1.1123,
+      "step": 557
+    },
+    {
+      "epoch": 0.21684640046633635,
+      "grad_norm": 0.21279917657375336,
+      "learning_rate": 0.00015702608018684314,
+      "loss": 1.0542,
+      "step": 558
+    },
+    {
+      "epoch": 0.21723501408724374,
+      "grad_norm": 0.2065514773130417,
+      "learning_rate": 0.0001569482288828338,
+      "loss": 1.0685,
+      "step": 559
+    },
+    {
+      "epoch": 0.21762362770815116,
+      "grad_norm": 0.20130831003189087,
+      "learning_rate": 0.00015687037757882447,
+      "loss": 0.9869,
+      "step": 560
+    },
+    {
+      "epoch": 0.21801224132905858,
+      "grad_norm": 0.2187541127204895,
+      "learning_rate": 0.00015679252627481512,
+      "loss": 1.1095,
+      "step": 561
+    },
+    {
+      "epoch": 0.218400854949966,
+      "grad_norm": 0.21028277277946472,
+      "learning_rate": 0.00015671467497080577,
+      "loss": 1.0804,
+      "step": 562
+    },
+    {
+      "epoch": 0.21878946857087342,
+      "grad_norm": 0.8187636733055115,
+      "learning_rate": 0.00015663682366679643,
+      "loss": 1.0782,
+      "step": 563
+    },
+    {
+      "epoch": 0.2191780821917808,
+      "grad_norm": 0.20059974491596222,
+      "learning_rate": 0.0001565589723627871,
+      "loss": 1.0279,
+      "step": 564
+    },
+    {
+      "epoch": 0.21956669581268823,
+      "grad_norm": 0.20440839231014252,
+      "learning_rate": 0.00015648112105877776,
+      "loss": 0.9863,
+      "step": 565
+    },
+    {
+      "epoch": 0.21995530943359565,
+      "grad_norm": 0.21423624455928802,
+      "learning_rate": 0.0001564032697547684,
+      "loss": 1.0685,
+      "step": 566
+    },
+    {
+      "epoch": 0.22034392305450307,
+      "grad_norm": 0.22430062294006348,
+      "learning_rate": 0.00015632541845075906,
+      "loss": 1.0761,
+      "step": 567
+    },
+    {
+      "epoch": 0.22073253667541048,
+      "grad_norm": 0.22782258689403534,
+      "learning_rate": 0.0001562475671467497,
+      "loss": 1.1024,
+      "step": 568
+    },
+    {
+      "epoch": 0.22112115029631788,
+      "grad_norm": 0.21150320768356323,
+      "learning_rate": 0.0001561697158427404,
+      "loss": 1.0621,
+      "step": 569
+    },
+    {
+      "epoch": 0.2215097639172253,
+      "grad_norm": 0.20342351496219635,
+      "learning_rate": 0.00015609186453873104,
+      "loss": 1.0667,
+      "step": 570
+    },
+    {
+      "epoch": 0.2218983775381327,
+      "grad_norm": 0.22866711020469666,
+      "learning_rate": 0.0001560140132347217,
+      "loss": 1.0631,
+      "step": 571
+    },
+    {
+      "epoch": 0.22228699115904013,
+      "grad_norm": 0.2200063169002533,
+      "learning_rate": 0.00015593616193071235,
+      "loss": 1.0448,
+      "step": 572
+    },
+    {
+      "epoch": 0.22267560477994755,
+      "grad_norm": 0.19440248608589172,
+      "learning_rate": 0.000155858310626703,
+      "loss": 1.037,
+      "step": 573
+    },
+    {
+      "epoch": 0.22306421840085494,
+      "grad_norm": 0.205752432346344,
+      "learning_rate": 0.00015578045932269368,
+      "loss": 1.0465,
+      "step": 574
+    },
+    {
+      "epoch": 0.22345283202176236,
+      "grad_norm": 0.22247998416423798,
+      "learning_rate": 0.00015570260801868433,
+      "loss": 0.997,
+      "step": 575
+    },
+    {
+      "epoch": 0.22384144564266978,
+      "grad_norm": 0.22199274599552155,
+      "learning_rate": 0.00015562475671467498,
+      "loss": 1.0178,
+      "step": 576
+    },
+    {
+      "epoch": 0.2242300592635772,
+      "grad_norm": 0.2114989310503006,
+      "learning_rate": 0.00015554690541066564,
+      "loss": 1.0457,
+      "step": 577
+    },
+    {
+      "epoch": 0.2246186728844846,
+      "grad_norm": 0.24248506128787994,
+      "learning_rate": 0.0001554690541066563,
+      "loss": 1.002,
+      "step": 578
+    },
+    {
+      "epoch": 0.225007286505392,
+      "grad_norm": 0.2565505802631378,
+      "learning_rate": 0.00015539120280264697,
+      "loss": 1.0541,
+      "step": 579
+    },
+    {
+      "epoch": 0.22539590012629943,
+      "grad_norm": 0.22799409925937653,
+      "learning_rate": 0.00015531335149863762,
+      "loss": 1.0788,
+      "step": 580
+    },
+    {
+      "epoch": 0.22578451374720684,
+      "grad_norm": 0.2196080982685089,
+      "learning_rate": 0.00015523550019462827,
+      "loss": 1.0877,
+      "step": 581
+    },
+    {
+      "epoch": 0.22617312736811426,
+      "grad_norm": 0.21992824971675873,
+      "learning_rate": 0.00015515764889061892,
+      "loss": 1.0213,
+      "step": 582
+    },
+    {
+      "epoch": 0.22656174098902165,
+      "grad_norm": 0.22793298959732056,
+      "learning_rate": 0.00015507979758660957,
+      "loss": 1.0633,
+      "step": 583
+    },
+    {
+      "epoch": 0.22695035460992907,
+      "grad_norm": 0.21707972884178162,
+      "learning_rate": 0.00015500194628260023,
+      "loss": 1.081,
+      "step": 584
+    },
+    {
+      "epoch": 0.2273389682308365,
+      "grad_norm": 0.220685675740242,
+      "learning_rate": 0.0001549240949785909,
+      "loss": 1.0658,
+      "step": 585
+    },
+    {
+      "epoch": 0.2277275818517439,
+      "grad_norm": 0.22576668858528137,
+      "learning_rate": 0.00015484624367458156,
+      "loss": 1.0795,
+      "step": 586
+    },
+    {
+      "epoch": 0.22811619547265133,
+      "grad_norm": 0.21778982877731323,
+      "learning_rate": 0.0001547683923705722,
+      "loss": 1.033,
+      "step": 587
+    },
+    {
+      "epoch": 0.22850480909355872,
+      "grad_norm": 0.22748610377311707,
+      "learning_rate": 0.00015469054106656286,
+      "loss": 1.0948,
+      "step": 588
+    },
+    {
+      "epoch": 0.22889342271446614,
+      "grad_norm": 0.21561284363269806,
+      "learning_rate": 0.00015461268976255351,
+      "loss": 1.0022,
+      "step": 589
+    },
+    {
+      "epoch": 0.22928203633537356,
+      "grad_norm": 0.2419756054878235,
+      "learning_rate": 0.0001545348384585442,
+      "loss": 1.0786,
+      "step": 590
+    },
+    {
+      "epoch": 0.22967064995628098,
+      "grad_norm": 0.20479315519332886,
+      "learning_rate": 0.00015445698715453485,
+      "loss": 1.027,
+      "step": 591
+    },
+    {
+      "epoch": 0.2300592635771884,
+      "grad_norm": 0.21365883946418762,
+      "learning_rate": 0.0001543791358505255,
+      "loss": 1.0773,
+      "step": 592
+    },
+    {
+      "epoch": 0.23044787719809579,
+      "grad_norm": 0.23133166134357452,
+      "learning_rate": 0.00015430128454651615,
+      "loss": 1.0877,
+      "step": 593
+    },
+    {
+      "epoch": 0.2308364908190032,
+      "grad_norm": 0.2110515981912613,
+      "learning_rate": 0.0001542234332425068,
+      "loss": 1.0509,
+      "step": 594
+    },
+    {
+      "epoch": 0.23122510443991062,
+      "grad_norm": 0.20658442378044128,
+      "learning_rate": 0.00015414558193849748,
+      "loss": 1.0623,
+      "step": 595
+    },
+    {
+      "epoch": 0.23161371806081804,
+      "grad_norm": 0.21831996738910675,
+      "learning_rate": 0.00015406773063448813,
+      "loss": 1.021,
+      "step": 596
+    },
+    {
+      "epoch": 0.23200233168172543,
+      "grad_norm": 0.23015642166137695,
+      "learning_rate": 0.00015398987933047878,
+      "loss": 1.0358,
+      "step": 597
+    },
+    {
+      "epoch": 0.23239094530263285,
+      "grad_norm": 0.23071645200252533,
+      "learning_rate": 0.00015391202802646944,
+      "loss": 1.1255,
+      "step": 598
+    },
+    {
+      "epoch": 0.23277955892354027,
+      "grad_norm": 0.19513486325740814,
+      "learning_rate": 0.0001538341767224601,
+      "loss": 1.0189,
+      "step": 599
+    },
+    {
+      "epoch": 0.2331681725444477,
+      "grad_norm": 0.20821452140808105,
+      "learning_rate": 0.00015375632541845077,
+      "loss": 1.0843,
+      "step": 600
+    },
+    {
+      "epoch": 0.2335567861653551,
+      "grad_norm": 0.20563223958015442,
+      "learning_rate": 0.00015367847411444142,
+      "loss": 1.0012,
+      "step": 601
+    },
+    {
+      "epoch": 0.2339453997862625,
+      "grad_norm": 0.22674202919006348,
+      "learning_rate": 0.00015360062281043207,
+      "loss": 1.0371,
+      "step": 602
+    },
+    {
+      "epoch": 0.23433401340716992,
+      "grad_norm": 0.20744135975837708,
+      "learning_rate": 0.00015352277150642272,
+      "loss": 1.0466,
+      "step": 603
+    },
+    {
+      "epoch": 0.23472262702807734,
+      "grad_norm": 0.22103577852249146,
+      "learning_rate": 0.00015344492020241338,
+      "loss": 1.0942,
+      "step": 604
+    },
+    {
+      "epoch": 0.23511124064898475,
+      "grad_norm": 0.20643098652362823,
+      "learning_rate": 0.00015336706889840406,
+      "loss": 1.0682,
+      "step": 605
+    },
+    {
+      "epoch": 0.23549985426989217,
+      "grad_norm": 0.23436777293682098,
+      "learning_rate": 0.0001532892175943947,
+      "loss": 1.0613,
+      "step": 606
+    },
+    {
+      "epoch": 0.23588846789079956,
+      "grad_norm": 0.21898899972438812,
+      "learning_rate": 0.00015321136629038536,
+      "loss": 1.0571,
+      "step": 607
+    },
+    {
+      "epoch": 0.23627708151170698,
+      "grad_norm": 0.20569247007369995,
+      "learning_rate": 0.00015313351498637604,
+      "loss": 1.061,
+      "step": 608
+    },
+    {
+      "epoch": 0.2366656951326144,
+      "grad_norm": 0.2099207490682602,
+      "learning_rate": 0.0001530556636823667,
+      "loss": 1.0776,
+      "step": 609
+    },
+    {
+      "epoch": 0.23705430875352182,
+      "grad_norm": 0.20078738033771515,
+      "learning_rate": 0.00015297781237835734,
+      "loss": 1.0341,
+      "step": 610
+    },
+    {
+      "epoch": 0.2374429223744292,
+      "grad_norm": 0.20327065885066986,
+      "learning_rate": 0.000152899961074348,
+      "loss": 1.0168,
+      "step": 611
+    },
+    {
+      "epoch": 0.23783153599533663,
+      "grad_norm": 0.21741214394569397,
+      "learning_rate": 0.00015282210977033867,
+      "loss": 1.0726,
+      "step": 612
+    },
+    {
+      "epoch": 0.23822014961624405,
+      "grad_norm": 0.2065727263689041,
+      "learning_rate": 0.00015274425846632933,
+      "loss": 1.0474,
+      "step": 613
+    },
+    {
+      "epoch": 0.23860876323715147,
+      "grad_norm": 0.21241194009780884,
+      "learning_rate": 0.00015266640716231998,
+      "loss": 1.0666,
+      "step": 614
+    },
+    {
+      "epoch": 0.23899737685805889,
+      "grad_norm": 0.2194201797246933,
+      "learning_rate": 0.00015258855585831066,
+      "loss": 1.1411,
+      "step": 615
+    },
+    {
+      "epoch": 0.23938599047896628,
+      "grad_norm": 0.21537193655967712,
+      "learning_rate": 0.0001525107045543013,
+      "loss": 1.081,
+      "step": 616
+    },
+    {
+      "epoch": 0.2397746040998737,
+      "grad_norm": 0.21125951409339905,
+      "learning_rate": 0.00015243285325029196,
+      "loss": 1.0679,
+      "step": 617
+    },
+    {
+      "epoch": 0.2401632177207811,
+      "grad_norm": 0.21342721581459045,
+      "learning_rate": 0.0001523550019462826,
+      "loss": 1.0564,
+      "step": 618
+    },
+    {
+      "epoch": 0.24055183134168853,
+      "grad_norm": 0.2223503291606903,
+      "learning_rate": 0.00015227715064227327,
+      "loss": 1.1163,
+      "step": 619
+    },
+    {
+      "epoch": 0.24094044496259595,
+      "grad_norm": 0.21626527607440948,
+      "learning_rate": 0.00015219929933826394,
+      "loss": 1.0793,
+      "step": 620
+    },
+    {
+      "epoch": 0.24132905858350334,
+      "grad_norm": 0.21899500489234924,
+      "learning_rate": 0.0001521214480342546,
+      "loss": 1.0864,
+      "step": 621
+    },
+    {
+      "epoch": 0.24171767220441076,
+      "grad_norm": 0.2499915212392807,
+      "learning_rate": 0.00015204359673024525,
+      "loss": 1.1381,
+      "step": 622
+    },
+    {
+      "epoch": 0.24210628582531818,
+      "grad_norm": 0.2108345925807953,
+      "learning_rate": 0.0001519657454262359,
+      "loss": 1.0534,
+      "step": 623
+    },
+    {
+      "epoch": 0.2424948994462256,
+      "grad_norm": 0.2224910855293274,
+      "learning_rate": 0.00015188789412222655,
+      "loss": 1.0235,
+      "step": 624
+    },
+    {
+      "epoch": 0.24288351306713302,
+      "grad_norm": 0.22163094580173492,
+      "learning_rate": 0.0001518100428182172,
+      "loss": 1.0143,
+      "step": 625
+    },
+    {
+      "epoch": 0.2432721266880404,
+      "grad_norm": 0.20709283649921417,
+      "learning_rate": 0.00015173219151420788,
+      "loss": 1.0506,
+      "step": 626
+    },
+    {
+      "epoch": 0.24366074030894783,
+      "grad_norm": 0.2112802267074585,
+      "learning_rate": 0.00015165434021019854,
+      "loss": 1.0692,
+      "step": 627
+    },
+    {
+      "epoch": 0.24404935392985525,
+      "grad_norm": 0.23622830212116241,
+      "learning_rate": 0.0001515764889061892,
+      "loss": 1.0769,
+      "step": 628
+    },
+    {
+      "epoch": 0.24443796755076266,
+      "grad_norm": 0.23328271508216858,
+      "learning_rate": 0.00015149863760217984,
+      "loss": 1.1158,
+      "step": 629
+    },
+    {
+      "epoch": 0.24482658117167005,
+      "grad_norm": 0.2071760892868042,
+      "learning_rate": 0.0001514207862981705,
+      "loss": 1.0133,
+      "step": 630
+    },
+    {
+      "epoch": 0.24521519479257747,
+      "grad_norm": 0.21428920328617096,
+      "learning_rate": 0.00015134293499416117,
+      "loss": 1.0342,
+      "step": 631
+    },
+    {
+      "epoch": 0.2456038084134849,
+      "grad_norm": 0.22225375473499298,
+      "learning_rate": 0.00015126508369015182,
+      "loss": 1.1054,
+      "step": 632
+    },
+    {
+      "epoch": 0.2459924220343923,
+      "grad_norm": 0.2096671611070633,
+      "learning_rate": 0.00015118723238614248,
+      "loss": 1.0229,
+      "step": 633
+    },
+    {
+      "epoch": 0.24638103565529973,
+      "grad_norm": 0.21473252773284912,
+      "learning_rate": 0.00015110938108213313,
+      "loss": 1.0915,
+      "step": 634
+    },
+    {
+      "epoch": 0.24676964927620712,
+      "grad_norm": 0.2071562111377716,
+      "learning_rate": 0.00015103152977812378,
+      "loss": 1.047,
+      "step": 635
+    },
+    {
+      "epoch": 0.24715826289711454,
+      "grad_norm": 0.19868609309196472,
+      "learning_rate": 0.00015095367847411446,
+      "loss": 1.0073,
+      "step": 636
+    },
+    {
+      "epoch": 0.24754687651802196,
+      "grad_norm": 0.20937366783618927,
+      "learning_rate": 0.0001508758271701051,
+      "loss": 1.0155,
+      "step": 637
+    },
+    {
+      "epoch": 0.24793549013892938,
+      "grad_norm": 0.19225911796092987,
+      "learning_rate": 0.00015079797586609576,
+      "loss": 1.0163,
+      "step": 638
+    },
+    {
+      "epoch": 0.2483241037598368,
+      "grad_norm": 0.20427283644676208,
+      "learning_rate": 0.00015072012456208641,
+      "loss": 1.062,
+      "step": 639
+    },
+    {
+      "epoch": 0.24871271738074419,
+      "grad_norm": 0.21640253067016602,
+      "learning_rate": 0.00015064227325807707,
+      "loss": 1.025,
+      "step": 640
+    },
+    {
+      "epoch": 0.2491013310016516,
+      "grad_norm": 0.20416739583015442,
+      "learning_rate": 0.00015056442195406775,
+      "loss": 1.0635,
+      "step": 641
+    },
+    {
+      "epoch": 0.24948994462255902,
+      "grad_norm": 0.1990521252155304,
+      "learning_rate": 0.0001504865706500584,
+      "loss": 1.0757,
+      "step": 642
+    },
+    {
+      "epoch": 0.24987855824346644,
+      "grad_norm": 0.21636444330215454,
+      "learning_rate": 0.00015040871934604905,
+      "loss": 1.0441,
+      "step": 643
+    },
+    {
+      "epoch": 0.25026717186437386,
+      "grad_norm": 0.21253719925880432,
+      "learning_rate": 0.0001503308680420397,
+      "loss": 1.0574,
+      "step": 644
+    },
+    {
+      "epoch": 0.2506557854852813,
+      "grad_norm": 0.2134159356355667,
+      "learning_rate": 0.00015025301673803035,
+      "loss": 1.0396,
+      "step": 645
+    },
+    {
+      "epoch": 0.2510443991061887,
+      "grad_norm": 0.2018527239561081,
+      "learning_rate": 0.00015017516543402103,
+      "loss": 1.0606,
+      "step": 646
+    },
+    {
+      "epoch": 0.25143301272709606,
+      "grad_norm": 0.20320741832256317,
+      "learning_rate": 0.00015009731413001169,
+      "loss": 1.0093,
+      "step": 647
+    },
+    {
+      "epoch": 0.2518216263480035,
+      "grad_norm": 0.21007056534290314,
+      "learning_rate": 0.00015001946282600234,
+      "loss": 1.0284,
+      "step": 648
+    },
+    {
+      "epoch": 0.2522102399689109,
+      "grad_norm": 0.22453372180461884,
+      "learning_rate": 0.000149941611521993,
+      "loss": 1.0271,
+      "step": 649
+    },
+    {
+      "epoch": 0.2525988535898183,
+      "grad_norm": 0.19889335334300995,
+      "learning_rate": 0.00014986376021798364,
+      "loss": 1.0238,
+      "step": 650
+    },
+    {
+      "epoch": 0.25298746721072574,
+      "grad_norm": 0.19339965283870697,
+      "learning_rate": 0.00014978590891397432,
+      "loss": 1.024,
+      "step": 651
+    },
+    {
+      "epoch": 0.25337608083163315,
+      "grad_norm": 0.22362011671066284,
+      "learning_rate": 0.00014970805760996497,
+      "loss": 1.0722,
+      "step": 652
+    },
+    {
+      "epoch": 0.2537646944525406,
+      "grad_norm": 0.2110588103532791,
+      "learning_rate": 0.00014963020630595562,
+      "loss": 1.0541,
+      "step": 653
+    },
+    {
+      "epoch": 0.254153308073448,
+      "grad_norm": 0.203025683760643,
+      "learning_rate": 0.00014955235500194628,
+      "loss": 1.0335,
+      "step": 654
+    },
+    {
+      "epoch": 0.2545419216943554,
+      "grad_norm": 0.20884902775287628,
+      "learning_rate": 0.00014947450369793693,
+      "loss": 1.0507,
+      "step": 655
+    },
+    {
+      "epoch": 0.2549305353152628,
+      "grad_norm": 0.21234256029129028,
+      "learning_rate": 0.0001493966523939276,
+      "loss": 1.0372,
+      "step": 656
+    },
+    {
+      "epoch": 0.2553191489361702,
+      "grad_norm": 0.1984352171421051,
+      "learning_rate": 0.00014931880108991826,
+      "loss": 0.9979,
+      "step": 657
+    },
+    {
+      "epoch": 0.2557077625570776,
+      "grad_norm": 0.18848282098770142,
+      "learning_rate": 0.0001492409497859089,
+      "loss": 0.9973,
+      "step": 658
+    },
+    {
+      "epoch": 0.25609637617798503,
+      "grad_norm": 0.2201709896326065,
+      "learning_rate": 0.00014916309848189956,
+      "loss": 1.0386,
+      "step": 659
+    },
+    {
+      "epoch": 0.25648498979889245,
+      "grad_norm": 0.23094095289707184,
+      "learning_rate": 0.00014908524717789024,
+      "loss": 1.1205,
+      "step": 660
+    },
+    {
+      "epoch": 0.25687360341979987,
+      "grad_norm": 0.21087734401226044,
+      "learning_rate": 0.0001490073958738809,
+      "loss": 1.0231,
+      "step": 661
+    },
+    {
+      "epoch": 0.2572622170407073,
+      "grad_norm": 0.24970979988574982,
+      "learning_rate": 0.00014892954456987155,
+      "loss": 1.0421,
+      "step": 662
+    },
+    {
+      "epoch": 0.2576508306616147,
+      "grad_norm": 0.22024711966514587,
+      "learning_rate": 0.00014885169326586223,
+      "loss": 1.1033,
+      "step": 663
+    },
+    {
+      "epoch": 0.2580394442825221,
+      "grad_norm": 0.2195248156785965,
+      "learning_rate": 0.00014877384196185288,
+      "loss": 1.089,
+      "step": 664
+    },
+    {
+      "epoch": 0.25842805790342954,
+      "grad_norm": 0.20236417651176453,
+      "learning_rate": 0.00014869599065784353,
+      "loss": 1.0196,
+      "step": 665
+    },
+    {
+      "epoch": 0.2588166715243369,
+      "grad_norm": 0.21973329782485962,
+      "learning_rate": 0.00014861813935383418,
+      "loss": 1.0844,
+      "step": 666
+    },
+    {
+      "epoch": 0.2592052851452443,
+      "grad_norm": 0.2069879174232483,
+      "learning_rate": 0.00014854028804982486,
+      "loss": 1.0312,
+      "step": 667
+    },
+    {
+      "epoch": 0.25959389876615174,
+      "grad_norm": 0.2037455290555954,
+      "learning_rate": 0.00014846243674581551,
+      "loss": 1.0018,
+      "step": 668
+    },
+    {
+      "epoch": 0.25998251238705916,
+      "grad_norm": 0.24176378548145294,
+      "learning_rate": 0.00014838458544180617,
+      "loss": 1.0749,
+      "step": 669
+    },
+    {
+      "epoch": 0.2603711260079666,
+      "grad_norm": 0.2007879763841629,
+      "learning_rate": 0.00014830673413779682,
+      "loss": 1.0443,
+      "step": 670
+    },
+    {
+      "epoch": 0.260759739628874,
+      "grad_norm": 0.23503245413303375,
+      "learning_rate": 0.00014822888283378747,
+      "loss": 1.0674,
+      "step": 671
+    },
+    {
+      "epoch": 0.2611483532497814,
+      "grad_norm": 0.2166167050600052,
+      "learning_rate": 0.00014815103152977815,
+      "loss": 1.079,
+      "step": 672
+    },
+    {
+      "epoch": 0.26153696687068884,
+      "grad_norm": 0.2293982058763504,
+      "learning_rate": 0.0001480731802257688,
+      "loss": 1.0517,
+      "step": 673
+    },
+    {
+      "epoch": 0.26192558049159625,
+      "grad_norm": 0.21040330827236176,
+      "learning_rate": 0.00014799532892175945,
+      "loss": 1.0475,
+      "step": 674
+    },
+    {
+      "epoch": 0.2623141941125036,
+      "grad_norm": 0.20750463008880615,
+      "learning_rate": 0.0001479174776177501,
+      "loss": 1.025,
+      "step": 675
+    },
+    {
+      "epoch": 0.26270280773341104,
+      "grad_norm": 0.2748873233795166,
+      "learning_rate": 0.00014783962631374076,
+      "loss": 1.0212,
+      "step": 676
+    },
+    {
+      "epoch": 0.26309142135431846,
+      "grad_norm": 0.19212333858013153,
+      "learning_rate": 0.00014776177500973144,
+      "loss": 1.0049,
+      "step": 677
+    },
+    {
+      "epoch": 0.2634800349752259,
+      "grad_norm": 0.207731693983078,
+      "learning_rate": 0.0001476839237057221,
+      "loss": 1.0062,
+      "step": 678
+    },
+    {
+      "epoch": 0.2638686485961333,
+      "grad_norm": 0.2177981585264206,
+      "learning_rate": 0.00014760607240171274,
+      "loss": 1.0489,
+      "step": 679
+    },
+    {
+      "epoch": 0.2642572622170407,
+      "grad_norm": 0.23239290714263916,
+      "learning_rate": 0.0001475282210977034,
+      "loss": 1.0856,
+      "step": 680
+    },
+    {
+      "epoch": 0.26464587583794813,
+      "grad_norm": 0.2033151388168335,
+      "learning_rate": 0.00014745036979369404,
+      "loss": 1.0389,
+      "step": 681
+    },
+    {
+      "epoch": 0.26503448945885555,
+      "grad_norm": 0.20917408168315887,
+      "learning_rate": 0.00014737251848968472,
+      "loss": 1.1208,
+      "step": 682
+    },
+    {
+      "epoch": 0.26542310307976297,
+      "grad_norm": 0.22075454890727997,
+      "learning_rate": 0.00014729466718567538,
+      "loss": 1.0435,
+      "step": 683
+    },
+    {
+      "epoch": 0.26581171670067033,
+      "grad_norm": 0.23094993829727173,
+      "learning_rate": 0.00014721681588166603,
+      "loss": 1.0649,
+      "step": 684
+    },
+    {
+      "epoch": 0.26620033032157775,
+      "grad_norm": 0.21209536492824554,
+      "learning_rate": 0.00014713896457765668,
+      "loss": 1.0578,
+      "step": 685
+    },
+    {
+      "epoch": 0.26658894394248517,
+      "grad_norm": 0.21412219107151031,
+      "learning_rate": 0.00014706111327364733,
+      "loss": 1.1137,
+      "step": 686
+    },
+    {
+      "epoch": 0.2669775575633926,
+      "grad_norm": 0.21175475418567657,
+      "learning_rate": 0.000146983261969638,
+      "loss": 1.023,
+      "step": 687
+    },
+    {
+      "epoch": 0.2673661711843,
+      "grad_norm": 0.21968993544578552,
+      "learning_rate": 0.00014690541066562866,
+      "loss": 1.1183,
+      "step": 688
+    },
+    {
+      "epoch": 0.2677547848052074,
+      "grad_norm": 0.20414218306541443,
+      "learning_rate": 0.00014682755936161932,
+      "loss": 1.078,
+      "step": 689
+    },
+    {
+      "epoch": 0.26814339842611484,
+      "grad_norm": 0.18986597657203674,
+      "learning_rate": 0.00014674970805760997,
+      "loss": 1.0029,
+      "step": 690
+    },
+    {
+      "epoch": 0.26853201204702226,
+      "grad_norm": 0.21215832233428955,
+      "learning_rate": 0.00014667185675360062,
+      "loss": 1.0759,
+      "step": 691
+    },
+    {
+      "epoch": 0.2689206256679297,
+      "grad_norm": 0.2113744169473648,
+      "learning_rate": 0.0001465940054495913,
+      "loss": 1.1027,
+      "step": 692
+    },
+    {
+      "epoch": 0.2693092392888371,
+      "grad_norm": 0.22010880708694458,
+      "learning_rate": 0.00014651615414558195,
+      "loss": 1.0984,
+      "step": 693
+    },
+    {
+      "epoch": 0.26969785290974446,
+      "grad_norm": 0.203857421875,
+      "learning_rate": 0.0001464383028415726,
+      "loss": 1.0407,
+      "step": 694
+    },
+    {
+      "epoch": 0.2700864665306519,
+      "grad_norm": 0.21120867133140564,
+      "learning_rate": 0.00014636045153756325,
+      "loss": 1.0521,
+      "step": 695
+    },
+    {
+      "epoch": 0.2704750801515593,
+      "grad_norm": 0.20039112865924835,
+      "learning_rate": 0.0001462826002335539,
+      "loss": 1.0897,
+      "step": 696
+    },
+    {
+      "epoch": 0.2708636937724667,
+      "grad_norm": 0.22893202304840088,
+      "learning_rate": 0.00014620474892954456,
+      "loss": 1.0903,
+      "step": 697
+    },
+    {
+      "epoch": 0.27125230739337414,
+      "grad_norm": 0.19886267185211182,
+      "learning_rate": 0.00014612689762553524,
+      "loss": 1.0889,
+      "step": 698
+    },
+    {
+      "epoch": 0.27164092101428156,
+      "grad_norm": 0.18892349302768707,
+      "learning_rate": 0.0001460490463215259,
+      "loss": 0.981,
+      "step": 699
+    },
+    {
+      "epoch": 0.272029534635189,
+      "grad_norm": 0.20602507889270782,
+      "learning_rate": 0.00014597119501751654,
+      "loss": 1.0223,
+      "step": 700
+    },
+    {
+      "epoch": 0.2724181482560964,
+      "grad_norm": 0.21480505168437958,
+      "learning_rate": 0.0001458933437135072,
+      "loss": 1.0355,
+      "step": 701
+    },
+    {
+      "epoch": 0.2728067618770038,
+      "grad_norm": 0.21011753380298615,
+      "learning_rate": 0.00014581549240949785,
+      "loss": 1.0613,
+      "step": 702
+    },
+    {
+      "epoch": 0.2731953754979112,
+      "grad_norm": 0.19350819289684296,
+      "learning_rate": 0.00014573764110548853,
+      "loss": 1.0144,
+      "step": 703
+    },
+    {
+      "epoch": 0.2735839891188186,
+      "grad_norm": 0.207548126578331,
+      "learning_rate": 0.00014565978980147918,
+      "loss": 1.0465,
+      "step": 704
+    },
+    {
+      "epoch": 0.273972602739726,
+      "grad_norm": 0.22220565378665924,
+      "learning_rate": 0.00014558193849746983,
+      "loss": 1.1073,
+      "step": 705
+    },
+    {
+      "epoch": 0.27436121636063343,
+      "grad_norm": 0.193622425198555,
+      "learning_rate": 0.00014550408719346048,
+      "loss": 1.0357,
+      "step": 706
+    },
+    {
+      "epoch": 0.27474982998154085,
+      "grad_norm": 0.2067158818244934,
+      "learning_rate": 0.00014542623588945113,
+      "loss": 1.0502,
+      "step": 707
+    },
+    {
+      "epoch": 0.27513844360244827,
+      "grad_norm": 0.2218742072582245,
+      "learning_rate": 0.0001453483845854418,
+      "loss": 0.9934,
+      "step": 708
+    },
+    {
+      "epoch": 0.2755270572233557,
+      "grad_norm": 0.22316142916679382,
+      "learning_rate": 0.00014527053328143246,
+      "loss": 1.0707,
+      "step": 709
+    },
+    {
+      "epoch": 0.2759156708442631,
+      "grad_norm": 0.21004025638103485,
+      "learning_rate": 0.00014519268197742312,
+      "loss": 1.0543,
+      "step": 710
+    },
+    {
+      "epoch": 0.2763042844651705,
+      "grad_norm": 0.22070440649986267,
+      "learning_rate": 0.00014511483067341377,
+      "loss": 1.0467,
+      "step": 711
+    },
+    {
+      "epoch": 0.27669289808607794,
+      "grad_norm": 0.21463747322559357,
+      "learning_rate": 0.00014503697936940445,
+      "loss": 1.0793,
+      "step": 712
+    },
+    {
+      "epoch": 0.2770815117069853,
+      "grad_norm": 0.23452533781528473,
+      "learning_rate": 0.0001449591280653951,
+      "loss": 1.043,
+      "step": 713
+    },
+    {
+      "epoch": 0.2774701253278927,
+      "grad_norm": 0.2405795156955719,
+      "learning_rate": 0.00014488127676138575,
+      "loss": 1.0752,
+      "step": 714
+    },
+    {
+      "epoch": 0.27785873894880014,
+      "grad_norm": 0.21546585857868195,
+      "learning_rate": 0.00014480342545737643,
+      "loss": 1.0834,
+      "step": 715
+    },
+    {
+      "epoch": 0.27824735256970756,
+      "grad_norm": 0.22675828635692596,
+      "learning_rate": 0.00014472557415336708,
+      "loss": 1.055,
+      "step": 716
+    },
+    {
+      "epoch": 0.278635966190615,
+      "grad_norm": 0.2117871195077896,
+      "learning_rate": 0.00014464772284935774,
+      "loss": 1.03,
+      "step": 717
+    },
+    {
+      "epoch": 0.2790245798115224,
+      "grad_norm": 0.2193155735731125,
+      "learning_rate": 0.00014456987154534841,
+      "loss": 1.0073,
+      "step": 718
+    },
+    {
+      "epoch": 0.2794131934324298,
+      "grad_norm": 0.21447965502738953,
+      "learning_rate": 0.00014449202024133907,
+      "loss": 1.0174,
+      "step": 719
+    },
+    {
+      "epoch": 0.27980180705333724,
+      "grad_norm": 0.22867532074451447,
+      "learning_rate": 0.00014441416893732972,
+      "loss": 1.0948,
+      "step": 720
+    },
+    {
+      "epoch": 0.28019042067424466,
+      "grad_norm": 0.21570557355880737,
+      "learning_rate": 0.00014433631763332037,
+      "loss": 1.0105,
+      "step": 721
+    },
+    {
+      "epoch": 0.280579034295152,
+      "grad_norm": 0.20787014067173004,
+      "learning_rate": 0.00014425846632931102,
+      "loss": 1.0384,
+      "step": 722
+    },
+    {
+      "epoch": 0.28096764791605944,
+      "grad_norm": 0.19924762845039368,
+      "learning_rate": 0.0001441806150253017,
+      "loss": 1.0653,
+      "step": 723
+    },
+    {
+      "epoch": 0.28135626153696686,
+      "grad_norm": 0.1996215283870697,
+      "learning_rate": 0.00014410276372129235,
+      "loss": 1.0439,
+      "step": 724
+    },
+    {
+      "epoch": 0.2817448751578743,
+      "grad_norm": 0.2054813802242279,
+      "learning_rate": 0.000144024912417283,
+      "loss": 0.9895,
+      "step": 725
+    },
+    {
+      "epoch": 0.2821334887787817,
+      "grad_norm": 0.2268310785293579,
+      "learning_rate": 0.00014394706111327366,
+      "loss": 1.0993,
+      "step": 726
+    },
+    {
+      "epoch": 0.2825221023996891,
+      "grad_norm": 0.19867680966854095,
+      "learning_rate": 0.0001438692098092643,
+      "loss": 0.985,
+      "step": 727
+    },
+    {
+      "epoch": 0.28291071602059653,
+      "grad_norm": 0.21099598705768585,
+      "learning_rate": 0.000143791358505255,
+      "loss": 1.0333,
+      "step": 728
+    },
+    {
+      "epoch": 0.28329932964150395,
+      "grad_norm": 0.22479215264320374,
+      "learning_rate": 0.00014371350720124564,
+      "loss": 1.0449,
+      "step": 729
+    },
+    {
+      "epoch": 0.28368794326241137,
+      "grad_norm": 0.22717688977718353,
+      "learning_rate": 0.0001436356558972363,
+      "loss": 1.0482,
+      "step": 730
+    },
+    {
+      "epoch": 0.2840765568833188,
+      "grad_norm": 0.20389345288276672,
+      "learning_rate": 0.00014355780459322695,
+      "loss": 0.956,
+      "step": 731
+    },
+    {
+      "epoch": 0.28446517050422615,
+      "grad_norm": 0.21583619713783264,
+      "learning_rate": 0.0001434799532892176,
+      "loss": 1.0154,
+      "step": 732
+    },
+    {
+      "epoch": 0.28485378412513357,
+      "grad_norm": 0.2219148874282837,
+      "learning_rate": 0.00014340210198520825,
+      "loss": 1.0553,
+      "step": 733
+    },
+    {
+      "epoch": 0.285242397746041,
+      "grad_norm": 0.19920189678668976,
+      "learning_rate": 0.00014332425068119893,
+      "loss": 0.9881,
+      "step": 734
+    },
+    {
+      "epoch": 0.2856310113669484,
+      "grad_norm": 0.2295670360326767,
+      "learning_rate": 0.00014324639937718958,
+      "loss": 1.0529,
+      "step": 735
+    },
+    {
+      "epoch": 0.2860196249878558,
+      "grad_norm": 0.21271567046642303,
+      "learning_rate": 0.00014316854807318023,
+      "loss": 1.037,
+      "step": 736
+    },
+    {
+      "epoch": 0.28640823860876324,
+      "grad_norm": 0.21304361522197723,
+      "learning_rate": 0.00014309069676917088,
+      "loss": 1.048,
+      "step": 737
+    },
+    {
+      "epoch": 0.28679685222967066,
+      "grad_norm": 0.19902732968330383,
+      "learning_rate": 0.00014301284546516154,
+      "loss": 1.0306,
+      "step": 738
+    },
+    {
+      "epoch": 0.2871854658505781,
+      "grad_norm": 0.1995929330587387,
+      "learning_rate": 0.00014293499416115222,
+      "loss": 1.0394,
+      "step": 739
+    },
+    {
+      "epoch": 0.2875740794714855,
+      "grad_norm": 0.20426060259342194,
+      "learning_rate": 0.00014285714285714287,
+      "loss": 1.0052,
+      "step": 740
+    },
+    {
+      "epoch": 0.28796269309239286,
+      "grad_norm": 0.20284566283226013,
+      "learning_rate": 0.00014277929155313352,
+      "loss": 1.0115,
+      "step": 741
+    },
+    {
+      "epoch": 0.2883513067133003,
+      "grad_norm": 0.2041557878255844,
+      "learning_rate": 0.00014270144024912417,
+      "loss": 1.0473,
+      "step": 742
+    },
+    {
+      "epoch": 0.2887399203342077,
+      "grad_norm": 0.2152249962091446,
+      "learning_rate": 0.00014262358894511482,
+      "loss": 1.0802,
+      "step": 743
+    },
+    {
+      "epoch": 0.2891285339551151,
+      "grad_norm": 0.20569871366024017,
+      "learning_rate": 0.0001425457376411055,
+      "loss": 1.0203,
+      "step": 744
+    },
+    {
+      "epoch": 0.28951714757602254,
+      "grad_norm": 0.21128378808498383,
+      "learning_rate": 0.00014246788633709616,
+      "loss": 1.108,
+      "step": 745
+    },
+    {
+      "epoch": 0.28990576119692996,
+      "grad_norm": 0.19587135314941406,
+      "learning_rate": 0.0001423900350330868,
+      "loss": 1.0427,
+      "step": 746
+    },
+    {
+      "epoch": 0.2902943748178374,
+      "grad_norm": 0.22052550315856934,
+      "learning_rate": 0.00014231218372907746,
+      "loss": 1.055,
+      "step": 747
+    },
+    {
+      "epoch": 0.2906829884387448,
+      "grad_norm": 0.21291717886924744,
+      "learning_rate": 0.0001422343324250681,
+      "loss": 1.0591,
+      "step": 748
+    },
+    {
+      "epoch": 0.2910716020596522,
+      "grad_norm": 0.20634084939956665,
+      "learning_rate": 0.0001421564811210588,
+      "loss": 1.0527,
+      "step": 749
+    },
+    {
+      "epoch": 0.29146021568055963,
+      "grad_norm": 0.2075488269329071,
+      "learning_rate": 0.00014207862981704944,
+      "loss": 1.0786,
+      "step": 750
+    },
+    {
+      "epoch": 0.291848829301467,
+      "grad_norm": 0.19780080020427704,
+      "learning_rate": 0.0001420007785130401,
+      "loss": 1.059,
+      "step": 751
+    },
+    {
+      "epoch": 0.2922374429223744,
+      "grad_norm": 0.21212074160575867,
+      "learning_rate": 0.00014192292720903075,
+      "loss": 1.0346,
+      "step": 752
+    },
+    {
+      "epoch": 0.29262605654328183,
+      "grad_norm": 0.2218451350927353,
+      "learning_rate": 0.0001418450759050214,
+      "loss": 1.0908,
+      "step": 753
+    },
+    {
+      "epoch": 0.29301467016418925,
+      "grad_norm": 0.20107759535312653,
+      "learning_rate": 0.00014176722460101208,
+      "loss": 1.0202,
+      "step": 754
+    },
+    {
+      "epoch": 0.29340328378509667,
+      "grad_norm": 0.20933273434638977,
+      "learning_rate": 0.00014168937329700273,
+      "loss": 1.0719,
+      "step": 755
+    },
+    {
+      "epoch": 0.2937918974060041,
+      "grad_norm": 0.22369107604026794,
+      "learning_rate": 0.00014161152199299338,
+      "loss": 1.0433,
+      "step": 756
+    },
+    {
+      "epoch": 0.2941805110269115,
+      "grad_norm": 0.2113707810640335,
+      "learning_rate": 0.00014153367068898403,
+      "loss": 1.0637,
+      "step": 757
+    },
+    {
+      "epoch": 0.2945691246478189,
+      "grad_norm": 0.21105700731277466,
+      "learning_rate": 0.00014145581938497469,
+      "loss": 1.0468,
+      "step": 758
+    },
+    {
+      "epoch": 0.29495773826872634,
+      "grad_norm": 0.20189693570137024,
+      "learning_rate": 0.00014137796808096537,
+      "loss": 1.0281,
+      "step": 759
+    },
+    {
+      "epoch": 0.2953463518896337,
+      "grad_norm": 0.1954152137041092,
+      "learning_rate": 0.00014130011677695602,
+      "loss": 1.0519,
+      "step": 760
+    },
+    {
+      "epoch": 0.2957349655105411,
+      "grad_norm": 0.24295592308044434,
+      "learning_rate": 0.00014122226547294667,
+      "loss": 1.1303,
+      "step": 761
+    },
+    {
+      "epoch": 0.29612357913144854,
+      "grad_norm": 0.20158620178699493,
+      "learning_rate": 0.00014114441416893732,
+      "loss": 1.0367,
+      "step": 762
+    },
+    {
+      "epoch": 0.29651219275235596,
+      "grad_norm": 0.20734666287899017,
+      "learning_rate": 0.00014106656286492797,
+      "loss": 1.0392,
+      "step": 763
+    },
+    {
+      "epoch": 0.2969008063732634,
+      "grad_norm": 0.2177533656358719,
+      "learning_rate": 0.00014098871156091865,
+      "loss": 1.0619,
+      "step": 764
+    },
+    {
+      "epoch": 0.2972894199941708,
+      "grad_norm": 0.1961720883846283,
+      "learning_rate": 0.0001409108602569093,
+      "loss": 0.9872,
+      "step": 765
+    },
+    {
+      "epoch": 0.2976780336150782,
+      "grad_norm": 0.21530941128730774,
+      "learning_rate": 0.00014083300895289996,
+      "loss": 1.1246,
+      "step": 766
+    },
+    {
+      "epoch": 0.29806664723598564,
+      "grad_norm": 0.2039783000946045,
+      "learning_rate": 0.00014075515764889064,
+      "loss": 1.0789,
+      "step": 767
+    },
+    {
+      "epoch": 0.29845526085689306,
+      "grad_norm": 0.20641569793224335,
+      "learning_rate": 0.0001406773063448813,
+      "loss": 1.05,
+      "step": 768
+    },
+    {
+      "epoch": 0.2988438744778004,
+      "grad_norm": 0.2071225494146347,
+      "learning_rate": 0.00014059945504087194,
+      "loss": 1.047,
+      "step": 769
+    },
+    {
+      "epoch": 0.29923248809870784,
+      "grad_norm": 0.20367531478405,
+      "learning_rate": 0.00014052160373686262,
+      "loss": 1.0734,
+      "step": 770
+    },
+    {
+      "epoch": 0.29962110171961526,
+      "grad_norm": 0.21718619763851166,
+      "learning_rate": 0.00014044375243285327,
+      "loss": 1.0613,
+      "step": 771
+    },
+    {
+      "epoch": 0.3000097153405227,
+      "grad_norm": 0.21649087965488434,
+      "learning_rate": 0.00014036590112884392,
+      "loss": 1.0671,
+      "step": 772
+    },
+    {
+      "epoch": 0.3003983289614301,
+      "grad_norm": 0.22223225235939026,
+      "learning_rate": 0.00014028804982483458,
+      "loss": 1.0977,
+      "step": 773
+    },
+    {
+      "epoch": 0.3007869425823375,
+      "grad_norm": 0.23101870715618134,
+      "learning_rate": 0.00014021019852082523,
+      "loss": 1.1236,
+      "step": 774
+    },
+    {
+      "epoch": 0.30117555620324493,
+      "grad_norm": 0.22855506837368011,
+      "learning_rate": 0.0001401323472168159,
+      "loss": 1.0517,
+      "step": 775
+    },
+    {
+      "epoch": 0.30156416982415235,
+      "grad_norm": 0.20862117409706116,
+      "learning_rate": 0.00014005449591280656,
+      "loss": 1.0493,
+      "step": 776
+    },
+    {
+      "epoch": 0.30195278344505977,
+      "grad_norm": 0.21692048013210297,
+      "learning_rate": 0.0001399766446087972,
+      "loss": 1.0681,
+      "step": 777
+    },
+    {
+      "epoch": 0.3023413970659672,
+      "grad_norm": 0.21541331708431244,
+      "learning_rate": 0.00013989879330478786,
+      "loss": 1.0775,
+      "step": 778
+    },
+    {
+      "epoch": 0.30273001068687455,
+      "grad_norm": 0.21221749484539032,
+      "learning_rate": 0.00013982094200077851,
+      "loss": 1.0421,
+      "step": 779
+    },
+    {
+      "epoch": 0.30311862430778197,
+      "grad_norm": 0.22497743368148804,
+      "learning_rate": 0.0001397430906967692,
+      "loss": 1.1115,
+      "step": 780
+    },
+    {
+      "epoch": 0.3035072379286894,
+      "grad_norm": 0.1974119246006012,
+      "learning_rate": 0.00013966523939275985,
+      "loss": 1.0264,
+      "step": 781
+    },
+    {
+      "epoch": 0.3038958515495968,
+      "grad_norm": 0.20349323749542236,
+      "learning_rate": 0.0001395873880887505,
+      "loss": 1.0512,
+      "step": 782
+    },
+    {
+      "epoch": 0.3042844651705042,
+      "grad_norm": 0.21116937696933746,
+      "learning_rate": 0.00013950953678474115,
+      "loss": 1.0135,
+      "step": 783
+    },
+    {
+      "epoch": 0.30467307879141164,
+      "grad_norm": 0.2133677899837494,
+      "learning_rate": 0.0001394316854807318,
+      "loss": 1.0694,
+      "step": 784
+    },
+    {
+      "epoch": 0.30506169241231906,
+      "grad_norm": 0.20406191051006317,
+      "learning_rate": 0.00013935383417672248,
+      "loss": 1.0179,
+      "step": 785
+    },
+    {
+      "epoch": 0.3054503060332265,
+      "grad_norm": 0.21428678929805756,
+      "learning_rate": 0.00013927598287271313,
+      "loss": 1.0577,
+      "step": 786
+    },
+    {
+      "epoch": 0.3058389196541339,
+      "grad_norm": 0.20878921449184418,
+      "learning_rate": 0.00013919813156870379,
+      "loss": 1.0311,
+      "step": 787
+    },
+    {
+      "epoch": 0.30622753327504126,
+      "grad_norm": 0.19033175706863403,
+      "learning_rate": 0.00013912028026469444,
+      "loss": 0.976,
+      "step": 788
+    },
+    {
+      "epoch": 0.3066161468959487,
+      "grad_norm": 0.22138020396232605,
+      "learning_rate": 0.0001390424289606851,
+      "loss": 1.0438,
+      "step": 789
+    },
+    {
+      "epoch": 0.3070047605168561,
+      "grad_norm": 0.20765596628189087,
+      "learning_rate": 0.00013896457765667577,
+      "loss": 1.0865,
+      "step": 790
+    },
+    {
+      "epoch": 0.3073933741377635,
+      "grad_norm": 0.209733247756958,
+      "learning_rate": 0.00013888672635266642,
+      "loss": 1.0648,
+      "step": 791
+    },
+    {
+      "epoch": 0.30778198775867094,
+      "grad_norm": 0.1896686851978302,
+      "learning_rate": 0.00013880887504865707,
+      "loss": 1.0133,
+      "step": 792
+    },
+    {
+      "epoch": 0.30817060137957836,
+      "grad_norm": 0.21651998162269592,
+      "learning_rate": 0.00013873102374464772,
+      "loss": 1.0729,
+      "step": 793
+    },
+    {
+      "epoch": 0.3085592150004858,
+      "grad_norm": 0.21751996874809265,
+      "learning_rate": 0.00013865317244063838,
+      "loss": 1.0444,
+      "step": 794
+    },
+    {
+      "epoch": 0.3089478286213932,
+      "grad_norm": 0.20593520998954773,
+      "learning_rate": 0.00013857532113662906,
+      "loss": 1.0304,
+      "step": 795
+    },
+    {
+      "epoch": 0.3093364422423006,
+      "grad_norm": 0.19937261939048767,
+      "learning_rate": 0.0001384974698326197,
+      "loss": 1.0017,
+      "step": 796
+    },
+    {
+      "epoch": 0.30972505586320803,
+      "grad_norm": 0.18901696801185608,
+      "learning_rate": 0.00013841961852861036,
+      "loss": 1.0362,
+      "step": 797
+    },
+    {
+      "epoch": 0.3101136694841154,
+      "grad_norm": 0.2079760730266571,
+      "learning_rate": 0.000138341767224601,
+      "loss": 1.0784,
+      "step": 798
+    },
+    {
+      "epoch": 0.3105022831050228,
+      "grad_norm": 0.24873265624046326,
+      "learning_rate": 0.00013826391592059166,
+      "loss": 1.1026,
+      "step": 799
+    },
+    {
+      "epoch": 0.31089089672593023,
+      "grad_norm": 0.20185396075248718,
+      "learning_rate": 0.00013818606461658234,
+      "loss": 1.0235,
+      "step": 800
+    },
+    {
+      "epoch": 0.31127951034683765,
+      "grad_norm": 0.211393803358078,
+      "learning_rate": 0.000138108213312573,
+      "loss": 1.0999,
+      "step": 801
+    },
+    {
+      "epoch": 0.31166812396774507,
+      "grad_norm": 0.19948823750019073,
+      "learning_rate": 0.00013803036200856365,
+      "loss": 1.0242,
+      "step": 802
+    },
+    {
+      "epoch": 0.3120567375886525,
+      "grad_norm": 0.21470944583415985,
+      "learning_rate": 0.0001379525107045543,
+      "loss": 1.0736,
+      "step": 803
+    },
+    {
+      "epoch": 0.3124453512095599,
+      "grad_norm": 0.2195902317762375,
+      "learning_rate": 0.00013787465940054495,
+      "loss": 1.0368,
+      "step": 804
+    },
+    {
+      "epoch": 0.3128339648304673,
+      "grad_norm": 0.22142355144023895,
+      "learning_rate": 0.00013779680809653563,
+      "loss": 1.1022,
+      "step": 805
+    },
+    {
+      "epoch": 0.31322257845137474,
+      "grad_norm": 0.20487886667251587,
+      "learning_rate": 0.00013771895679252628,
+      "loss": 1.0478,
+      "step": 806
+    },
+    {
+      "epoch": 0.3136111920722821,
+      "grad_norm": 0.217549130320549,
+      "learning_rate": 0.00013764110548851693,
+      "loss": 1.0526,
+      "step": 807
+    },
+    {
+      "epoch": 0.3139998056931895,
+      "grad_norm": 0.20199982821941376,
+      "learning_rate": 0.0001375632541845076,
+      "loss": 0.9992,
+      "step": 808
+    },
+    {
+      "epoch": 0.31438841931409695,
+      "grad_norm": 0.19496634602546692,
+      "learning_rate": 0.00013748540288049824,
+      "loss": 1.0179,
+      "step": 809
+    },
+    {
+      "epoch": 0.31477703293500436,
+      "grad_norm": 0.21999460458755493,
+      "learning_rate": 0.0001374075515764889,
+      "loss": 1.0547,
+      "step": 810
+    },
+    {
+      "epoch": 0.3151656465559118,
+      "grad_norm": 0.21421074867248535,
+      "learning_rate": 0.00013732970027247957,
+      "loss": 1.0283,
+      "step": 811
+    },
+    {
+      "epoch": 0.3155542601768192,
+      "grad_norm": 0.1913364827632904,
+      "learning_rate": 0.00013725184896847022,
+      "loss": 0.9826,
+      "step": 812
+    },
+    {
+      "epoch": 0.3159428737977266,
+      "grad_norm": 0.20509806275367737,
+      "learning_rate": 0.00013717399766446087,
+      "loss": 1.0303,
+      "step": 813
+    },
+    {
+      "epoch": 0.31633148741863404,
+      "grad_norm": 0.20309868454933167,
+      "learning_rate": 0.00013709614636045153,
+      "loss": 1.0479,
+      "step": 814
+    },
+    {
+      "epoch": 0.31672010103954146,
+      "grad_norm": 0.2274443656206131,
+      "learning_rate": 0.0001370182950564422,
+      "loss": 1.1311,
+      "step": 815
+    },
+    {
+      "epoch": 0.3171087146604489,
+      "grad_norm": 0.22785170376300812,
+      "learning_rate": 0.00013694044375243286,
+      "loss": 1.1009,
+      "step": 816
+    },
+    {
+      "epoch": 0.31749732828135624,
+      "grad_norm": 0.2105439007282257,
+      "learning_rate": 0.0001368625924484235,
+      "loss": 1.0251,
+      "step": 817
+    },
+    {
+      "epoch": 0.31788594190226366,
+      "grad_norm": 0.20583970844745636,
+      "learning_rate": 0.00013678474114441416,
+      "loss": 1.0833,
+      "step": 818
+    },
+    {
+      "epoch": 0.3182745555231711,
+      "grad_norm": 0.21091191470623016,
+      "learning_rate": 0.00013670688984040484,
+      "loss": 1.071,
+      "step": 819
+    },
+    {
+      "epoch": 0.3186631691440785,
+      "grad_norm": 0.20645928382873535,
+      "learning_rate": 0.0001366290385363955,
+      "loss": 1.0605,
+      "step": 820
+    },
+    {
+      "epoch": 0.3190517827649859,
+      "grad_norm": 0.1990513950586319,
+      "learning_rate": 0.00013655118723238614,
+      "loss": 1.0461,
+      "step": 821
+    },
+    {
+      "epoch": 0.31944039638589333,
+      "grad_norm": 0.2192249745130539,
+      "learning_rate": 0.00013647333592837682,
+      "loss": 1.0975,
+      "step": 822
+    },
+    {
+      "epoch": 0.31982901000680075,
+      "grad_norm": 0.2157617211341858,
+      "learning_rate": 0.00013639548462436748,
+      "loss": 1.091,
+      "step": 823
+    },
+    {
+      "epoch": 0.32021762362770817,
+      "grad_norm": 0.21964526176452637,
+      "learning_rate": 0.00013631763332035813,
+      "loss": 1.0286,
+      "step": 824
+    },
+    {
+      "epoch": 0.3206062372486156,
+      "grad_norm": 0.2079797089099884,
+      "learning_rate": 0.00013623978201634878,
+      "loss": 1.0257,
+      "step": 825
+    },
+    {
+      "epoch": 0.32099485086952295,
+      "grad_norm": 0.21220168471336365,
+      "learning_rate": 0.00013616193071233946,
+      "loss": 1.0046,
+      "step": 826
+    },
+    {
+      "epoch": 0.32138346449043037,
+      "grad_norm": 0.2885231673717499,
+      "learning_rate": 0.0001360840794083301,
+      "loss": 1.1442,
+      "step": 827
+    },
+    {
+      "epoch": 0.3217720781113378,
+      "grad_norm": 0.2096511274576187,
+      "learning_rate": 0.00013600622810432076,
+      "loss": 1.0209,
+      "step": 828
+    },
+    {
+      "epoch": 0.3221606917322452,
+      "grad_norm": 0.2179451286792755,
+      "learning_rate": 0.00013592837680031142,
+      "loss": 1.0548,
+      "step": 829
+    },
+    {
+      "epoch": 0.3225493053531526,
+      "grad_norm": 0.2096329927444458,
+      "learning_rate": 0.00013585052549630207,
+      "loss": 1.0279,
+      "step": 830
+    },
+    {
+      "epoch": 0.32293791897406005,
+      "grad_norm": 0.22531811892986298,
+      "learning_rate": 0.00013577267419229275,
+      "loss": 1.0463,
+      "step": 831
+    },
+    {
+      "epoch": 0.32332653259496746,
+      "grad_norm": 0.22516901791095734,
+      "learning_rate": 0.0001356948228882834,
+      "loss": 1.1127,
+      "step": 832
+    },
+    {
+      "epoch": 0.3237151462158749,
+      "grad_norm": 0.22487780451774597,
+      "learning_rate": 0.00013561697158427405,
+      "loss": 1.0707,
+      "step": 833
+    },
+    {
+      "epoch": 0.3241037598367823,
+      "grad_norm": 0.20976543426513672,
+      "learning_rate": 0.0001355391202802647,
+      "loss": 1.0217,
+      "step": 834
+    },
+    {
+      "epoch": 0.32449237345768966,
+      "grad_norm": 0.19849295914173126,
+      "learning_rate": 0.00013546126897625535,
+      "loss": 1.021,
+      "step": 835
+    },
+    {
+      "epoch": 0.3248809870785971,
+      "grad_norm": 0.21772268414497375,
+      "learning_rate": 0.00013538341767224603,
+      "loss": 1.0605,
+      "step": 836
+    },
+    {
+      "epoch": 0.3252696006995045,
+      "grad_norm": 0.19670265913009644,
+      "learning_rate": 0.00013530556636823669,
+      "loss": 1.0165,
+      "step": 837
+    },
+    {
+      "epoch": 0.3256582143204119,
+      "grad_norm": 0.19339734315872192,
+      "learning_rate": 0.00013522771506422734,
+      "loss": 1.0203,
+      "step": 838
+    },
+    {
+      "epoch": 0.32604682794131934,
+      "grad_norm": 0.21289557218551636,
+      "learning_rate": 0.000135149863760218,
+      "loss": 1.0252,
+      "step": 839
+    },
+    {
+      "epoch": 0.32643544156222676,
+      "grad_norm": 0.1964789777994156,
+      "learning_rate": 0.00013507201245620864,
+      "loss": 1.0392,
+      "step": 840
+    },
+    {
+      "epoch": 0.3268240551831342,
+      "grad_norm": 0.20783716440200806,
+      "learning_rate": 0.00013499416115219932,
+      "loss": 1.0569,
+      "step": 841
+    },
+    {
+      "epoch": 0.3272126688040416,
+      "grad_norm": 0.22782161831855774,
+      "learning_rate": 0.00013491630984818997,
+      "loss": 1.0555,
+      "step": 842
+    },
+    {
+      "epoch": 0.327601282424949,
+      "grad_norm": 0.22771142423152924,
+      "learning_rate": 0.00013483845854418063,
+      "loss": 1.085,
+      "step": 843
+    },
+    {
+      "epoch": 0.32798989604585643,
+      "grad_norm": 0.19773711264133453,
+      "learning_rate": 0.00013476060724017128,
+      "loss": 1.008,
+      "step": 844
+    },
+    {
+      "epoch": 0.3283785096667638,
+      "grad_norm": 0.22399166226387024,
+      "learning_rate": 0.00013468275593616193,
+      "loss": 1.0511,
+      "step": 845
+    },
+    {
+      "epoch": 0.3287671232876712,
+      "grad_norm": 0.20488236844539642,
+      "learning_rate": 0.00013460490463215258,
+      "loss": 1.0883,
+      "step": 846
+    },
+    {
+      "epoch": 0.32915573690857863,
+      "grad_norm": 0.21387654542922974,
+      "learning_rate": 0.00013452705332814326,
+      "loss": 1.0808,
+      "step": 847
+    },
+    {
+      "epoch": 0.32954435052948605,
+      "grad_norm": 0.1972568780183792,
+      "learning_rate": 0.0001344492020241339,
+      "loss": 1.0555,
+      "step": 848
+    },
+    {
+      "epoch": 0.32993296415039347,
+      "grad_norm": 0.20835663378238678,
+      "learning_rate": 0.00013437135072012456,
+      "loss": 1.0473,
+      "step": 849
+    },
+    {
+      "epoch": 0.3303215777713009,
+      "grad_norm": 0.19707520306110382,
+      "learning_rate": 0.00013429349941611522,
+      "loss": 0.9585,
+      "step": 850
+    },
+    {
+      "epoch": 0.3307101913922083,
+      "grad_norm": 0.19163411855697632,
+      "learning_rate": 0.00013421564811210587,
+      "loss": 1.0025,
+      "step": 851
+    },
+    {
+      "epoch": 0.3310988050131157,
+      "grad_norm": 0.19730083644390106,
+      "learning_rate": 0.00013413779680809655,
+      "loss": 1.0696,
+      "step": 852
+    },
+    {
+      "epoch": 0.33148741863402315,
+      "grad_norm": 0.19537493586540222,
+      "learning_rate": 0.0001340599455040872,
+      "loss": 1.0466,
+      "step": 853
+    },
+    {
+      "epoch": 0.3318760322549305,
+      "grad_norm": 0.2255164235830307,
+      "learning_rate": 0.00013398209420007785,
+      "loss": 1.0659,
+      "step": 854
+    },
+    {
+      "epoch": 0.3322646458758379,
+      "grad_norm": 0.19774770736694336,
+      "learning_rate": 0.0001339042428960685,
+      "loss": 1.0326,
+      "step": 855
+    },
+    {
+      "epoch": 0.33265325949674535,
+      "grad_norm": 0.2004510909318924,
+      "learning_rate": 0.00013382639159205916,
+      "loss": 1.0327,
+      "step": 856
+    },
+    {
+      "epoch": 0.33304187311765276,
+      "grad_norm": 0.19187591969966888,
+      "learning_rate": 0.00013374854028804984,
+      "loss": 1.0069,
+      "step": 857
+    },
+    {
+      "epoch": 0.3334304867385602,
+      "grad_norm": 0.18775832653045654,
+      "learning_rate": 0.0001336706889840405,
+      "loss": 1.0083,
+      "step": 858
+    },
+    {
+      "epoch": 0.3338191003594676,
+      "grad_norm": 0.2005717158317566,
+      "learning_rate": 0.00013359283768003114,
+      "loss": 1.0398,
+      "step": 859
+    },
+    {
+      "epoch": 0.334207713980375,
+      "grad_norm": 0.19705893099308014,
+      "learning_rate": 0.0001335149863760218,
+      "loss": 1.0031,
+      "step": 860
+    },
+    {
+      "epoch": 0.33459632760128244,
+      "grad_norm": 0.19589562714099884,
+      "learning_rate": 0.00013343713507201244,
+      "loss": 0.9831,
+      "step": 861
+    },
+    {
+      "epoch": 0.33498494122218986,
+      "grad_norm": 0.19302591681480408,
+      "learning_rate": 0.00013335928376800312,
+      "loss": 1.0009,
+      "step": 862
+    },
+    {
+      "epoch": 0.3353735548430973,
+      "grad_norm": 0.20499618351459503,
+      "learning_rate": 0.00013328143246399377,
+      "loss": 1.0205,
+      "step": 863
+    },
+    {
+      "epoch": 0.33576216846400464,
+      "grad_norm": 0.20514456927776337,
+      "learning_rate": 0.00013320358115998443,
+      "loss": 1.0837,
+      "step": 864
+    },
+    {
+      "epoch": 0.33615078208491206,
+      "grad_norm": 0.19285848736763,
+      "learning_rate": 0.00013312572985597508,
+      "loss": 1.0167,
+      "step": 865
+    },
+    {
+      "epoch": 0.3365393957058195,
+      "grad_norm": 0.20891553163528442,
+      "learning_rate": 0.00013304787855196573,
+      "loss": 1.0127,
+      "step": 866
+    },
+    {
+      "epoch": 0.3369280093267269,
+      "grad_norm": 0.20511706173419952,
+      "learning_rate": 0.0001329700272479564,
+      "loss": 0.964,
+      "step": 867
+    },
+    {
+      "epoch": 0.3373166229476343,
+      "grad_norm": 0.1855512261390686,
+      "learning_rate": 0.00013289217594394706,
+      "loss": 0.9721,
+      "step": 868
+    },
+    {
+      "epoch": 0.33770523656854173,
+      "grad_norm": 0.20010098814964294,
+      "learning_rate": 0.00013281432463993771,
+      "loss": 1.0411,
+      "step": 869
+    },
+    {
+      "epoch": 0.33809385018944915,
+      "grad_norm": 0.1991325318813324,
+      "learning_rate": 0.0001327364733359284,
+      "loss": 0.9658,
+      "step": 870
+    },
+    {
+      "epoch": 0.33848246381035657,
+      "grad_norm": 0.19895736873149872,
+      "learning_rate": 0.00013265862203191905,
+      "loss": 1.0744,
+      "step": 871
+    },
+    {
+      "epoch": 0.338871077431264,
+      "grad_norm": 0.2091255635023117,
+      "learning_rate": 0.0001325807707279097,
+      "loss": 1.0375,
+      "step": 872
+    },
+    {
+      "epoch": 0.33925969105217135,
+      "grad_norm": 0.21355532109737396,
+      "learning_rate": 0.00013250291942390035,
+      "loss": 1.09,
+      "step": 873
+    },
+    {
+      "epoch": 0.33964830467307877,
+      "grad_norm": 0.21844851970672607,
+      "learning_rate": 0.00013242506811989103,
+      "loss": 1.0769,
+      "step": 874
+    },
+    {
+      "epoch": 0.3400369182939862,
+      "grad_norm": 0.1877543330192566,
+      "learning_rate": 0.00013234721681588168,
+      "loss": 1.0199,
+      "step": 875
+    },
+    {
+      "epoch": 0.3404255319148936,
+      "grad_norm": 0.2020038366317749,
+      "learning_rate": 0.00013226936551187233,
+      "loss": 1.0218,
+      "step": 876
+    },
+    {
+      "epoch": 0.340814145535801,
+      "grad_norm": 0.20682141184806824,
+      "learning_rate": 0.000132191514207863,
+      "loss": 1.0891,
+      "step": 877
+    },
+    {
+      "epoch": 0.34120275915670845,
+      "grad_norm": 0.21942824125289917,
+      "learning_rate": 0.00013211366290385366,
+      "loss": 0.9877,
+      "step": 878
+    },
+    {
+      "epoch": 0.34159137277761586,
+      "grad_norm": 0.21150313317775726,
+      "learning_rate": 0.00013203581159984432,
+      "loss": 1.0815,
+      "step": 879
+    },
+    {
+      "epoch": 0.3419799863985233,
+      "grad_norm": 0.2073293924331665,
+      "learning_rate": 0.00013195796029583497,
+      "loss": 1.0579,
+      "step": 880
+    },
+    {
+      "epoch": 0.3423686000194307,
+      "grad_norm": 0.221574068069458,
+      "learning_rate": 0.00013188010899182562,
+      "loss": 1.0279,
+      "step": 881
+    },
+    {
+      "epoch": 0.3427572136403381,
+      "grad_norm": 0.22334492206573486,
+      "learning_rate": 0.00013180225768781627,
+      "loss": 1.0837,
+      "step": 882
+    },
+    {
+      "epoch": 0.3431458272612455,
+      "grad_norm": 0.18817654252052307,
+      "learning_rate": 0.00013172440638380695,
+      "loss": 1.0262,
+      "step": 883
+    },
+    {
+      "epoch": 0.3435344408821529,
+      "grad_norm": 0.20126822590827942,
+      "learning_rate": 0.0001316465550797976,
+      "loss": 1.0679,
+      "step": 884
+    },
+    {
+      "epoch": 0.3439230545030603,
+      "grad_norm": 0.2128864973783493,
+      "learning_rate": 0.00013156870377578825,
+      "loss": 1.0316,
+      "step": 885
+    },
+    {
+      "epoch": 0.34431166812396774,
+      "grad_norm": 0.20054499804973602,
+      "learning_rate": 0.0001314908524717789,
+      "loss": 1.0024,
+      "step": 886
+    },
+    {
+      "epoch": 0.34470028174487516,
+      "grad_norm": 0.21358034014701843,
+      "learning_rate": 0.00013141300116776956,
+      "loss": 1.0475,
+      "step": 887
+    },
+    {
+      "epoch": 0.3450888953657826,
+      "grad_norm": 0.21377703547477722,
+      "learning_rate": 0.00013133514986376024,
+      "loss": 1.0957,
+      "step": 888
+    },
+    {
+      "epoch": 0.34547750898669,
+      "grad_norm": 0.20166514813899994,
+      "learning_rate": 0.0001312572985597509,
+      "loss": 1.0189,
+      "step": 889
+    },
+    {
+      "epoch": 0.3458661226075974,
+      "grad_norm": 0.20424878597259521,
+      "learning_rate": 0.00013117944725574154,
+      "loss": 1.0896,
+      "step": 890
+    },
+    {
+      "epoch": 0.34625473622850483,
+      "grad_norm": 0.19028648734092712,
+      "learning_rate": 0.0001311015959517322,
+      "loss": 0.9881,
+      "step": 891
+    },
+    {
+      "epoch": 0.3466433498494122,
+      "grad_norm": 0.20828665792942047,
+      "learning_rate": 0.00013102374464772285,
+      "loss": 0.9932,
+      "step": 892
+    },
+    {
+      "epoch": 0.3470319634703196,
+      "grad_norm": 0.20756572484970093,
+      "learning_rate": 0.00013094589334371353,
+      "loss": 1.0406,
+      "step": 893
+    },
+    {
+      "epoch": 0.34742057709122703,
+      "grad_norm": 0.20768921077251434,
+      "learning_rate": 0.00013086804203970418,
+      "loss": 0.9652,
+      "step": 894
+    },
+    {
+      "epoch": 0.34780919071213445,
+      "grad_norm": 0.20660027861595154,
+      "learning_rate": 0.00013079019073569483,
+      "loss": 1.0728,
+      "step": 895
+    },
+    {
+      "epoch": 0.34819780433304187,
+      "grad_norm": 0.20186837017536163,
+      "learning_rate": 0.00013071233943168548,
+      "loss": 1.0407,
+      "step": 896
+    },
+    {
+      "epoch": 0.3485864179539493,
+      "grad_norm": 0.20880667865276337,
+      "learning_rate": 0.00013063448812767613,
+      "loss": 1.0275,
+      "step": 897
+    },
+    {
+      "epoch": 0.3489750315748567,
+      "grad_norm": 0.22212949395179749,
+      "learning_rate": 0.0001305566368236668,
+      "loss": 1.0293,
+      "step": 898
+    },
+    {
+      "epoch": 0.3493636451957641,
+      "grad_norm": 0.20552745461463928,
+      "learning_rate": 0.00013047878551965746,
+      "loss": 1.0434,
+      "step": 899
+    },
+    {
+      "epoch": 0.34975225881667155,
+      "grad_norm": 0.21239839494228363,
+      "learning_rate": 0.00013040093421564812,
+      "loss": 1.052,
+      "step": 900
+    },
+    {
+      "epoch": 0.3501408724375789,
+      "grad_norm": 0.22420544922351837,
+      "learning_rate": 0.00013032308291163877,
+      "loss": 1.0236,
+      "step": 901
+    },
+    {
+      "epoch": 0.35052948605848633,
+      "grad_norm": 0.23435090482234955,
+      "learning_rate": 0.00013024523160762942,
+      "loss": 1.0876,
+      "step": 902
+    },
+    {
+      "epoch": 0.35091809967939375,
+      "grad_norm": 0.22763386368751526,
+      "learning_rate": 0.0001301673803036201,
+      "loss": 1.0636,
+      "step": 903
+    },
+    {
+      "epoch": 0.35130671330030117,
+      "grad_norm": 0.20948883891105652,
+      "learning_rate": 0.00013008952899961075,
+      "loss": 1.0083,
+      "step": 904
+    },
+    {
+      "epoch": 0.3516953269212086,
+      "grad_norm": 0.20408779382705688,
+      "learning_rate": 0.0001300116776956014,
+      "loss": 1.039,
+      "step": 905
+    },
+    {
+      "epoch": 0.352083940542116,
+      "grad_norm": 0.2126050591468811,
+      "learning_rate": 0.00012993382639159206,
+      "loss": 1.0365,
+      "step": 906
+    },
+    {
+      "epoch": 0.3524725541630234,
+      "grad_norm": 0.20314334332942963,
+      "learning_rate": 0.0001298559750875827,
+      "loss": 1.0474,
+      "step": 907
+    },
+    {
+      "epoch": 0.35286116778393084,
+      "grad_norm": 0.23720984160900116,
+      "learning_rate": 0.0001297781237835734,
+      "loss": 1.0529,
+      "step": 908
+    },
+    {
+      "epoch": 0.35324978140483826,
+      "grad_norm": 0.22642800211906433,
+      "learning_rate": 0.00012970027247956404,
+      "loss": 1.0586,
+      "step": 909
+    },
+    {
+      "epoch": 0.3536383950257457,
+      "grad_norm": 0.20469972491264343,
+      "learning_rate": 0.0001296224211755547,
+      "loss": 1.0267,
+      "step": 910
+    },
+    {
+      "epoch": 0.35402700864665304,
+      "grad_norm": 0.197368785738945,
+      "learning_rate": 0.00012954456987154534,
+      "loss": 1.0348,
+      "step": 911
+    },
+    {
+      "epoch": 0.35441562226756046,
+      "grad_norm": 0.21924498677253723,
+      "learning_rate": 0.000129466718567536,
+      "loss": 1.0861,
+      "step": 912
+    },
+    {
+      "epoch": 0.3548042358884679,
+      "grad_norm": 0.22006285190582275,
+      "learning_rate": 0.00012938886726352667,
+      "loss": 1.0545,
+      "step": 913
+    },
+    {
+      "epoch": 0.3551928495093753,
+      "grad_norm": 0.22419220209121704,
+      "learning_rate": 0.00012931101595951733,
+      "loss": 1.0716,
+      "step": 914
+    },
+    {
+      "epoch": 0.3555814631302827,
+      "grad_norm": 0.215990349650383,
+      "learning_rate": 0.00012923316465550798,
+      "loss": 1.0619,
+      "step": 915
+    },
+    {
+      "epoch": 0.35597007675119013,
+      "grad_norm": 0.20783264935016632,
+      "learning_rate": 0.00012915531335149863,
+      "loss": 1.0412,
+      "step": 916
+    },
+    {
+      "epoch": 0.35635869037209755,
+      "grad_norm": 0.24584618210792542,
+      "learning_rate": 0.00012907746204748928,
+      "loss": 1.1165,
+      "step": 917
+    },
+    {
+      "epoch": 0.35674730399300497,
+      "grad_norm": 0.23146122694015503,
+      "learning_rate": 0.00012899961074347996,
+      "loss": 1.1111,
+      "step": 918
+    },
+    {
+      "epoch": 0.3571359176139124,
+      "grad_norm": 0.19983729720115662,
+      "learning_rate": 0.00012892175943947061,
+      "loss": 1.0674,
+      "step": 919
+    },
+    {
+      "epoch": 0.35752453123481975,
+      "grad_norm": 0.2161000818014145,
+      "learning_rate": 0.00012884390813546127,
+      "loss": 1.076,
+      "step": 920
+    },
+    {
+      "epoch": 0.35791314485572717,
+      "grad_norm": 0.21042793989181519,
+      "learning_rate": 0.00012876605683145192,
+      "loss": 1.0535,
+      "step": 921
+    },
+    {
+      "epoch": 0.3583017584766346,
+      "grad_norm": 0.20135439932346344,
+      "learning_rate": 0.0001286882055274426,
+      "loss": 1.0059,
+      "step": 922
+    },
+    {
+      "epoch": 0.358690372097542,
+      "grad_norm": 0.19394971430301666,
+      "learning_rate": 0.00012861035422343325,
+      "loss": 1.0381,
+      "step": 923
+    },
+    {
+      "epoch": 0.35907898571844943,
+      "grad_norm": 0.21171030402183533,
+      "learning_rate": 0.0001285325029194239,
+      "loss": 1.0513,
+      "step": 924
+    },
+    {
+      "epoch": 0.35946759933935685,
+      "grad_norm": 0.19476690888404846,
+      "learning_rate": 0.00012845465161541458,
+      "loss": 1.0003,
+      "step": 925
+    },
+    {
+      "epoch": 0.35985621296026427,
+      "grad_norm": 0.20468670129776,
+      "learning_rate": 0.00012837680031140523,
+      "loss": 1.0608,
+      "step": 926
+    },
+    {
+      "epoch": 0.3602448265811717,
+      "grad_norm": 0.21159446239471436,
+      "learning_rate": 0.00012829894900739588,
+      "loss": 1.0734,
+      "step": 927
+    },
+    {
+      "epoch": 0.3606334402020791,
+      "grad_norm": 0.21179519593715668,
+      "learning_rate": 0.00012822109770338654,
+      "loss": 1.0957,
+      "step": 928
+    },
+    {
+      "epoch": 0.3610220538229865,
+      "grad_norm": 0.20997527241706848,
+      "learning_rate": 0.00012814324639937722,
+      "loss": 1.0644,
+      "step": 929
+    },
+    {
+      "epoch": 0.3614106674438939,
+      "grad_norm": 0.21178296208381653,
+      "learning_rate": 0.00012806539509536787,
+      "loss": 1.0208,
+      "step": 930
+    },
+    {
+      "epoch": 0.3617992810648013,
+      "grad_norm": 0.20890356600284576,
+      "learning_rate": 0.00012798754379135852,
+      "loss": 1.0888,
+      "step": 931
+    },
+    {
+      "epoch": 0.3621878946857087,
+      "grad_norm": 0.20177409052848816,
+      "learning_rate": 0.00012790969248734917,
+      "loss": 0.9741,
+      "step": 932
+    },
+    {
+      "epoch": 0.36257650830661614,
+      "grad_norm": 0.23504556715488434,
+      "learning_rate": 0.00012783184118333982,
+      "loss": 1.1048,
+      "step": 933
+    },
+    {
+      "epoch": 0.36296512192752356,
+      "grad_norm": 0.22829356789588928,
+      "learning_rate": 0.0001277539898793305,
+      "loss": 1.0798,
+      "step": 934
+    },
+    {
+      "epoch": 0.363353735548431,
+      "grad_norm": 0.2068483531475067,
+      "learning_rate": 0.00012767613857532116,
+      "loss": 1.0452,
+      "step": 935
+    },
+    {
+      "epoch": 0.3637423491693384,
+      "grad_norm": 0.2093171775341034,
+      "learning_rate": 0.0001275982872713118,
+      "loss": 1.0742,
+      "step": 936
+    },
+    {
+      "epoch": 0.3641309627902458,
+      "grad_norm": 0.21478736400604248,
+      "learning_rate": 0.00012752043596730246,
+      "loss": 1.0572,
+      "step": 937
+    },
+    {
+      "epoch": 0.36451957641115323,
+      "grad_norm": 0.1906953752040863,
+      "learning_rate": 0.0001274425846632931,
+      "loss": 1.0107,
+      "step": 938
+    },
+    {
+      "epoch": 0.3649081900320606,
+      "grad_norm": 0.20580604672431946,
+      "learning_rate": 0.0001273647333592838,
+      "loss": 1.0677,
+      "step": 939
+    },
+    {
+      "epoch": 0.365296803652968,
+      "grad_norm": 0.22586850821971893,
+      "learning_rate": 0.00012728688205527444,
+      "loss": 1.0389,
+      "step": 940
+    },
+    {
+      "epoch": 0.36568541727387543,
+      "grad_norm": 0.199899360537529,
+      "learning_rate": 0.0001272090307512651,
+      "loss": 1.0462,
+      "step": 941
+    },
+    {
+      "epoch": 0.36607403089478285,
+      "grad_norm": 0.19881689548492432,
+      "learning_rate": 0.00012713117944725575,
+      "loss": 1.0565,
+      "step": 942
+    },
+    {
+      "epoch": 0.3664626445156903,
+      "grad_norm": 0.21748925745487213,
+      "learning_rate": 0.0001270533281432464,
+      "loss": 1.0659,
+      "step": 943
+    },
+    {
+      "epoch": 0.3668512581365977,
+      "grad_norm": 0.19363689422607422,
+      "learning_rate": 0.00012697547683923708,
+      "loss": 1.0307,
+      "step": 944
+    },
+    {
+      "epoch": 0.3672398717575051,
+      "grad_norm": 0.21701784431934357,
+      "learning_rate": 0.00012689762553522773,
+      "loss": 1.0684,
+      "step": 945
+    },
+    {
+      "epoch": 0.36762848537841253,
+      "grad_norm": 0.21406958997249603,
+      "learning_rate": 0.00012681977423121838,
+      "loss": 1.0703,
+      "step": 946
+    },
+    {
+      "epoch": 0.36801709899931995,
+      "grad_norm": 0.23539729416370392,
+      "learning_rate": 0.00012674192292720903,
+      "loss": 1.1537,
+      "step": 947
+    },
+    {
+      "epoch": 0.36840571262022737,
+      "grad_norm": 0.2177354395389557,
+      "learning_rate": 0.00012666407162319969,
+      "loss": 1.0131,
+      "step": 948
+    },
+    {
+      "epoch": 0.36879432624113473,
+      "grad_norm": 0.255346417427063,
+      "learning_rate": 0.00012658622031919037,
+      "loss": 0.9807,
+      "step": 949
+    },
+    {
+      "epoch": 0.36918293986204215,
+      "grad_norm": 0.2139921486377716,
+      "learning_rate": 0.00012650836901518102,
+      "loss": 1.0392,
+      "step": 950
+    },
+    {
+      "epoch": 0.36957155348294957,
+      "grad_norm": 0.22490833699703217,
+      "learning_rate": 0.00012643051771117167,
+      "loss": 1.0512,
+      "step": 951
+    },
+    {
+      "epoch": 0.369960167103857,
+      "grad_norm": 0.20698820054531097,
+      "learning_rate": 0.00012635266640716232,
+      "loss": 1.0391,
+      "step": 952
+    },
+    {
+      "epoch": 0.3703487807247644,
+      "grad_norm": 0.2276201844215393,
+      "learning_rate": 0.00012627481510315297,
+      "loss": 1.0513,
+      "step": 953
+    },
+    {
+      "epoch": 0.3707373943456718,
+      "grad_norm": 0.2493600994348526,
+      "learning_rate": 0.00012619696379914365,
+      "loss": 1.0136,
+      "step": 954
+    },
+    {
+      "epoch": 0.37112600796657924,
+      "grad_norm": 0.2155001014471054,
+      "learning_rate": 0.0001261191124951343,
+      "loss": 1.0523,
+      "step": 955
+    },
+    {
+      "epoch": 0.37151462158748666,
+      "grad_norm": 0.21571211516857147,
+      "learning_rate": 0.00012604126119112496,
+      "loss": 1.0288,
+      "step": 956
+    },
+    {
+      "epoch": 0.3719032352083941,
+      "grad_norm": 0.23238877952098846,
+      "learning_rate": 0.0001259634098871156,
+      "loss": 1.0638,
+      "step": 957
+    },
+    {
+      "epoch": 0.37229184882930144,
+      "grad_norm": 0.2002813220024109,
+      "learning_rate": 0.00012588555858310626,
+      "loss": 0.9665,
+      "step": 958
+    },
+    {
+      "epoch": 0.37268046245020886,
+      "grad_norm": 0.21712858974933624,
+      "learning_rate": 0.0001258077072790969,
+      "loss": 1.0469,
+      "step": 959
+    },
+    {
+      "epoch": 0.3730690760711163,
+      "grad_norm": 0.2178192287683487,
+      "learning_rate": 0.0001257298559750876,
+      "loss": 1.0267,
+      "step": 960
+    },
+    {
+      "epoch": 0.3734576896920237,
+      "grad_norm": 0.25488024950027466,
+      "learning_rate": 0.00012565200467107824,
+      "loss": 1.0153,
+      "step": 961
+    },
+    {
+      "epoch": 0.3738463033129311,
+      "grad_norm": 0.20070038735866547,
+      "learning_rate": 0.0001255741533670689,
+      "loss": 1.0279,
+      "step": 962
+    },
+    {
+      "epoch": 0.37423491693383854,
+      "grad_norm": 0.21885356307029724,
+      "learning_rate": 0.00012549630206305955,
+      "loss": 1.0395,
+      "step": 963
+    },
+    {
+      "epoch": 0.37462353055474595,
+      "grad_norm": 0.2407921701669693,
+      "learning_rate": 0.0001254184507590502,
+      "loss": 1.0767,
+      "step": 964
+    },
+    {
+      "epoch": 0.3750121441756534,
+      "grad_norm": 0.20645053684711456,
+      "learning_rate": 0.00012534059945504088,
+      "loss": 1.0318,
+      "step": 965
+    },
+    {
+      "epoch": 0.3754007577965608,
+      "grad_norm": 0.21275092661380768,
+      "learning_rate": 0.00012526274815103153,
+      "loss": 1.0546,
+      "step": 966
+    },
+    {
+      "epoch": 0.3757893714174682,
+      "grad_norm": 0.21574917435646057,
+      "learning_rate": 0.00012518489684702218,
+      "loss": 1.032,
+      "step": 967
+    },
+    {
+      "epoch": 0.3761779850383756,
+      "grad_norm": 0.21589480340480804,
+      "learning_rate": 0.00012510704554301284,
+      "loss": 1.0834,
+      "step": 968
+    },
+    {
+      "epoch": 0.376566598659283,
+      "grad_norm": 0.19576796889305115,
+      "learning_rate": 0.0001250291942390035,
+      "loss": 1.0178,
+      "step": 969
+    },
+    {
+      "epoch": 0.3769552122801904,
+      "grad_norm": 0.20941287279129028,
+      "learning_rate": 0.00012495134293499417,
+      "loss": 1.0712,
+      "step": 970
+    },
+    {
+      "epoch": 0.37734382590109783,
+      "grad_norm": 0.22585494816303253,
+      "learning_rate": 0.00012487349163098482,
+      "loss": 1.0401,
+      "step": 971
+    },
+    {
+      "epoch": 0.37773243952200525,
+      "grad_norm": 0.21093420684337616,
+      "learning_rate": 0.00012479564032697547,
+      "loss": 1.0569,
+      "step": 972
+    },
+    {
+      "epoch": 0.37812105314291267,
+      "grad_norm": 0.22375014424324036,
+      "learning_rate": 0.00012471778902296612,
+      "loss": 1.0687,
+      "step": 973
+    },
+    {
+      "epoch": 0.3785096667638201,
+      "grad_norm": 0.19787487387657166,
+      "learning_rate": 0.0001246399377189568,
+      "loss": 1.0266,
+      "step": 974
+    },
+    {
+      "epoch": 0.3788982803847275,
+      "grad_norm": 0.20633013546466827,
+      "learning_rate": 0.00012456208641494745,
+      "loss": 0.9996,
+      "step": 975
+    },
+    {
+      "epoch": 0.3792868940056349,
+      "grad_norm": 0.21559873223304749,
+      "learning_rate": 0.0001244842351109381,
+      "loss": 1.0851,
+      "step": 976
+    },
+    {
+      "epoch": 0.3796755076265423,
+      "grad_norm": 0.2166333943605423,
+      "learning_rate": 0.00012440638380692879,
+      "loss": 1.0859,
+      "step": 977
+    },
+    {
+      "epoch": 0.3800641212474497,
+      "grad_norm": 0.18558773398399353,
+      "learning_rate": 0.00012432853250291944,
+      "loss": 0.9534,
+      "step": 978
+    },
+    {
+      "epoch": 0.3804527348683571,
+      "grad_norm": 0.2086942344903946,
+      "learning_rate": 0.0001242506811989101,
+      "loss": 1.0786,
+      "step": 979
+    },
+    {
+      "epoch": 0.38084134848926454,
+      "grad_norm": 0.2207823544740677,
+      "learning_rate": 0.00012417282989490074,
+      "loss": 1.0626,
+      "step": 980
+    },
+    {
+      "epoch": 0.38122996211017196,
+      "grad_norm": 0.21255749464035034,
+      "learning_rate": 0.00012409497859089142,
+      "loss": 1.063,
+      "step": 981
+    },
+    {
+      "epoch": 0.3816185757310794,
+      "grad_norm": 0.20682042837142944,
+      "learning_rate": 0.00012401712728688207,
+      "loss": 1.034,
+      "step": 982
+    },
+    {
+      "epoch": 0.3820071893519868,
+      "grad_norm": 0.2084134966135025,
+      "learning_rate": 0.00012393927598287272,
+      "loss": 1.0481,
+      "step": 983
+    },
+    {
+      "epoch": 0.3823958029728942,
+      "grad_norm": 0.1922312080860138,
+      "learning_rate": 0.00012386142467886338,
+      "loss": 1.0461,
+      "step": 984
+    },
+    {
+      "epoch": 0.38278441659380164,
+      "grad_norm": 0.20893707871437073,
+      "learning_rate": 0.00012378357337485406,
+      "loss": 1.0797,
+      "step": 985
+    },
+    {
+      "epoch": 0.383173030214709,
+      "grad_norm": 0.19717541337013245,
+      "learning_rate": 0.0001237057220708447,
+      "loss": 1.0028,
+      "step": 986
+    },
+    {
+      "epoch": 0.3835616438356164,
+      "grad_norm": 0.20688053965568542,
+      "learning_rate": 0.00012362787076683536,
+      "loss": 0.989,
+      "step": 987
+    },
+    {
+      "epoch": 0.38395025745652384,
+      "grad_norm": 0.20580583810806274,
+      "learning_rate": 0.000123550019462826,
+      "loss": 1.06,
+      "step": 988
+    },
+    {
+      "epoch": 0.38433887107743125,
+      "grad_norm": 0.2151709794998169,
+      "learning_rate": 0.00012347216815881666,
+      "loss": 1.0685,
+      "step": 989
+    },
+    {
+      "epoch": 0.3847274846983387,
+      "grad_norm": 0.19573980569839478,
+      "learning_rate": 0.00012339431685480734,
+      "loss": 1.0072,
+      "step": 990
+    },
+    {
+      "epoch": 0.3851160983192461,
+      "grad_norm": 0.1949119120836258,
+      "learning_rate": 0.000123316465550798,
+      "loss": 0.9995,
+      "step": 991
+    },
+    {
+      "epoch": 0.3855047119401535,
+      "grad_norm": 0.2062375247478485,
+      "learning_rate": 0.00012323861424678865,
+      "loss": 1.0694,
+      "step": 992
+    },
+    {
+      "epoch": 0.38589332556106093,
+      "grad_norm": 0.2007209211587906,
+      "learning_rate": 0.0001231607629427793,
+      "loss": 1.0397,
+      "step": 993
+    },
+    {
+      "epoch": 0.38628193918196835,
+      "grad_norm": 0.2231544405221939,
+      "learning_rate": 0.00012308291163876995,
+      "loss": 1.0755,
+      "step": 994
+    },
+    {
+      "epoch": 0.38667055280287577,
+      "grad_norm": 0.2103337049484253,
+      "learning_rate": 0.0001230050603347606,
+      "loss": 1.0505,
+      "step": 995
+    },
+    {
+      "epoch": 0.38705916642378313,
+      "grad_norm": 0.20178386569023132,
+      "learning_rate": 0.00012292720903075128,
+      "loss": 1.0696,
+      "step": 996
+    },
+    {
+      "epoch": 0.38744778004469055,
+      "grad_norm": 0.21268007159233093,
+      "learning_rate": 0.00012284935772674193,
+      "loss": 1.0262,
+      "step": 997
+    },
+    {
+      "epoch": 0.38783639366559797,
+      "grad_norm": 0.21439722180366516,
+      "learning_rate": 0.0001227715064227326,
+      "loss": 1.0718,
+      "step": 998
+    },
+    {
+      "epoch": 0.3882250072865054,
+      "grad_norm": 0.19691336154937744,
+      "learning_rate": 0.00012269365511872324,
+      "loss": 0.9663,
+      "step": 999
+    },
+    {
+      "epoch": 0.3886136209074128,
+      "grad_norm": 0.2165926694869995,
+      "learning_rate": 0.0001226158038147139,
+      "loss": 1.0432,
+      "step": 1000
+    },
+    {
+      "epoch": 0.3890022345283202,
+      "grad_norm": 0.20730604231357574,
+      "learning_rate": 0.00012253795251070457,
+      "loss": 1.0386,
+      "step": 1001
+    },
+    {
+      "epoch": 0.38939084814922764,
+      "grad_norm": 0.2138068974018097,
+      "learning_rate": 0.00012246010120669522,
+      "loss": 1.0683,
+      "step": 1002
+    },
+    {
+      "epoch": 0.38977946177013506,
+      "grad_norm": 0.2118951678276062,
+      "learning_rate": 0.00012238224990268587,
+      "loss": 1.0393,
+      "step": 1003
+    },
+    {
+      "epoch": 0.3901680753910425,
+      "grad_norm": 0.20879961550235748,
+      "learning_rate": 0.00012230439859867653,
+      "loss": 1.0349,
+      "step": 1004
+    },
+    {
+      "epoch": 0.39055668901194984,
+      "grad_norm": 0.19588464498519897,
+      "learning_rate": 0.00012222654729466718,
+      "loss": 1.0226,
+      "step": 1005
+    },
+    {
+      "epoch": 0.39094530263285726,
+      "grad_norm": 0.2059485912322998,
+      "learning_rate": 0.00012214869599065786,
+      "loss": 1.052,
+      "step": 1006
+    },
+    {
+      "epoch": 0.3913339162537647,
+      "grad_norm": 0.2299761176109314,
+      "learning_rate": 0.0001220708446866485,
+      "loss": 1.1055,
+      "step": 1007
+    },
+    {
+      "epoch": 0.3917225298746721,
+      "grad_norm": 0.20196737349033356,
+      "learning_rate": 0.00012199299338263916,
+      "loss": 1.0497,
+      "step": 1008
+    },
+    {
+      "epoch": 0.3921111434955795,
+      "grad_norm": 0.20615293085575104,
+      "learning_rate": 0.00012191514207862981,
+      "loss": 1.047,
+      "step": 1009
+    },
+    {
+      "epoch": 0.39249975711648694,
+      "grad_norm": 0.20265278220176697,
+      "learning_rate": 0.00012183729077462047,
+      "loss": 1.0035,
+      "step": 1010
+    },
+    {
+      "epoch": 0.39288837073739435,
+      "grad_norm": 0.20197926461696625,
+      "learning_rate": 0.00012175943947061114,
+      "loss": 0.9847,
+      "step": 1011
+    },
+    {
+      "epoch": 0.3932769843583018,
+      "grad_norm": 0.19974152743816376,
+      "learning_rate": 0.0001216815881666018,
+      "loss": 1.0669,
+      "step": 1012
+    },
+    {
+      "epoch": 0.3936655979792092,
+      "grad_norm": 0.21684005856513977,
+      "learning_rate": 0.00012160373686259245,
+      "loss": 1.0562,
+      "step": 1013
+    },
+    {
+      "epoch": 0.3940542116001166,
+      "grad_norm": 0.2030404955148697,
+      "learning_rate": 0.00012152588555858311,
+      "loss": 1.0159,
+      "step": 1014
+    },
+    {
+      "epoch": 0.394442825221024,
+      "grad_norm": 0.2123572677373886,
+      "learning_rate": 0.00012144803425457377,
+      "loss": 1.0757,
+      "step": 1015
+    },
+    {
+      "epoch": 0.3948314388419314,
+      "grad_norm": 0.20320011675357819,
+      "learning_rate": 0.00012137018295056443,
+      "loss": 1.038,
+      "step": 1016
+    },
+    {
+      "epoch": 0.3952200524628388,
+      "grad_norm": 0.20120739936828613,
+      "learning_rate": 0.00012129233164655508,
+      "loss": 1.1015,
+      "step": 1017
+    },
+    {
+      "epoch": 0.39560866608374623,
+      "grad_norm": 0.19862449169158936,
+      "learning_rate": 0.00012121448034254575,
+      "loss": 1.0328,
+      "step": 1018
+    },
+    {
+      "epoch": 0.39599727970465365,
+      "grad_norm": 0.19761312007904053,
+      "learning_rate": 0.0001211366290385364,
+      "loss": 0.997,
+      "step": 1019
+    },
+    {
+      "epoch": 0.39638589332556107,
+      "grad_norm": 0.1943569928407669,
+      "learning_rate": 0.00012105877773452705,
+      "loss": 1.0099,
+      "step": 1020
+    },
+    {
+      "epoch": 0.3967745069464685,
+      "grad_norm": 0.2109062373638153,
+      "learning_rate": 0.00012098092643051773,
+      "loss": 1.1039,
+      "step": 1021
+    },
+    {
+      "epoch": 0.3971631205673759,
+      "grad_norm": 0.20966266095638275,
+      "learning_rate": 0.00012090307512650839,
+      "loss": 1.1208,
+      "step": 1022
+    },
+    {
+      "epoch": 0.3975517341882833,
+      "grad_norm": 0.19208088517189026,
+      "learning_rate": 0.00012082522382249904,
+      "loss": 1.0147,
+      "step": 1023
+    },
+    {
+      "epoch": 0.3979403478091907,
+      "grad_norm": 0.21821236610412598,
+      "learning_rate": 0.00012074737251848969,
+      "loss": 1.0615,
+      "step": 1024
+    },
+    {
+      "epoch": 0.3983289614300981,
+      "grad_norm": 0.20031368732452393,
+      "learning_rate": 0.00012066952121448034,
+      "loss": 1.0303,
+      "step": 1025
+    },
+    {
+      "epoch": 0.3987175750510055,
+      "grad_norm": 0.22910597920417786,
+      "learning_rate": 0.00012059166991047102,
+      "loss": 1.0182,
+      "step": 1026
+    },
+    {
+      "epoch": 0.39910618867191294,
+      "grad_norm": 0.20816978812217712,
+      "learning_rate": 0.00012051381860646167,
+      "loss": 1.0142,
+      "step": 1027
+    },
+    {
+      "epoch": 0.39949480229282036,
+      "grad_norm": 0.20989780128002167,
+      "learning_rate": 0.00012043596730245232,
+      "loss": 1.0676,
+      "step": 1028
+    },
+    {
+      "epoch": 0.3998834159137278,
+      "grad_norm": 0.21894055604934692,
+      "learning_rate": 0.00012035811599844298,
+      "loss": 1.0222,
+      "step": 1029
+    },
+    {
+      "epoch": 0.4002720295346352,
+      "grad_norm": 0.2170870155096054,
+      "learning_rate": 0.00012028026469443363,
+      "loss": 1.0319,
+      "step": 1030
+    },
+    {
+      "epoch": 0.4006606431555426,
+      "grad_norm": 0.20869679749011993,
+      "learning_rate": 0.00012020241339042428,
+      "loss": 1.055,
+      "step": 1031
+    },
+    {
+      "epoch": 0.40104925677645004,
+      "grad_norm": 0.18850640952587128,
+      "learning_rate": 0.00012012456208641496,
+      "loss": 0.9993,
+      "step": 1032
+    },
+    {
+      "epoch": 0.40143787039735745,
+      "grad_norm": 0.21462580561637878,
+      "learning_rate": 0.00012004671078240561,
+      "loss": 1.0115,
+      "step": 1033
+    },
+    {
+      "epoch": 0.4018264840182648,
+      "grad_norm": 0.2008499950170517,
+      "learning_rate": 0.00011996885947839626,
+      "loss": 1.0229,
+      "step": 1034
+    },
+    {
+      "epoch": 0.40221509763917224,
+      "grad_norm": 0.20063354074954987,
+      "learning_rate": 0.00011989100817438692,
+      "loss": 1.0295,
+      "step": 1035
+    },
+    {
+      "epoch": 0.40260371126007966,
+      "grad_norm": 0.20655786991119385,
+      "learning_rate": 0.00011981315687037757,
+      "loss": 1.0044,
+      "step": 1036
+    },
+    {
+      "epoch": 0.4029923248809871,
+      "grad_norm": 0.1985999196767807,
+      "learning_rate": 0.00011973530556636825,
+      "loss": 1.0063,
+      "step": 1037
+    },
+    {
+      "epoch": 0.4033809385018945,
+      "grad_norm": 0.2039060890674591,
+      "learning_rate": 0.0001196574542623589,
+      "loss": 1.044,
+      "step": 1038
+    },
+    {
+      "epoch": 0.4037695521228019,
+      "grad_norm": 0.21838189661502838,
+      "learning_rate": 0.00011957960295834955,
+      "loss": 1.1101,
+      "step": 1039
+    },
+    {
+      "epoch": 0.40415816574370933,
+      "grad_norm": 0.21508415043354034,
+      "learning_rate": 0.00011950175165434022,
+      "loss": 1.0764,
+      "step": 1040
+    },
+    {
+      "epoch": 0.40454677936461675,
+      "grad_norm": 0.2089119255542755,
+      "learning_rate": 0.00011942390035033087,
+      "loss": 0.9986,
+      "step": 1041
+    },
+    {
+      "epoch": 0.40493539298552417,
+      "grad_norm": 0.19859452545642853,
+      "learning_rate": 0.00011934604904632153,
+      "loss": 1.0122,
+      "step": 1042
+    },
+    {
+      "epoch": 0.40532400660643153,
+      "grad_norm": 0.2018653154373169,
+      "learning_rate": 0.00011926819774231219,
+      "loss": 1.0187,
+      "step": 1043
+    },
+    {
+      "epoch": 0.40571262022733895,
+      "grad_norm": 0.19892063736915588,
+      "learning_rate": 0.00011919034643830285,
+      "loss": 1.0029,
+      "step": 1044
+    },
+    {
+      "epoch": 0.40610123384824637,
+      "grad_norm": 0.20355650782585144,
+      "learning_rate": 0.0001191124951342935,
+      "loss": 1.0484,
+      "step": 1045
+    },
+    {
+      "epoch": 0.4064898474691538,
+      "grad_norm": 0.2033994495868683,
+      "learning_rate": 0.00011903464383028416,
+      "loss": 1.087,
+      "step": 1046
+    },
+    {
+      "epoch": 0.4068784610900612,
+      "grad_norm": 0.2047330141067505,
+      "learning_rate": 0.00011895679252627484,
+      "loss": 1.0774,
+      "step": 1047
+    },
+    {
+      "epoch": 0.4072670747109686,
+      "grad_norm": 0.21420112252235413,
+      "learning_rate": 0.00011887894122226549,
+      "loss": 1.0252,
+      "step": 1048
+    },
+    {
+      "epoch": 0.40765568833187604,
+      "grad_norm": 0.2030097395181656,
+      "learning_rate": 0.00011880108991825614,
+      "loss": 1.0501,
+      "step": 1049
+    },
+    {
+      "epoch": 0.40804430195278346,
+      "grad_norm": 0.2128026783466339,
+      "learning_rate": 0.00011872323861424679,
+      "loss": 1.1031,
+      "step": 1050
+    },
+    {
+      "epoch": 0.4084329155736909,
+      "grad_norm": 0.20724938809871674,
+      "learning_rate": 0.00011864538731023744,
+      "loss": 1.0327,
+      "step": 1051
+    },
+    {
+      "epoch": 0.40882152919459824,
+      "grad_norm": 0.20344072580337524,
+      "learning_rate": 0.00011856753600622812,
+      "loss": 1.0719,
+      "step": 1052
+    },
+    {
+      "epoch": 0.40921014281550566,
+      "grad_norm": 0.2145012468099594,
+      "learning_rate": 0.00011848968470221877,
+      "loss": 1.0582,
+      "step": 1053
+    },
+    {
+      "epoch": 0.4095987564364131,
+      "grad_norm": 0.220048725605011,
+      "learning_rate": 0.00011841183339820943,
+      "loss": 1.0825,
+      "step": 1054
+    },
+    {
+      "epoch": 0.4099873700573205,
+      "grad_norm": 0.19074465334415436,
+      "learning_rate": 0.00011833398209420008,
+      "loss": 0.9657,
+      "step": 1055
+    },
+    {
+      "epoch": 0.4103759836782279,
+      "grad_norm": 0.1958267241716385,
+      "learning_rate": 0.00011825613079019073,
+      "loss": 0.9864,
+      "step": 1056
+    },
+    {
+      "epoch": 0.41076459729913534,
+      "grad_norm": 0.21768233180046082,
+      "learning_rate": 0.00011817827948618141,
+      "loss": 0.9997,
+      "step": 1057
+    },
+    {
+      "epoch": 0.41115321092004276,
+      "grad_norm": 0.20218704640865326,
+      "learning_rate": 0.00011810042818217206,
+      "loss": 1.072,
+      "step": 1058
+    },
+    {
+      "epoch": 0.4115418245409502,
+      "grad_norm": 0.2035023719072342,
+      "learning_rate": 0.00011802257687816271,
+      "loss": 1.0415,
+      "step": 1059
+    },
+    {
+      "epoch": 0.4119304381618576,
+      "grad_norm": 0.22603970766067505,
+      "learning_rate": 0.00011794472557415337,
+      "loss": 1.0751,
+      "step": 1060
+    },
+    {
+      "epoch": 0.412319051782765,
+      "grad_norm": 0.2125842273235321,
+      "learning_rate": 0.00011786687427014402,
+      "loss": 1.0727,
+      "step": 1061
+    },
+    {
+      "epoch": 0.4127076654036724,
+      "grad_norm": 0.2005981206893921,
+      "learning_rate": 0.0001177890229661347,
+      "loss": 1.0191,
+      "step": 1062
+    },
+    {
+      "epoch": 0.4130962790245798,
+      "grad_norm": 0.22252701222896576,
+      "learning_rate": 0.00011771117166212535,
+      "loss": 1.0591,
+      "step": 1063
+    },
+    {
+      "epoch": 0.4134848926454872,
+      "grad_norm": 0.22205251455307007,
+      "learning_rate": 0.000117633320358116,
+      "loss": 1.1198,
+      "step": 1064
+    },
+    {
+      "epoch": 0.41387350626639463,
+      "grad_norm": 0.20037783682346344,
+      "learning_rate": 0.00011755546905410665,
+      "loss": 1.0548,
+      "step": 1065
+    },
+    {
+      "epoch": 0.41426211988730205,
+      "grad_norm": 0.21737834811210632,
+      "learning_rate": 0.00011747761775009732,
+      "loss": 1.0922,
+      "step": 1066
+    },
+    {
+      "epoch": 0.41465073350820947,
+      "grad_norm": 0.19312533736228943,
+      "learning_rate": 0.00011739976644608798,
+      "loss": 0.9836,
+      "step": 1067
+    },
+    {
+      "epoch": 0.4150393471291169,
+      "grad_norm": 0.22055000066757202,
+      "learning_rate": 0.00011732191514207864,
+      "loss": 1.0383,
+      "step": 1068
+    },
+    {
+      "epoch": 0.4154279607500243,
+      "grad_norm": 0.22623857855796814,
+      "learning_rate": 0.0001172440638380693,
+      "loss": 1.0704,
+      "step": 1069
+    },
+    {
+      "epoch": 0.4158165743709317,
+      "grad_norm": 0.21481367945671082,
+      "learning_rate": 0.00011716621253405995,
+      "loss": 1.052,
+      "step": 1070
+    },
+    {
+      "epoch": 0.4162051879918391,
+      "grad_norm": 0.21022087335586548,
+      "learning_rate": 0.0001170883612300506,
+      "loss": 1.1021,
+      "step": 1071
+    },
+    {
+      "epoch": 0.4165938016127465,
+      "grad_norm": 0.2154620885848999,
+      "learning_rate": 0.00011701050992604126,
+      "loss": 1.0128,
+      "step": 1072
+    },
+    {
+      "epoch": 0.4169824152336539,
+      "grad_norm": 0.20545578002929688,
+      "learning_rate": 0.00011693265862203194,
+      "loss": 1.0058,
+      "step": 1073
+    },
+    {
+      "epoch": 0.41737102885456134,
+      "grad_norm": 0.21726195514202118,
+      "learning_rate": 0.00011685480731802259,
+      "loss": 1.0753,
+      "step": 1074
+    },
+    {
+      "epoch": 0.41775964247546876,
+      "grad_norm": 0.2067115604877472,
+      "learning_rate": 0.00011677695601401324,
+      "loss": 1.0594,
+      "step": 1075
+    },
+    {
+      "epoch": 0.4181482560963762,
+      "grad_norm": 0.23024648427963257,
+      "learning_rate": 0.0001166991047100039,
+      "loss": 1.1039,
+      "step": 1076
+    },
+    {
+      "epoch": 0.4185368697172836,
+      "grad_norm": 0.20692144334316254,
+      "learning_rate": 0.00011662125340599455,
+      "loss": 1.0598,
+      "step": 1077
+    },
+    {
+      "epoch": 0.418925483338191,
+      "grad_norm": 0.19839999079704285,
+      "learning_rate": 0.00011654340210198522,
+      "loss": 1.054,
+      "step": 1078
+    },
+    {
+      "epoch": 0.41931409695909844,
+      "grad_norm": 0.19227825105190277,
+      "learning_rate": 0.00011646555079797588,
+      "loss": 0.9453,
+      "step": 1079
+    },
+    {
+      "epoch": 0.41970271058000586,
+      "grad_norm": 0.2112567275762558,
+      "learning_rate": 0.00011638769949396653,
+      "loss": 1.023,
+      "step": 1080
+    },
+    {
+      "epoch": 0.4200913242009132,
+      "grad_norm": 0.185299351811409,
+      "learning_rate": 0.00011630984818995718,
+      "loss": 0.9752,
+      "step": 1081
+    },
+    {
+      "epoch": 0.42047993782182064,
+      "grad_norm": 0.20148858428001404,
+      "learning_rate": 0.00011623199688594783,
+      "loss": 1.0659,
+      "step": 1082
+    },
+    {
+      "epoch": 0.42086855144272806,
+      "grad_norm": 0.1935974359512329,
+      "learning_rate": 0.00011615414558193851,
+      "loss": 1.0116,
+      "step": 1083
+    },
+    {
+      "epoch": 0.4212571650636355,
+      "grad_norm": 0.20433953404426575,
+      "learning_rate": 0.00011607629427792916,
+      "loss": 1.0671,
+      "step": 1084
+    },
+    {
+      "epoch": 0.4216457786845429,
+      "grad_norm": 0.20729799568653107,
+      "learning_rate": 0.00011599844297391982,
+      "loss": 1.0341,
+      "step": 1085
+    },
+    {
+      "epoch": 0.4220343923054503,
+      "grad_norm": 0.2126002460718155,
+      "learning_rate": 0.00011592059166991047,
+      "loss": 1.0188,
+      "step": 1086
+    },
+    {
+      "epoch": 0.42242300592635773,
+      "grad_norm": 0.19453707337379456,
+      "learning_rate": 0.00011584274036590112,
+      "loss": 1.0331,
+      "step": 1087
+    },
+    {
+      "epoch": 0.42281161954726515,
+      "grad_norm": 0.20909856259822845,
+      "learning_rate": 0.0001157648890618918,
+      "loss": 0.9984,
+      "step": 1088
+    },
+    {
+      "epoch": 0.42320023316817257,
+      "grad_norm": 0.19596272706985474,
+      "learning_rate": 0.00011568703775788245,
+      "loss": 1.0121,
+      "step": 1089
+    },
+    {
+      "epoch": 0.42358884678907993,
+      "grad_norm": 0.22045716643333435,
+      "learning_rate": 0.0001156091864538731,
+      "loss": 1.0591,
+      "step": 1090
+    },
+    {
+      "epoch": 0.42397746040998735,
+      "grad_norm": 0.22624897956848145,
+      "learning_rate": 0.00011553133514986376,
+      "loss": 1.0565,
+      "step": 1091
+    },
+    {
+      "epoch": 0.42436607403089477,
+      "grad_norm": 0.20263417065143585,
+      "learning_rate": 0.00011545348384585442,
+      "loss": 1.024,
+      "step": 1092
+    },
+    {
+      "epoch": 0.4247546876518022,
+      "grad_norm": 0.20179417729377747,
+      "learning_rate": 0.00011537563254184509,
+      "loss": 0.9806,
+      "step": 1093
+    },
+    {
+      "epoch": 0.4251433012727096,
+      "grad_norm": 0.30221593379974365,
+      "learning_rate": 0.00011529778123783574,
+      "loss": 1.0683,
+      "step": 1094
+    },
+    {
+      "epoch": 0.425531914893617,
+      "grad_norm": 0.21195146441459656,
+      "learning_rate": 0.0001152199299338264,
+      "loss": 1.1283,
+      "step": 1095
+    },
+    {
+      "epoch": 0.42592052851452444,
+      "grad_norm": 0.21860192716121674,
+      "learning_rate": 0.00011514207862981706,
+      "loss": 1.0046,
+      "step": 1096
+    },
+    {
+      "epoch": 0.42630914213543186,
+      "grad_norm": 0.2234150469303131,
+      "learning_rate": 0.00011506422732580771,
+      "loss": 1.0461,
+      "step": 1097
+    },
+    {
+      "epoch": 0.4266977557563393,
+      "grad_norm": 0.21535125374794006,
+      "learning_rate": 0.00011498637602179837,
+      "loss": 1.0593,
+      "step": 1098
+    },
+    {
+      "epoch": 0.4270863693772467,
+      "grad_norm": 0.19313789904117584,
+      "learning_rate": 0.00011490852471778904,
+      "loss": 1.0357,
+      "step": 1099
+    },
+    {
+      "epoch": 0.42747498299815406,
+      "grad_norm": 0.19886989891529083,
+      "learning_rate": 0.00011483067341377969,
+      "loss": 0.9946,
+      "step": 1100
+    },
+    {
+      "epoch": 0.4278635966190615,
+      "grad_norm": 0.21028490364551544,
+      "learning_rate": 0.00011475282210977034,
+      "loss": 1.0765,
+      "step": 1101
+    },
+    {
+      "epoch": 0.4282522102399689,
+      "grad_norm": 0.2066621333360672,
+      "learning_rate": 0.000114674970805761,
+      "loss": 1.0405,
+      "step": 1102
+    },
+    {
+      "epoch": 0.4286408238608763,
+      "grad_norm": 0.18400220572948456,
+      "learning_rate": 0.00011459711950175168,
+      "loss": 0.9404,
+      "step": 1103
+    },
+    {
+      "epoch": 0.42902943748178374,
+      "grad_norm": 0.2058599591255188,
+      "learning_rate": 0.00011451926819774233,
+      "loss": 1.0505,
+      "step": 1104
+    },
+    {
+      "epoch": 0.42941805110269116,
+      "grad_norm": 0.19696786999702454,
+      "learning_rate": 0.00011444141689373298,
+      "loss": 1.032,
+      "step": 1105
+    },
+    {
+      "epoch": 0.4298066647235986,
+      "grad_norm": 0.2082854062318802,
+      "learning_rate": 0.00011436356558972363,
+      "loss": 1.0914,
+      "step": 1106
+    },
+    {
+      "epoch": 0.430195278344506,
+      "grad_norm": 0.20155015587806702,
+      "learning_rate": 0.00011428571428571428,
+      "loss": 1.0541,
+      "step": 1107
+    },
+    {
+      "epoch": 0.4305838919654134,
+      "grad_norm": 0.23419982194900513,
+      "learning_rate": 0.00011420786298170494,
+      "loss": 1.0684,
+      "step": 1108
+    },
+    {
+      "epoch": 0.4309725055863208,
+      "grad_norm": 0.23493975400924683,
+      "learning_rate": 0.00011413001167769561,
+      "loss": 1.0509,
+      "step": 1109
+    },
+    {
+      "epoch": 0.4313611192072282,
+      "grad_norm": 0.2089843600988388,
+      "learning_rate": 0.00011405216037368627,
+      "loss": 1.0479,
+      "step": 1110
+    },
+    {
+      "epoch": 0.4317497328281356,
+      "grad_norm": 0.21076850593090057,
+      "learning_rate": 0.00011397430906967692,
+      "loss": 1.064,
+      "step": 1111
+    },
+    {
+      "epoch": 0.43213834644904303,
+      "grad_norm": 0.20307987928390503,
+      "learning_rate": 0.00011389645776566757,
+      "loss": 1.0416,
+      "step": 1112
+    },
+    {
+      "epoch": 0.43252696006995045,
+      "grad_norm": 0.20955562591552734,
+      "learning_rate": 0.00011381860646165822,
+      "loss": 1.0158,
+      "step": 1113
+    },
+    {
+      "epoch": 0.43291557369085787,
+      "grad_norm": 0.2074531465768814,
+      "learning_rate": 0.0001137407551576489,
+      "loss": 1.0486,
+      "step": 1114
+    },
+    {
+      "epoch": 0.4333041873117653,
+      "grad_norm": 0.20907235145568848,
+      "learning_rate": 0.00011366290385363955,
+      "loss": 1.0352,
+      "step": 1115
+    },
+    {
+      "epoch": 0.4336928009326727,
+      "grad_norm": 0.21726477146148682,
+      "learning_rate": 0.0001135850525496302,
+      "loss": 1.0068,
+      "step": 1116
+    },
+    {
+      "epoch": 0.4340814145535801,
+      "grad_norm": 0.20231984555721283,
+      "learning_rate": 0.00011350720124562086,
+      "loss": 0.9757,
+      "step": 1117
+    },
+    {
+      "epoch": 0.4344700281744875,
+      "grad_norm": 0.23485834896564484,
+      "learning_rate": 0.00011342934994161152,
+      "loss": 1.0681,
+      "step": 1118
+    },
+    {
+      "epoch": 0.4348586417953949,
+      "grad_norm": 0.21286556124687195,
+      "learning_rate": 0.00011335149863760219,
+      "loss": 1.0399,
+      "step": 1119
+    },
+    {
+      "epoch": 0.4352472554163023,
+      "grad_norm": 0.2097872495651245,
+      "learning_rate": 0.00011327364733359284,
+      "loss": 1.0435,
+      "step": 1120
+    },
+    {
+      "epoch": 0.43563586903720974,
+      "grad_norm": 0.2224377542734146,
+      "learning_rate": 0.00011319579602958351,
+      "loss": 1.1664,
+      "step": 1121
+    },
+    {
+      "epoch": 0.43602448265811716,
+      "grad_norm": 0.19213411211967468,
+      "learning_rate": 0.00011311794472557416,
+      "loss": 1.0424,
+      "step": 1122
+    },
+    {
+      "epoch": 0.4364130962790246,
+      "grad_norm": 0.20974959433078766,
+      "learning_rate": 0.00011304009342156481,
+      "loss": 1.0943,
+      "step": 1123
+    },
+    {
+      "epoch": 0.436801709899932,
+      "grad_norm": 0.19943708181381226,
+      "learning_rate": 0.00011296224211755549,
+      "loss": 1.0652,
+      "step": 1124
+    },
+    {
+      "epoch": 0.4371903235208394,
+      "grad_norm": 0.1832750141620636,
+      "learning_rate": 0.00011288439081354614,
+      "loss": 0.9883,
+      "step": 1125
+    },
+    {
+      "epoch": 0.43757893714174684,
+      "grad_norm": 0.2205052226781845,
+      "learning_rate": 0.0001128065395095368,
+      "loss": 1.0733,
+      "step": 1126
+    },
+    {
+      "epoch": 0.43796755076265426,
+      "grad_norm": 0.2082854062318802,
+      "learning_rate": 0.00011272868820552745,
+      "loss": 1.0141,
+      "step": 1127
+    },
+    {
+      "epoch": 0.4383561643835616,
+      "grad_norm": 0.22755026817321777,
+      "learning_rate": 0.0001126508369015181,
+      "loss": 1.0942,
+      "step": 1128
+    },
+    {
+      "epoch": 0.43874477800446904,
+      "grad_norm": 0.2098863571882248,
+      "learning_rate": 0.00011257298559750878,
+      "loss": 0.9987,
+      "step": 1129
+    },
+    {
+      "epoch": 0.43913339162537646,
+      "grad_norm": 0.20559263229370117,
+      "learning_rate": 0.00011249513429349943,
+      "loss": 1.0345,
+      "step": 1130
+    },
+    {
+      "epoch": 0.4395220052462839,
+      "grad_norm": 0.21955084800720215,
+      "learning_rate": 0.00011241728298949008,
+      "loss": 1.1068,
+      "step": 1131
+    },
+    {
+      "epoch": 0.4399106188671913,
+      "grad_norm": 0.21353478729724884,
+      "learning_rate": 0.00011233943168548073,
+      "loss": 1.0094,
+      "step": 1132
+    },
+    {
+      "epoch": 0.4402992324880987,
+      "grad_norm": 0.19822491705417633,
+      "learning_rate": 0.00011226158038147139,
+      "loss": 0.9758,
+      "step": 1133
+    },
+    {
+      "epoch": 0.44068784610900613,
+      "grad_norm": 0.20079441368579865,
+      "learning_rate": 0.00011218372907746206,
+      "loss": 1.0202,
+      "step": 1134
+    },
+    {
+      "epoch": 0.44107645972991355,
+      "grad_norm": 0.2261926829814911,
+      "learning_rate": 0.00011210587777345272,
+      "loss": 0.9877,
+      "step": 1135
+    },
+    {
+      "epoch": 0.44146507335082097,
+      "grad_norm": 0.2264915257692337,
+      "learning_rate": 0.00011202802646944337,
+      "loss": 0.9887,
+      "step": 1136
+    },
+    {
+      "epoch": 0.44185368697172833,
+      "grad_norm": 0.21853779256343842,
+      "learning_rate": 0.00011195017516543402,
+      "loss": 1.0535,
+      "step": 1137
+    },
+    {
+      "epoch": 0.44224230059263575,
+      "grad_norm": 0.21332694590091705,
+      "learning_rate": 0.00011187232386142467,
+      "loss": 1.0824,
+      "step": 1138
+    },
+    {
+      "epoch": 0.44263091421354317,
+      "grad_norm": 0.21350236237049103,
+      "learning_rate": 0.00011179447255741535,
+      "loss": 1.0758,
+      "step": 1139
+    },
+    {
+      "epoch": 0.4430195278344506,
+      "grad_norm": 0.21305765211582184,
+      "learning_rate": 0.000111716621253406,
+      "loss": 1.035,
+      "step": 1140
+    },
+    {
+      "epoch": 0.443408141455358,
+      "grad_norm": 0.20486389100551605,
+      "learning_rate": 0.00011163876994939666,
+      "loss": 1.0413,
+      "step": 1141
+    },
+    {
+      "epoch": 0.4437967550762654,
+      "grad_norm": 0.19255472719669342,
+      "learning_rate": 0.00011156091864538731,
+      "loss": 0.9583,
+      "step": 1142
+    },
+    {
+      "epoch": 0.44418536869717284,
+      "grad_norm": 0.19824008643627167,
+      "learning_rate": 0.00011148306734137796,
+      "loss": 1.0331,
+      "step": 1143
+    },
+    {
+      "epoch": 0.44457398231808026,
+      "grad_norm": 0.20308080315589905,
+      "learning_rate": 0.00011140521603736863,
+      "loss": 1.0399,
+      "step": 1144
+    },
+    {
+      "epoch": 0.4449625959389877,
+      "grad_norm": 0.2193964123725891,
+      "learning_rate": 0.00011132736473335929,
+      "loss": 1.063,
+      "step": 1145
+    },
+    {
+      "epoch": 0.4453512095598951,
+      "grad_norm": 0.2151576578617096,
+      "learning_rate": 0.00011124951342934994,
+      "loss": 1.0795,
+      "step": 1146
+    },
+    {
+      "epoch": 0.44573982318080246,
+      "grad_norm": 0.23056697845458984,
+      "learning_rate": 0.00011117166212534061,
+      "loss": 1.0351,
+      "step": 1147
+    },
+    {
+      "epoch": 0.4461284368017099,
+      "grad_norm": 0.1973094493150711,
+      "learning_rate": 0.00011109381082133126,
+      "loss": 0.9866,
+      "step": 1148
+    },
+    {
+      "epoch": 0.4465170504226173,
+      "grad_norm": 0.2119562178850174,
+      "learning_rate": 0.00011101595951732191,
+      "loss": 1.0591,
+      "step": 1149
+    },
+    {
+      "epoch": 0.4469056640435247,
+      "grad_norm": 0.20407763123512268,
+      "learning_rate": 0.00011093810821331259,
+      "loss": 0.988,
+      "step": 1150
+    },
+    {
+      "epoch": 0.44729427766443214,
+      "grad_norm": 0.19474107027053833,
+      "learning_rate": 0.00011086025690930324,
+      "loss": 0.9729,
+      "step": 1151
+    },
+    {
+      "epoch": 0.44768289128533956,
+      "grad_norm": 0.2179928421974182,
+      "learning_rate": 0.0001107824056052939,
+      "loss": 1.0558,
+      "step": 1152
+    },
+    {
+      "epoch": 0.448071504906247,
+      "grad_norm": 0.44306451082229614,
+      "learning_rate": 0.00011070455430128455,
+      "loss": 1.0901,
+      "step": 1153
+    },
+    {
+      "epoch": 0.4484601185271544,
+      "grad_norm": 0.22060540318489075,
+      "learning_rate": 0.0001106267029972752,
+      "loss": 1.0009,
+      "step": 1154
+    },
+    {
+      "epoch": 0.4488487321480618,
+      "grad_norm": 0.20534972846508026,
+      "learning_rate": 0.00011054885169326588,
+      "loss": 0.9741,
+      "step": 1155
+    },
+    {
+      "epoch": 0.4492373457689692,
+      "grad_norm": 0.19488993287086487,
+      "learning_rate": 0.00011047100038925653,
+      "loss": 1.0,
+      "step": 1156
+    },
+    {
+      "epoch": 0.4496259593898766,
+      "grad_norm": 0.20462395250797272,
+      "learning_rate": 0.00011039314908524718,
+      "loss": 1.0309,
+      "step": 1157
+    },
+    {
+      "epoch": 0.450014573010784,
+      "grad_norm": 0.2170749306678772,
+      "learning_rate": 0.00011031529778123784,
+      "loss": 1.0726,
+      "step": 1158
+    },
+    {
+      "epoch": 0.45040318663169143,
+      "grad_norm": 0.2066730111837387,
+      "learning_rate": 0.00011023744647722849,
+      "loss": 1.0227,
+      "step": 1159
+    },
+    {
+      "epoch": 0.45079180025259885,
+      "grad_norm": 0.20625676214694977,
+      "learning_rate": 0.00011015959517321917,
+      "loss": 1.0287,
+      "step": 1160
+    },
+    {
+      "epoch": 0.45118041387350627,
+      "grad_norm": 0.19483047723770142,
+      "learning_rate": 0.00011008174386920982,
+      "loss": 0.9639,
+      "step": 1161
+    },
+    {
+      "epoch": 0.4515690274944137,
+      "grad_norm": 0.24705417454242706,
+      "learning_rate": 0.00011000389256520047,
+      "loss": 0.9903,
+      "step": 1162
+    },
+    {
+      "epoch": 0.4519576411153211,
+      "grad_norm": 0.2109205424785614,
+      "learning_rate": 0.00010992604126119112,
+      "loss": 1.054,
+      "step": 1163
+    },
+    {
+      "epoch": 0.4523462547362285,
+      "grad_norm": 0.20904991030693054,
+      "learning_rate": 0.00010984818995718178,
+      "loss": 1.0416,
+      "step": 1164
+    },
+    {
+      "epoch": 0.45273486835713594,
+      "grad_norm": 0.19841328263282776,
+      "learning_rate": 0.00010977033865317245,
+      "loss": 0.9986,
+      "step": 1165
+    },
+    {
+      "epoch": 0.4531234819780433,
+      "grad_norm": 0.20545506477355957,
+      "learning_rate": 0.0001096924873491631,
+      "loss": 1.0337,
+      "step": 1166
+    },
+    {
+      "epoch": 0.4535120955989507,
+      "grad_norm": 0.208644837141037,
+      "learning_rate": 0.00010961463604515376,
+      "loss": 1.0304,
+      "step": 1167
+    },
+    {
+      "epoch": 0.45390070921985815,
+      "grad_norm": 0.2111911028623581,
+      "learning_rate": 0.00010953678474114441,
+      "loss": 1.0398,
+      "step": 1168
+    },
+    {
+      "epoch": 0.45428932284076556,
+      "grad_norm": 0.2600184381008148,
+      "learning_rate": 0.00010945893343713506,
+      "loss": 1.0509,
+      "step": 1169
+    },
+    {
+      "epoch": 0.454677936461673,
+      "grad_norm": 0.2059030532836914,
+      "learning_rate": 0.00010938108213312574,
+      "loss": 0.9347,
+      "step": 1170
+    },
+    {
+      "epoch": 0.4550665500825804,
+      "grad_norm": 0.19232551753520966,
+      "learning_rate": 0.0001093032308291164,
+      "loss": 1.0162,
+      "step": 1171
+    },
+    {
+      "epoch": 0.4554551637034878,
+      "grad_norm": 0.19147330522537231,
+      "learning_rate": 0.00010922537952510705,
+      "loss": 0.9872,
+      "step": 1172
+    },
+    {
+      "epoch": 0.45584377732439524,
+      "grad_norm": 0.2599676251411438,
+      "learning_rate": 0.00010914752822109771,
+      "loss": 1.0402,
+      "step": 1173
+    },
+    {
+      "epoch": 0.45623239094530266,
+      "grad_norm": 0.2159397304058075,
+      "learning_rate": 0.00010906967691708836,
+      "loss": 1.0411,
+      "step": 1174
+    },
+    {
+      "epoch": 0.45662100456621,
+      "grad_norm": 0.23864266276359558,
+      "learning_rate": 0.00010899182561307903,
+      "loss": 1.054,
+      "step": 1175
+    },
+    {
+      "epoch": 0.45700961818711744,
+      "grad_norm": 0.2027217596769333,
+      "learning_rate": 0.0001089139743090697,
+      "loss": 0.9713,
+      "step": 1176
+    },
+    {
+      "epoch": 0.45739823180802486,
+      "grad_norm": 0.1837588995695114,
+      "learning_rate": 0.00010883612300506035,
+      "loss": 0.9698,
+      "step": 1177
+    },
+    {
+      "epoch": 0.4577868454289323,
+      "grad_norm": 0.20038527250289917,
+      "learning_rate": 0.000108758271701051,
+      "loss": 1.0456,
+      "step": 1178
+    },
+    {
+      "epoch": 0.4581754590498397,
+      "grad_norm": 0.21525044739246368,
+      "learning_rate": 0.00010868042039704165,
+      "loss": 1.021,
+      "step": 1179
+    },
+    {
+      "epoch": 0.4585640726707471,
+      "grad_norm": 0.18813730776309967,
+      "learning_rate": 0.0001086025690930323,
+      "loss": 0.9673,
+      "step": 1180
+    },
+    {
+      "epoch": 0.45895268629165453,
+      "grad_norm": 0.2056179642677307,
+      "learning_rate": 0.00010852471778902298,
+      "loss": 1.0119,
+      "step": 1181
+    },
+    {
+      "epoch": 0.45934129991256195,
+      "grad_norm": 0.21599683165550232,
+      "learning_rate": 0.00010844686648501363,
+      "loss": 1.0537,
+      "step": 1182
+    },
+    {
+      "epoch": 0.45972991353346937,
+      "grad_norm": 0.19750265777111053,
+      "learning_rate": 0.00010836901518100429,
+      "loss": 1.0203,
+      "step": 1183
+    },
+    {
+      "epoch": 0.4601185271543768,
+      "grad_norm": 0.22186161577701569,
+      "learning_rate": 0.00010829116387699494,
+      "loss": 1.0583,
+      "step": 1184
+    },
+    {
+      "epoch": 0.46050714077528415,
+      "grad_norm": 0.2109905481338501,
+      "learning_rate": 0.00010821331257298559,
+      "loss": 1.0022,
+      "step": 1185
+    },
+    {
+      "epoch": 0.46089575439619157,
+      "grad_norm": 0.2032858431339264,
+      "learning_rate": 0.00010813546126897627,
+      "loss": 0.9774,
+      "step": 1186
+    },
+    {
+      "epoch": 0.461284368017099,
+      "grad_norm": 0.20381197333335876,
+      "learning_rate": 0.00010805760996496692,
+      "loss": 0.9768,
+      "step": 1187
+    },
+    {
+      "epoch": 0.4616729816380064,
+      "grad_norm": 0.20488987863063812,
+      "learning_rate": 0.00010797975866095757,
+      "loss": 1.0448,
+      "step": 1188
+    },
+    {
+      "epoch": 0.4620615952589138,
+      "grad_norm": 0.20257477462291718,
+      "learning_rate": 0.00010790190735694823,
+      "loss": 1.0157,
+      "step": 1189
+    },
+    {
+      "epoch": 0.46245020887982125,
+      "grad_norm": 0.20761239528656006,
+      "learning_rate": 0.00010782405605293888,
+      "loss": 1.0328,
+      "step": 1190
+    },
+    {
+      "epoch": 0.46283882250072866,
+      "grad_norm": 0.22062581777572632,
+      "learning_rate": 0.00010774620474892956,
+      "loss": 1.0362,
+      "step": 1191
+    },
+    {
+      "epoch": 0.4632274361216361,
+      "grad_norm": 0.19970272481441498,
+      "learning_rate": 0.00010766835344492021,
+      "loss": 1.0783,
+      "step": 1192
+    },
+    {
+      "epoch": 0.4636160497425435,
+      "grad_norm": 0.2221893072128296,
+      "learning_rate": 0.00010759050214091086,
+      "loss": 1.0136,
+      "step": 1193
+    },
+    {
+      "epoch": 0.46400466336345086,
+      "grad_norm": 0.2124665081501007,
+      "learning_rate": 0.00010751265083690151,
+      "loss": 1.0528,
+      "step": 1194
+    },
+    {
+      "epoch": 0.4643932769843583,
+      "grad_norm": 0.2001204937696457,
+      "learning_rate": 0.00010743479953289218,
+      "loss": 1.0495,
+      "step": 1195
+    },
+    {
+      "epoch": 0.4647818906052657,
+      "grad_norm": 0.20979635417461395,
+      "learning_rate": 0.00010735694822888284,
+      "loss": 1.0664,
+      "step": 1196
+    },
+    {
+      "epoch": 0.4651705042261731,
+      "grad_norm": 0.190982848405838,
+      "learning_rate": 0.0001072790969248735,
+      "loss": 1.0256,
+      "step": 1197
+    },
+    {
+      "epoch": 0.46555911784708054,
+      "grad_norm": 0.19910745322704315,
+      "learning_rate": 0.00010720124562086415,
+      "loss": 1.0263,
+      "step": 1198
+    },
+    {
+      "epoch": 0.46594773146798796,
+      "grad_norm": 0.21624085307121277,
+      "learning_rate": 0.00010712339431685481,
+      "loss": 1.0768,
+      "step": 1199
+    },
+    {
+      "epoch": 0.4663363450888954,
+      "grad_norm": 0.20857703685760498,
+      "learning_rate": 0.00010704554301284547,
+      "loss": 1.0892,
+      "step": 1200
+    },
+    {
+      "epoch": 0.4667249587098028,
+      "grad_norm": 0.21897061169147491,
+      "learning_rate": 0.00010696769170883613,
+      "loss": 1.0873,
+      "step": 1201
+    },
+    {
+      "epoch": 0.4671135723307102,
+      "grad_norm": 0.1943386346101761,
+      "learning_rate": 0.0001068898404048268,
+      "loss": 1.0116,
+      "step": 1202
+    },
+    {
+      "epoch": 0.4675021859516176,
+      "grad_norm": 0.22607874870300293,
+      "learning_rate": 0.00010681198910081745,
+      "loss": 1.0328,
+      "step": 1203
+    },
+    {
+      "epoch": 0.467890799572525,
+      "grad_norm": 0.1898999959230423,
+      "learning_rate": 0.0001067341377968081,
+      "loss": 0.9791,
+      "step": 1204
+    },
+    {
+      "epoch": 0.4682794131934324,
+      "grad_norm": 0.2193334400653839,
+      "learning_rate": 0.00010665628649279875,
+      "loss": 1.0742,
+      "step": 1205
+    },
+    {
+      "epoch": 0.46866802681433983,
+      "grad_norm": 0.2096349149942398,
+      "learning_rate": 0.00010657843518878943,
+      "loss": 1.0683,
+      "step": 1206
+    },
+    {
+      "epoch": 0.46905664043524725,
+      "grad_norm": 0.2040576934814453,
+      "learning_rate": 0.00010650058388478008,
+      "loss": 1.0516,
+      "step": 1207
+    },
+    {
+      "epoch": 0.46944525405615467,
+      "grad_norm": 0.20619645714759827,
+      "learning_rate": 0.00010642273258077074,
+      "loss": 1.0429,
+      "step": 1208
+    },
+    {
+      "epoch": 0.4698338676770621,
+      "grad_norm": 0.19753660261631012,
+      "learning_rate": 0.00010634488127676139,
+      "loss": 1.0268,
+      "step": 1209
+    },
+    {
+      "epoch": 0.4702224812979695,
+      "grad_norm": 0.2201426476240158,
+      "learning_rate": 0.00010626702997275204,
+      "loss": 1.0879,
+      "step": 1210
+    },
+    {
+      "epoch": 0.4706110949188769,
+      "grad_norm": 0.21307805180549622,
+      "learning_rate": 0.00010618917866874272,
+      "loss": 1.0186,
+      "step": 1211
+    },
+    {
+      "epoch": 0.47099970853978435,
+      "grad_norm": 0.21142373979091644,
+      "learning_rate": 0.00010611132736473337,
+      "loss": 1.0417,
+      "step": 1212
+    },
+    {
+      "epoch": 0.4713883221606917,
+      "grad_norm": 0.20523706078529358,
+      "learning_rate": 0.00010603347606072402,
+      "loss": 1.0372,
+      "step": 1213
+    },
+    {
+      "epoch": 0.4717769357815991,
+      "grad_norm": 0.19843094050884247,
+      "learning_rate": 0.00010595562475671468,
+      "loss": 1.0062,
+      "step": 1214
+    },
+    {
+      "epoch": 0.47216554940250655,
+      "grad_norm": 0.2146739959716797,
+      "learning_rate": 0.00010587777345270533,
+      "loss": 1.0528,
+      "step": 1215
+    },
+    {
+      "epoch": 0.47255416302341396,
+      "grad_norm": 0.2136303037405014,
+      "learning_rate": 0.00010579992214869601,
+      "loss": 1.0521,
+      "step": 1216
+    },
+    {
+      "epoch": 0.4729427766443214,
+      "grad_norm": 0.21379397809505463,
+      "learning_rate": 0.00010572207084468666,
+      "loss": 1.0362,
+      "step": 1217
+    },
+    {
+      "epoch": 0.4733313902652288,
+      "grad_norm": 0.20459088683128357,
+      "learning_rate": 0.00010564421954067731,
+      "loss": 1.0455,
+      "step": 1218
+    },
+    {
+      "epoch": 0.4737200038861362,
+      "grad_norm": 0.20667988061904907,
+      "learning_rate": 0.00010556636823666796,
+      "loss": 1.0284,
+      "step": 1219
+    },
+    {
+      "epoch": 0.47410861750704364,
+      "grad_norm": 0.21820449829101562,
+      "learning_rate": 0.00010548851693265862,
+      "loss": 1.0584,
+      "step": 1220
+    },
+    {
+      "epoch": 0.47449723112795106,
+      "grad_norm": 0.19705156981945038,
+      "learning_rate": 0.00010541066562864928,
+      "loss": 1.004,
+      "step": 1221
+    },
+    {
+      "epoch": 0.4748858447488584,
+      "grad_norm": 0.19806528091430664,
+      "learning_rate": 0.00010533281432463995,
+      "loss": 1.0519,
+      "step": 1222
+    },
+    {
+      "epoch": 0.47527445836976584,
+      "grad_norm": 0.2006833702325821,
+      "learning_rate": 0.0001052549630206306,
+      "loss": 1.0119,
+      "step": 1223
+    },
+    {
+      "epoch": 0.47566307199067326,
+      "grad_norm": 0.21757058799266815,
+      "learning_rate": 0.00010517711171662125,
+      "loss": 1.0961,
+      "step": 1224
+    },
+    {
+      "epoch": 0.4760516856115807,
+      "grad_norm": 0.2015775889158249,
+      "learning_rate": 0.00010509926041261192,
+      "loss": 1.0419,
+      "step": 1225
+    },
+    {
+      "epoch": 0.4764402992324881,
+      "grad_norm": 0.19691923260688782,
+      "learning_rate": 0.00010502140910860257,
+      "loss": 1.0555,
+      "step": 1226
+    },
+    {
+      "epoch": 0.4768289128533955,
+      "grad_norm": 0.19924800097942352,
+      "learning_rate": 0.00010494355780459323,
+      "loss": 1.0106,
+      "step": 1227
+    },
+    {
+      "epoch": 0.47721752647430293,
+      "grad_norm": 0.21416346728801727,
+      "learning_rate": 0.0001048657065005839,
+      "loss": 1.0741,
+      "step": 1228
+    },
+    {
+      "epoch": 0.47760614009521035,
+      "grad_norm": 0.21823547780513763,
+      "learning_rate": 0.00010478785519657455,
+      "loss": 1.023,
+      "step": 1229
+    },
+    {
+      "epoch": 0.47799475371611777,
+      "grad_norm": 0.2083735466003418,
+      "learning_rate": 0.0001047100038925652,
+      "loss": 1.0424,
+      "step": 1230
+    },
+    {
+      "epoch": 0.4783833673370252,
+      "grad_norm": 0.2219141572713852,
+      "learning_rate": 0.00010463215258855586,
+      "loss": 1.0839,
+      "step": 1231
+    },
+    {
+      "epoch": 0.47877198095793255,
+      "grad_norm": 0.21334600448608398,
+      "learning_rate": 0.00010455430128454653,
+      "loss": 0.9888,
+      "step": 1232
+    },
+    {
+      "epoch": 0.47916059457883997,
+      "grad_norm": 0.2140086442232132,
+      "learning_rate": 0.00010447644998053719,
+      "loss": 1.0119,
+      "step": 1233
+    },
+    {
+      "epoch": 0.4795492081997474,
+      "grad_norm": 0.25360551476478577,
+      "learning_rate": 0.00010439859867652784,
+      "loss": 1.0026,
+      "step": 1234
+    },
+    {
+      "epoch": 0.4799378218206548,
+      "grad_norm": 0.20200380682945251,
+      "learning_rate": 0.00010432074737251849,
+      "loss": 1.0,
+      "step": 1235
+    },
+    {
+      "epoch": 0.4803264354415622,
+      "grad_norm": 0.22641289234161377,
+      "learning_rate": 0.00010424289606850914,
+      "loss": 1.1022,
+      "step": 1236
+    },
+    {
+      "epoch": 0.48071504906246965,
+      "grad_norm": 0.20538561046123505,
+      "learning_rate": 0.00010416504476449982,
+      "loss": 0.9847,
+      "step": 1237
+    },
+    {
+      "epoch": 0.48110366268337706,
+      "grad_norm": 0.206883504986763,
+      "learning_rate": 0.00010408719346049047,
+      "loss": 1.0152,
+      "step": 1238
+    },
+    {
+      "epoch": 0.4814922763042845,
+      "grad_norm": 0.21584320068359375,
+      "learning_rate": 0.00010400934215648113,
+      "loss": 1.0361,
+      "step": 1239
+    },
+    {
+      "epoch": 0.4818808899251919,
+      "grad_norm": 0.20963703095912933,
+      "learning_rate": 0.00010393149085247178,
+      "loss": 1.0814,
+      "step": 1240
+    },
+    {
+      "epoch": 0.48226950354609927,
+      "grad_norm": 0.1965872198343277,
+      "learning_rate": 0.00010385363954846243,
+      "loss": 1.0365,
+      "step": 1241
+    },
+    {
+      "epoch": 0.4826581171670067,
+      "grad_norm": 0.2030191719532013,
+      "learning_rate": 0.00010377578824445311,
+      "loss": 1.0374,
+      "step": 1242
+    },
+    {
+      "epoch": 0.4830467307879141,
+      "grad_norm": 0.21448804438114166,
+      "learning_rate": 0.00010369793694044376,
+      "loss": 0.9686,
+      "step": 1243
+    },
+    {
+      "epoch": 0.4834353444088215,
+      "grad_norm": 0.2181752622127533,
+      "learning_rate": 0.00010362008563643441,
+      "loss": 1.0812,
+      "step": 1244
+    },
+    {
+      "epoch": 0.48382395802972894,
+      "grad_norm": 0.19887101650238037,
+      "learning_rate": 0.00010354223433242507,
+      "loss": 1.036,
+      "step": 1245
+    },
+    {
+      "epoch": 0.48421257165063636,
+      "grad_norm": 0.19007287919521332,
+      "learning_rate": 0.00010346438302841572,
+      "loss": 1.0292,
+      "step": 1246
+    },
+    {
+      "epoch": 0.4846011852715438,
+      "grad_norm": 0.21390347182750702,
+      "learning_rate": 0.0001033865317244064,
+      "loss": 1.0284,
+      "step": 1247
+    },
+    {
+      "epoch": 0.4849897988924512,
+      "grad_norm": 0.23822663724422455,
+      "learning_rate": 0.00010330868042039705,
+      "loss": 1.1044,
+      "step": 1248
+    },
+    {
+      "epoch": 0.4853784125133586,
+      "grad_norm": 0.20779070258140564,
+      "learning_rate": 0.0001032308291163877,
+      "loss": 1.0475,
+      "step": 1249
+    },
+    {
+      "epoch": 0.48576702613426603,
+      "grad_norm": 0.19232134521007538,
+      "learning_rate": 0.00010315297781237835,
+      "loss": 0.9945,
+      "step": 1250
+    },
+    {
+      "epoch": 0.4861556397551734,
+      "grad_norm": 0.22378556430339813,
+      "learning_rate": 0.00010307512650836902,
+      "loss": 1.0462,
+      "step": 1251
+    },
+    {
+      "epoch": 0.4865442533760808,
+      "grad_norm": 0.22156798839569092,
+      "learning_rate": 0.00010299727520435968,
+      "loss": 1.051,
+      "step": 1252
+    },
+    {
+      "epoch": 0.48693286699698823,
+      "grad_norm": 0.19885733723640442,
+      "learning_rate": 0.00010291942390035034,
+      "loss": 1.0593,
+      "step": 1253
+    },
+    {
+      "epoch": 0.48732148061789565,
+      "grad_norm": 0.2172418236732483,
+      "learning_rate": 0.000102841572596341,
+      "loss": 1.0513,
+      "step": 1254
+    },
+    {
+      "epoch": 0.48771009423880307,
+      "grad_norm": 0.22136956453323364,
+      "learning_rate": 0.00010276372129233165,
+      "loss": 1.0438,
+      "step": 1255
+    },
+    {
+      "epoch": 0.4880987078597105,
+      "grad_norm": 0.21337302029132843,
+      "learning_rate": 0.0001026858699883223,
+      "loss": 1.0551,
+      "step": 1256
+    },
+    {
+      "epoch": 0.4884873214806179,
+      "grad_norm": 0.21376267075538635,
+      "learning_rate": 0.00010260801868431296,
+      "loss": 1.054,
+      "step": 1257
+    },
+    {
+      "epoch": 0.4888759351015253,
+      "grad_norm": 0.19498860836029053,
+      "learning_rate": 0.00010253016738030364,
+      "loss": 1.0045,
+      "step": 1258
+    },
+    {
+      "epoch": 0.48926454872243275,
+      "grad_norm": 0.22354961931705475,
+      "learning_rate": 0.00010245231607629429,
+      "loss": 1.096,
+      "step": 1259
+    },
+    {
+      "epoch": 0.4896531623433401,
+      "grad_norm": 0.2078939527273178,
+      "learning_rate": 0.00010237446477228494,
+      "loss": 1.0102,
+      "step": 1260
+    },
+    {
+      "epoch": 0.49004177596424753,
+      "grad_norm": 0.20992495119571686,
+      "learning_rate": 0.00010229661346827559,
+      "loss": 0.9814,
+      "step": 1261
+    },
+    {
+      "epoch": 0.49043038958515495,
+      "grad_norm": 0.2178875207901001,
+      "learning_rate": 0.00010221876216426625,
+      "loss": 1.0489,
+      "step": 1262
+    },
+    {
+      "epoch": 0.49081900320606237,
+      "grad_norm": 0.22152946889400482,
+      "learning_rate": 0.00010214091086025692,
+      "loss": 1.0808,
+      "step": 1263
+    },
+    {
+      "epoch": 0.4912076168269698,
+      "grad_norm": 0.21179009974002838,
+      "learning_rate": 0.00010206305955624758,
+      "loss": 1.0323,
+      "step": 1264
+    },
+    {
+      "epoch": 0.4915962304478772,
+      "grad_norm": 0.2126997411251068,
+      "learning_rate": 0.00010198520825223823,
+      "loss": 1.0093,
+      "step": 1265
+    },
+    {
+      "epoch": 0.4919848440687846,
+      "grad_norm": 0.20912809669971466,
+      "learning_rate": 0.00010190735694822888,
+      "loss": 1.0343,
+      "step": 1266
+    },
+    {
+      "epoch": 0.49237345768969204,
+      "grad_norm": 0.2231636494398117,
+      "learning_rate": 0.00010182950564421953,
+      "loss": 1.0587,
+      "step": 1267
+    },
+    {
+      "epoch": 0.49276207131059946,
+      "grad_norm": 0.1954583376646042,
+      "learning_rate": 0.00010175165434021021,
+      "loss": 0.9566,
+      "step": 1268
+    },
+    {
+      "epoch": 0.4931506849315068,
+      "grad_norm": 0.20520909130573273,
+      "learning_rate": 0.00010167380303620086,
+      "loss": 1.024,
+      "step": 1269
+    },
+    {
+      "epoch": 0.49353929855241424,
+      "grad_norm": 0.21736180782318115,
+      "learning_rate": 0.00010159595173219152,
+      "loss": 1.0434,
+      "step": 1270
+    },
+    {
+      "epoch": 0.49392791217332166,
+      "grad_norm": 0.2360561490058899,
+      "learning_rate": 0.00010151810042818217,
+      "loss": 1.114,
+      "step": 1271
+    },
+    {
+      "epoch": 0.4943165257942291,
+      "grad_norm": 0.20595967769622803,
+      "learning_rate": 0.00010144024912417282,
+      "loss": 0.9909,
+      "step": 1272
+    },
+    {
+      "epoch": 0.4947051394151365,
+      "grad_norm": 0.2161860466003418,
+      "learning_rate": 0.0001013623978201635,
+      "loss": 1.0536,
+      "step": 1273
+    },
+    {
+      "epoch": 0.4950937530360439,
+      "grad_norm": 0.19852355122566223,
+      "learning_rate": 0.00010128454651615415,
+      "loss": 1.0001,
+      "step": 1274
+    },
+    {
+      "epoch": 0.49548236665695133,
+      "grad_norm": 0.21081402897834778,
+      "learning_rate": 0.0001012066952121448,
+      "loss": 1.0151,
+      "step": 1275
+    },
+    {
+      "epoch": 0.49587098027785875,
+      "grad_norm": 0.2053362876176834,
+      "learning_rate": 0.00010112884390813547,
+      "loss": 1.018,
+      "step": 1276
+    },
+    {
+      "epoch": 0.49625959389876617,
+      "grad_norm": 0.21205593645572662,
+      "learning_rate": 0.00010105099260412612,
+      "loss": 0.9912,
+      "step": 1277
+    },
+    {
+      "epoch": 0.4966482075196736,
+      "grad_norm": 0.2005016952753067,
+      "learning_rate": 0.00010097314130011679,
+      "loss": 1.0069,
+      "step": 1278
+    },
+    {
+      "epoch": 0.49703682114058095,
+      "grad_norm": 0.21688181161880493,
+      "learning_rate": 0.00010089528999610744,
+      "loss": 1.0364,
+      "step": 1279
+    },
+    {
+      "epoch": 0.49742543476148837,
+      "grad_norm": 0.20582237839698792,
+      "learning_rate": 0.0001008174386920981,
+      "loss": 1.0138,
+      "step": 1280
+    },
+    {
+      "epoch": 0.4978140483823958,
+      "grad_norm": 0.20824448764324188,
+      "learning_rate": 0.00010073958738808876,
+      "loss": 0.9941,
+      "step": 1281
+    },
+    {
+      "epoch": 0.4982026620033032,
+      "grad_norm": 0.20749075710773468,
+      "learning_rate": 0.00010066173608407941,
+      "loss": 1.0478,
+      "step": 1282
+    },
+    {
+      "epoch": 0.49859127562421063,
+      "grad_norm": 0.20012183487415314,
+      "learning_rate": 0.00010058388478007009,
+      "loss": 0.995,
+      "step": 1283
+    },
+    {
+      "epoch": 0.49897988924511805,
+      "grad_norm": 0.20275959372520447,
+      "learning_rate": 0.00010050603347606074,
+      "loss": 1.097,
+      "step": 1284
+    },
+    {
+      "epoch": 0.49936850286602547,
+      "grad_norm": 0.19588243961334229,
+      "learning_rate": 0.00010042818217205139,
+      "loss": 1.0,
+      "step": 1285
+    },
+    {
+      "epoch": 0.4997571164869329,
+      "grad_norm": 0.20693185925483704,
+      "learning_rate": 0.00010035033086804204,
+      "loss": 1.0527,
+      "step": 1286
+    },
+    {
+      "epoch": 0.5001457301078402,
+      "grad_norm": 0.20330573618412018,
+      "learning_rate": 0.0001002724795640327,
+      "loss": 1.0137,
+      "step": 1287
+    },
+    {
+      "epoch": 0.5005343437287477,
+      "grad_norm": 0.19123876094818115,
+      "learning_rate": 0.00010019462826002337,
+      "loss": 0.9688,
+      "step": 1288
+    },
+    {
+      "epoch": 0.5009229573496551,
+      "grad_norm": 0.2184276431798935,
+      "learning_rate": 0.00010011677695601403,
+      "loss": 1.0367,
+      "step": 1289
+    },
+    {
+      "epoch": 0.5013115709705626,
+      "grad_norm": 0.21642108261585236,
+      "learning_rate": 0.00010003892565200468,
+      "loss": 1.102,
+      "step": 1290
+    },
+    {
+      "epoch": 0.5017001845914699,
+      "grad_norm": 0.20351074635982513,
+      "learning_rate": 9.996107434799533e-05,
+      "loss": 1.0327,
+      "step": 1291
+    },
+    {
+      "epoch": 0.5020887982123774,
+      "grad_norm": 0.22771553695201874,
+      "learning_rate": 9.9883223043986e-05,
+      "loss": 1.104,
+      "step": 1292
+    },
+    {
+      "epoch": 0.5024774118332848,
+      "grad_norm": 0.2271403968334198,
+      "learning_rate": 9.980537173997665e-05,
+      "loss": 1.1313,
+      "step": 1293
+    },
+    {
+      "epoch": 0.5028660254541921,
+      "grad_norm": 0.2157830148935318,
+      "learning_rate": 9.97275204359673e-05,
+      "loss": 1.0203,
+      "step": 1294
+    },
+    {
+      "epoch": 0.5032546390750996,
+      "grad_norm": 0.19555307924747467,
+      "learning_rate": 9.964966913195797e-05,
+      "loss": 1.0194,
+      "step": 1295
+    },
+    {
+      "epoch": 0.503643252696007,
+      "grad_norm": 0.1898549199104309,
+      "learning_rate": 9.957181782794862e-05,
+      "loss": 1.0034,
+      "step": 1296
+    },
+    {
+      "epoch": 0.5040318663169144,
+      "grad_norm": 0.23555906116962433,
+      "learning_rate": 9.949396652393928e-05,
+      "loss": 1.0298,
+      "step": 1297
+    },
+    {
+      "epoch": 0.5044204799378218,
+      "grad_norm": 0.20434850454330444,
+      "learning_rate": 9.941611521992994e-05,
+      "loss": 0.9999,
+      "step": 1298
+    },
+    {
+      "epoch": 0.5048090935587293,
+      "grad_norm": 0.21015289425849915,
+      "learning_rate": 9.933826391592059e-05,
+      "loss": 1.006,
+      "step": 1299
+    },
+    {
+      "epoch": 0.5051977071796366,
+      "grad_norm": 0.21147851645946503,
+      "learning_rate": 9.926041261191125e-05,
+      "loss": 1.0854,
+      "step": 1300
+    },
+    {
+      "epoch": 0.5055863208005441,
+      "grad_norm": 0.19666944444179535,
+      "learning_rate": 9.91825613079019e-05,
+      "loss": 1.0057,
+      "step": 1301
+    },
+    {
+      "epoch": 0.5059749344214515,
+      "grad_norm": 0.21233728528022766,
+      "learning_rate": 9.910471000389257e-05,
+      "loss": 1.0675,
+      "step": 1302
+    },
+    {
+      "epoch": 0.5063635480423588,
+      "grad_norm": 0.21905581653118134,
+      "learning_rate": 9.902685869988322e-05,
+      "loss": 1.0054,
+      "step": 1303
+    },
+    {
+      "epoch": 0.5067521616632663,
+      "grad_norm": 0.23434993624687195,
+      "learning_rate": 9.894900739587389e-05,
+      "loss": 0.9915,
+      "step": 1304
+    },
+    {
+      "epoch": 0.5071407752841737,
+      "grad_norm": 0.21684227883815765,
+      "learning_rate": 9.887115609186454e-05,
+      "loss": 1.1131,
+      "step": 1305
+    },
+    {
+      "epoch": 0.5075293889050811,
+      "grad_norm": 0.21699552237987518,
+      "learning_rate": 9.87933047878552e-05,
+      "loss": 1.0782,
+      "step": 1306
+    },
+    {
+      "epoch": 0.5079180025259885,
+      "grad_norm": 0.2218221127986908,
+      "learning_rate": 9.871545348384586e-05,
+      "loss": 1.0388,
+      "step": 1307
+    },
+    {
+      "epoch": 0.508306616146896,
+      "grad_norm": 0.20104359090328217,
+      "learning_rate": 9.863760217983652e-05,
+      "loss": 1.0336,
+      "step": 1308
+    },
+    {
+      "epoch": 0.5086952297678033,
+      "grad_norm": 0.21907050907611847,
+      "learning_rate": 9.855975087582718e-05,
+      "loss": 1.0587,
+      "step": 1309
+    },
+    {
+      "epoch": 0.5090838433887108,
+      "grad_norm": 0.2140391767024994,
+      "learning_rate": 9.848189957181784e-05,
+      "loss": 1.0351,
+      "step": 1310
+    },
+    {
+      "epoch": 0.5094724570096182,
+      "grad_norm": 0.33287563920021057,
+      "learning_rate": 9.84040482678085e-05,
+      "loss": 0.9908,
+      "step": 1311
+    },
+    {
+      "epoch": 0.5098610706305255,
+      "grad_norm": 0.2706705927848816,
+      "learning_rate": 9.832619696379915e-05,
+      "loss": 1.0078,
+      "step": 1312
+    },
+    {
+      "epoch": 0.510249684251433,
+      "grad_norm": 0.20216278731822968,
+      "learning_rate": 9.824834565978981e-05,
+      "loss": 1.0253,
+      "step": 1313
+    },
+    {
+      "epoch": 0.5106382978723404,
+      "grad_norm": 0.20736576616764069,
+      "learning_rate": 9.817049435578046e-05,
+      "loss": 1.0217,
+      "step": 1314
+    },
+    {
+      "epoch": 0.5110269114932479,
+      "grad_norm": 0.2275344580411911,
+      "learning_rate": 9.809264305177113e-05,
+      "loss": 1.0139,
+      "step": 1315
+    },
+    {
+      "epoch": 0.5114155251141552,
+      "grad_norm": 0.22243620455265045,
+      "learning_rate": 9.801479174776178e-05,
+      "loss": 1.0427,
+      "step": 1316
+    },
+    {
+      "epoch": 0.5118041387350627,
+      "grad_norm": 0.198841854929924,
+      "learning_rate": 9.793694044375243e-05,
+      "loss": 1.0231,
+      "step": 1317
+    },
+    {
+      "epoch": 0.5121927523559701,
+      "grad_norm": 0.2031068503856659,
+      "learning_rate": 9.78590891397431e-05,
+      "loss": 1.0184,
+      "step": 1318
+    },
+    {
+      "epoch": 0.5125813659768775,
+      "grad_norm": 0.21712587773799896,
+      "learning_rate": 9.778123783573375e-05,
+      "loss": 1.0205,
+      "step": 1319
+    },
+    {
+      "epoch": 0.5129699795977849,
+      "grad_norm": 0.19366060197353363,
+      "learning_rate": 9.77033865317244e-05,
+      "loss": 0.9623,
+      "step": 1320
+    },
+    {
+      "epoch": 0.5133585932186923,
+      "grad_norm": 0.19845952093601227,
+      "learning_rate": 9.762553522771507e-05,
+      "loss": 1.0209,
+      "step": 1321
+    },
+    {
+      "epoch": 0.5137472068395997,
+      "grad_norm": 0.19700276851654053,
+      "learning_rate": 9.754768392370572e-05,
+      "loss": 0.9506,
+      "step": 1322
+    },
+    {
+      "epoch": 0.5141358204605071,
+      "grad_norm": 0.19797460734844208,
+      "learning_rate": 9.746983261969639e-05,
+      "loss": 1.0928,
+      "step": 1323
+    },
+    {
+      "epoch": 0.5145244340814146,
+      "grad_norm": 0.20470699667930603,
+      "learning_rate": 9.739198131568704e-05,
+      "loss": 1.0835,
+      "step": 1324
+    },
+    {
+      "epoch": 0.5149130477023219,
+      "grad_norm": 0.19121742248535156,
+      "learning_rate": 9.731413001167769e-05,
+      "loss": 0.9877,
+      "step": 1325
+    },
+    {
+      "epoch": 0.5153016613232294,
+      "grad_norm": 0.20026616752147675,
+      "learning_rate": 9.723627870766836e-05,
+      "loss": 1.0094,
+      "step": 1326
+    },
+    {
+      "epoch": 0.5156902749441368,
+      "grad_norm": 0.2214539796113968,
+      "learning_rate": 9.715842740365901e-05,
+      "loss": 0.9867,
+      "step": 1327
+    },
+    {
+      "epoch": 0.5160788885650442,
+      "grad_norm": 0.22674603760242462,
+      "learning_rate": 9.708057609964967e-05,
+      "loss": 1.0738,
+      "step": 1328
+    },
+    {
+      "epoch": 0.5164675021859516,
+      "grad_norm": 0.21274834871292114,
+      "learning_rate": 9.700272479564033e-05,
+      "loss": 1.0458,
+      "step": 1329
+    },
+    {
+      "epoch": 0.5168561158068591,
+      "grad_norm": 0.20305052399635315,
+      "learning_rate": 9.692487349163099e-05,
+      "loss": 1.0041,
+      "step": 1330
+    },
+    {
+      "epoch": 0.5172447294277664,
+      "grad_norm": 0.1840772181749344,
+      "learning_rate": 9.684702218762166e-05,
+      "loss": 0.9498,
+      "step": 1331
+    },
+    {
+      "epoch": 0.5176333430486738,
+      "grad_norm": 0.2055782824754715,
+      "learning_rate": 9.676917088361231e-05,
+      "loss": 1.0223,
+      "step": 1332
+    },
+    {
+      "epoch": 0.5180219566695813,
+      "grad_norm": 0.21826402842998505,
+      "learning_rate": 9.669131957960297e-05,
+      "loss": 1.1068,
+      "step": 1333
+    },
+    {
+      "epoch": 0.5184105702904886,
+      "grad_norm": 0.22516922652721405,
+      "learning_rate": 9.661346827559363e-05,
+      "loss": 1.0957,
+      "step": 1334
+    },
+    {
+      "epoch": 0.5187991839113961,
+      "grad_norm": 0.21044284105300903,
+      "learning_rate": 9.653561697158428e-05,
+      "loss": 1.0384,
+      "step": 1335
+    },
+    {
+      "epoch": 0.5191877975323035,
+      "grad_norm": 0.20275571942329407,
+      "learning_rate": 9.645776566757494e-05,
+      "loss": 0.9978,
+      "step": 1336
+    },
+    {
+      "epoch": 0.519576411153211,
+      "grad_norm": 0.2077122926712036,
+      "learning_rate": 9.63799143635656e-05,
+      "loss": 1.0418,
+      "step": 1337
+    },
+    {
+      "epoch": 0.5199650247741183,
+      "grad_norm": 0.19158867001533508,
+      "learning_rate": 9.630206305955625e-05,
+      "loss": 1.0527,
+      "step": 1338
+    },
+    {
+      "epoch": 0.5203536383950258,
+      "grad_norm": 0.1932496577501297,
+      "learning_rate": 9.622421175554691e-05,
+      "loss": 1.0039,
+      "step": 1339
+    },
+    {
+      "epoch": 0.5207422520159332,
+      "grad_norm": 0.21937766671180725,
+      "learning_rate": 9.614636045153757e-05,
+      "loss": 1.0373,
+      "step": 1340
+    },
+    {
+      "epoch": 0.5211308656368405,
+      "grad_norm": 0.2268432229757309,
+      "learning_rate": 9.606850914752823e-05,
+      "loss": 1.0815,
+      "step": 1341
+    },
+    {
+      "epoch": 0.521519479257748,
+      "grad_norm": 0.2147454470396042,
+      "learning_rate": 9.599065784351888e-05,
+      "loss": 1.0331,
+      "step": 1342
+    },
+    {
+      "epoch": 0.5219080928786554,
+      "grad_norm": 0.19899709522724152,
+      "learning_rate": 9.591280653950954e-05,
+      "loss": 1.032,
+      "step": 1343
+    },
+    {
+      "epoch": 0.5222967064995628,
+      "grad_norm": 0.19646069407463074,
+      "learning_rate": 9.58349552355002e-05,
+      "loss": 0.9788,
+      "step": 1344
+    },
+    {
+      "epoch": 0.5226853201204702,
+      "grad_norm": 0.2146075963973999,
+      "learning_rate": 9.575710393149085e-05,
+      "loss": 1.0201,
+      "step": 1345
+    },
+    {
+      "epoch": 0.5230739337413777,
+      "grad_norm": 0.1968650370836258,
+      "learning_rate": 9.567925262748152e-05,
+      "loss": 0.9894,
+      "step": 1346
+    },
+    {
+      "epoch": 0.523462547362285,
+      "grad_norm": 0.21111296117305756,
+      "learning_rate": 9.560140132347217e-05,
+      "loss": 1.0961,
+      "step": 1347
+    },
+    {
+      "epoch": 0.5238511609831925,
+      "grad_norm": 0.20917272567749023,
+      "learning_rate": 9.552355001946282e-05,
+      "loss": 1.0435,
+      "step": 1348
+    },
+    {
+      "epoch": 0.5242397746040999,
+      "grad_norm": 0.2029752880334854,
+      "learning_rate": 9.544569871545349e-05,
+      "loss": 1.0328,
+      "step": 1349
+    },
+    {
+      "epoch": 0.5246283882250072,
+      "grad_norm": 0.20726613700389862,
+      "learning_rate": 9.536784741144414e-05,
+      "loss": 1.0465,
+      "step": 1350
+    },
+    {
+      "epoch": 0.5250170018459147,
+      "grad_norm": 0.19778740406036377,
+      "learning_rate": 9.52899961074348e-05,
+      "loss": 1.0058,
+      "step": 1351
+    },
+    {
+      "epoch": 0.5254056154668221,
+      "grad_norm": 0.19958540797233582,
+      "learning_rate": 9.521214480342546e-05,
+      "loss": 1.0164,
+      "step": 1352
+    },
+    {
+      "epoch": 0.5257942290877295,
+      "grad_norm": 0.2151395082473755,
+      "learning_rate": 9.513429349941611e-05,
+      "loss": 1.0703,
+      "step": 1353
+    },
+    {
+      "epoch": 0.5261828427086369,
+      "grad_norm": 0.2366979569196701,
+      "learning_rate": 9.505644219540678e-05,
+      "loss": 0.9832,
+      "step": 1354
+    },
+    {
+      "epoch": 0.5265714563295444,
+      "grad_norm": 0.22064165771007538,
+      "learning_rate": 9.497859089139743e-05,
+      "loss": 1.0181,
+      "step": 1355
+    },
+    {
+      "epoch": 0.5269600699504517,
+      "grad_norm": 0.20221936702728271,
+      "learning_rate": 9.49007395873881e-05,
+      "loss": 1.0424,
+      "step": 1356
+    },
+    {
+      "epoch": 0.5273486835713592,
+      "grad_norm": 0.19608759880065918,
+      "learning_rate": 9.482288828337876e-05,
+      "loss": 1.0074,
+      "step": 1357
+    },
+    {
+      "epoch": 0.5277372971922666,
+      "grad_norm": 0.20686689019203186,
+      "learning_rate": 9.474503697936941e-05,
+      "loss": 1.0213,
+      "step": 1358
+    },
+    {
+      "epoch": 0.528125910813174,
+      "grad_norm": 0.223610520362854,
+      "learning_rate": 9.466718567536008e-05,
+      "loss": 1.05,
+      "step": 1359
+    },
+    {
+      "epoch": 0.5285145244340814,
+      "grad_norm": 0.2135966569185257,
+      "learning_rate": 9.458933437135073e-05,
+      "loss": 1.034,
+      "step": 1360
+    },
+    {
+      "epoch": 0.5289031380549888,
+      "grad_norm": 0.1933239996433258,
+      "learning_rate": 9.451148306734138e-05,
+      "loss": 0.9883,
+      "step": 1361
+    },
+    {
+      "epoch": 0.5292917516758963,
+      "grad_norm": 0.20794694125652313,
+      "learning_rate": 9.443363176333205e-05,
+      "loss": 1.0103,
+      "step": 1362
+    },
+    {
+      "epoch": 0.5296803652968036,
+      "grad_norm": 0.20128493010997772,
+      "learning_rate": 9.43557804593227e-05,
+      "loss": 1.015,
+      "step": 1363
+    },
+    {
+      "epoch": 0.5300689789177111,
+      "grad_norm": 0.2128933072090149,
+      "learning_rate": 9.427792915531336e-05,
+      "loss": 1.0038,
+      "step": 1364
+    },
+    {
+      "epoch": 0.5304575925386185,
+      "grad_norm": 0.2046983689069748,
+      "learning_rate": 9.420007785130402e-05,
+      "loss": 0.9948,
+      "step": 1365
+    },
+    {
+      "epoch": 0.5308462061595259,
+      "grad_norm": 0.20909680426120758,
+      "learning_rate": 9.412222654729467e-05,
+      "loss": 1.0308,
+      "step": 1366
+    },
+    {
+      "epoch": 0.5312348197804333,
+      "grad_norm": 0.2182164192199707,
+      "learning_rate": 9.404437524328533e-05,
+      "loss": 1.0018,
+      "step": 1367
+    },
+    {
+      "epoch": 0.5316234334013407,
+      "grad_norm": 0.2107028216123581,
+      "learning_rate": 9.396652393927599e-05,
+      "loss": 1.0419,
+      "step": 1368
+    },
+    {
+      "epoch": 0.5320120470222481,
+      "grad_norm": 0.24631445109844208,
+      "learning_rate": 9.388867263526665e-05,
+      "loss": 1.0171,
+      "step": 1369
+    },
+    {
+      "epoch": 0.5324006606431555,
+      "grad_norm": 0.20331013202667236,
+      "learning_rate": 9.38108213312573e-05,
+      "loss": 1.0592,
+      "step": 1370
+    },
+    {
+      "epoch": 0.532789274264063,
+      "grad_norm": 0.19266058504581451,
+      "learning_rate": 9.373297002724796e-05,
+      "loss": 0.9912,
+      "step": 1371
+    },
+    {
+      "epoch": 0.5331778878849703,
+      "grad_norm": 0.22874227166175842,
+      "learning_rate": 9.365511872323862e-05,
+      "loss": 1.0533,
+      "step": 1372
+    },
+    {
+      "epoch": 0.5335665015058778,
+      "grad_norm": 0.2088235765695572,
+      "learning_rate": 9.357726741922927e-05,
+      "loss": 1.0464,
+      "step": 1373
+    },
+    {
+      "epoch": 0.5339551151267852,
+      "grad_norm": 0.2112397700548172,
+      "learning_rate": 9.349941611521994e-05,
+      "loss": 1.0503,
+      "step": 1374
+    },
+    {
+      "epoch": 0.5343437287476926,
+      "grad_norm": 0.20712170004844666,
+      "learning_rate": 9.342156481121059e-05,
+      "loss": 1.0237,
+      "step": 1375
+    },
+    {
+      "epoch": 0.5347323423686,
+      "grad_norm": 0.20077116787433624,
+      "learning_rate": 9.334371350720124e-05,
+      "loss": 1.0467,
+      "step": 1376
+    },
+    {
+      "epoch": 0.5351209559895075,
+      "grad_norm": 0.20394501090049744,
+      "learning_rate": 9.326586220319191e-05,
+      "loss": 1.0054,
+      "step": 1377
+    },
+    {
+      "epoch": 0.5355095696104148,
+      "grad_norm": 0.19459395110607147,
+      "learning_rate": 9.318801089918256e-05,
+      "loss": 0.9792,
+      "step": 1378
+    },
+    {
+      "epoch": 0.5358981832313222,
+      "grad_norm": 0.2116049826145172,
+      "learning_rate": 9.311015959517321e-05,
+      "loss": 1.0345,
+      "step": 1379
+    },
+    {
+      "epoch": 0.5362867968522297,
+      "grad_norm": 0.21672269701957703,
+      "learning_rate": 9.303230829116388e-05,
+      "loss": 1.0709,
+      "step": 1380
+    },
+    {
+      "epoch": 0.536675410473137,
+      "grad_norm": 0.20358407497406006,
+      "learning_rate": 9.295445698715453e-05,
+      "loss": 1.0534,
+      "step": 1381
+    },
+    {
+      "epoch": 0.5370640240940445,
+      "grad_norm": 0.19512853026390076,
+      "learning_rate": 9.28766056831452e-05,
+      "loss": 0.9397,
+      "step": 1382
+    },
+    {
+      "epoch": 0.5374526377149519,
+      "grad_norm": 0.2140122503042221,
+      "learning_rate": 9.279875437913586e-05,
+      "loss": 1.0164,
+      "step": 1383
+    },
+    {
+      "epoch": 0.5378412513358594,
+      "grad_norm": 0.20486049354076385,
+      "learning_rate": 9.272090307512651e-05,
+      "loss": 0.9892,
+      "step": 1384
+    },
+    {
+      "epoch": 0.5382298649567667,
+      "grad_norm": 0.20023222267627716,
+      "learning_rate": 9.264305177111718e-05,
+      "loss": 1.0019,
+      "step": 1385
+    },
+    {
+      "epoch": 0.5386184785776742,
+      "grad_norm": 0.20024439692497253,
+      "learning_rate": 9.256520046710783e-05,
+      "loss": 0.9717,
+      "step": 1386
+    },
+    {
+      "epoch": 0.5390070921985816,
+      "grad_norm": 0.21021386981010437,
+      "learning_rate": 9.24873491630985e-05,
+      "loss": 1.028,
+      "step": 1387
+    },
+    {
+      "epoch": 0.5393957058194889,
+      "grad_norm": 0.18508704006671906,
+      "learning_rate": 9.240949785908915e-05,
+      "loss": 1.0008,
+      "step": 1388
+    },
+    {
+      "epoch": 0.5397843194403964,
+      "grad_norm": 0.19351208209991455,
+      "learning_rate": 9.23316465550798e-05,
+      "loss": 0.9898,
+      "step": 1389
+    },
+    {
+      "epoch": 0.5401729330613038,
+      "grad_norm": 0.20341919362545013,
+      "learning_rate": 9.225379525107047e-05,
+      "loss": 1.0203,
+      "step": 1390
+    },
+    {
+      "epoch": 0.5405615466822112,
+      "grad_norm": 0.1942797303199768,
+      "learning_rate": 9.217594394706112e-05,
+      "loss": 1.003,
+      "step": 1391
+    },
+    {
+      "epoch": 0.5409501603031186,
+      "grad_norm": 0.2056138813495636,
+      "learning_rate": 9.209809264305178e-05,
+      "loss": 1.0149,
+      "step": 1392
+    },
+    {
+      "epoch": 0.5413387739240261,
+      "grad_norm": 0.21572062373161316,
+      "learning_rate": 9.202024133904244e-05,
+      "loss": 0.9808,
+      "step": 1393
+    },
+    {
+      "epoch": 0.5417273875449334,
+      "grad_norm": 0.19841499626636505,
+      "learning_rate": 9.194239003503309e-05,
+      "loss": 1.0467,
+      "step": 1394
+    },
+    {
+      "epoch": 0.5421160011658409,
+      "grad_norm": 0.20452147722244263,
+      "learning_rate": 9.186453873102375e-05,
+      "loss": 1.0378,
+      "step": 1395
+    },
+    {
+      "epoch": 0.5425046147867483,
+      "grad_norm": 0.2090451419353485,
+      "learning_rate": 9.17866874270144e-05,
+      "loss": 1.0823,
+      "step": 1396
+    },
+    {
+      "epoch": 0.5428932284076556,
+      "grad_norm": 0.215814009308815,
+      "learning_rate": 9.170883612300506e-05,
+      "loss": 1.0994,
+      "step": 1397
+    },
+    {
+      "epoch": 0.5432818420285631,
+      "grad_norm": 0.19924724102020264,
+      "learning_rate": 9.163098481899572e-05,
+      "loss": 1.0099,
+      "step": 1398
+    },
+    {
+      "epoch": 0.5436704556494705,
+      "grad_norm": 0.20074865221977234,
+      "learning_rate": 9.155313351498638e-05,
+      "loss": 1.0163,
+      "step": 1399
+    },
+    {
+      "epoch": 0.544059069270378,
+      "grad_norm": 0.21737203001976013,
+      "learning_rate": 9.147528221097704e-05,
+      "loss": 1.0527,
+      "step": 1400
+    },
+    {
+      "epoch": 0.5444476828912853,
+      "grad_norm": 0.2036885768175125,
+      "learning_rate": 9.139743090696769e-05,
+      "loss": 1.0208,
+      "step": 1401
+    },
+    {
+      "epoch": 0.5448362965121928,
+      "grad_norm": 0.20861585438251495,
+      "learning_rate": 9.131957960295835e-05,
+      "loss": 1.0175,
+      "step": 1402
+    },
+    {
+      "epoch": 0.5452249101331001,
+      "grad_norm": 0.23425570130348206,
+      "learning_rate": 9.124172829894901e-05,
+      "loss": 1.053,
+      "step": 1403
+    },
+    {
+      "epoch": 0.5456135237540076,
+      "grad_norm": 0.20389291644096375,
+      "learning_rate": 9.116387699493966e-05,
+      "loss": 1.0479,
+      "step": 1404
+    },
+    {
+      "epoch": 0.546002137374915,
+      "grad_norm": 0.20166678726673126,
+      "learning_rate": 9.108602569093033e-05,
+      "loss": 1.0064,
+      "step": 1405
+    },
+    {
+      "epoch": 0.5463907509958223,
+      "grad_norm": 0.21419203281402588,
+      "learning_rate": 9.100817438692098e-05,
+      "loss": 1.0122,
+      "step": 1406
+    },
+    {
+      "epoch": 0.5467793646167298,
+      "grad_norm": 0.20541758835315704,
+      "learning_rate": 9.093032308291165e-05,
+      "loss": 1.0355,
+      "step": 1407
+    },
+    {
+      "epoch": 0.5471679782376372,
+      "grad_norm": 0.21865367889404297,
+      "learning_rate": 9.08524717789023e-05,
+      "loss": 1.0201,
+      "step": 1408
+    },
+    {
+      "epoch": 0.5475565918585447,
+      "grad_norm": 0.21181468665599823,
+      "learning_rate": 9.077462047489296e-05,
+      "loss": 1.0501,
+      "step": 1409
+    },
+    {
+      "epoch": 0.547945205479452,
+      "grad_norm": 0.21016767621040344,
+      "learning_rate": 9.069676917088362e-05,
+      "loss": 1.0452,
+      "step": 1410
+    },
+    {
+      "epoch": 0.5483338191003595,
+      "grad_norm": 0.21119755506515503,
+      "learning_rate": 9.061891786687428e-05,
+      "loss": 1.0935,
+      "step": 1411
+    },
+    {
+      "epoch": 0.5487224327212669,
+      "grad_norm": 0.20688095688819885,
+      "learning_rate": 9.054106656286493e-05,
+      "loss": 1.0526,
+      "step": 1412
+    },
+    {
+      "epoch": 0.5491110463421743,
+      "grad_norm": 0.21857528388500214,
+      "learning_rate": 9.04632152588556e-05,
+      "loss": 1.0067,
+      "step": 1413
+    },
+    {
+      "epoch": 0.5494996599630817,
+      "grad_norm": 0.2196548581123352,
+      "learning_rate": 9.038536395484625e-05,
+      "loss": 1.0263,
+      "step": 1414
+    },
+    {
+      "epoch": 0.5498882735839892,
+      "grad_norm": 0.21952040493488312,
+      "learning_rate": 9.03075126508369e-05,
+      "loss": 1.0009,
+      "step": 1415
+    },
+    {
+      "epoch": 0.5502768872048965,
+      "grad_norm": 0.20059294998645782,
+      "learning_rate": 9.022966134682757e-05,
+      "loss": 1.0481,
+      "step": 1416
+    },
+    {
+      "epoch": 0.5506655008258039,
+      "grad_norm": 0.1960824728012085,
+      "learning_rate": 9.015181004281822e-05,
+      "loss": 1.0003,
+      "step": 1417
+    },
+    {
+      "epoch": 0.5510541144467114,
+      "grad_norm": 0.19051724672317505,
+      "learning_rate": 9.007395873880889e-05,
+      "loss": 0.9556,
+      "step": 1418
+    },
+    {
+      "epoch": 0.5514427280676187,
+      "grad_norm": 0.21008028090000153,
+      "learning_rate": 8.999610743479954e-05,
+      "loss": 1.0457,
+      "step": 1419
+    },
+    {
+      "epoch": 0.5518313416885262,
+      "grad_norm": 0.21465444564819336,
+      "learning_rate": 8.991825613079019e-05,
+      "loss": 1.0196,
+      "step": 1420
+    },
+    {
+      "epoch": 0.5522199553094336,
+      "grad_norm": 0.2062770277261734,
+      "learning_rate": 8.984040482678086e-05,
+      "loss": 1.0501,
+      "step": 1421
+    },
+    {
+      "epoch": 0.552608568930341,
+      "grad_norm": 0.21400012075901031,
+      "learning_rate": 8.976255352277151e-05,
+      "loss": 1.0711,
+      "step": 1422
+    },
+    {
+      "epoch": 0.5529971825512484,
+      "grad_norm": 0.19617624580860138,
+      "learning_rate": 8.968470221876217e-05,
+      "loss": 0.9858,
+      "step": 1423
+    },
+    {
+      "epoch": 0.5533857961721559,
+      "grad_norm": 0.20835624635219574,
+      "learning_rate": 8.960685091475283e-05,
+      "loss": 1.0122,
+      "step": 1424
+    },
+    {
+      "epoch": 0.5537744097930632,
+      "grad_norm": 0.21708111464977264,
+      "learning_rate": 8.952899961074348e-05,
+      "loss": 1.0108,
+      "step": 1425
+    },
+    {
+      "epoch": 0.5541630234139706,
+      "grad_norm": 0.20877864956855774,
+      "learning_rate": 8.945114830673414e-05,
+      "loss": 1.0389,
+      "step": 1426
+    },
+    {
+      "epoch": 0.5545516370348781,
+      "grad_norm": 0.1924441158771515,
+      "learning_rate": 8.93732970027248e-05,
+      "loss": 1.0088,
+      "step": 1427
+    },
+    {
+      "epoch": 0.5549402506557854,
+      "grad_norm": 0.20288826525211334,
+      "learning_rate": 8.929544569871546e-05,
+      "loss": 1.0296,
+      "step": 1428
+    },
+    {
+      "epoch": 0.5553288642766929,
+      "grad_norm": 0.2008143663406372,
+      "learning_rate": 8.921759439470611e-05,
+      "loss": 1.0521,
+      "step": 1429
+    },
+    {
+      "epoch": 0.5557174778976003,
+      "grad_norm": 0.24407047033309937,
+      "learning_rate": 8.913974309069677e-05,
+      "loss": 1.1038,
+      "step": 1430
+    },
+    {
+      "epoch": 0.5561060915185078,
+      "grad_norm": 0.2172536998987198,
+      "learning_rate": 8.906189178668743e-05,
+      "loss": 1.0811,
+      "step": 1431
+    },
+    {
+      "epoch": 0.5564947051394151,
+      "grad_norm": 0.21712054312229156,
+      "learning_rate": 8.898404048267808e-05,
+      "loss": 1.0642,
+      "step": 1432
+    },
+    {
+      "epoch": 0.5568833187603226,
+      "grad_norm": 0.22482797503471375,
+      "learning_rate": 8.890618917866875e-05,
+      "loss": 1.0742,
+      "step": 1433
+    },
+    {
+      "epoch": 0.55727193238123,
+      "grad_norm": 0.1974876970052719,
+      "learning_rate": 8.88283378746594e-05,
+      "loss": 0.9954,
+      "step": 1434
+    },
+    {
+      "epoch": 0.5576605460021373,
+      "grad_norm": 0.19162166118621826,
+      "learning_rate": 8.875048657065007e-05,
+      "loss": 1.0074,
+      "step": 1435
+    },
+    {
+      "epoch": 0.5580491596230448,
+      "grad_norm": 0.20439045131206512,
+      "learning_rate": 8.867263526664072e-05,
+      "loss": 1.026,
+      "step": 1436
+    },
+    {
+      "epoch": 0.5584377732439522,
+      "grad_norm": 0.1947651207447052,
+      "learning_rate": 8.859478396263138e-05,
+      "loss": 0.9848,
+      "step": 1437
+    },
+    {
+      "epoch": 0.5588263868648596,
+      "grad_norm": 0.21434316039085388,
+      "learning_rate": 8.851693265862204e-05,
+      "loss": 1.0843,
+      "step": 1438
+    },
+    {
+      "epoch": 0.559215000485767,
+      "grad_norm": 1.3314417600631714,
+      "learning_rate": 8.84390813546127e-05,
+      "loss": 1.0356,
+      "step": 1439
+    },
+    {
+      "epoch": 0.5596036141066745,
+      "grad_norm": 0.20131289958953857,
+      "learning_rate": 8.836123005060335e-05,
+      "loss": 1.0214,
+      "step": 1440
+    },
+    {
+      "epoch": 0.5599922277275818,
+      "grad_norm": 0.21596461534500122,
+      "learning_rate": 8.828337874659402e-05,
+      "loss": 1.0962,
+      "step": 1441
+    },
+    {
+      "epoch": 0.5603808413484893,
+      "grad_norm": 0.20477193593978882,
+      "learning_rate": 8.820552744258467e-05,
+      "loss": 1.0643,
+      "step": 1442
+    },
+    {
+      "epoch": 0.5607694549693967,
+      "grad_norm": 0.1978107988834381,
+      "learning_rate": 8.812767613857532e-05,
+      "loss": 1.0054,
+      "step": 1443
+    },
+    {
+      "epoch": 0.561158068590304,
+      "grad_norm": 0.219422847032547,
+      "learning_rate": 8.804982483456599e-05,
+      "loss": 1.0009,
+      "step": 1444
+    },
+    {
+      "epoch": 0.5615466822112115,
+      "grad_norm": 0.21489015221595764,
+      "learning_rate": 8.797197353055664e-05,
+      "loss": 1.052,
+      "step": 1445
+    },
+    {
+      "epoch": 0.5619352958321189,
+      "grad_norm": 0.2235930860042572,
+      "learning_rate": 8.78941222265473e-05,
+      "loss": 1.037,
+      "step": 1446
+    },
+    {
+      "epoch": 0.5623239094530263,
+      "grad_norm": 0.19922038912773132,
+      "learning_rate": 8.781627092253796e-05,
+      "loss": 1.0006,
+      "step": 1447
+    },
+    {
+      "epoch": 0.5627125230739337,
+      "grad_norm": 0.24740247428417206,
+      "learning_rate": 8.773841961852861e-05,
+      "loss": 1.0753,
+      "step": 1448
+    },
+    {
+      "epoch": 0.5631011366948412,
+      "grad_norm": 0.2148803174495697,
+      "learning_rate": 8.766056831451928e-05,
+      "loss": 1.0712,
+      "step": 1449
+    },
+    {
+      "epoch": 0.5634897503157485,
+      "grad_norm": 0.19838745892047882,
+      "learning_rate": 8.758271701050993e-05,
+      "loss": 1.027,
+      "step": 1450
+    },
+    {
+      "epoch": 0.563878363936656,
+      "grad_norm": 0.20328201353549957,
+      "learning_rate": 8.750486570650058e-05,
+      "loss": 1.0117,
+      "step": 1451
+    },
+    {
+      "epoch": 0.5642669775575634,
+      "grad_norm": 0.21230114996433258,
+      "learning_rate": 8.742701440249125e-05,
+      "loss": 1.0658,
+      "step": 1452
+    },
+    {
+      "epoch": 0.5646555911784708,
+      "grad_norm": 0.2030259519815445,
+      "learning_rate": 8.73491630984819e-05,
+      "loss": 1.0002,
+      "step": 1453
+    },
+    {
+      "epoch": 0.5650442047993782,
+      "grad_norm": 0.21404659748077393,
+      "learning_rate": 8.727131179447256e-05,
+      "loss": 1.0572,
+      "step": 1454
+    },
+    {
+      "epoch": 0.5654328184202856,
+      "grad_norm": 0.2148464322090149,
+      "learning_rate": 8.719346049046322e-05,
+      "loss": 1.0164,
+      "step": 1455
+    },
+    {
+      "epoch": 0.5658214320411931,
+      "grad_norm": 0.22083118557929993,
+      "learning_rate": 8.711560918645387e-05,
+      "loss": 0.9704,
+      "step": 1456
+    },
+    {
+      "epoch": 0.5662100456621004,
+      "grad_norm": 0.19305935502052307,
+      "learning_rate": 8.703775788244453e-05,
+      "loss": 1.0034,
+      "step": 1457
+    },
+    {
+      "epoch": 0.5665986592830079,
+      "grad_norm": 0.2100098729133606,
+      "learning_rate": 8.695990657843518e-05,
+      "loss": 1.0907,
+      "step": 1458
+    },
+    {
+      "epoch": 0.5669872729039153,
+      "grad_norm": 0.18947799503803253,
+      "learning_rate": 8.688205527442585e-05,
+      "loss": 0.9664,
+      "step": 1459
+    },
+    {
+      "epoch": 0.5673758865248227,
+      "grad_norm": 0.22341710329055786,
+      "learning_rate": 8.68042039704165e-05,
+      "loss": 1.0551,
+      "step": 1460
+    },
+    {
+      "epoch": 0.5677645001457301,
+      "grad_norm": 0.219679057598114,
+      "learning_rate": 8.672635266640717e-05,
+      "loss": 1.0398,
+      "step": 1461
+    },
+    {
+      "epoch": 0.5681531137666376,
+      "grad_norm": 0.22389841079711914,
+      "learning_rate": 8.664850136239782e-05,
+      "loss": 1.0472,
+      "step": 1462
+    },
+    {
+      "epoch": 0.5685417273875449,
+      "grad_norm": 0.21402975916862488,
+      "learning_rate": 8.657065005838849e-05,
+      "loss": 1.0224,
+      "step": 1463
+    },
+    {
+      "epoch": 0.5689303410084523,
+      "grad_norm": 0.20917154848575592,
+      "learning_rate": 8.649279875437915e-05,
+      "loss": 1.0526,
+      "step": 1464
+    },
+    {
+      "epoch": 0.5693189546293598,
+      "grad_norm": 0.2252056896686554,
+      "learning_rate": 8.64149474503698e-05,
+      "loss": 1.1064,
+      "step": 1465
+    },
+    {
+      "epoch": 0.5697075682502671,
+      "grad_norm": 0.21834802627563477,
+      "learning_rate": 8.633709614636046e-05,
+      "loss": 1.0318,
+      "step": 1466
+    },
+    {
+      "epoch": 0.5700961818711746,
+      "grad_norm": 0.21882353723049164,
+      "learning_rate": 8.625924484235112e-05,
+      "loss": 1.0285,
+      "step": 1467
+    },
+    {
+      "epoch": 0.570484795492082,
+      "grad_norm": 0.2028426229953766,
+      "learning_rate": 8.618139353834177e-05,
+      "loss": 1.0356,
+      "step": 1468
+    },
+    {
+      "epoch": 0.5708734091129894,
+      "grad_norm": 0.22297166287899017,
+      "learning_rate": 8.610354223433243e-05,
+      "loss": 1.0804,
+      "step": 1469
+    },
+    {
+      "epoch": 0.5712620227338968,
+      "grad_norm": 0.21775268018245697,
+      "learning_rate": 8.602569093032309e-05,
+      "loss": 0.9978,
+      "step": 1470
+    },
+    {
+      "epoch": 0.5716506363548043,
+      "grad_norm": 0.20362353324890137,
+      "learning_rate": 8.594783962631374e-05,
+      "loss": 0.9982,
+      "step": 1471
+    },
+    {
+      "epoch": 0.5720392499757117,
+      "grad_norm": 0.21854591369628906,
+      "learning_rate": 8.586998832230441e-05,
+      "loss": 1.0465,
+      "step": 1472
+    },
+    {
+      "epoch": 0.572427863596619,
+      "grad_norm": 0.20501428842544556,
+      "learning_rate": 8.579213701829506e-05,
+      "loss": 1.0468,
+      "step": 1473
+    },
+    {
+      "epoch": 0.5728164772175265,
+      "grad_norm": 0.21606214344501495,
+      "learning_rate": 8.571428571428571e-05,
+      "loss": 1.0477,
+      "step": 1474
+    },
+    {
+      "epoch": 0.5732050908384339,
+      "grad_norm": 0.2100660502910614,
+      "learning_rate": 8.563643441027638e-05,
+      "loss": 1.0071,
+      "step": 1475
+    },
+    {
+      "epoch": 0.5735937044593413,
+      "grad_norm": 0.21008896827697754,
+      "learning_rate": 8.555858310626703e-05,
+      "loss": 0.9914,
+      "step": 1476
+    },
+    {
+      "epoch": 0.5739823180802487,
+      "grad_norm": 0.22192159295082092,
+      "learning_rate": 8.54807318022577e-05,
+      "loss": 1.0385,
+      "step": 1477
+    },
+    {
+      "epoch": 0.5743709317011562,
+      "grad_norm": 0.20123356580734253,
+      "learning_rate": 8.540288049824835e-05,
+      "loss": 1.0062,
+      "step": 1478
+    },
+    {
+      "epoch": 0.5747595453220635,
+      "grad_norm": 0.201947420835495,
+      "learning_rate": 8.5325029194239e-05,
+      "loss": 1.0218,
+      "step": 1479
+    },
+    {
+      "epoch": 0.575148158942971,
+      "grad_norm": 0.22804415225982666,
+      "learning_rate": 8.524717789022967e-05,
+      "loss": 1.0445,
+      "step": 1480
+    },
+    {
+      "epoch": 0.5755367725638784,
+      "grad_norm": 0.20527036488056183,
+      "learning_rate": 8.516932658622032e-05,
+      "loss": 0.9972,
+      "step": 1481
+    },
+    {
+      "epoch": 0.5759253861847857,
+      "grad_norm": 0.20298773050308228,
+      "learning_rate": 8.509147528221098e-05,
+      "loss": 1.0272,
+      "step": 1482
+    },
+    {
+      "epoch": 0.5763139998056932,
+      "grad_norm": 0.22500957548618317,
+      "learning_rate": 8.501362397820164e-05,
+      "loss": 1.0982,
+      "step": 1483
+    },
+    {
+      "epoch": 0.5767026134266006,
+      "grad_norm": 0.1950521320104599,
+      "learning_rate": 8.493577267419229e-05,
+      "loss": 0.9848,
+      "step": 1484
+    },
+    {
+      "epoch": 0.577091227047508,
+      "grad_norm": 0.21087585389614105,
+      "learning_rate": 8.485792137018295e-05,
+      "loss": 1.0125,
+      "step": 1485
+    },
+    {
+      "epoch": 0.5774798406684154,
+      "grad_norm": 0.20122238993644714,
+      "learning_rate": 8.47800700661736e-05,
+      "loss": 1.0533,
+      "step": 1486
+    },
+    {
+      "epoch": 0.5778684542893229,
+      "grad_norm": 0.20149008929729462,
+      "learning_rate": 8.470221876216427e-05,
+      "loss": 1.0719,
+      "step": 1487
+    },
+    {
+      "epoch": 0.5782570679102302,
+      "grad_norm": 0.21307213604450226,
+      "learning_rate": 8.462436745815494e-05,
+      "loss": 1.0522,
+      "step": 1488
+    },
+    {
+      "epoch": 0.5786456815311377,
+      "grad_norm": 0.21828554570674896,
+      "learning_rate": 8.454651615414559e-05,
+      "loss": 1.0184,
+      "step": 1489
+    },
+    {
+      "epoch": 0.5790342951520451,
+      "grad_norm": 0.22002705931663513,
+      "learning_rate": 8.446866485013625e-05,
+      "loss": 1.0101,
+      "step": 1490
+    },
+    {
+      "epoch": 0.5794229087729524,
+      "grad_norm": 0.19479142129421234,
+      "learning_rate": 8.43908135461269e-05,
+      "loss": 0.9889,
+      "step": 1491
+    },
+    {
+      "epoch": 0.5798115223938599,
+      "grad_norm": 0.21346086263656616,
+      "learning_rate": 8.431296224211756e-05,
+      "loss": 1.0373,
+      "step": 1492
+    },
+    {
+      "epoch": 0.5802001360147673,
+      "grad_norm": 0.20177558064460754,
+      "learning_rate": 8.423511093810822e-05,
+      "loss": 1.0215,
+      "step": 1493
+    },
+    {
+      "epoch": 0.5805887496356748,
+      "grad_norm": 0.2117915153503418,
+      "learning_rate": 8.415725963409888e-05,
+      "loss": 1.0321,
+      "step": 1494
+    },
+    {
+      "epoch": 0.5809773632565821,
+      "grad_norm": 0.21304374933242798,
+      "learning_rate": 8.407940833008954e-05,
+      "loss": 1.0123,
+      "step": 1495
+    },
+    {
+      "epoch": 0.5813659768774896,
+      "grad_norm": 0.21173715591430664,
+      "learning_rate": 8.400155702608019e-05,
+      "loss": 1.0696,
+      "step": 1496
+    },
+    {
+      "epoch": 0.581754590498397,
+      "grad_norm": 0.20407019555568695,
+      "learning_rate": 8.392370572207085e-05,
+      "loss": 1.0086,
+      "step": 1497
+    },
+    {
+      "epoch": 0.5821432041193044,
+      "grad_norm": 0.209481880068779,
+      "learning_rate": 8.384585441806151e-05,
+      "loss": 0.9975,
+      "step": 1498
+    },
+    {
+      "epoch": 0.5825318177402118,
+      "grad_norm": 0.22184531390666962,
+      "learning_rate": 8.376800311405216e-05,
+      "loss": 1.0956,
+      "step": 1499
+    },
+    {
+      "epoch": 0.5829204313611193,
+      "grad_norm": 0.21344684064388275,
+      "learning_rate": 8.369015181004283e-05,
+      "loss": 1.0685,
+      "step": 1500
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 2574,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.2833762852661166e+19,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/outputs/checkpoint-2000/README.md b/outputs/checkpoint-2000/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3abf956c074d00f34a12693c8d6da9738211d7c7
--- /dev/null
+++ b/outputs/checkpoint-2000/README.md
@@ -0,0 +1,209 @@
+---
+base_model: unsloth/gpt-oss-20b-unsloth-bnb-4bit
+library_name: peft
+tags:
+- base_model:adapter:unsloth/gpt-oss-20b-unsloth-bnb-4bit
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/outputs/checkpoint-2000/adapter_config.json b/outputs/checkpoint-2000/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e285b9b6e018b5b9f23736d6699eb1a4267764e7
--- /dev/null
+++ b/outputs/checkpoint-2000/adapter_config.json
@@ -0,0 +1,45 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": {
+    "base_model_class": "GptOssForCausalLM",
+    "parent_library": "transformers.models.gpt_oss.modeling_gpt_oss"
+  },
+  "base_model_name_or_path": "unsloth/gpt-oss-20b-unsloth-bnb-4bit",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "v_proj",
+    "up_proj",
+    "down_proj",
+    "gate_proj",
+    "k_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": null,
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/outputs/checkpoint-2000/chat_template.jinja b/outputs/checkpoint-2000/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..a3650f886e98b2834c25727759c8e0ab8495f316
--- /dev/null
+++ b/outputs/checkpoint-2000/chat_template.jinja
@@ -0,0 +1,315 @@
+{# Copyright 2025-present Unsloth. Apache 2.0 License. Unsloth chat template fixes. Edited from ggml-org & OpenAI #}
+{#-
+  In addition to the normal inputs of `messages` and `tools`, this template also accepts the
+  following kwargs:
+  - "builtin_tools": A list, can contain "browser" and/or "python".
+  - "model_identity": A string that optionally describes the model identity.
+  - "reasoning_effort": A string that describes the reasoning effort, defaults to "medium".
+ #}
+
+{#- Tool Definition Rendering ============================================== #}
+{%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%}
+    {%- if param_spec.type == "array" -%}
+        {%- if param_spec['items'] -%}
+            {%- if param_spec['items']['type'] == "string" -%}
+                {{- "string[]" }}
+            {%- elif param_spec['items']['type'] == "number" -%}
+                {{- "number[]" }}
+            {%- elif param_spec['items']['type'] == "integer" -%}
+                {{- "number[]" }}
+            {%- elif param_spec['items']['type'] == "boolean" -%}
+                {{- "boolean[]" }}
+            {%- else -%}
+                {%- set inner_type = render_typescript_type(param_spec['items'], required_params) -%}
+                {%- if inner_type == "object | object" or inner_type|length > 50 -%}
+                    {{- "any[]" }}
+                {%- else -%}
+                    {{- inner_type + "[]" }}
+                {%- endif -%}
+            {%- endif -%}
+            {%- if param_spec.nullable -%}
+                {{- " | null" }}
+            {%- endif -%}
+        {%- else -%}
+            {{- "any[]" }}
+            {%- if param_spec.nullable -%}
+                {{- " | null" }}
+            {%- endif -%}
+        {%- endif -%}
+    {%- elif param_spec.type is defined and param_spec.type is iterable and param_spec.type is not string and param_spec.type is not mapping and param_spec.type[0] is defined -%}
+        {#- Handle array of types like ["object", "object"] from Union[dict, list] #}
+        {%- if param_spec.type | length > 1 -%}
+            {{- param_spec.type | join(" | ") }}
+        {%- else -%}
+            {{- param_spec.type[0] }}
+        {%- endif -%}
+    {%- elif param_spec.oneOf -%}
+        {#- Handle oneOf schemas - check for complex unions and fallback to any #}
+        {%- set has_object_variants = false -%}
+        {%- for variant in param_spec.oneOf -%}
+            {%- if variant.type == "object" -%}
+                {%- set has_object_variants = true -%}
+            {%- endif -%}
+        {%- endfor -%}
+        {%- if has_object_variants and param_spec.oneOf|length > 1 -%}
+            {{- "any" }}
+        {%- else -%}
+            {%- for variant in param_spec.oneOf -%}
+                {{- render_typescript_type(variant, required_params) -}}
+                {%- if variant.description %}
+                    {{- "// " + variant.description }}
+                {%- endif -%}
+                {%- if variant.default is defined %}
+                    {{ "// default: " + variant.default|tojson }}
+                {%- endif -%}
+                {%- if not loop.last %}
+                    {{- " | " }}
+                {% endif -%}
+            {%- endfor -%}
+        {%- endif -%}
+    {%- elif param_spec.type == "string" -%}
+        {%- if param_spec.enum -%}
+            {{- '"' + param_spec.enum|join('" | "') + '"' -}}
+        {%- else -%}
+            {{- "string" }}
+            {%- if param_spec.nullable %}
+                {{- " | null" }}
+            {%- endif -%}
+        {%- endif -%}
+    {%- elif param_spec.type == "number" -%}
+        {{- "number" }}
+    {%- elif param_spec.type == "integer" -%}
+        {{- "number" }}
+    {%- elif param_spec.type == "boolean" -%}
+        {{- "boolean" }}
+
+    {%- elif param_spec.type == "object" -%}
+        {%- if param_spec.properties -%}
+            {{- "{\n" }}
+            {%- for prop_name, prop_spec in param_spec.properties.items() -%}
+                {{- prop_name -}}
+                {%- if prop_name not in (param_spec.required or []) -%}
+                    {{- "?" }}
+                {%- endif -%}
+                {{- ": " }}
+                {{ render_typescript_type(prop_spec, param_spec.required or []) }}
+                {%- if not loop.last -%}
+                    {{-", " }}
+                {%- endif -%}
+            {%- endfor -%}
+            {{- "}" }}
+        {%- else -%}
+            {{- "object" }}
+        {%- endif -%}
+    {%- else -%}
+        {{- "any" }}
+    {%- endif -%}
+{%- endmacro -%}
+
+{%- macro render_tool_namespace(namespace_name, tools) -%}
+    {{- "## " + namespace_name + "\n\n" }}
+    {{- "namespace " + namespace_name + " {\n\n" }}
+    {%- for tool in tools %}
+        {%- set tool = tool.function %}
+        {{- "// " + tool.description + "\n" }}
+        {{- "type "+ tool.name + " = " }}
+        {%- if tool.parameters and tool.parameters.properties -%}
+            {{- "(_: " }}
+            {{- "{\n" }}
+            {%- for param_name, param_spec in tool.parameters.properties.items() %}
+                {{- "// " + param_spec.description + "\n" }}
+                {{- param_name }}
+                {%- if param_name not in (tool.parameters.required or []) -%}
+                    {{- "?" }}
+                {%- endif -%}
+                {{- ": " }}
+                {{- render_typescript_type(param_spec, tool.parameters.required or []) }}
+                {%- if param_spec.default is defined -%}
+                    {%- if param_spec.enum %}
+                        {{- ", // default: " + param_spec.default }}
+                    {%- elif param_spec.oneOf %}
+                        {{- "// default: " + param_spec.default }}
+                    {%- else %}
+                        {{- ", // default: " + param_spec.default|tojson }}
+                    {%- endif -%}
+                {%- endif -%}
+                {%- if not loop.last %}
+                    {{- ",\n" }}
+                {%- else %}
+                    {{- "\n" }}
+                {%- endif -%}
+            {%- endfor %}
+            {{- "}) => any;\n\n" }}
+        {%- else -%}
+            {{- "() => any;\n\n" }}
+        {%- endif -%}
+    {%- endfor %}
+    {{- "} // namespace " + namespace_name }}
+{%- endmacro -%}
+
+{%- macro render_builtin_tools(browser_tool, python_tool) -%}
+    {%- if browser_tool %}
+        {{- "## browser\n\n" }}
+        {{- "// Tool for browsing.\n" }}
+        {{- "// The `cursor` appears in brackets before each browsing display: `[{cursor}]`.\n" }}
+        {{- "// Cite information from the tool using the following format:\n" }}
+        {{- "// `【{cursor}†L{line_start}(-L{line_end})?】`, for example: `【6†L9-L11】` or `【8†L3】`.\n" }}
+        {{- "// Do not quote more than 10 words directly from the tool output.\n" }}
+        {{- "// sources=web (default: web)\n" }}
+        {{- "namespace browser {\n\n" }}
+        {{- "// Searches for information related to `query` and displays `topn` results.\n" }}
+        {{- "type search = (_: {\n" }}
+        {{- "query: string,\n" }}
+        {{- "topn?: number, // default: 10\n" }}
+        {{- "source?: string,\n" }}
+        {{- "}) => any;\n\n" }}
+        {{- "// Opens the link `id` from the page indicated by `cursor` starting at line number `loc`, showing `num_lines` lines.\n" }}
+        {{- "// Valid link ids are displayed with the formatting: `【{id}†.*】`.\n" }}
+        {{- "// If `cursor` is not provided, the most recent page is implied.\n" }}
+        {{- "// If `id` is a string, it is treated as a fully qualified URL associated with `source`.\n" }}
+        {{- "// If `loc` is not provided, the viewport will be positioned at the beginning of the document or centered on the most relevant passage, if available.\n" }}
+        {{- "// Use this function without `id` to scroll to a new location of an opened page.\n" }}
+        {{- "type open = (_: {\n" }}
+        {{- "id?: number | string, // default: -1\n" }}
+        {{- "cursor?: number, // default: -1\n" }}
+        {{- "loc?: number, // default: -1\n" }}
+        {{- "num_lines?: number, // default: -1\n" }}
+        {{- "view_source?: boolean, // default: false\n" }}
+        {{- "source?: string,\n" }}
+        {{- "}) => any;\n\n" }}
+        {{- "// Finds exact matches of `pattern` in the current page, or the page given by `cursor`.\n" }}
+        {{- "type find = (_: {\n" }}
+        {{- "pattern: string,\n" }}
+        {{- "cursor?: number, // default: -1\n" }}
+        {{- "}) => any;\n\n" }}
+        {{- "} // namespace browser\n\n" }}
+    {%- endif -%}
+
+    {%- if python_tool %}
+        {{- "## python\n\n" }}
+        {{- "Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).\n\n" }}
+        {{- "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 120.0 seconds. The drive at '/mnt/data' can be used to save and persist user files. Internet access for this session is UNKNOWN. Depends on the cluster.\n\n" }}
+    {%- endif -%}
+{%- endmacro -%}
+
+{#- System Message Construction ============================================ #}
+{%- macro build_system_message() -%}
+    {%- if model_identity is not defined %}
+        {{- "You are ChatGPT, a large language model trained by OpenAI.\n" -}}
+    {%- else %}
+        {{- model_identity }}
+    {%- endif %}
+    {{- "Knowledge cutoff: 2024-06\n" }}
+    {{- "Current date: " + strftime_now("%Y-%m-%d") + "\n\n" }}
+    {%- if reasoning_effort is not defined %}
+        {%- set reasoning_effort = "medium" %}
+    {%- endif %}
+    {{- "Reasoning: " + reasoning_effort + "\n\n" }}
+    {%- if builtin_tools is defined %}
+        {{- "# Tools\n\n" }}
+        {%- set available_builtin_tools = namespace(browser=false, python=false) %}
+        {%- for tool in builtin_tools %}
+            {%- if tool == "browser" %}
+                {%- set available_builtin_tools.browser = true %}
+            {%- elif tool == "python" %}
+                {%- set available_builtin_tools.python = true %}
+            {%- endif %}
+        {%- endfor %}
+        {{- render_builtin_tools(available_builtin_tools.browser, available_builtin_tools.python) }}
+    {%- endif -%}
+    {{- "# Valid channels: analysis, commentary, final. Channel must be included for every message." }}
+    {%- if tools is defined -%}
+        {{- "\nCalls to these tools must go to the commentary channel: 'functions'." }}
+    {%- endif -%}
+{%- endmacro -%}
+
+{#- Main Template Logic ================================================= #}
+{#- Set defaults #}
+
+{#- Render system message #}
+{{- "<|start|>system<|message|>" }}
+{{- build_system_message() }}
+{{- "<|end|>" }}
+
+{#- Extract developer message #}
+{%- if messages[0].role == "developer" or messages[0].role == "system" %}
+    {%- set developer_message = messages[0].content %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set developer_message = "" %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+
+{#- Render developer message #}
+{%- if developer_message or tools %}
+    {{- "<|start|>developer<|message|>" }}
+    {%- if developer_message %}
+        {{- "# Instructions\n\n" }}
+        {{- developer_message }}
+    {%- endif %}
+    {%- if tools -%}
+        {{- "\n\n" }}
+        {{- "# Tools\n\n" }}
+        {{- render_tool_namespace("functions", tools) }}
+    {%- endif -%}
+    {{- "<|end|>" }}
+{%- endif %}
+
+{#- Render messages #}
+{%- set last_tool_call = namespace(name=none) %}
+{%- for message in loop_messages -%}
+    {#- At this point only assistant/user/tool messages should remain #}
+    {%- if message.role == 'assistant' -%}
+        {%- if "tool_calls" in message %}
+            {#- We assume max 1 tool call per message, and so we infer the tool call name #}
+            {#- in "tool" messages from the most recent assistant tool call name #}
+            {%- set tool_call = message.tool_calls[0] %}
+            {%- if tool_call.function %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {%- if message.content %}
+                {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.content + "<|end|>" }}
+            {%- endif %}
+            {{- "<|start|>assistant to=" }}
+            {{- "functions." + tool_call.name + "<|channel|>commentary json<|message|>" }}
+            {{- tool_call.arguments|tojson }}
+            {{- "<|call|>" }}
+            {%- set last_tool_call.name = tool_call.name %}
+        {%- elif "thinking" in message and loop.last and not add_generation_prompt %}
+            {#- Only render the CoT if the final turn is an assistant turn and add_generation_prompt is false #}
+            {#- This is a situation that should only occur in training, never in inference. #}
+            {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }}
+            {#- <|return|> indicates the end of generation, but <|end|> does not #}
+            {#- <|return|> should never be an input to the model, but we include it as the final token #}
+            {#- when training, so the model learns to emit it. #}
+            {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|return|>" }}
+            {%- set last_tool_call.name = none %}
+        {%- elif "thinking" in message %}
+            {#- CoT is dropped during all previous turns, so we never render it for inference #}
+            {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|end|>" }}
+            {%- set last_tool_call.name = none %}
+        {%- elif loop.last and not add_generation_prompt %}
+            {#- <|return|> indicates the end of generation, but <|end|> does not #}
+            {#- <|return|> should never be an input to the model, but we include it as the final token #}
+            {#- when training, so the model learns to emit it. #}
+            {{- "<|start|>assistant<|message|>" + message.content + "<|return|>" }}
+        {%- else %}
+            {{- "<|start|>assistant<|message|>" + message.content + "<|end|>" }}
+            {%- set last_tool_call.name = none %}
+        {%- endif %}
+    {%- elif message.role == 'tool' -%}
+        {%- if last_tool_call.name is none %}
+            {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }}
+        {%- endif %}
+        {{- "<|start|>functions." + last_tool_call.name }}
+        {{- " to=assistant<|channel|>commentary<|message|>" + message.content|tojson + "<|end|>" }}
+    {%- else -%}
+        {{- "<|start|>user<|message|>" + message.content + "<|end|>" }}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Generation prompt #}
+{%- if add_generation_prompt -%}
+<|start|>assistant
+{%- endif -%}
+{# Copyright 2025-present Unsloth. Apache 2.0 License. Unsloth chat template fixes. Edited from ggml-org & OpenAI #}
\ No newline at end of file
diff --git a/outputs/checkpoint-2000/special_tokens_map.json b/outputs/checkpoint-2000/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..6fba18753f4d09dbb8fcdf1482daff36b963d639
--- /dev/null
+++ b/outputs/checkpoint-2000/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|return|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|reserved_200017|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/outputs/checkpoint-2000/tokenizer.json b/outputs/checkpoint-2000/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..6ec3ef1795cbbda6b7cb7d1f114919cbe3fdd647
--- /dev/null
+++ b/outputs/checkpoint-2000/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0614fe83cadab421296e664e1f48f4261fa8fef6e03e63bb75c20f38e37d07d3
+size 27868174
diff --git a/outputs/checkpoint-2000/tokenizer_config.json b/outputs/checkpoint-2000/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..482ae30d27a74c38d2228e69dd37c529fc485a45
--- /dev/null
+++ b/outputs/checkpoint-2000/tokenizer_config.json
@@ -0,0 +1,185 @@
+{
+  "added_tokens_decoder": {
+    "199998": {
+      "content": "<|startoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "199999": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200000": {
+      "content": "<|reserved_200000|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200001": {
+      "content": "<|reserved_200001|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200002": {
+      "content": "<|return|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200003": {
+      "content": "<|constrain|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200004": {
+      "content": "<|reserved_200004|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200005": {
+      "content": "<|channel|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200006": {
+      "content": "<|start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200007": {
+      "content": "<|end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200008": {
+      "content": "<|message|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200009": {
+      "content": "<|reserved_200009|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200010": {
+      "content": "<|reserved_200010|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200011": {
+      "content": "<|reserved_200011|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200012": {
+      "content": "<|call|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200013": {
+      "content": "<|reserved_200013|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200014": {
+      "content": "<|reserved_200014|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200015": {
+      "content": "<|reserved_200015|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200016": {
+      "content": "<|reserved_200016|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200017": {
+      "content": "<|reserved_200017|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200018": {
+      "content": "<|endofprompt|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|startoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|return|>",
+  "extra_special_tokens": {},
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 131072,
+  "pad_token": "<|reserved_200017|>",
+  "padding_side": "right",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": null
+}
diff --git a/outputs/checkpoint-2000/trainer_state.json b/outputs/checkpoint-2000/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d64370de502ee9836f14148d772ef056a5aed5d9
--- /dev/null
+++ b/outputs/checkpoint-2000/trainer_state.json
@@ -0,0 +1,14034 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.7772272418148256,
+  "eval_steps": 500,
+  "global_step": 2000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 13.684800148010254,
+      "learning_rate": 0.0,
+      "loss": 2.3276,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 13.660787582397461,
+      "learning_rate": 4e-05,
+      "loss": 2.2792,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 13.35280704498291,
+      "learning_rate": 8e-05,
+      "loss": 2.4151,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 6.15027379989624,
+      "learning_rate": 0.00012,
+      "loss": 1.7812,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 1.3168226480484009,
+      "learning_rate": 0.00016,
+      "loss": 1.4536,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.9872580170631409,
+      "learning_rate": 0.0002,
+      "loss": 1.4171,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.7496100664138794,
+      "learning_rate": 0.00019935064935064936,
+      "loss": 1.4168,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.7376005053520203,
+      "learning_rate": 0.00019870129870129872,
+      "loss": 1.3659,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5281137824058533,
+      "learning_rate": 0.00019805194805194807,
+      "loss": 1.2566,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.5485746264457703,
+      "learning_rate": 0.00019740259740259742,
+      "loss": 1.3761,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.5506592392921448,
+      "learning_rate": 0.00019675324675324675,
+      "loss": 1.3327,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.49382686614990234,
+      "learning_rate": 0.00019610389610389613,
+      "loss": 1.3727,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.36203011870384216,
+      "learning_rate": 0.00019545454545454548,
+      "loss": 1.1515,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.3528599739074707,
+      "learning_rate": 0.0001948051948051948,
+      "loss": 1.2636,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.31244418025016785,
+      "learning_rate": 0.00019415584415584416,
+      "loss": 1.1873,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.3379523754119873,
+      "learning_rate": 0.00019350649350649354,
+      "loss": 1.2657,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.3025083839893341,
+      "learning_rate": 0.00019285714285714286,
+      "loss": 1.2846,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.2560190260410309,
+      "learning_rate": 0.00019220779220779222,
+      "loss": 1.1587,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.2554129958152771,
+      "learning_rate": 0.00019155844155844157,
+      "loss": 1.2812,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.22662702202796936,
+      "learning_rate": 0.00019090909090909092,
+      "loss": 1.1664,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.2515714168548584,
+      "learning_rate": 0.00019025974025974027,
+      "loss": 1.2177,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.24396637082099915,
+      "learning_rate": 0.00018961038961038963,
+      "loss": 1.2053,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.24488303065299988,
+      "learning_rate": 0.00018896103896103895,
+      "loss": 1.2074,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.2168620079755783,
+      "learning_rate": 0.00018831168831168833,
+      "loss": 1.1284,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.24021224677562714,
+      "learning_rate": 0.00018766233766233769,
+      "loss": 1.2169,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.20057056844234467,
+      "learning_rate": 0.000187012987012987,
+      "loss": 1.1031,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.19900795817375183,
+      "learning_rate": 0.00018636363636363636,
+      "loss": 1.1004,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.2019268423318863,
+      "learning_rate": 0.00018571428571428572,
+      "loss": 1.1476,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.1996479034423828,
+      "learning_rate": 0.00018506493506493507,
+      "loss": 1.1455,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.25262022018432617,
+      "learning_rate": 0.00018441558441558442,
+      "loss": 1.1025,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.225438192486763,
+      "learning_rate": 0.00018376623376623378,
+      "loss": 1.1954,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.17834505438804626,
+      "learning_rate": 0.00018311688311688313,
+      "loss": 1.0934,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.20071206986904144,
+      "learning_rate": 0.00018246753246753248,
+      "loss": 1.0488,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.1920139640569687,
+      "learning_rate": 0.00018181818181818183,
+      "loss": 1.123,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.18714852631092072,
+      "learning_rate": 0.0001811688311688312,
+      "loss": 1.0798,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.18315713107585907,
+      "learning_rate": 0.00018051948051948054,
+      "loss": 1.1107,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.19156870245933533,
+      "learning_rate": 0.00017987012987012987,
+      "loss": 1.1125,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.21527768671512604,
+      "learning_rate": 0.00017922077922077922,
+      "loss": 1.1346,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.1871163249015808,
+      "learning_rate": 0.0001785714285714286,
+      "loss": 1.0742,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.17750784754753113,
+      "learning_rate": 0.00017792207792207792,
+      "loss": 1.1323,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.177419051527977,
+      "learning_rate": 0.00017727272727272728,
+      "loss": 1.1405,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.16714292764663696,
+      "learning_rate": 0.00017662337662337663,
+      "loss": 1.1084,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.1610356718301773,
+      "learning_rate": 0.00017597402597402598,
+      "loss": 1.1125,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.2548656761646271,
+      "learning_rate": 0.00017532467532467534,
+      "loss": 1.1114,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.1731044203042984,
+      "learning_rate": 0.0001746753246753247,
+      "loss": 1.1197,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.1739533394575119,
+      "learning_rate": 0.00017402597402597401,
+      "loss": 1.1777,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.2178352177143097,
+      "learning_rate": 0.0001733766233766234,
+      "loss": 1.1111,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.17247150838375092,
+      "learning_rate": 0.00017272727272727275,
+      "loss": 1.1253,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.18075324594974518,
+      "learning_rate": 0.00017207792207792207,
+      "loss": 1.1358,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.15898071229457855,
+      "learning_rate": 0.00017142857142857143,
+      "loss": 1.0606,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.16518613696098328,
+      "learning_rate": 0.0001707792207792208,
+      "loss": 1.0944,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.16035063564777374,
+      "learning_rate": 0.00017012987012987013,
+      "loss": 1.0554,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.1686483472585678,
+      "learning_rate": 0.00016948051948051948,
+      "loss": 1.0384,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.16575631499290466,
+      "learning_rate": 0.00016883116883116884,
+      "loss": 1.0243,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.16840039193630219,
+      "learning_rate": 0.0001681818181818182,
+      "loss": 1.117,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.17616064846515656,
+      "learning_rate": 0.00016753246753246754,
+      "loss": 1.0743,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.168218195438385,
+      "learning_rate": 0.0001668831168831169,
+      "loss": 1.0627,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.17026656866073608,
+      "learning_rate": 0.00016623376623376625,
+      "loss": 1.0059,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.16454458236694336,
+      "learning_rate": 0.0001655844155844156,
+      "loss": 0.9943,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.17185136675834656,
+      "learning_rate": 0.00016493506493506495,
+      "loss": 1.1545,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.17822986841201782,
+      "learning_rate": 0.00016428571428571428,
+      "loss": 1.073,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.1676608771085739,
+      "learning_rate": 0.00016363636363636366,
+      "loss": 1.0886,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.1727771908044815,
+      "learning_rate": 0.000162987012987013,
+      "loss": 1.0432,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.17827573418617249,
+      "learning_rate": 0.00016233766233766234,
+      "loss": 1.083,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.19807517528533936,
+      "learning_rate": 0.0001616883116883117,
+      "loss": 1.1208,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.17693684995174408,
+      "learning_rate": 0.00016103896103896104,
+      "loss": 1.089,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.15489234030246735,
+      "learning_rate": 0.0001603896103896104,
+      "loss": 0.9707,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.16443990170955658,
+      "learning_rate": 0.00015974025974025975,
+      "loss": 1.0643,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.2051103413105011,
+      "learning_rate": 0.0001590909090909091,
+      "loss": 1.1246,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.18824075162410736,
+      "learning_rate": 0.00015844155844155845,
+      "loss": 1.0855,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.18659448623657227,
+      "learning_rate": 0.0001577922077922078,
+      "loss": 1.1412,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.1854114979505539,
+      "learning_rate": 0.00015714285714285716,
+      "loss": 1.0249,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.1876193732023239,
+      "learning_rate": 0.00015649350649350649,
+      "loss": 1.1029,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.1888684630393982,
+      "learning_rate": 0.00015584415584415587,
+      "loss": 1.0789,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.20240606367588043,
+      "learning_rate": 0.0001551948051948052,
+      "loss": 1.0495,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.232120081782341,
+      "learning_rate": 0.00015454545454545454,
+      "loss": 1.0735,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.16897843778133392,
+      "learning_rate": 0.0001538961038961039,
+      "loss": 1.0164,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.18796634674072266,
+      "learning_rate": 0.00015324675324675325,
+      "loss": 1.0676,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.19574032723903656,
+      "learning_rate": 0.0001525974025974026,
+      "loss": 1.0456,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.18007811903953552,
+      "learning_rate": 0.00015194805194805196,
+      "loss": 1.0894,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.18932929635047913,
+      "learning_rate": 0.0001512987012987013,
+      "loss": 1.0729,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.20614288747310638,
+      "learning_rate": 0.00015064935064935066,
+      "loss": 1.0854,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.19291089475154877,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.1217,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.18916529417037964,
+      "learning_rate": 0.00014935064935064934,
+      "loss": 1.0963,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.20306220650672913,
+      "learning_rate": 0.00014870129870129872,
+      "loss": 1.0898,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.17870067059993744,
+      "learning_rate": 0.00014805194805194807,
+      "loss": 1.0213,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.18411923944950104,
+      "learning_rate": 0.0001474025974025974,
+      "loss": 1.0844,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.18788227438926697,
+      "learning_rate": 0.00014675324675324675,
+      "loss": 1.0338,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.23874884843826294,
+      "learning_rate": 0.00014610389610389613,
+      "loss": 1.1118,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.19380499422550201,
+      "learning_rate": 0.00014545454545454546,
+      "loss": 1.0464,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.18968750536441803,
+      "learning_rate": 0.0001448051948051948,
+      "loss": 1.0569,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.19545753300189972,
+      "learning_rate": 0.00014415584415584416,
+      "loss": 1.1225,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.19170494377613068,
+      "learning_rate": 0.00014350649350649352,
+      "loss": 1.0602,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.17953918874263763,
+      "learning_rate": 0.00014285714285714287,
+      "loss": 1.032,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.1822536289691925,
+      "learning_rate": 0.00014220779220779222,
+      "loss": 1.0559,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.18591298162937164,
+      "learning_rate": 0.00014155844155844155,
+      "loss": 1.031,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.2129002958536148,
+      "learning_rate": 0.00014090909090909093,
+      "loss": 1.1391,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.18386681377887726,
+      "learning_rate": 0.00014025974025974028,
+      "loss": 0.9919,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.18314239382743835,
+      "learning_rate": 0.0001396103896103896,
+      "loss": 1.0445,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.1999066174030304,
+      "learning_rate": 0.00013896103896103896,
+      "loss": 1.0538,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.18741188943386078,
+      "learning_rate": 0.00013831168831168834,
+      "loss": 1.0722,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.19351010024547577,
+      "learning_rate": 0.00013766233766233766,
+      "loss": 1.0491,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.18859203159809113,
+      "learning_rate": 0.00013701298701298702,
+      "loss": 1.0593,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.1962767392396927,
+      "learning_rate": 0.00013636363636363637,
+      "loss": 1.1344,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.20819440484046936,
+      "learning_rate": 0.00013571428571428572,
+      "loss": 1.1137,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.19590184092521667,
+      "learning_rate": 0.00013506493506493507,
+      "loss": 1.0624,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.18631424009799957,
+      "learning_rate": 0.00013441558441558443,
+      "loss": 1.0587,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.19572143256664276,
+      "learning_rate": 0.00013376623376623375,
+      "loss": 1.0494,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.1910988837480545,
+      "learning_rate": 0.00013311688311688313,
+      "loss": 1.0481,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.19455869495868683,
+      "learning_rate": 0.00013246753246753249,
+      "loss": 1.029,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.18669827282428741,
+      "learning_rate": 0.0001318181818181818,
+      "loss": 1.0513,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.17523664236068726,
+      "learning_rate": 0.0001311688311688312,
+      "loss": 1.0126,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.17929129302501678,
+      "learning_rate": 0.00013051948051948052,
+      "loss": 1.0717,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.19380168616771698,
+      "learning_rate": 0.00012987012987012987,
+      "loss": 1.0324,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.18090228736400604,
+      "learning_rate": 0.00012922077922077922,
+      "loss": 1.0515,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.2067340910434723,
+      "learning_rate": 0.00012857142857142858,
+      "loss": 1.0939,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.1880485862493515,
+      "learning_rate": 0.00012792207792207793,
+      "loss": 1.0986,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.182168647646904,
+      "learning_rate": 0.00012727272727272728,
+      "loss": 1.0109,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.20187129080295563,
+      "learning_rate": 0.00012662337662337663,
+      "loss": 1.0668,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.2082669734954834,
+      "learning_rate": 0.000125974025974026,
+      "loss": 1.054,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.18294434249401093,
+      "learning_rate": 0.00012532467532467534,
+      "loss": 1.0397,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.20515067875385284,
+      "learning_rate": 0.00012467532467532467,
+      "loss": 1.1092,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.1758790761232376,
+      "learning_rate": 0.00012402597402597402,
+      "loss": 0.9755,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.2170792669057846,
+      "learning_rate": 0.0001233766233766234,
+      "loss": 1.0434,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.202157124876976,
+      "learning_rate": 0.00012272727272727272,
+      "loss": 1.1129,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.18556398153305054,
+      "learning_rate": 0.00012207792207792208,
+      "loss": 1.0665,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.20196087658405304,
+      "learning_rate": 0.00012142857142857143,
+      "loss": 1.1,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.1921566128730774,
+      "learning_rate": 0.0001207792207792208,
+      "loss": 1.0918,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.18866224586963654,
+      "learning_rate": 0.00012012987012987014,
+      "loss": 1.0014,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.207601398229599,
+      "learning_rate": 0.00011948051948051949,
+      "loss": 1.0726,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.21592366695404053,
+      "learning_rate": 0.00011883116883116883,
+      "loss": 1.1379,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.2016124576330185,
+      "learning_rate": 0.0001181818181818182,
+      "loss": 1.1428,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.20478437840938568,
+      "learning_rate": 0.00011753246753246753,
+      "loss": 1.121,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.22730594873428345,
+      "learning_rate": 0.00011688311688311689,
+      "loss": 1.0319,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.22592711448669434,
+      "learning_rate": 0.00011623376623376625,
+      "loss": 1.1264,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.20035041868686676,
+      "learning_rate": 0.00011558441558441559,
+      "loss": 1.0686,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.20648567378520966,
+      "learning_rate": 0.00011493506493506494,
+      "loss": 1.0817,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.21222743391990662,
+      "learning_rate": 0.00011428571428571428,
+      "loss": 1.0678,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.2075391560792923,
+      "learning_rate": 0.00011363636363636365,
+      "loss": 1.0897,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.1964101791381836,
+      "learning_rate": 0.000112987012987013,
+      "loss": 1.0906,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.22406511008739471,
+      "learning_rate": 0.00011233766233766234,
+      "loss": 1.0594,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.23787978291511536,
+      "learning_rate": 0.00011168831168831168,
+      "loss": 1.1053,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.21196185052394867,
+      "learning_rate": 0.00011103896103896105,
+      "loss": 1.0923,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.21042804419994354,
+      "learning_rate": 0.0001103896103896104,
+      "loss": 1.0381,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.2267436534166336,
+      "learning_rate": 0.00010974025974025974,
+      "loss": 1.0818,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.23742735385894775,
+      "learning_rate": 0.00010909090909090909,
+      "loss": 1.0872,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.17787213623523712,
+      "learning_rate": 0.00010844155844155846,
+      "loss": 1.03,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.22422832250595093,
+      "learning_rate": 0.0001077922077922078,
+      "loss": 1.0738,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.22946301102638245,
+      "learning_rate": 0.00010714285714285715,
+      "loss": 1.0274,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.2137996405363083,
+      "learning_rate": 0.00010649350649350649,
+      "loss": 1.0539,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.1748756766319275,
+      "learning_rate": 0.00010584415584415586,
+      "loss": 1.0355,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.22275175154209137,
+      "learning_rate": 0.0001051948051948052,
+      "loss": 1.1696,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.20996077358722687,
+      "learning_rate": 0.00010454545454545455,
+      "loss": 1.0303,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.1945938766002655,
+      "learning_rate": 0.00010389610389610389,
+      "loss": 0.9747,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.1970377266407013,
+      "learning_rate": 0.00010324675324675325,
+      "loss": 1.0358,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.18814732134342194,
+      "learning_rate": 0.00010259740259740261,
+      "loss": 0.9612,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.2153233289718628,
+      "learning_rate": 0.00010194805194805195,
+      "loss": 1.0749,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.21788008511066437,
+      "learning_rate": 0.0001012987012987013,
+      "loss": 1.0883,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.214650496840477,
+      "learning_rate": 0.00010064935064935067,
+      "loss": 1.0539,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.19312834739685059,
+      "learning_rate": 0.0001,
+      "loss": 1.0657,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.19916598498821259,
+      "learning_rate": 9.935064935064936e-05,
+      "loss": 1.0478,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.2057606726884842,
+      "learning_rate": 9.870129870129871e-05,
+      "loss": 1.0094,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.22159607708454132,
+      "learning_rate": 9.805194805194806e-05,
+      "loss": 1.0952,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.18274275958538055,
+      "learning_rate": 9.74025974025974e-05,
+      "loss": 1.0065,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.19835162162780762,
+      "learning_rate": 9.675324675324677e-05,
+      "loss": 1.0742,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.2114904820919037,
+      "learning_rate": 9.610389610389611e-05,
+      "loss": 1.1109,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.21488523483276367,
+      "learning_rate": 9.545454545454546e-05,
+      "loss": 1.0465,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.19870303571224213,
+      "learning_rate": 9.480519480519481e-05,
+      "loss": 1.0318,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.20413029193878174,
+      "learning_rate": 9.415584415584417e-05,
+      "loss": 1.0817,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.1847231239080429,
+      "learning_rate": 9.35064935064935e-05,
+      "loss": 1.0144,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.2715964913368225,
+      "learning_rate": 9.285714285714286e-05,
+      "loss": 0.9832,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.2225002497434616,
+      "learning_rate": 9.220779220779221e-05,
+      "loss": 1.1051,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.22931510210037231,
+      "learning_rate": 9.155844155844156e-05,
+      "loss": 1.1042,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.21848627924919128,
+      "learning_rate": 9.090909090909092e-05,
+      "loss": 1.1151,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.19852259755134583,
+      "learning_rate": 9.025974025974027e-05,
+      "loss": 1.0889,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.2080363780260086,
+      "learning_rate": 8.961038961038961e-05,
+      "loss": 1.0777,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.22391024231910706,
+      "learning_rate": 8.896103896103896e-05,
+      "loss": 1.1092,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.21793846786022186,
+      "learning_rate": 8.831168831168831e-05,
+      "loss": 1.044,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.2009749859571457,
+      "learning_rate": 8.766233766233767e-05,
+      "loss": 1.0198,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.19432318210601807,
+      "learning_rate": 8.701298701298701e-05,
+      "loss": 1.075,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.18634547293186188,
+      "learning_rate": 8.636363636363637e-05,
+      "loss": 0.9964,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.1947103589773178,
+      "learning_rate": 8.571428571428571e-05,
+      "loss": 1.0025,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.23098671436309814,
+      "learning_rate": 8.506493506493507e-05,
+      "loss": 1.0562,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.19686414301395416,
+      "learning_rate": 8.441558441558442e-05,
+      "loss": 1.0285,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.19852428138256073,
+      "learning_rate": 8.376623376623377e-05,
+      "loss": 1.0054,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.21483510732650757,
+      "learning_rate": 8.311688311688312e-05,
+      "loss": 1.108,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.23313644528388977,
+      "learning_rate": 8.246753246753248e-05,
+      "loss": 1.1383,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.21453145146369934,
+      "learning_rate": 8.181818181818183e-05,
+      "loss": 1.0911,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.20268195867538452,
+      "learning_rate": 8.116883116883117e-05,
+      "loss": 1.0145,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.20576398074626923,
+      "learning_rate": 8.051948051948052e-05,
+      "loss": 1.0829,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.21732626855373383,
+      "learning_rate": 7.987012987012987e-05,
+      "loss": 1.0152,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.22046895325183868,
+      "learning_rate": 7.922077922077923e-05,
+      "loss": 1.1311,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.19727715849876404,
+      "learning_rate": 7.857142857142858e-05,
+      "loss": 1.0364,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.20861488580703735,
+      "learning_rate": 7.792207792207793e-05,
+      "loss": 1.0435,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.18545083701610565,
+      "learning_rate": 7.727272727272727e-05,
+      "loss": 1.0299,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.19965052604675293,
+      "learning_rate": 7.662337662337662e-05,
+      "loss": 1.0511,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.23673909902572632,
+      "learning_rate": 7.597402597402598e-05,
+      "loss": 1.081,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.17583179473876953,
+      "learning_rate": 7.532467532467533e-05,
+      "loss": 0.9808,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.2129366099834442,
+      "learning_rate": 7.467532467532467e-05,
+      "loss": 1.0522,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.21679140627384186,
+      "learning_rate": 7.402597402597404e-05,
+      "loss": 1.0567,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.2032000720500946,
+      "learning_rate": 7.337662337662338e-05,
+      "loss": 1.0466,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.1887970268726349,
+      "learning_rate": 7.272727272727273e-05,
+      "loss": 1.0329,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.21060192584991455,
+      "learning_rate": 7.207792207792208e-05,
+      "loss": 1.1021,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.21191425621509552,
+      "learning_rate": 7.142857142857143e-05,
+      "loss": 0.99,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.1995989829301834,
+      "learning_rate": 7.077922077922077e-05,
+      "loss": 1.0526,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.1849513053894043,
+      "learning_rate": 7.012987012987014e-05,
+      "loss": 0.9998,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.1948779672384262,
+      "learning_rate": 6.948051948051948e-05,
+      "loss": 1.075,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.20374052226543427,
+      "learning_rate": 6.883116883116883e-05,
+      "loss": 1.0933,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.2102465033531189,
+      "learning_rate": 6.818181818181818e-05,
+      "loss": 1.1123,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.21376173198223114,
+      "learning_rate": 6.753246753246754e-05,
+      "loss": 1.1233,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.20934203267097473,
+      "learning_rate": 6.688311688311688e-05,
+      "loss": 1.1374,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.18604128062725067,
+      "learning_rate": 6.623376623376624e-05,
+      "loss": 1.0213,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.19644233584403992,
+      "learning_rate": 6.55844155844156e-05,
+      "loss": 1.0046,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.18479463458061218,
+      "learning_rate": 6.493506493506494e-05,
+      "loss": 0.9792,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.1945149153470993,
+      "learning_rate": 6.428571428571429e-05,
+      "loss": 1.0584,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.2070147544145584,
+      "learning_rate": 6.363636363636364e-05,
+      "loss": 1.071,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.19645985960960388,
+      "learning_rate": 6.2987012987013e-05,
+      "loss": 1.0721,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.1960117667913437,
+      "learning_rate": 6.233766233766233e-05,
+      "loss": 1.071,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.20168261229991913,
+      "learning_rate": 6.16883116883117e-05,
+      "loss": 1.0808,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.21254412829875946,
+      "learning_rate": 6.103896103896104e-05,
+      "loss": 1.0287,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.21271063387393951,
+      "learning_rate": 6.03896103896104e-05,
+      "loss": 1.0605,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.2081408053636551,
+      "learning_rate": 5.9740259740259744e-05,
+      "loss": 1.091,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.21113798022270203,
+      "learning_rate": 5.90909090909091e-05,
+      "loss": 1.1323,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.20670844614505768,
+      "learning_rate": 5.844155844155844e-05,
+      "loss": 1.0955,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.2010120451450348,
+      "learning_rate": 5.7792207792207796e-05,
+      "loss": 1.1068,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.20379121601581573,
+      "learning_rate": 5.714285714285714e-05,
+      "loss": 1.0419,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.22799807786941528,
+      "learning_rate": 5.64935064935065e-05,
+      "loss": 1.0904,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.2005995213985443,
+      "learning_rate": 5.584415584415584e-05,
+      "loss": 1.078,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.20329605042934418,
+      "learning_rate": 5.51948051948052e-05,
+      "loss": 1.0245,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.19283504784107208,
+      "learning_rate": 5.4545454545454546e-05,
+      "loss": 1.0367,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.20624355971813202,
+      "learning_rate": 5.38961038961039e-05,
+      "loss": 1.1046,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.21362991631031036,
+      "learning_rate": 5.3246753246753245e-05,
+      "loss": 1.1104,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.20447863638401031,
+      "learning_rate": 5.25974025974026e-05,
+      "loss": 1.0514,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.1974381059408188,
+      "learning_rate": 5.1948051948051944e-05,
+      "loss": 1.0048,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.21237170696258545,
+      "learning_rate": 5.1298701298701304e-05,
+      "loss": 1.1299,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.21224971115589142,
+      "learning_rate": 5.064935064935065e-05,
+      "loss": 1.05,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.19865018129348755,
+      "learning_rate": 5e-05,
+      "loss": 1.0665,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.19199275970458984,
+      "learning_rate": 4.9350649350649355e-05,
+      "loss": 0.9531,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.19573214650154114,
+      "learning_rate": 4.87012987012987e-05,
+      "loss": 1.0318,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.21338805556297302,
+      "learning_rate": 4.8051948051948054e-05,
+      "loss": 1.0343,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.2254691869020462,
+      "learning_rate": 4.740259740259741e-05,
+      "loss": 1.0472,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.18101665377616882,
+      "learning_rate": 4.675324675324675e-05,
+      "loss": 1.017,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.22090592980384827,
+      "learning_rate": 4.6103896103896106e-05,
+      "loss": 1.0389,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.20865507423877716,
+      "learning_rate": 4.545454545454546e-05,
+      "loss": 1.0369,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.21619610488414764,
+      "learning_rate": 4.4805194805194805e-05,
+      "loss": 1.109,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.21694771945476532,
+      "learning_rate": 4.415584415584416e-05,
+      "loss": 1.0525,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.2182662934064865,
+      "learning_rate": 4.3506493506493503e-05,
+      "loss": 1.0331,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.2026486098766327,
+      "learning_rate": 4.2857142857142856e-05,
+      "loss": 1.027,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.19606547057628632,
+      "learning_rate": 4.220779220779221e-05,
+      "loss": 1.0242,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.22107470035552979,
+      "learning_rate": 4.155844155844156e-05,
+      "loss": 1.0924,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.19960008561611176,
+      "learning_rate": 4.0909090909090915e-05,
+      "loss": 1.0384,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.1945488154888153,
+      "learning_rate": 4.025974025974026e-05,
+      "loss": 1.0673,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.22067414224147797,
+      "learning_rate": 3.9610389610389614e-05,
+      "loss": 1.0426,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.19010980427265167,
+      "learning_rate": 3.8961038961038966e-05,
+      "loss": 1.0617,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.18781176209449768,
+      "learning_rate": 3.831168831168831e-05,
+      "loss": 1.0243,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.20388829708099365,
+      "learning_rate": 3.7662337662337665e-05,
+      "loss": 1.0476,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.19911155104637146,
+      "learning_rate": 3.701298701298702e-05,
+      "loss": 1.0324,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.19884039461612701,
+      "learning_rate": 3.6363636363636364e-05,
+      "loss": 1.0242,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.19036105275154114,
+      "learning_rate": 3.571428571428572e-05,
+      "loss": 1.0323,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.20039844512939453,
+      "learning_rate": 3.506493506493507e-05,
+      "loss": 1.0749,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.1899934560060501,
+      "learning_rate": 3.4415584415584416e-05,
+      "loss": 1.0115,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.20019090175628662,
+      "learning_rate": 3.376623376623377e-05,
+      "loss": 1.0782,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.2020583152770996,
+      "learning_rate": 3.311688311688312e-05,
+      "loss": 1.0687,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.21407337486743927,
+      "learning_rate": 3.246753246753247e-05,
+      "loss": 1.1015,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.1871640682220459,
+      "learning_rate": 3.181818181818182e-05,
+      "loss": 0.9637,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.21622811257839203,
+      "learning_rate": 3.1168831168831166e-05,
+      "loss": 1.1222,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.22504661977291107,
+      "learning_rate": 3.051948051948052e-05,
+      "loss": 1.132,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.19177629053592682,
+      "learning_rate": 2.9870129870129872e-05,
+      "loss": 1.0281,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.1970544159412384,
+      "learning_rate": 2.922077922077922e-05,
+      "loss": 1.0393,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.21554522216320038,
+      "learning_rate": 2.857142857142857e-05,
+      "loss": 1.074,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.21131229400634766,
+      "learning_rate": 2.792207792207792e-05,
+      "loss": 1.054,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.19816523790359497,
+      "learning_rate": 2.7272727272727273e-05,
+      "loss": 1.0456,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.21075209975242615,
+      "learning_rate": 2.6623376623376623e-05,
+      "loss": 1.0758,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.2296527624130249,
+      "learning_rate": 2.5974025974025972e-05,
+      "loss": 1.0917,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.19722610712051392,
+      "learning_rate": 2.5324675324675325e-05,
+      "loss": 1.0704,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.18721099197864532,
+      "learning_rate": 2.4675324675324678e-05,
+      "loss": 0.9919,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.20244193077087402,
+      "learning_rate": 2.4025974025974027e-05,
+      "loss": 1.0368,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.19518914818763733,
+      "learning_rate": 2.3376623376623376e-05,
+      "loss": 1.0436,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.19650357961654663,
+      "learning_rate": 2.272727272727273e-05,
+      "loss": 1.0306,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.20320096611976624,
+      "learning_rate": 2.207792207792208e-05,
+      "loss": 1.0941,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.18296951055526733,
+      "learning_rate": 2.1428571428571428e-05,
+      "loss": 0.9802,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.21357610821723938,
+      "learning_rate": 2.077922077922078e-05,
+      "loss": 1.0449,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.193921759724617,
+      "learning_rate": 2.012987012987013e-05,
+      "loss": 1.0116,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.1953902244567871,
+      "learning_rate": 1.9480519480519483e-05,
+      "loss": 1.0105,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.19440975785255432,
+      "learning_rate": 1.8831168831168833e-05,
+      "loss": 0.9952,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.21054105460643768,
+      "learning_rate": 1.8181818181818182e-05,
+      "loss": 1.0701,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.18844804167747498,
+      "learning_rate": 1.7532467532467535e-05,
+      "loss": 1.0146,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.2067311704158783,
+      "learning_rate": 1.6883116883116884e-05,
+      "loss": 1.0781,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.1941213756799698,
+      "learning_rate": 1.6233766233766234e-05,
+      "loss": 0.9814,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.22726193070411682,
+      "learning_rate": 1.5584415584415583e-05,
+      "loss": 1.1431,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.18025581538677216,
+      "learning_rate": 1.4935064935064936e-05,
+      "loss": 0.9649,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.21535000205039978,
+      "learning_rate": 1.4285714285714285e-05,
+      "loss": 1.0441,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.20014546811580658,
+      "learning_rate": 1.3636363636363637e-05,
+      "loss": 1.0166,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.22738787531852722,
+      "learning_rate": 1.2987012987012986e-05,
+      "loss": 1.0564,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.2020861804485321,
+      "learning_rate": 1.2337662337662339e-05,
+      "loss": 1.1241,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.19888809323310852,
+      "learning_rate": 1.1688311688311688e-05,
+      "loss": 1.1114,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.20912377536296844,
+      "learning_rate": 1.103896103896104e-05,
+      "loss": 1.0971,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.21206621825695038,
+      "learning_rate": 1.038961038961039e-05,
+      "loss": 1.0601,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.18667680025100708,
+      "learning_rate": 9.740259740259742e-06,
+      "loss": 1.0291,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.21125559508800507,
+      "learning_rate": 9.090909090909091e-06,
+      "loss": 1.0483,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.21776145696640015,
+      "learning_rate": 8.441558441558442e-06,
+      "loss": 0.9912,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.20144303143024445,
+      "learning_rate": 7.792207792207792e-06,
+      "loss": 1.0357,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.1984029859304428,
+      "learning_rate": 7.142857142857143e-06,
+      "loss": 1.0648,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.17972829937934875,
+      "learning_rate": 6.493506493506493e-06,
+      "loss": 1.0033,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.1818286031484604,
+      "learning_rate": 5.844155844155844e-06,
+      "loss": 0.997,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.19670912623405457,
+      "learning_rate": 5.194805194805195e-06,
+      "loss": 1.0256,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.20527283847332,
+      "learning_rate": 4.5454545454545455e-06,
+      "loss": 1.0348,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.19025909900665283,
+      "learning_rate": 3.896103896103896e-06,
+      "loss": 1.0682,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.19544818997383118,
+      "learning_rate": 3.2467532467532465e-06,
+      "loss": 0.9872,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.22112183272838593,
+      "learning_rate": 2.5974025974025976e-06,
+      "loss": 1.0661,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.23328153789043427,
+      "learning_rate": 1.948051948051948e-06,
+      "loss": 1.0691,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.20181375741958618,
+      "learning_rate": 1.2987012987012988e-06,
+      "loss": 0.9416,
+      "step": 312
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.29312625527381897,
+      "learning_rate": 6.493506493506494e-07,
+      "loss": 1.1216,
+      "step": 313
+    },
+    {
+      "epoch": 0.12202467696492762,
+      "grad_norm": 0.2231415957212448,
+      "learning_rate": 0.0,
+      "loss": 1.0468,
+      "step": 314
+    },
+    {
+      "epoch": 0.12241329058583503,
+      "grad_norm": 0.22263288497924805,
+      "learning_rate": 0.00017594394706111328,
+      "loss": 1.0399,
+      "step": 315
+    },
+    {
+      "epoch": 0.12280190420674245,
+      "grad_norm": 0.22909891605377197,
+      "learning_rate": 0.00017586609575710393,
+      "loss": 1.1069,
+      "step": 316
+    },
+    {
+      "epoch": 0.12319051782764986,
+      "grad_norm": 0.23951445519924164,
+      "learning_rate": 0.0001757882444530946,
+      "loss": 1.1036,
+      "step": 317
+    },
+    {
+      "epoch": 0.12357913144855727,
+      "grad_norm": 0.2409268021583557,
+      "learning_rate": 0.00017571039314908526,
+      "loss": 1.1114,
+      "step": 318
+    },
+    {
+      "epoch": 0.12396774506946469,
+      "grad_norm": 0.23753899335861206,
+      "learning_rate": 0.00017563254184507592,
+      "loss": 1.1297,
+      "step": 319
+    },
+    {
+      "epoch": 0.12435635869037209,
+      "grad_norm": 0.2823902666568756,
+      "learning_rate": 0.00017555469054106657,
+      "loss": 1.1293,
+      "step": 320
+    },
+    {
+      "epoch": 0.12474497231127951,
+      "grad_norm": 0.24093545973300934,
+      "learning_rate": 0.00017547683923705722,
+      "loss": 1.0678,
+      "step": 321
+    },
+    {
+      "epoch": 0.12513358593218693,
+      "grad_norm": 0.22565563023090363,
+      "learning_rate": 0.0001753989879330479,
+      "loss": 1.1408,
+      "step": 322
+    },
+    {
+      "epoch": 0.12552219955309435,
+      "grad_norm": 0.22569572925567627,
+      "learning_rate": 0.00017532113662903855,
+      "loss": 1.0543,
+      "step": 323
+    },
+    {
+      "epoch": 0.12591081317400174,
+      "grad_norm": 0.24962866306304932,
+      "learning_rate": 0.0001752432853250292,
+      "loss": 1.0818,
+      "step": 324
+    },
+    {
+      "epoch": 0.12629942679490916,
+      "grad_norm": 0.22184576094150543,
+      "learning_rate": 0.00017516543402101986,
+      "loss": 1.0835,
+      "step": 325
+    },
+    {
+      "epoch": 0.12668804041581658,
+      "grad_norm": 0.2572194039821625,
+      "learning_rate": 0.0001750875827170105,
+      "loss": 1.0767,
+      "step": 326
+    },
+    {
+      "epoch": 0.127076654036724,
+      "grad_norm": 0.24131342768669128,
+      "learning_rate": 0.00017500973141300116,
+      "loss": 1.0981,
+      "step": 327
+    },
+    {
+      "epoch": 0.1274652676576314,
+      "grad_norm": 0.2386389970779419,
+      "learning_rate": 0.00017493188010899184,
+      "loss": 1.0828,
+      "step": 328
+    },
+    {
+      "epoch": 0.1278538812785388,
+      "grad_norm": 0.2654125690460205,
+      "learning_rate": 0.0001748540288049825,
+      "loss": 1.1266,
+      "step": 329
+    },
+    {
+      "epoch": 0.12824249489944622,
+      "grad_norm": 0.2925739884376526,
+      "learning_rate": 0.00017477617750097314,
+      "loss": 1.0983,
+      "step": 330
+    },
+    {
+      "epoch": 0.12863110852035364,
+      "grad_norm": 0.26589342951774597,
+      "learning_rate": 0.0001746983261969638,
+      "loss": 1.1029,
+      "step": 331
+    },
+    {
+      "epoch": 0.12901972214126106,
+      "grad_norm": 0.24565957486629486,
+      "learning_rate": 0.00017462047489295445,
+      "loss": 1.0975,
+      "step": 332
+    },
+    {
+      "epoch": 0.12940833576216845,
+      "grad_norm": 0.2459682673215866,
+      "learning_rate": 0.00017454262358894513,
+      "loss": 1.0566,
+      "step": 333
+    },
+    {
+      "epoch": 0.12979694938307587,
+      "grad_norm": 0.23349183797836304,
+      "learning_rate": 0.00017446477228493578,
+      "loss": 1.0833,
+      "step": 334
+    },
+    {
+      "epoch": 0.1301855630039833,
+      "grad_norm": 0.26166337728500366,
+      "learning_rate": 0.00017438692098092643,
+      "loss": 1.1598,
+      "step": 335
+    },
+    {
+      "epoch": 0.1305741766248907,
+      "grad_norm": 0.24188168346881866,
+      "learning_rate": 0.00017430906967691708,
+      "loss": 1.0728,
+      "step": 336
+    },
+    {
+      "epoch": 0.13096279024579813,
+      "grad_norm": 0.22922398149967194,
+      "learning_rate": 0.00017423121837290773,
+      "loss": 1.0311,
+      "step": 337
+    },
+    {
+      "epoch": 0.13135140386670552,
+      "grad_norm": 0.2652754485607147,
+      "learning_rate": 0.00017415336706889841,
+      "loss": 1.1096,
+      "step": 338
+    },
+    {
+      "epoch": 0.13174001748761294,
+      "grad_norm": 0.2355881780385971,
+      "learning_rate": 0.00017407551576488907,
+      "loss": 1.0964,
+      "step": 339
+    },
+    {
+      "epoch": 0.13212863110852036,
+      "grad_norm": 0.244523823261261,
+      "learning_rate": 0.00017399766446087972,
+      "loss": 1.142,
+      "step": 340
+    },
+    {
+      "epoch": 0.13251724472942777,
+      "grad_norm": 0.24705976247787476,
+      "learning_rate": 0.00017391981315687037,
+      "loss": 1.0943,
+      "step": 341
+    },
+    {
+      "epoch": 0.13290585835033517,
+      "grad_norm": 0.22817552089691162,
+      "learning_rate": 0.00017384196185286102,
+      "loss": 1.0621,
+      "step": 342
+    },
+    {
+      "epoch": 0.13329447197124258,
+      "grad_norm": 0.22605225443840027,
+      "learning_rate": 0.0001737641105488517,
+      "loss": 1.0714,
+      "step": 343
+    },
+    {
+      "epoch": 0.13368308559215,
+      "grad_norm": 0.2584545314311981,
+      "learning_rate": 0.00017368625924484235,
+      "loss": 1.1367,
+      "step": 344
+    },
+    {
+      "epoch": 0.13407169921305742,
+      "grad_norm": 0.2248220443725586,
+      "learning_rate": 0.000173608407940833,
+      "loss": 1.0872,
+      "step": 345
+    },
+    {
+      "epoch": 0.13446031283396484,
+      "grad_norm": 0.2141868770122528,
+      "learning_rate": 0.00017353055663682368,
+      "loss": 1.0572,
+      "step": 346
+    },
+    {
+      "epoch": 0.13484892645487223,
+      "grad_norm": 0.2615523934364319,
+      "learning_rate": 0.00017345270533281434,
+      "loss": 1.1048,
+      "step": 347
+    },
+    {
+      "epoch": 0.13523754007577965,
+      "grad_norm": 0.22990448772907257,
+      "learning_rate": 0.000173374854028805,
+      "loss": 1.0528,
+      "step": 348
+    },
+    {
+      "epoch": 0.13562615369668707,
+      "grad_norm": 0.2132262885570526,
+      "learning_rate": 0.00017329700272479564,
+      "loss": 1.0476,
+      "step": 349
+    },
+    {
+      "epoch": 0.1360147673175945,
+      "grad_norm": 0.2578272819519043,
+      "learning_rate": 0.00017321915142078632,
+      "loss": 1.0852,
+      "step": 350
+    },
+    {
+      "epoch": 0.1364033809385019,
+      "grad_norm": 0.22881457209587097,
+      "learning_rate": 0.00017314130011677697,
+      "loss": 1.1017,
+      "step": 351
+    },
+    {
+      "epoch": 0.1367919945594093,
+      "grad_norm": 0.21067696809768677,
+      "learning_rate": 0.00017306344881276762,
+      "loss": 1.0444,
+      "step": 352
+    },
+    {
+      "epoch": 0.13718060818031672,
+      "grad_norm": 0.2304215282201767,
+      "learning_rate": 0.0001729855975087583,
+      "loss": 1.0737,
+      "step": 353
+    },
+    {
+      "epoch": 0.13756922180122413,
+      "grad_norm": 0.2031925916671753,
+      "learning_rate": 0.00017290774620474895,
+      "loss": 1.0036,
+      "step": 354
+    },
+    {
+      "epoch": 0.13795783542213155,
+      "grad_norm": 0.27281051874160767,
+      "learning_rate": 0.0001728298949007396,
+      "loss": 1.148,
+      "step": 355
+    },
+    {
+      "epoch": 0.13834644904303897,
+      "grad_norm": 0.204191654920578,
+      "learning_rate": 0.00017275204359673026,
+      "loss": 0.9607,
+      "step": 356
+    },
+    {
+      "epoch": 0.13873506266394636,
+      "grad_norm": 0.221976637840271,
+      "learning_rate": 0.0001726741922927209,
+      "loss": 1.1068,
+      "step": 357
+    },
+    {
+      "epoch": 0.13912367628485378,
+      "grad_norm": 0.20831729471683502,
+      "learning_rate": 0.0001725963409887116,
+      "loss": 1.034,
+      "step": 358
+    },
+    {
+      "epoch": 0.1395122899057612,
+      "grad_norm": 0.21639779210090637,
+      "learning_rate": 0.00017251848968470224,
+      "loss": 1.0613,
+      "step": 359
+    },
+    {
+      "epoch": 0.13990090352666862,
+      "grad_norm": 0.1959424465894699,
+      "learning_rate": 0.0001724406383806929,
+      "loss": 1.0506,
+      "step": 360
+    },
+    {
+      "epoch": 0.140289517147576,
+      "grad_norm": 0.2044398933649063,
+      "learning_rate": 0.00017236278707668355,
+      "loss": 1.0316,
+      "step": 361
+    },
+    {
+      "epoch": 0.14067813076848343,
+      "grad_norm": 0.21483004093170166,
+      "learning_rate": 0.0001722849357726742,
+      "loss": 1.0361,
+      "step": 362
+    },
+    {
+      "epoch": 0.14106674438939085,
+      "grad_norm": 0.237701416015625,
+      "learning_rate": 0.00017220708446866485,
+      "loss": 1.1264,
+      "step": 363
+    },
+    {
+      "epoch": 0.14145535801029827,
+      "grad_norm": 0.20750795304775238,
+      "learning_rate": 0.00017212923316465553,
+      "loss": 1.0523,
+      "step": 364
+    },
+    {
+      "epoch": 0.14184397163120568,
+      "grad_norm": 0.2252965271472931,
+      "learning_rate": 0.00017205138186064618,
+      "loss": 1.0764,
+      "step": 365
+    },
+    {
+      "epoch": 0.14223258525211308,
+      "grad_norm": 0.2033565789461136,
+      "learning_rate": 0.00017197353055663683,
+      "loss": 1.064,
+      "step": 366
+    },
+    {
+      "epoch": 0.1426211988730205,
+      "grad_norm": 0.21123190224170685,
+      "learning_rate": 0.00017189567925262749,
+      "loss": 1.0515,
+      "step": 367
+    },
+    {
+      "epoch": 0.1430098124939279,
+      "grad_norm": 0.20646221935749054,
+      "learning_rate": 0.00017181782794861814,
+      "loss": 1.0617,
+      "step": 368
+    },
+    {
+      "epoch": 0.14339842611483533,
+      "grad_norm": 0.2079589068889618,
+      "learning_rate": 0.00017173997664460882,
+      "loss": 1.0569,
+      "step": 369
+    },
+    {
+      "epoch": 0.14378703973574275,
+      "grad_norm": 0.216246098279953,
+      "learning_rate": 0.00017166212534059947,
+      "loss": 1.0986,
+      "step": 370
+    },
+    {
+      "epoch": 0.14417565335665014,
+      "grad_norm": 0.20711806416511536,
+      "learning_rate": 0.00017158427403659012,
+      "loss": 1.1342,
+      "step": 371
+    },
+    {
+      "epoch": 0.14456426697755756,
+      "grad_norm": 0.235435351729393,
+      "learning_rate": 0.00017150642273258077,
+      "loss": 1.1082,
+      "step": 372
+    },
+    {
+      "epoch": 0.14495288059846498,
+      "grad_norm": 0.2273191511631012,
+      "learning_rate": 0.00017142857142857143,
+      "loss": 1.1064,
+      "step": 373
+    },
+    {
+      "epoch": 0.1453414942193724,
+      "grad_norm": 0.2075672745704651,
+      "learning_rate": 0.0001713507201245621,
+      "loss": 1.0536,
+      "step": 374
+    },
+    {
+      "epoch": 0.14573010784027982,
+      "grad_norm": 0.20764274895191193,
+      "learning_rate": 0.00017127286882055276,
+      "loss": 1.0673,
+      "step": 375
+    },
+    {
+      "epoch": 0.1461187214611872,
+      "grad_norm": 0.2441243678331375,
+      "learning_rate": 0.0001711950175165434,
+      "loss": 1.1271,
+      "step": 376
+    },
+    {
+      "epoch": 0.14650733508209463,
+      "grad_norm": 0.2383374124765396,
+      "learning_rate": 0.00017111716621253406,
+      "loss": 1.083,
+      "step": 377
+    },
+    {
+      "epoch": 0.14689594870300204,
+      "grad_norm": 0.2172410786151886,
+      "learning_rate": 0.0001710393149085247,
+      "loss": 1.0605,
+      "step": 378
+    },
+    {
+      "epoch": 0.14728456232390946,
+      "grad_norm": 0.22591541707515717,
+      "learning_rate": 0.0001709614636045154,
+      "loss": 1.0931,
+      "step": 379
+    },
+    {
+      "epoch": 0.14767317594481685,
+      "grad_norm": 0.23099495470523834,
+      "learning_rate": 0.00017088361230050604,
+      "loss": 1.1021,
+      "step": 380
+    },
+    {
+      "epoch": 0.14806178956572427,
+      "grad_norm": 0.21461094915866852,
+      "learning_rate": 0.0001708057609964967,
+      "loss": 1.0959,
+      "step": 381
+    },
+    {
+      "epoch": 0.1484504031866317,
+      "grad_norm": 0.21557241678237915,
+      "learning_rate": 0.00017072790969248735,
+      "loss": 1.0155,
+      "step": 382
+    },
+    {
+      "epoch": 0.1488390168075391,
+      "grad_norm": 0.234396293759346,
+      "learning_rate": 0.000170650058388478,
+      "loss": 1.1289,
+      "step": 383
+    },
+    {
+      "epoch": 0.14922763042844653,
+      "grad_norm": 0.22895503044128418,
+      "learning_rate": 0.00017057220708446868,
+      "loss": 0.9919,
+      "step": 384
+    },
+    {
+      "epoch": 0.14961624404935392,
+      "grad_norm": 0.2054683268070221,
+      "learning_rate": 0.00017049435578045933,
+      "loss": 1.0607,
+      "step": 385
+    },
+    {
+      "epoch": 0.15000485767026134,
+      "grad_norm": 0.25569215416908264,
+      "learning_rate": 0.00017041650447644998,
+      "loss": 1.0517,
+      "step": 386
+    },
+    {
+      "epoch": 0.15039347129116876,
+      "grad_norm": 0.2222641259431839,
+      "learning_rate": 0.00017033865317244064,
+      "loss": 1.0404,
+      "step": 387
+    },
+    {
+      "epoch": 0.15078208491207618,
+      "grad_norm": 0.20501169562339783,
+      "learning_rate": 0.0001702608018684313,
+      "loss": 0.9897,
+      "step": 388
+    },
+    {
+      "epoch": 0.1511706985329836,
+      "grad_norm": 0.22080403566360474,
+      "learning_rate": 0.00017018295056442197,
+      "loss": 1.1013,
+      "step": 389
+    },
+    {
+      "epoch": 0.15155931215389098,
+      "grad_norm": 0.21218529343605042,
+      "learning_rate": 0.00017010509926041262,
+      "loss": 1.0541,
+      "step": 390
+    },
+    {
+      "epoch": 0.1519479257747984,
+      "grad_norm": 0.23064807057380676,
+      "learning_rate": 0.00017002724795640327,
+      "loss": 1.037,
+      "step": 391
+    },
+    {
+      "epoch": 0.15233653939570582,
+      "grad_norm": 0.21164493262767792,
+      "learning_rate": 0.00016994939665239392,
+      "loss": 1.0769,
+      "step": 392
+    },
+    {
+      "epoch": 0.15272515301661324,
+      "grad_norm": 0.22565549612045288,
+      "learning_rate": 0.00016987154534838457,
+      "loss": 1.0638,
+      "step": 393
+    },
+    {
+      "epoch": 0.15311376663752063,
+      "grad_norm": 0.22492647171020508,
+      "learning_rate": 0.00016979369404437525,
+      "loss": 1.063,
+      "step": 394
+    },
+    {
+      "epoch": 0.15350238025842805,
+      "grad_norm": 0.22335395216941833,
+      "learning_rate": 0.0001697158427403659,
+      "loss": 1.1032,
+      "step": 395
+    },
+    {
+      "epoch": 0.15389099387933547,
+      "grad_norm": 0.2164154201745987,
+      "learning_rate": 0.00016963799143635656,
+      "loss": 1.1275,
+      "step": 396
+    },
+    {
+      "epoch": 0.1542796075002429,
+      "grad_norm": 0.22547736763954163,
+      "learning_rate": 0.0001695601401323472,
+      "loss": 1.1324,
+      "step": 397
+    },
+    {
+      "epoch": 0.1546682211211503,
+      "grad_norm": 0.2028045952320099,
+      "learning_rate": 0.0001694822888283379,
+      "loss": 1.0057,
+      "step": 398
+    },
+    {
+      "epoch": 0.1550568347420577,
+      "grad_norm": 0.20770573616027832,
+      "learning_rate": 0.00016940443752432854,
+      "loss": 1.0311,
+      "step": 399
+    },
+    {
+      "epoch": 0.15544544836296512,
+      "grad_norm": 0.2231476902961731,
+      "learning_rate": 0.0001693265862203192,
+      "loss": 1.0535,
+      "step": 400
+    },
+    {
+      "epoch": 0.15583406198387253,
+      "grad_norm": 0.21618099510669708,
+      "learning_rate": 0.00016924873491630987,
+      "loss": 1.0616,
+      "step": 401
+    },
+    {
+      "epoch": 0.15622267560477995,
+      "grad_norm": 0.24024419486522675,
+      "learning_rate": 0.00016917088361230052,
+      "loss": 1.1324,
+      "step": 402
+    },
+    {
+      "epoch": 0.15661128922568737,
+      "grad_norm": 0.2002171128988266,
+      "learning_rate": 0.00016909303230829118,
+      "loss": 1.015,
+      "step": 403
+    },
+    {
+      "epoch": 0.15699990284659476,
+      "grad_norm": 0.21771477162837982,
+      "learning_rate": 0.00016901518100428183,
+      "loss": 1.0817,
+      "step": 404
+    },
+    {
+      "epoch": 0.15738851646750218,
+      "grad_norm": 0.22052259743213654,
+      "learning_rate": 0.0001689373297002725,
+      "loss": 1.0836,
+      "step": 405
+    },
+    {
+      "epoch": 0.1577771300884096,
+      "grad_norm": 0.1964062750339508,
+      "learning_rate": 0.00016885947839626316,
+      "loss": 1.0505,
+      "step": 406
+    },
+    {
+      "epoch": 0.15816574370931702,
+      "grad_norm": 0.22714298963546753,
+      "learning_rate": 0.0001687816270922538,
+      "loss": 1.0702,
+      "step": 407
+    },
+    {
+      "epoch": 0.15855435733022444,
+      "grad_norm": 0.20647728443145752,
+      "learning_rate": 0.00016870377578824446,
+      "loss": 1.0349,
+      "step": 408
+    },
+    {
+      "epoch": 0.15894297095113183,
+      "grad_norm": 0.2355160117149353,
+      "learning_rate": 0.00016862592448423512,
+      "loss": 1.0305,
+      "step": 409
+    },
+    {
+      "epoch": 0.15933158457203925,
+      "grad_norm": 0.22890770435333252,
+      "learning_rate": 0.0001685480731802258,
+      "loss": 1.0854,
+      "step": 410
+    },
+    {
+      "epoch": 0.15972019819294667,
+      "grad_norm": 0.21947838366031647,
+      "learning_rate": 0.00016847022187621645,
+      "loss": 1.0948,
+      "step": 411
+    },
+    {
+      "epoch": 0.16010881181385409,
+      "grad_norm": 0.22334899008274078,
+      "learning_rate": 0.0001683923705722071,
+      "loss": 1.006,
+      "step": 412
+    },
+    {
+      "epoch": 0.16049742543476148,
+      "grad_norm": 0.22324936091899872,
+      "learning_rate": 0.00016831451926819775,
+      "loss": 1.0402,
+      "step": 413
+    },
+    {
+      "epoch": 0.1608860390556689,
+      "grad_norm": 0.21462097764015198,
+      "learning_rate": 0.0001682366679641884,
+      "loss": 1.077,
+      "step": 414
+    },
+    {
+      "epoch": 0.1612746526765763,
+      "grad_norm": 0.24567006528377533,
+      "learning_rate": 0.00016815881666017908,
+      "loss": 1.15,
+      "step": 415
+    },
+    {
+      "epoch": 0.16166326629748373,
+      "grad_norm": 0.26437243819236755,
+      "learning_rate": 0.00016808096535616973,
+      "loss": 1.1251,
+      "step": 416
+    },
+    {
+      "epoch": 0.16205187991839115,
+      "grad_norm": 0.2217959761619568,
+      "learning_rate": 0.00016800311405216039,
+      "loss": 1.1103,
+      "step": 417
+    },
+    {
+      "epoch": 0.16244049353929854,
+      "grad_norm": 0.24402475357055664,
+      "learning_rate": 0.00016792526274815104,
+      "loss": 1.0672,
+      "step": 418
+    },
+    {
+      "epoch": 0.16282910716020596,
+      "grad_norm": 0.21609526872634888,
+      "learning_rate": 0.0001678474114441417,
+      "loss": 1.0291,
+      "step": 419
+    },
+    {
+      "epoch": 0.16321772078111338,
+      "grad_norm": 0.20054642856121063,
+      "learning_rate": 0.00016776956014013237,
+      "loss": 1.0704,
+      "step": 420
+    },
+    {
+      "epoch": 0.1636063344020208,
+      "grad_norm": 0.22864869236946106,
+      "learning_rate": 0.00016769170883612302,
+      "loss": 1.0612,
+      "step": 421
+    },
+    {
+      "epoch": 0.16399494802292822,
+      "grad_norm": 0.22651974856853485,
+      "learning_rate": 0.00016761385753211367,
+      "loss": 1.0749,
+      "step": 422
+    },
+    {
+      "epoch": 0.1643835616438356,
+      "grad_norm": 0.21587328612804413,
+      "learning_rate": 0.00016753600622810433,
+      "loss": 1.0398,
+      "step": 423
+    },
+    {
+      "epoch": 0.16477217526474303,
+      "grad_norm": 0.1953774094581604,
+      "learning_rate": 0.00016745815492409498,
+      "loss": 1.0275,
+      "step": 424
+    },
+    {
+      "epoch": 0.16516078888565044,
+      "grad_norm": 0.21803410351276398,
+      "learning_rate": 0.00016738030362008566,
+      "loss": 1.1219,
+      "step": 425
+    },
+    {
+      "epoch": 0.16554940250655786,
+      "grad_norm": 0.2034682035446167,
+      "learning_rate": 0.0001673024523160763,
+      "loss": 1.0342,
+      "step": 426
+    },
+    {
+      "epoch": 0.16593801612746525,
+      "grad_norm": 0.20135951042175293,
+      "learning_rate": 0.00016722460101206696,
+      "loss": 0.9802,
+      "step": 427
+    },
+    {
+      "epoch": 0.16632662974837267,
+      "grad_norm": 0.23310376703739166,
+      "learning_rate": 0.0001671467497080576,
+      "loss": 1.0789,
+      "step": 428
+    },
+    {
+      "epoch": 0.1667152433692801,
+      "grad_norm": 0.21475404500961304,
+      "learning_rate": 0.00016706889840404827,
+      "loss": 1.0416,
+      "step": 429
+    },
+    {
+      "epoch": 0.1671038569901875,
+      "grad_norm": 0.21661072969436646,
+      "learning_rate": 0.00016699104710003894,
+      "loss": 1.0568,
+      "step": 430
+    },
+    {
+      "epoch": 0.16749247061109493,
+      "grad_norm": 0.20310629904270172,
+      "learning_rate": 0.0001669131957960296,
+      "loss": 0.9968,
+      "step": 431
+    },
+    {
+      "epoch": 0.16788108423200232,
+      "grad_norm": 0.2596947252750397,
+      "learning_rate": 0.00016683534449202025,
+      "loss": 1.0478,
+      "step": 432
+    },
+    {
+      "epoch": 0.16826969785290974,
+      "grad_norm": 0.22226987779140472,
+      "learning_rate": 0.0001667574931880109,
+      "loss": 1.0898,
+      "step": 433
+    },
+    {
+      "epoch": 0.16865831147381716,
+      "grad_norm": 0.22499911487102509,
+      "learning_rate": 0.00016667964188400155,
+      "loss": 1.07,
+      "step": 434
+    },
+    {
+      "epoch": 0.16904692509472458,
+      "grad_norm": 0.2717292308807373,
+      "learning_rate": 0.0001666017905799922,
+      "loss": 1.0562,
+      "step": 435
+    },
+    {
+      "epoch": 0.169435538715632,
+      "grad_norm": 0.22052323818206787,
+      "learning_rate": 0.00016652393927598288,
+      "loss": 1.0732,
+      "step": 436
+    },
+    {
+      "epoch": 0.16982415233653939,
+      "grad_norm": 0.21741728484630585,
+      "learning_rate": 0.00016644608797197354,
+      "loss": 1.0409,
+      "step": 437
+    },
+    {
+      "epoch": 0.1702127659574468,
+      "grad_norm": 0.20701193809509277,
+      "learning_rate": 0.0001663682366679642,
+      "loss": 1.0731,
+      "step": 438
+    },
+    {
+      "epoch": 0.17060137957835422,
+      "grad_norm": 0.22071130573749542,
+      "learning_rate": 0.00016629038536395484,
+      "loss": 1.0992,
+      "step": 439
+    },
+    {
+      "epoch": 0.17098999319926164,
+      "grad_norm": 0.20261412858963013,
+      "learning_rate": 0.0001662125340599455,
+      "loss": 1.0051,
+      "step": 440
+    },
+    {
+      "epoch": 0.17137860682016906,
+      "grad_norm": 0.2082947939634323,
+      "learning_rate": 0.00016613468275593617,
+      "loss": 1.0477,
+      "step": 441
+    },
+    {
+      "epoch": 0.17176722044107645,
+      "grad_norm": 0.22534717619419098,
+      "learning_rate": 0.00016605683145192682,
+      "loss": 1.041,
+      "step": 442
+    },
+    {
+      "epoch": 0.17215583406198387,
+      "grad_norm": 0.21547731757164001,
+      "learning_rate": 0.00016597898014791748,
+      "loss": 1.0528,
+      "step": 443
+    },
+    {
+      "epoch": 0.1725444476828913,
+      "grad_norm": 0.24141089618206024,
+      "learning_rate": 0.00016590112884390813,
+      "loss": 1.0928,
+      "step": 444
+    },
+    {
+      "epoch": 0.1729330613037987,
+      "grad_norm": 0.21910884976387024,
+      "learning_rate": 0.00016582327753989878,
+      "loss": 1.063,
+      "step": 445
+    },
+    {
+      "epoch": 0.1733216749247061,
+      "grad_norm": 0.21782316267490387,
+      "learning_rate": 0.00016574542623588946,
+      "loss": 1.0976,
+      "step": 446
+    },
+    {
+      "epoch": 0.17371028854561352,
+      "grad_norm": 0.21771778166294098,
+      "learning_rate": 0.0001656675749318801,
+      "loss": 1.0677,
+      "step": 447
+    },
+    {
+      "epoch": 0.17409890216652094,
+      "grad_norm": 0.22117659449577332,
+      "learning_rate": 0.00016558972362787076,
+      "loss": 1.0669,
+      "step": 448
+    },
+    {
+      "epoch": 0.17448751578742835,
+      "grad_norm": 0.21918092668056488,
+      "learning_rate": 0.00016551187232386141,
+      "loss": 1.0955,
+      "step": 449
+    },
+    {
+      "epoch": 0.17487612940833577,
+      "grad_norm": 0.22027818858623505,
+      "learning_rate": 0.0001654340210198521,
+      "loss": 1.0201,
+      "step": 450
+    },
+    {
+      "epoch": 0.17526474302924316,
+      "grad_norm": 0.2042885720729828,
+      "learning_rate": 0.00016535616971584275,
+      "loss": 1.0881,
+      "step": 451
+    },
+    {
+      "epoch": 0.17565335665015058,
+      "grad_norm": 0.21788261830806732,
+      "learning_rate": 0.0001652783184118334,
+      "loss": 1.0918,
+      "step": 452
+    },
+    {
+      "epoch": 0.176041970271058,
+      "grad_norm": 0.23332571983337402,
+      "learning_rate": 0.00016520046710782408,
+      "loss": 1.091,
+      "step": 453
+    },
+    {
+      "epoch": 0.17643058389196542,
+      "grad_norm": 0.20204192399978638,
+      "learning_rate": 0.00016512261580381473,
+      "loss": 1.0366,
+      "step": 454
+    },
+    {
+      "epoch": 0.17681919751287284,
+      "grad_norm": 0.21761906147003174,
+      "learning_rate": 0.00016504476449980538,
+      "loss": 1.0131,
+      "step": 455
+    },
+    {
+      "epoch": 0.17720781113378023,
+      "grad_norm": 0.2152051478624344,
+      "learning_rate": 0.00016496691319579606,
+      "loss": 1.0868,
+      "step": 456
+    },
+    {
+      "epoch": 0.17759642475468765,
+      "grad_norm": 0.22776494920253754,
+      "learning_rate": 0.0001648890618917867,
+      "loss": 1.0807,
+      "step": 457
+    },
+    {
+      "epoch": 0.17798503837559507,
+      "grad_norm": 0.2171342968940735,
+      "learning_rate": 0.00016481121058777736,
+      "loss": 1.0537,
+      "step": 458
+    },
+    {
+      "epoch": 0.17837365199650249,
+      "grad_norm": 0.2046273946762085,
+      "learning_rate": 0.00016473335928376802,
+      "loss": 1.0097,
+      "step": 459
+    },
+    {
+      "epoch": 0.17876226561740988,
+      "grad_norm": 0.2047681361436844,
+      "learning_rate": 0.00016465550797975867,
+      "loss": 1.0204,
+      "step": 460
+    },
+    {
+      "epoch": 0.1791508792383173,
+      "grad_norm": 0.1876862645149231,
+      "learning_rate": 0.00016457765667574935,
+      "loss": 0.9383,
+      "step": 461
+    },
+    {
+      "epoch": 0.17953949285922471,
+      "grad_norm": 0.218430757522583,
+      "learning_rate": 0.00016449980537174,
+      "loss": 1.0721,
+      "step": 462
+    },
+    {
+      "epoch": 0.17992810648013213,
+      "grad_norm": 0.2245480865240097,
+      "learning_rate": 0.00016442195406773065,
+      "loss": 1.0859,
+      "step": 463
+    },
+    {
+      "epoch": 0.18031672010103955,
+      "grad_norm": 0.22577151656150818,
+      "learning_rate": 0.0001643441027637213,
+      "loss": 1.0825,
+      "step": 464
+    },
+    {
+      "epoch": 0.18070533372194694,
+      "grad_norm": 0.20132745802402496,
+      "learning_rate": 0.00016426625145971196,
+      "loss": 1.0615,
+      "step": 465
+    },
+    {
+      "epoch": 0.18109394734285436,
+      "grad_norm": 0.2277505248785019,
+      "learning_rate": 0.00016418840015570263,
+      "loss": 1.0426,
+      "step": 466
+    },
+    {
+      "epoch": 0.18148256096376178,
+      "grad_norm": 0.22540105879306793,
+      "learning_rate": 0.0001641105488516933,
+      "loss": 1.0481,
+      "step": 467
+    },
+    {
+      "epoch": 0.1818711745846692,
+      "grad_norm": 0.20358088612556458,
+      "learning_rate": 0.00016403269754768394,
+      "loss": 1.0286,
+      "step": 468
+    },
+    {
+      "epoch": 0.18225978820557662,
+      "grad_norm": 0.22534145414829254,
+      "learning_rate": 0.0001639548462436746,
+      "loss": 1.1183,
+      "step": 469
+    },
+    {
+      "epoch": 0.182648401826484,
+      "grad_norm": 0.2188873142004013,
+      "learning_rate": 0.00016387699493966524,
+      "loss": 1.0439,
+      "step": 470
+    },
+    {
+      "epoch": 0.18303701544739143,
+      "grad_norm": 0.2128048539161682,
+      "learning_rate": 0.00016379914363565592,
+      "loss": 1.027,
+      "step": 471
+    },
+    {
+      "epoch": 0.18342562906829885,
+      "grad_norm": 0.2518141567707062,
+      "learning_rate": 0.00016372129233164657,
+      "loss": 1.0468,
+      "step": 472
+    },
+    {
+      "epoch": 0.18381424268920626,
+      "grad_norm": 0.2189142256975174,
+      "learning_rate": 0.00016364344102763723,
+      "loss": 1.0581,
+      "step": 473
+    },
+    {
+      "epoch": 0.18420285631011368,
+      "grad_norm": 0.31266725063323975,
+      "learning_rate": 0.00016356558972362788,
+      "loss": 1.0554,
+      "step": 474
+    },
+    {
+      "epoch": 0.18459146993102107,
+      "grad_norm": 0.21343916654586792,
+      "learning_rate": 0.00016348773841961853,
+      "loss": 1.0795,
+      "step": 475
+    },
+    {
+      "epoch": 0.1849800835519285,
+      "grad_norm": 0.22907280921936035,
+      "learning_rate": 0.00016340988711560918,
+      "loss": 1.0304,
+      "step": 476
+    },
+    {
+      "epoch": 0.1853686971728359,
+      "grad_norm": 0.2105257511138916,
+      "learning_rate": 0.00016333203581159986,
+      "loss": 1.0231,
+      "step": 477
+    },
+    {
+      "epoch": 0.18575731079374333,
+      "grad_norm": 0.19537831842899323,
+      "learning_rate": 0.00016325418450759051,
+      "loss": 1.0103,
+      "step": 478
+    },
+    {
+      "epoch": 0.18614592441465072,
+      "grad_norm": 0.20522372424602509,
+      "learning_rate": 0.00016317633320358117,
+      "loss": 1.0196,
+      "step": 479
+    },
+    {
+      "epoch": 0.18653453803555814,
+      "grad_norm": 0.21646477282047272,
+      "learning_rate": 0.00016309848189957182,
+      "loss": 1.0579,
+      "step": 480
+    },
+    {
+      "epoch": 0.18692315165646556,
+      "grad_norm": 0.21077193319797516,
+      "learning_rate": 0.00016302063059556247,
+      "loss": 1.0638,
+      "step": 481
+    },
+    {
+      "epoch": 0.18731176527737298,
+      "grad_norm": 0.20357473194599152,
+      "learning_rate": 0.00016294277929155315,
+      "loss": 1.0635,
+      "step": 482
+    },
+    {
+      "epoch": 0.1877003788982804,
+      "grad_norm": 0.2188001275062561,
+      "learning_rate": 0.0001628649279875438,
+      "loss": 1.0267,
+      "step": 483
+    },
+    {
+      "epoch": 0.1880889925191878,
+      "grad_norm": 0.2128928154706955,
+      "learning_rate": 0.00016278707668353445,
+      "loss": 0.9706,
+      "step": 484
+    },
+    {
+      "epoch": 0.1884776061400952,
+      "grad_norm": 0.22081372141838074,
+      "learning_rate": 0.0001627092253795251,
+      "loss": 1.08,
+      "step": 485
+    },
+    {
+      "epoch": 0.18886621976100262,
+      "grad_norm": 0.2250615805387497,
+      "learning_rate": 0.00016263137407551576,
+      "loss": 1.1451,
+      "step": 486
+    },
+    {
+      "epoch": 0.18925483338191004,
+      "grad_norm": 0.1984967589378357,
+      "learning_rate": 0.00016255352277150644,
+      "loss": 1.0744,
+      "step": 487
+    },
+    {
+      "epoch": 0.18964344700281746,
+      "grad_norm": 0.20778900384902954,
+      "learning_rate": 0.0001624756714674971,
+      "loss": 1.0623,
+      "step": 488
+    },
+    {
+      "epoch": 0.19003206062372485,
+      "grad_norm": 0.2026563137769699,
+      "learning_rate": 0.00016239782016348774,
+      "loss": 1.0714,
+      "step": 489
+    },
+    {
+      "epoch": 0.19042067424463227,
+      "grad_norm": 0.21598374843597412,
+      "learning_rate": 0.0001623199688594784,
+      "loss": 1.0869,
+      "step": 490
+    },
+    {
+      "epoch": 0.1908092878655397,
+      "grad_norm": 0.18944978713989258,
+      "learning_rate": 0.00016224211755546904,
+      "loss": 1.055,
+      "step": 491
+    },
+    {
+      "epoch": 0.1911979014864471,
+      "grad_norm": 0.20698946714401245,
+      "learning_rate": 0.00016216426625145972,
+      "loss": 1.0392,
+      "step": 492
+    },
+    {
+      "epoch": 0.1915865151073545,
+      "grad_norm": 0.22395353019237518,
+      "learning_rate": 0.00016208641494745038,
+      "loss": 1.0681,
+      "step": 493
+    },
+    {
+      "epoch": 0.19197512872826192,
+      "grad_norm": 0.22372962534427643,
+      "learning_rate": 0.00016200856364344103,
+      "loss": 1.0767,
+      "step": 494
+    },
+    {
+      "epoch": 0.19236374234916934,
+      "grad_norm": 0.2066701054573059,
+      "learning_rate": 0.00016193071233943168,
+      "loss": 1.0061,
+      "step": 495
+    },
+    {
+      "epoch": 0.19275235597007676,
+      "grad_norm": 0.19716408848762512,
+      "learning_rate": 0.00016185286103542233,
+      "loss": 1.039,
+      "step": 496
+    },
+    {
+      "epoch": 0.19314096959098417,
+      "grad_norm": 0.22159601747989655,
+      "learning_rate": 0.000161775009731413,
+      "loss": 1.0832,
+      "step": 497
+    },
+    {
+      "epoch": 0.19352958321189156,
+      "grad_norm": 0.21509626507759094,
+      "learning_rate": 0.00016169715842740366,
+      "loss": 1.0264,
+      "step": 498
+    },
+    {
+      "epoch": 0.19391819683279898,
+      "grad_norm": 0.21598199009895325,
+      "learning_rate": 0.00016161930712339431,
+      "loss": 1.049,
+      "step": 499
+    },
+    {
+      "epoch": 0.1943068104537064,
+      "grad_norm": 0.20279590785503387,
+      "learning_rate": 0.00016154145581938497,
+      "loss": 1.0505,
+      "step": 500
+    },
+    {
+      "epoch": 0.19469542407461382,
+      "grad_norm": 0.21796855330467224,
+      "learning_rate": 0.00016146360451537565,
+      "loss": 1.0885,
+      "step": 501
+    },
+    {
+      "epoch": 0.19508403769552124,
+      "grad_norm": 0.22128933668136597,
+      "learning_rate": 0.0001613857532113663,
+      "loss": 1.0903,
+      "step": 502
+    },
+    {
+      "epoch": 0.19547265131642863,
+      "grad_norm": 0.2032536417245865,
+      "learning_rate": 0.00016130790190735695,
+      "loss": 1.0285,
+      "step": 503
+    },
+    {
+      "epoch": 0.19586126493733605,
+      "grad_norm": 0.23738974332809448,
+      "learning_rate": 0.0001612300506033476,
+      "loss": 1.1188,
+      "step": 504
+    },
+    {
+      "epoch": 0.19624987855824347,
+      "grad_norm": 0.19614790380001068,
+      "learning_rate": 0.00016115219929933828,
+      "loss": 1.04,
+      "step": 505
+    },
+    {
+      "epoch": 0.1966384921791509,
+      "grad_norm": 0.2198178917169571,
+      "learning_rate": 0.00016107434799532893,
+      "loss": 1.0696,
+      "step": 506
+    },
+    {
+      "epoch": 0.1970271058000583,
+      "grad_norm": 0.18814648687839508,
+      "learning_rate": 0.00016099649669131959,
+      "loss": 1.0203,
+      "step": 507
+    },
+    {
+      "epoch": 0.1974157194209657,
+      "grad_norm": 0.20699037611484528,
+      "learning_rate": 0.00016091864538731026,
+      "loss": 1.1074,
+      "step": 508
+    },
+    {
+      "epoch": 0.19780433304187311,
+      "grad_norm": 0.21490445733070374,
+      "learning_rate": 0.00016084079408330092,
+      "loss": 1.0682,
+      "step": 509
+    },
+    {
+      "epoch": 0.19819294666278053,
+      "grad_norm": 0.2363848090171814,
+      "learning_rate": 0.00016076294277929157,
+      "loss": 1.0408,
+      "step": 510
+    },
+    {
+      "epoch": 0.19858156028368795,
+      "grad_norm": 0.20186659693717957,
+      "learning_rate": 0.00016068509147528222,
+      "loss": 1.026,
+      "step": 511
+    },
+    {
+      "epoch": 0.19897017390459534,
+      "grad_norm": 0.21564024686813354,
+      "learning_rate": 0.00016060724017127287,
+      "loss": 1.0418,
+      "step": 512
+    },
+    {
+      "epoch": 0.19935878752550276,
+      "grad_norm": 0.19151560962200165,
+      "learning_rate": 0.00016052938886726355,
+      "loss": 1.0037,
+      "step": 513
+    },
+    {
+      "epoch": 0.19974740114641018,
+      "grad_norm": 0.21038194000720978,
+      "learning_rate": 0.0001604515375632542,
+      "loss": 1.0545,
+      "step": 514
+    },
+    {
+      "epoch": 0.2001360147673176,
+      "grad_norm": 0.20496582984924316,
+      "learning_rate": 0.00016037368625924486,
+      "loss": 1.0543,
+      "step": 515
+    },
+    {
+      "epoch": 0.20052462838822502,
+      "grad_norm": 0.20689113438129425,
+      "learning_rate": 0.0001602958349552355,
+      "loss": 1.0905,
+      "step": 516
+    },
+    {
+      "epoch": 0.2009132420091324,
+      "grad_norm": 0.2284041792154312,
+      "learning_rate": 0.00016021798365122616,
+      "loss": 1.0717,
+      "step": 517
+    },
+    {
+      "epoch": 0.20130185563003983,
+      "grad_norm": 0.23457761108875275,
+      "learning_rate": 0.00016014013234721684,
+      "loss": 1.106,
+      "step": 518
+    },
+    {
+      "epoch": 0.20169046925094725,
+      "grad_norm": 0.2088528722524643,
+      "learning_rate": 0.0001600622810432075,
+      "loss": 1.0428,
+      "step": 519
+    },
+    {
+      "epoch": 0.20207908287185467,
+      "grad_norm": 0.2170068770647049,
+      "learning_rate": 0.00015998442973919814,
+      "loss": 0.9875,
+      "step": 520
+    },
+    {
+      "epoch": 0.20246769649276208,
+      "grad_norm": 0.2270561158657074,
+      "learning_rate": 0.0001599065784351888,
+      "loss": 1.0676,
+      "step": 521
+    },
+    {
+      "epoch": 0.20285631011366947,
+      "grad_norm": 0.2151324599981308,
+      "learning_rate": 0.00015982872713117945,
+      "loss": 1.0675,
+      "step": 522
+    },
+    {
+      "epoch": 0.2032449237345769,
+      "grad_norm": 0.23113249242305756,
+      "learning_rate": 0.00015975087582717013,
+      "loss": 1.0608,
+      "step": 523
+    },
+    {
+      "epoch": 0.2036335373554843,
+      "grad_norm": 0.2587106227874756,
+      "learning_rate": 0.00015967302452316078,
+      "loss": 1.0867,
+      "step": 524
+    },
+    {
+      "epoch": 0.20402215097639173,
+      "grad_norm": 0.21842992305755615,
+      "learning_rate": 0.00015959517321915143,
+      "loss": 1.0726,
+      "step": 525
+    },
+    {
+      "epoch": 0.20441076459729912,
+      "grad_norm": 0.20867805182933807,
+      "learning_rate": 0.00015951732191514208,
+      "loss": 1.0578,
+      "step": 526
+    },
+    {
+      "epoch": 0.20479937821820654,
+      "grad_norm": 0.2396962195634842,
+      "learning_rate": 0.00015943947061113273,
+      "loss": 1.0292,
+      "step": 527
+    },
+    {
+      "epoch": 0.20518799183911396,
+      "grad_norm": 0.221155047416687,
+      "learning_rate": 0.00015936161930712341,
+      "loss": 1.0019,
+      "step": 528
+    },
+    {
+      "epoch": 0.20557660546002138,
+      "grad_norm": 0.20032119750976562,
+      "learning_rate": 0.00015928376800311407,
+      "loss": 1.0435,
+      "step": 529
+    },
+    {
+      "epoch": 0.2059652190809288,
+      "grad_norm": 0.24095888435840607,
+      "learning_rate": 0.00015920591669910472,
+      "loss": 1.0355,
+      "step": 530
+    },
+    {
+      "epoch": 0.2063538327018362,
+      "grad_norm": 0.2286604344844818,
+      "learning_rate": 0.00015912806539509537,
+      "loss": 0.9989,
+      "step": 531
+    },
+    {
+      "epoch": 0.2067424463227436,
+      "grad_norm": 0.21537137031555176,
+      "learning_rate": 0.00015905021409108602,
+      "loss": 1.0642,
+      "step": 532
+    },
+    {
+      "epoch": 0.20713105994365102,
+      "grad_norm": 0.22447925806045532,
+      "learning_rate": 0.0001589723627870767,
+      "loss": 1.1244,
+      "step": 533
+    },
+    {
+      "epoch": 0.20751967356455844,
+      "grad_norm": 0.21077273786067963,
+      "learning_rate": 0.00015889451148306735,
+      "loss": 1.0167,
+      "step": 534
+    },
+    {
+      "epoch": 0.20790828718546586,
+      "grad_norm": 0.22340558469295502,
+      "learning_rate": 0.000158816660179058,
+      "loss": 1.0991,
+      "step": 535
+    },
+    {
+      "epoch": 0.20829690080637325,
+      "grad_norm": 0.223599374294281,
+      "learning_rate": 0.00015873880887504866,
+      "loss": 1.086,
+      "step": 536
+    },
+    {
+      "epoch": 0.20868551442728067,
+      "grad_norm": 0.2615208923816681,
+      "learning_rate": 0.0001586609575710393,
+      "loss": 1.0584,
+      "step": 537
+    },
+    {
+      "epoch": 0.2090741280481881,
+      "grad_norm": 0.2085907757282257,
+      "learning_rate": 0.00015858310626703,
+      "loss": 1.0994,
+      "step": 538
+    },
+    {
+      "epoch": 0.2094627416690955,
+      "grad_norm": 0.2170211672782898,
+      "learning_rate": 0.00015850525496302064,
+      "loss": 1.1105,
+      "step": 539
+    },
+    {
+      "epoch": 0.20985135529000293,
+      "grad_norm": 0.21978625655174255,
+      "learning_rate": 0.0001584274036590113,
+      "loss": 1.002,
+      "step": 540
+    },
+    {
+      "epoch": 0.21023996891091032,
+      "grad_norm": 0.23684021830558777,
+      "learning_rate": 0.00015834955235500194,
+      "loss": 1.1216,
+      "step": 541
+    },
+    {
+      "epoch": 0.21062858253181774,
+      "grad_norm": 0.220269113779068,
+      "learning_rate": 0.0001582717010509926,
+      "loss": 1.0773,
+      "step": 542
+    },
+    {
+      "epoch": 0.21101719615272516,
+      "grad_norm": 0.22447973489761353,
+      "learning_rate": 0.00015819384974698328,
+      "loss": 1.0941,
+      "step": 543
+    },
+    {
+      "epoch": 0.21140580977363257,
+      "grad_norm": 0.22435730695724487,
+      "learning_rate": 0.00015811599844297393,
+      "loss": 1.0138,
+      "step": 544
+    },
+    {
+      "epoch": 0.21179442339453997,
+      "grad_norm": 0.2230793684720993,
+      "learning_rate": 0.00015803814713896458,
+      "loss": 1.0343,
+      "step": 545
+    },
+    {
+      "epoch": 0.21218303701544738,
+      "grad_norm": 0.23491905629634857,
+      "learning_rate": 0.00015796029583495523,
+      "loss": 1.11,
+      "step": 546
+    },
+    {
+      "epoch": 0.2125716506363548,
+      "grad_norm": 0.213560551404953,
+      "learning_rate": 0.00015788244453094588,
+      "loss": 1.0615,
+      "step": 547
+    },
+    {
+      "epoch": 0.21296026425726222,
+      "grad_norm": 0.21392837166786194,
+      "learning_rate": 0.00015780459322693654,
+      "loss": 1.0872,
+      "step": 548
+    },
+    {
+      "epoch": 0.21334887787816964,
+      "grad_norm": 0.20007692277431488,
+      "learning_rate": 0.00015772674192292722,
+      "loss": 1.0394,
+      "step": 549
+    },
+    {
+      "epoch": 0.21373749149907703,
+      "grad_norm": 0.1969841718673706,
+      "learning_rate": 0.00015764889061891787,
+      "loss": 1.0381,
+      "step": 550
+    },
+    {
+      "epoch": 0.21412610511998445,
+      "grad_norm": 0.21874025464057922,
+      "learning_rate": 0.00015757103931490852,
+      "loss": 1.0822,
+      "step": 551
+    },
+    {
+      "epoch": 0.21451471874089187,
+      "grad_norm": 0.21824273467063904,
+      "learning_rate": 0.00015749318801089917,
+      "loss": 1.0802,
+      "step": 552
+    },
+    {
+      "epoch": 0.2149033323617993,
+      "grad_norm": 0.20942047238349915,
+      "learning_rate": 0.00015741533670688985,
+      "loss": 1.0634,
+      "step": 553
+    },
+    {
+      "epoch": 0.2152919459827067,
+      "grad_norm": 0.1940152943134308,
+      "learning_rate": 0.0001573374854028805,
+      "loss": 1.0264,
+      "step": 554
+    },
+    {
+      "epoch": 0.2156805596036141,
+      "grad_norm": 0.19859059154987335,
+      "learning_rate": 0.00015725963409887115,
+      "loss": 0.9701,
+      "step": 555
+    },
+    {
+      "epoch": 0.21606917322452152,
+      "grad_norm": 0.22239404916763306,
+      "learning_rate": 0.0001571817827948618,
+      "loss": 1.1282,
+      "step": 556
+    },
+    {
+      "epoch": 0.21645778684542893,
+      "grad_norm": 0.23820599913597107,
+      "learning_rate": 0.00015710393149085249,
+      "loss": 1.1123,
+      "step": 557
+    },
+    {
+      "epoch": 0.21684640046633635,
+      "grad_norm": 0.21279917657375336,
+      "learning_rate": 0.00015702608018684314,
+      "loss": 1.0542,
+      "step": 558
+    },
+    {
+      "epoch": 0.21723501408724374,
+      "grad_norm": 0.2065514773130417,
+      "learning_rate": 0.0001569482288828338,
+      "loss": 1.0685,
+      "step": 559
+    },
+    {
+      "epoch": 0.21762362770815116,
+      "grad_norm": 0.20130831003189087,
+      "learning_rate": 0.00015687037757882447,
+      "loss": 0.9869,
+      "step": 560
+    },
+    {
+      "epoch": 0.21801224132905858,
+      "grad_norm": 0.2187541127204895,
+      "learning_rate": 0.00015679252627481512,
+      "loss": 1.1095,
+      "step": 561
+    },
+    {
+      "epoch": 0.218400854949966,
+      "grad_norm": 0.21028277277946472,
+      "learning_rate": 0.00015671467497080577,
+      "loss": 1.0804,
+      "step": 562
+    },
+    {
+      "epoch": 0.21878946857087342,
+      "grad_norm": 0.8187636733055115,
+      "learning_rate": 0.00015663682366679643,
+      "loss": 1.0782,
+      "step": 563
+    },
+    {
+      "epoch": 0.2191780821917808,
+      "grad_norm": 0.20059974491596222,
+      "learning_rate": 0.0001565589723627871,
+      "loss": 1.0279,
+      "step": 564
+    },
+    {
+      "epoch": 0.21956669581268823,
+      "grad_norm": 0.20440839231014252,
+      "learning_rate": 0.00015648112105877776,
+      "loss": 0.9863,
+      "step": 565
+    },
+    {
+      "epoch": 0.21995530943359565,
+      "grad_norm": 0.21423624455928802,
+      "learning_rate": 0.0001564032697547684,
+      "loss": 1.0685,
+      "step": 566
+    },
+    {
+      "epoch": 0.22034392305450307,
+      "grad_norm": 0.22430062294006348,
+      "learning_rate": 0.00015632541845075906,
+      "loss": 1.0761,
+      "step": 567
+    },
+    {
+      "epoch": 0.22073253667541048,
+      "grad_norm": 0.22782258689403534,
+      "learning_rate": 0.0001562475671467497,
+      "loss": 1.1024,
+      "step": 568
+    },
+    {
+      "epoch": 0.22112115029631788,
+      "grad_norm": 0.21150320768356323,
+      "learning_rate": 0.0001561697158427404,
+      "loss": 1.0621,
+      "step": 569
+    },
+    {
+      "epoch": 0.2215097639172253,
+      "grad_norm": 0.20342351496219635,
+      "learning_rate": 0.00015609186453873104,
+      "loss": 1.0667,
+      "step": 570
+    },
+    {
+      "epoch": 0.2218983775381327,
+      "grad_norm": 0.22866711020469666,
+      "learning_rate": 0.0001560140132347217,
+      "loss": 1.0631,
+      "step": 571
+    },
+    {
+      "epoch": 0.22228699115904013,
+      "grad_norm": 0.2200063169002533,
+      "learning_rate": 0.00015593616193071235,
+      "loss": 1.0448,
+      "step": 572
+    },
+    {
+      "epoch": 0.22267560477994755,
+      "grad_norm": 0.19440248608589172,
+      "learning_rate": 0.000155858310626703,
+      "loss": 1.037,
+      "step": 573
+    },
+    {
+      "epoch": 0.22306421840085494,
+      "grad_norm": 0.205752432346344,
+      "learning_rate": 0.00015578045932269368,
+      "loss": 1.0465,
+      "step": 574
+    },
+    {
+      "epoch": 0.22345283202176236,
+      "grad_norm": 0.22247998416423798,
+      "learning_rate": 0.00015570260801868433,
+      "loss": 0.997,
+      "step": 575
+    },
+    {
+      "epoch": 0.22384144564266978,
+      "grad_norm": 0.22199274599552155,
+      "learning_rate": 0.00015562475671467498,
+      "loss": 1.0178,
+      "step": 576
+    },
+    {
+      "epoch": 0.2242300592635772,
+      "grad_norm": 0.2114989310503006,
+      "learning_rate": 0.00015554690541066564,
+      "loss": 1.0457,
+      "step": 577
+    },
+    {
+      "epoch": 0.2246186728844846,
+      "grad_norm": 0.24248506128787994,
+      "learning_rate": 0.0001554690541066563,
+      "loss": 1.002,
+      "step": 578
+    },
+    {
+      "epoch": 0.225007286505392,
+      "grad_norm": 0.2565505802631378,
+      "learning_rate": 0.00015539120280264697,
+      "loss": 1.0541,
+      "step": 579
+    },
+    {
+      "epoch": 0.22539590012629943,
+      "grad_norm": 0.22799409925937653,
+      "learning_rate": 0.00015531335149863762,
+      "loss": 1.0788,
+      "step": 580
+    },
+    {
+      "epoch": 0.22578451374720684,
+      "grad_norm": 0.2196080982685089,
+      "learning_rate": 0.00015523550019462827,
+      "loss": 1.0877,
+      "step": 581
+    },
+    {
+      "epoch": 0.22617312736811426,
+      "grad_norm": 0.21992824971675873,
+      "learning_rate": 0.00015515764889061892,
+      "loss": 1.0213,
+      "step": 582
+    },
+    {
+      "epoch": 0.22656174098902165,
+      "grad_norm": 0.22793298959732056,
+      "learning_rate": 0.00015507979758660957,
+      "loss": 1.0633,
+      "step": 583
+    },
+    {
+      "epoch": 0.22695035460992907,
+      "grad_norm": 0.21707972884178162,
+      "learning_rate": 0.00015500194628260023,
+      "loss": 1.081,
+      "step": 584
+    },
+    {
+      "epoch": 0.2273389682308365,
+      "grad_norm": 0.220685675740242,
+      "learning_rate": 0.0001549240949785909,
+      "loss": 1.0658,
+      "step": 585
+    },
+    {
+      "epoch": 0.2277275818517439,
+      "grad_norm": 0.22576668858528137,
+      "learning_rate": 0.00015484624367458156,
+      "loss": 1.0795,
+      "step": 586
+    },
+    {
+      "epoch": 0.22811619547265133,
+      "grad_norm": 0.21778982877731323,
+      "learning_rate": 0.0001547683923705722,
+      "loss": 1.033,
+      "step": 587
+    },
+    {
+      "epoch": 0.22850480909355872,
+      "grad_norm": 0.22748610377311707,
+      "learning_rate": 0.00015469054106656286,
+      "loss": 1.0948,
+      "step": 588
+    },
+    {
+      "epoch": 0.22889342271446614,
+      "grad_norm": 0.21561284363269806,
+      "learning_rate": 0.00015461268976255351,
+      "loss": 1.0022,
+      "step": 589
+    },
+    {
+      "epoch": 0.22928203633537356,
+      "grad_norm": 0.2419756054878235,
+      "learning_rate": 0.0001545348384585442,
+      "loss": 1.0786,
+      "step": 590
+    },
+    {
+      "epoch": 0.22967064995628098,
+      "grad_norm": 0.20479315519332886,
+      "learning_rate": 0.00015445698715453485,
+      "loss": 1.027,
+      "step": 591
+    },
+    {
+      "epoch": 0.2300592635771884,
+      "grad_norm": 0.21365883946418762,
+      "learning_rate": 0.0001543791358505255,
+      "loss": 1.0773,
+      "step": 592
+    },
+    {
+      "epoch": 0.23044787719809579,
+      "grad_norm": 0.23133166134357452,
+      "learning_rate": 0.00015430128454651615,
+      "loss": 1.0877,
+      "step": 593
+    },
+    {
+      "epoch": 0.2308364908190032,
+      "grad_norm": 0.2110515981912613,
+      "learning_rate": 0.0001542234332425068,
+      "loss": 1.0509,
+      "step": 594
+    },
+    {
+      "epoch": 0.23122510443991062,
+      "grad_norm": 0.20658442378044128,
+      "learning_rate": 0.00015414558193849748,
+      "loss": 1.0623,
+      "step": 595
+    },
+    {
+      "epoch": 0.23161371806081804,
+      "grad_norm": 0.21831996738910675,
+      "learning_rate": 0.00015406773063448813,
+      "loss": 1.021,
+      "step": 596
+    },
+    {
+      "epoch": 0.23200233168172543,
+      "grad_norm": 0.23015642166137695,
+      "learning_rate": 0.00015398987933047878,
+      "loss": 1.0358,
+      "step": 597
+    },
+    {
+      "epoch": 0.23239094530263285,
+      "grad_norm": 0.23071645200252533,
+      "learning_rate": 0.00015391202802646944,
+      "loss": 1.1255,
+      "step": 598
+    },
+    {
+      "epoch": 0.23277955892354027,
+      "grad_norm": 0.19513486325740814,
+      "learning_rate": 0.0001538341767224601,
+      "loss": 1.0189,
+      "step": 599
+    },
+    {
+      "epoch": 0.2331681725444477,
+      "grad_norm": 0.20821452140808105,
+      "learning_rate": 0.00015375632541845077,
+      "loss": 1.0843,
+      "step": 600
+    },
+    {
+      "epoch": 0.2335567861653551,
+      "grad_norm": 0.20563223958015442,
+      "learning_rate": 0.00015367847411444142,
+      "loss": 1.0012,
+      "step": 601
+    },
+    {
+      "epoch": 0.2339453997862625,
+      "grad_norm": 0.22674202919006348,
+      "learning_rate": 0.00015360062281043207,
+      "loss": 1.0371,
+      "step": 602
+    },
+    {
+      "epoch": 0.23433401340716992,
+      "grad_norm": 0.20744135975837708,
+      "learning_rate": 0.00015352277150642272,
+      "loss": 1.0466,
+      "step": 603
+    },
+    {
+      "epoch": 0.23472262702807734,
+      "grad_norm": 0.22103577852249146,
+      "learning_rate": 0.00015344492020241338,
+      "loss": 1.0942,
+      "step": 604
+    },
+    {
+      "epoch": 0.23511124064898475,
+      "grad_norm": 0.20643098652362823,
+      "learning_rate": 0.00015336706889840406,
+      "loss": 1.0682,
+      "step": 605
+    },
+    {
+      "epoch": 0.23549985426989217,
+      "grad_norm": 0.23436777293682098,
+      "learning_rate": 0.0001532892175943947,
+      "loss": 1.0613,
+      "step": 606
+    },
+    {
+      "epoch": 0.23588846789079956,
+      "grad_norm": 0.21898899972438812,
+      "learning_rate": 0.00015321136629038536,
+      "loss": 1.0571,
+      "step": 607
+    },
+    {
+      "epoch": 0.23627708151170698,
+      "grad_norm": 0.20569247007369995,
+      "learning_rate": 0.00015313351498637604,
+      "loss": 1.061,
+      "step": 608
+    },
+    {
+      "epoch": 0.2366656951326144,
+      "grad_norm": 0.2099207490682602,
+      "learning_rate": 0.0001530556636823667,
+      "loss": 1.0776,
+      "step": 609
+    },
+    {
+      "epoch": 0.23705430875352182,
+      "grad_norm": 0.20078738033771515,
+      "learning_rate": 0.00015297781237835734,
+      "loss": 1.0341,
+      "step": 610
+    },
+    {
+      "epoch": 0.2374429223744292,
+      "grad_norm": 0.20327065885066986,
+      "learning_rate": 0.000152899961074348,
+      "loss": 1.0168,
+      "step": 611
+    },
+    {
+      "epoch": 0.23783153599533663,
+      "grad_norm": 0.21741214394569397,
+      "learning_rate": 0.00015282210977033867,
+      "loss": 1.0726,
+      "step": 612
+    },
+    {
+      "epoch": 0.23822014961624405,
+      "grad_norm": 0.2065727263689041,
+      "learning_rate": 0.00015274425846632933,
+      "loss": 1.0474,
+      "step": 613
+    },
+    {
+      "epoch": 0.23860876323715147,
+      "grad_norm": 0.21241194009780884,
+      "learning_rate": 0.00015266640716231998,
+      "loss": 1.0666,
+      "step": 614
+    },
+    {
+      "epoch": 0.23899737685805889,
+      "grad_norm": 0.2194201797246933,
+      "learning_rate": 0.00015258855585831066,
+      "loss": 1.1411,
+      "step": 615
+    },
+    {
+      "epoch": 0.23938599047896628,
+      "grad_norm": 0.21537193655967712,
+      "learning_rate": 0.0001525107045543013,
+      "loss": 1.081,
+      "step": 616
+    },
+    {
+      "epoch": 0.2397746040998737,
+      "grad_norm": 0.21125951409339905,
+      "learning_rate": 0.00015243285325029196,
+      "loss": 1.0679,
+      "step": 617
+    },
+    {
+      "epoch": 0.2401632177207811,
+      "grad_norm": 0.21342721581459045,
+      "learning_rate": 0.0001523550019462826,
+      "loss": 1.0564,
+      "step": 618
+    },
+    {
+      "epoch": 0.24055183134168853,
+      "grad_norm": 0.2223503291606903,
+      "learning_rate": 0.00015227715064227327,
+      "loss": 1.1163,
+      "step": 619
+    },
+    {
+      "epoch": 0.24094044496259595,
+      "grad_norm": 0.21626527607440948,
+      "learning_rate": 0.00015219929933826394,
+      "loss": 1.0793,
+      "step": 620
+    },
+    {
+      "epoch": 0.24132905858350334,
+      "grad_norm": 0.21899500489234924,
+      "learning_rate": 0.0001521214480342546,
+      "loss": 1.0864,
+      "step": 621
+    },
+    {
+      "epoch": 0.24171767220441076,
+      "grad_norm": 0.2499915212392807,
+      "learning_rate": 0.00015204359673024525,
+      "loss": 1.1381,
+      "step": 622
+    },
+    {
+      "epoch": 0.24210628582531818,
+      "grad_norm": 0.2108345925807953,
+      "learning_rate": 0.0001519657454262359,
+      "loss": 1.0534,
+      "step": 623
+    },
+    {
+      "epoch": 0.2424948994462256,
+      "grad_norm": 0.2224910855293274,
+      "learning_rate": 0.00015188789412222655,
+      "loss": 1.0235,
+      "step": 624
+    },
+    {
+      "epoch": 0.24288351306713302,
+      "grad_norm": 0.22163094580173492,
+      "learning_rate": 0.0001518100428182172,
+      "loss": 1.0143,
+      "step": 625
+    },
+    {
+      "epoch": 0.2432721266880404,
+      "grad_norm": 0.20709283649921417,
+      "learning_rate": 0.00015173219151420788,
+      "loss": 1.0506,
+      "step": 626
+    },
+    {
+      "epoch": 0.24366074030894783,
+      "grad_norm": 0.2112802267074585,
+      "learning_rate": 0.00015165434021019854,
+      "loss": 1.0692,
+      "step": 627
+    },
+    {
+      "epoch": 0.24404935392985525,
+      "grad_norm": 0.23622830212116241,
+      "learning_rate": 0.0001515764889061892,
+      "loss": 1.0769,
+      "step": 628
+    },
+    {
+      "epoch": 0.24443796755076266,
+      "grad_norm": 0.23328271508216858,
+      "learning_rate": 0.00015149863760217984,
+      "loss": 1.1158,
+      "step": 629
+    },
+    {
+      "epoch": 0.24482658117167005,
+      "grad_norm": 0.2071760892868042,
+      "learning_rate": 0.0001514207862981705,
+      "loss": 1.0133,
+      "step": 630
+    },
+    {
+      "epoch": 0.24521519479257747,
+      "grad_norm": 0.21428920328617096,
+      "learning_rate": 0.00015134293499416117,
+      "loss": 1.0342,
+      "step": 631
+    },
+    {
+      "epoch": 0.2456038084134849,
+      "grad_norm": 0.22225375473499298,
+      "learning_rate": 0.00015126508369015182,
+      "loss": 1.1054,
+      "step": 632
+    },
+    {
+      "epoch": 0.2459924220343923,
+      "grad_norm": 0.2096671611070633,
+      "learning_rate": 0.00015118723238614248,
+      "loss": 1.0229,
+      "step": 633
+    },
+    {
+      "epoch": 0.24638103565529973,
+      "grad_norm": 0.21473252773284912,
+      "learning_rate": 0.00015110938108213313,
+      "loss": 1.0915,
+      "step": 634
+    },
+    {
+      "epoch": 0.24676964927620712,
+      "grad_norm": 0.2071562111377716,
+      "learning_rate": 0.00015103152977812378,
+      "loss": 1.047,
+      "step": 635
+    },
+    {
+      "epoch": 0.24715826289711454,
+      "grad_norm": 0.19868609309196472,
+      "learning_rate": 0.00015095367847411446,
+      "loss": 1.0073,
+      "step": 636
+    },
+    {
+      "epoch": 0.24754687651802196,
+      "grad_norm": 0.20937366783618927,
+      "learning_rate": 0.0001508758271701051,
+      "loss": 1.0155,
+      "step": 637
+    },
+    {
+      "epoch": 0.24793549013892938,
+      "grad_norm": 0.19225911796092987,
+      "learning_rate": 0.00015079797586609576,
+      "loss": 1.0163,
+      "step": 638
+    },
+    {
+      "epoch": 0.2483241037598368,
+      "grad_norm": 0.20427283644676208,
+      "learning_rate": 0.00015072012456208641,
+      "loss": 1.062,
+      "step": 639
+    },
+    {
+      "epoch": 0.24871271738074419,
+      "grad_norm": 0.21640253067016602,
+      "learning_rate": 0.00015064227325807707,
+      "loss": 1.025,
+      "step": 640
+    },
+    {
+      "epoch": 0.2491013310016516,
+      "grad_norm": 0.20416739583015442,
+      "learning_rate": 0.00015056442195406775,
+      "loss": 1.0635,
+      "step": 641
+    },
+    {
+      "epoch": 0.24948994462255902,
+      "grad_norm": 0.1990521252155304,
+      "learning_rate": 0.0001504865706500584,
+      "loss": 1.0757,
+      "step": 642
+    },
+    {
+      "epoch": 0.24987855824346644,
+      "grad_norm": 0.21636444330215454,
+      "learning_rate": 0.00015040871934604905,
+      "loss": 1.0441,
+      "step": 643
+    },
+    {
+      "epoch": 0.25026717186437386,
+      "grad_norm": 0.21253719925880432,
+      "learning_rate": 0.0001503308680420397,
+      "loss": 1.0574,
+      "step": 644
+    },
+    {
+      "epoch": 0.2506557854852813,
+      "grad_norm": 0.2134159356355667,
+      "learning_rate": 0.00015025301673803035,
+      "loss": 1.0396,
+      "step": 645
+    },
+    {
+      "epoch": 0.2510443991061887,
+      "grad_norm": 0.2018527239561081,
+      "learning_rate": 0.00015017516543402103,
+      "loss": 1.0606,
+      "step": 646
+    },
+    {
+      "epoch": 0.25143301272709606,
+      "grad_norm": 0.20320741832256317,
+      "learning_rate": 0.00015009731413001169,
+      "loss": 1.0093,
+      "step": 647
+    },
+    {
+      "epoch": 0.2518216263480035,
+      "grad_norm": 0.21007056534290314,
+      "learning_rate": 0.00015001946282600234,
+      "loss": 1.0284,
+      "step": 648
+    },
+    {
+      "epoch": 0.2522102399689109,
+      "grad_norm": 0.22453372180461884,
+      "learning_rate": 0.000149941611521993,
+      "loss": 1.0271,
+      "step": 649
+    },
+    {
+      "epoch": 0.2525988535898183,
+      "grad_norm": 0.19889335334300995,
+      "learning_rate": 0.00014986376021798364,
+      "loss": 1.0238,
+      "step": 650
+    },
+    {
+      "epoch": 0.25298746721072574,
+      "grad_norm": 0.19339965283870697,
+      "learning_rate": 0.00014978590891397432,
+      "loss": 1.024,
+      "step": 651
+    },
+    {
+      "epoch": 0.25337608083163315,
+      "grad_norm": 0.22362011671066284,
+      "learning_rate": 0.00014970805760996497,
+      "loss": 1.0722,
+      "step": 652
+    },
+    {
+      "epoch": 0.2537646944525406,
+      "grad_norm": 0.2110588103532791,
+      "learning_rate": 0.00014963020630595562,
+      "loss": 1.0541,
+      "step": 653
+    },
+    {
+      "epoch": 0.254153308073448,
+      "grad_norm": 0.203025683760643,
+      "learning_rate": 0.00014955235500194628,
+      "loss": 1.0335,
+      "step": 654
+    },
+    {
+      "epoch": 0.2545419216943554,
+      "grad_norm": 0.20884902775287628,
+      "learning_rate": 0.00014947450369793693,
+      "loss": 1.0507,
+      "step": 655
+    },
+    {
+      "epoch": 0.2549305353152628,
+      "grad_norm": 0.21234256029129028,
+      "learning_rate": 0.0001493966523939276,
+      "loss": 1.0372,
+      "step": 656
+    },
+    {
+      "epoch": 0.2553191489361702,
+      "grad_norm": 0.1984352171421051,
+      "learning_rate": 0.00014931880108991826,
+      "loss": 0.9979,
+      "step": 657
+    },
+    {
+      "epoch": 0.2557077625570776,
+      "grad_norm": 0.18848282098770142,
+      "learning_rate": 0.0001492409497859089,
+      "loss": 0.9973,
+      "step": 658
+    },
+    {
+      "epoch": 0.25609637617798503,
+      "grad_norm": 0.2201709896326065,
+      "learning_rate": 0.00014916309848189956,
+      "loss": 1.0386,
+      "step": 659
+    },
+    {
+      "epoch": 0.25648498979889245,
+      "grad_norm": 0.23094095289707184,
+      "learning_rate": 0.00014908524717789024,
+      "loss": 1.1205,
+      "step": 660
+    },
+    {
+      "epoch": 0.25687360341979987,
+      "grad_norm": 0.21087734401226044,
+      "learning_rate": 0.0001490073958738809,
+      "loss": 1.0231,
+      "step": 661
+    },
+    {
+      "epoch": 0.2572622170407073,
+      "grad_norm": 0.24970979988574982,
+      "learning_rate": 0.00014892954456987155,
+      "loss": 1.0421,
+      "step": 662
+    },
+    {
+      "epoch": 0.2576508306616147,
+      "grad_norm": 0.22024711966514587,
+      "learning_rate": 0.00014885169326586223,
+      "loss": 1.1033,
+      "step": 663
+    },
+    {
+      "epoch": 0.2580394442825221,
+      "grad_norm": 0.2195248156785965,
+      "learning_rate": 0.00014877384196185288,
+      "loss": 1.089,
+      "step": 664
+    },
+    {
+      "epoch": 0.25842805790342954,
+      "grad_norm": 0.20236417651176453,
+      "learning_rate": 0.00014869599065784353,
+      "loss": 1.0196,
+      "step": 665
+    },
+    {
+      "epoch": 0.2588166715243369,
+      "grad_norm": 0.21973329782485962,
+      "learning_rate": 0.00014861813935383418,
+      "loss": 1.0844,
+      "step": 666
+    },
+    {
+      "epoch": 0.2592052851452443,
+      "grad_norm": 0.2069879174232483,
+      "learning_rate": 0.00014854028804982486,
+      "loss": 1.0312,
+      "step": 667
+    },
+    {
+      "epoch": 0.25959389876615174,
+      "grad_norm": 0.2037455290555954,
+      "learning_rate": 0.00014846243674581551,
+      "loss": 1.0018,
+      "step": 668
+    },
+    {
+      "epoch": 0.25998251238705916,
+      "grad_norm": 0.24176378548145294,
+      "learning_rate": 0.00014838458544180617,
+      "loss": 1.0749,
+      "step": 669
+    },
+    {
+      "epoch": 0.2603711260079666,
+      "grad_norm": 0.2007879763841629,
+      "learning_rate": 0.00014830673413779682,
+      "loss": 1.0443,
+      "step": 670
+    },
+    {
+      "epoch": 0.260759739628874,
+      "grad_norm": 0.23503245413303375,
+      "learning_rate": 0.00014822888283378747,
+      "loss": 1.0674,
+      "step": 671
+    },
+    {
+      "epoch": 0.2611483532497814,
+      "grad_norm": 0.2166167050600052,
+      "learning_rate": 0.00014815103152977815,
+      "loss": 1.079,
+      "step": 672
+    },
+    {
+      "epoch": 0.26153696687068884,
+      "grad_norm": 0.2293982058763504,
+      "learning_rate": 0.0001480731802257688,
+      "loss": 1.0517,
+      "step": 673
+    },
+    {
+      "epoch": 0.26192558049159625,
+      "grad_norm": 0.21040330827236176,
+      "learning_rate": 0.00014799532892175945,
+      "loss": 1.0475,
+      "step": 674
+    },
+    {
+      "epoch": 0.2623141941125036,
+      "grad_norm": 0.20750463008880615,
+      "learning_rate": 0.0001479174776177501,
+      "loss": 1.025,
+      "step": 675
+    },
+    {
+      "epoch": 0.26270280773341104,
+      "grad_norm": 0.2748873233795166,
+      "learning_rate": 0.00014783962631374076,
+      "loss": 1.0212,
+      "step": 676
+    },
+    {
+      "epoch": 0.26309142135431846,
+      "grad_norm": 0.19212333858013153,
+      "learning_rate": 0.00014776177500973144,
+      "loss": 1.0049,
+      "step": 677
+    },
+    {
+      "epoch": 0.2634800349752259,
+      "grad_norm": 0.207731693983078,
+      "learning_rate": 0.0001476839237057221,
+      "loss": 1.0062,
+      "step": 678
+    },
+    {
+      "epoch": 0.2638686485961333,
+      "grad_norm": 0.2177981585264206,
+      "learning_rate": 0.00014760607240171274,
+      "loss": 1.0489,
+      "step": 679
+    },
+    {
+      "epoch": 0.2642572622170407,
+      "grad_norm": 0.23239290714263916,
+      "learning_rate": 0.0001475282210977034,
+      "loss": 1.0856,
+      "step": 680
+    },
+    {
+      "epoch": 0.26464587583794813,
+      "grad_norm": 0.2033151388168335,
+      "learning_rate": 0.00014745036979369404,
+      "loss": 1.0389,
+      "step": 681
+    },
+    {
+      "epoch": 0.26503448945885555,
+      "grad_norm": 0.20917408168315887,
+      "learning_rate": 0.00014737251848968472,
+      "loss": 1.1208,
+      "step": 682
+    },
+    {
+      "epoch": 0.26542310307976297,
+      "grad_norm": 0.22075454890727997,
+      "learning_rate": 0.00014729466718567538,
+      "loss": 1.0435,
+      "step": 683
+    },
+    {
+      "epoch": 0.26581171670067033,
+      "grad_norm": 0.23094993829727173,
+      "learning_rate": 0.00014721681588166603,
+      "loss": 1.0649,
+      "step": 684
+    },
+    {
+      "epoch": 0.26620033032157775,
+      "grad_norm": 0.21209536492824554,
+      "learning_rate": 0.00014713896457765668,
+      "loss": 1.0578,
+      "step": 685
+    },
+    {
+      "epoch": 0.26658894394248517,
+      "grad_norm": 0.21412219107151031,
+      "learning_rate": 0.00014706111327364733,
+      "loss": 1.1137,
+      "step": 686
+    },
+    {
+      "epoch": 0.2669775575633926,
+      "grad_norm": 0.21175475418567657,
+      "learning_rate": 0.000146983261969638,
+      "loss": 1.023,
+      "step": 687
+    },
+    {
+      "epoch": 0.2673661711843,
+      "grad_norm": 0.21968993544578552,
+      "learning_rate": 0.00014690541066562866,
+      "loss": 1.1183,
+      "step": 688
+    },
+    {
+      "epoch": 0.2677547848052074,
+      "grad_norm": 0.20414218306541443,
+      "learning_rate": 0.00014682755936161932,
+      "loss": 1.078,
+      "step": 689
+    },
+    {
+      "epoch": 0.26814339842611484,
+      "grad_norm": 0.18986597657203674,
+      "learning_rate": 0.00014674970805760997,
+      "loss": 1.0029,
+      "step": 690
+    },
+    {
+      "epoch": 0.26853201204702226,
+      "grad_norm": 0.21215832233428955,
+      "learning_rate": 0.00014667185675360062,
+      "loss": 1.0759,
+      "step": 691
+    },
+    {
+      "epoch": 0.2689206256679297,
+      "grad_norm": 0.2113744169473648,
+      "learning_rate": 0.0001465940054495913,
+      "loss": 1.1027,
+      "step": 692
+    },
+    {
+      "epoch": 0.2693092392888371,
+      "grad_norm": 0.22010880708694458,
+      "learning_rate": 0.00014651615414558195,
+      "loss": 1.0984,
+      "step": 693
+    },
+    {
+      "epoch": 0.26969785290974446,
+      "grad_norm": 0.203857421875,
+      "learning_rate": 0.0001464383028415726,
+      "loss": 1.0407,
+      "step": 694
+    },
+    {
+      "epoch": 0.2700864665306519,
+      "grad_norm": 0.21120867133140564,
+      "learning_rate": 0.00014636045153756325,
+      "loss": 1.0521,
+      "step": 695
+    },
+    {
+      "epoch": 0.2704750801515593,
+      "grad_norm": 0.20039112865924835,
+      "learning_rate": 0.0001462826002335539,
+      "loss": 1.0897,
+      "step": 696
+    },
+    {
+      "epoch": 0.2708636937724667,
+      "grad_norm": 0.22893202304840088,
+      "learning_rate": 0.00014620474892954456,
+      "loss": 1.0903,
+      "step": 697
+    },
+    {
+      "epoch": 0.27125230739337414,
+      "grad_norm": 0.19886267185211182,
+      "learning_rate": 0.00014612689762553524,
+      "loss": 1.0889,
+      "step": 698
+    },
+    {
+      "epoch": 0.27164092101428156,
+      "grad_norm": 0.18892349302768707,
+      "learning_rate": 0.0001460490463215259,
+      "loss": 0.981,
+      "step": 699
+    },
+    {
+      "epoch": 0.272029534635189,
+      "grad_norm": 0.20602507889270782,
+      "learning_rate": 0.00014597119501751654,
+      "loss": 1.0223,
+      "step": 700
+    },
+    {
+      "epoch": 0.2724181482560964,
+      "grad_norm": 0.21480505168437958,
+      "learning_rate": 0.0001458933437135072,
+      "loss": 1.0355,
+      "step": 701
+    },
+    {
+      "epoch": 0.2728067618770038,
+      "grad_norm": 0.21011753380298615,
+      "learning_rate": 0.00014581549240949785,
+      "loss": 1.0613,
+      "step": 702
+    },
+    {
+      "epoch": 0.2731953754979112,
+      "grad_norm": 0.19350819289684296,
+      "learning_rate": 0.00014573764110548853,
+      "loss": 1.0144,
+      "step": 703
+    },
+    {
+      "epoch": 0.2735839891188186,
+      "grad_norm": 0.207548126578331,
+      "learning_rate": 0.00014565978980147918,
+      "loss": 1.0465,
+      "step": 704
+    },
+    {
+      "epoch": 0.273972602739726,
+      "grad_norm": 0.22220565378665924,
+      "learning_rate": 0.00014558193849746983,
+      "loss": 1.1073,
+      "step": 705
+    },
+    {
+      "epoch": 0.27436121636063343,
+      "grad_norm": 0.193622425198555,
+      "learning_rate": 0.00014550408719346048,
+      "loss": 1.0357,
+      "step": 706
+    },
+    {
+      "epoch": 0.27474982998154085,
+      "grad_norm": 0.2067158818244934,
+      "learning_rate": 0.00014542623588945113,
+      "loss": 1.0502,
+      "step": 707
+    },
+    {
+      "epoch": 0.27513844360244827,
+      "grad_norm": 0.2218742072582245,
+      "learning_rate": 0.0001453483845854418,
+      "loss": 0.9934,
+      "step": 708
+    },
+    {
+      "epoch": 0.2755270572233557,
+      "grad_norm": 0.22316142916679382,
+      "learning_rate": 0.00014527053328143246,
+      "loss": 1.0707,
+      "step": 709
+    },
+    {
+      "epoch": 0.2759156708442631,
+      "grad_norm": 0.21004025638103485,
+      "learning_rate": 0.00014519268197742312,
+      "loss": 1.0543,
+      "step": 710
+    },
+    {
+      "epoch": 0.2763042844651705,
+      "grad_norm": 0.22070440649986267,
+      "learning_rate": 0.00014511483067341377,
+      "loss": 1.0467,
+      "step": 711
+    },
+    {
+      "epoch": 0.27669289808607794,
+      "grad_norm": 0.21463747322559357,
+      "learning_rate": 0.00014503697936940445,
+      "loss": 1.0793,
+      "step": 712
+    },
+    {
+      "epoch": 0.2770815117069853,
+      "grad_norm": 0.23452533781528473,
+      "learning_rate": 0.0001449591280653951,
+      "loss": 1.043,
+      "step": 713
+    },
+    {
+      "epoch": 0.2774701253278927,
+      "grad_norm": 0.2405795156955719,
+      "learning_rate": 0.00014488127676138575,
+      "loss": 1.0752,
+      "step": 714
+    },
+    {
+      "epoch": 0.27785873894880014,
+      "grad_norm": 0.21546585857868195,
+      "learning_rate": 0.00014480342545737643,
+      "loss": 1.0834,
+      "step": 715
+    },
+    {
+      "epoch": 0.27824735256970756,
+      "grad_norm": 0.22675828635692596,
+      "learning_rate": 0.00014472557415336708,
+      "loss": 1.055,
+      "step": 716
+    },
+    {
+      "epoch": 0.278635966190615,
+      "grad_norm": 0.2117871195077896,
+      "learning_rate": 0.00014464772284935774,
+      "loss": 1.03,
+      "step": 717
+    },
+    {
+      "epoch": 0.2790245798115224,
+      "grad_norm": 0.2193155735731125,
+      "learning_rate": 0.00014456987154534841,
+      "loss": 1.0073,
+      "step": 718
+    },
+    {
+      "epoch": 0.2794131934324298,
+      "grad_norm": 0.21447965502738953,
+      "learning_rate": 0.00014449202024133907,
+      "loss": 1.0174,
+      "step": 719
+    },
+    {
+      "epoch": 0.27980180705333724,
+      "grad_norm": 0.22867532074451447,
+      "learning_rate": 0.00014441416893732972,
+      "loss": 1.0948,
+      "step": 720
+    },
+    {
+      "epoch": 0.28019042067424466,
+      "grad_norm": 0.21570557355880737,
+      "learning_rate": 0.00014433631763332037,
+      "loss": 1.0105,
+      "step": 721
+    },
+    {
+      "epoch": 0.280579034295152,
+      "grad_norm": 0.20787014067173004,
+      "learning_rate": 0.00014425846632931102,
+      "loss": 1.0384,
+      "step": 722
+    },
+    {
+      "epoch": 0.28096764791605944,
+      "grad_norm": 0.19924762845039368,
+      "learning_rate": 0.0001441806150253017,
+      "loss": 1.0653,
+      "step": 723
+    },
+    {
+      "epoch": 0.28135626153696686,
+      "grad_norm": 0.1996215283870697,
+      "learning_rate": 0.00014410276372129235,
+      "loss": 1.0439,
+      "step": 724
+    },
+    {
+      "epoch": 0.2817448751578743,
+      "grad_norm": 0.2054813802242279,
+      "learning_rate": 0.000144024912417283,
+      "loss": 0.9895,
+      "step": 725
+    },
+    {
+      "epoch": 0.2821334887787817,
+      "grad_norm": 0.2268310785293579,
+      "learning_rate": 0.00014394706111327366,
+      "loss": 1.0993,
+      "step": 726
+    },
+    {
+      "epoch": 0.2825221023996891,
+      "grad_norm": 0.19867680966854095,
+      "learning_rate": 0.0001438692098092643,
+      "loss": 0.985,
+      "step": 727
+    },
+    {
+      "epoch": 0.28291071602059653,
+      "grad_norm": 0.21099598705768585,
+      "learning_rate": 0.000143791358505255,
+      "loss": 1.0333,
+      "step": 728
+    },
+    {
+      "epoch": 0.28329932964150395,
+      "grad_norm": 0.22479215264320374,
+      "learning_rate": 0.00014371350720124564,
+      "loss": 1.0449,
+      "step": 729
+    },
+    {
+      "epoch": 0.28368794326241137,
+      "grad_norm": 0.22717688977718353,
+      "learning_rate": 0.0001436356558972363,
+      "loss": 1.0482,
+      "step": 730
+    },
+    {
+      "epoch": 0.2840765568833188,
+      "grad_norm": 0.20389345288276672,
+      "learning_rate": 0.00014355780459322695,
+      "loss": 0.956,
+      "step": 731
+    },
+    {
+      "epoch": 0.28446517050422615,
+      "grad_norm": 0.21583619713783264,
+      "learning_rate": 0.0001434799532892176,
+      "loss": 1.0154,
+      "step": 732
+    },
+    {
+      "epoch": 0.28485378412513357,
+      "grad_norm": 0.2219148874282837,
+      "learning_rate": 0.00014340210198520825,
+      "loss": 1.0553,
+      "step": 733
+    },
+    {
+      "epoch": 0.285242397746041,
+      "grad_norm": 0.19920189678668976,
+      "learning_rate": 0.00014332425068119893,
+      "loss": 0.9881,
+      "step": 734
+    },
+    {
+      "epoch": 0.2856310113669484,
+      "grad_norm": 0.2295670360326767,
+      "learning_rate": 0.00014324639937718958,
+      "loss": 1.0529,
+      "step": 735
+    },
+    {
+      "epoch": 0.2860196249878558,
+      "grad_norm": 0.21271567046642303,
+      "learning_rate": 0.00014316854807318023,
+      "loss": 1.037,
+      "step": 736
+    },
+    {
+      "epoch": 0.28640823860876324,
+      "grad_norm": 0.21304361522197723,
+      "learning_rate": 0.00014309069676917088,
+      "loss": 1.048,
+      "step": 737
+    },
+    {
+      "epoch": 0.28679685222967066,
+      "grad_norm": 0.19902732968330383,
+      "learning_rate": 0.00014301284546516154,
+      "loss": 1.0306,
+      "step": 738
+    },
+    {
+      "epoch": 0.2871854658505781,
+      "grad_norm": 0.1995929330587387,
+      "learning_rate": 0.00014293499416115222,
+      "loss": 1.0394,
+      "step": 739
+    },
+    {
+      "epoch": 0.2875740794714855,
+      "grad_norm": 0.20426060259342194,
+      "learning_rate": 0.00014285714285714287,
+      "loss": 1.0052,
+      "step": 740
+    },
+    {
+      "epoch": 0.28796269309239286,
+      "grad_norm": 0.20284566283226013,
+      "learning_rate": 0.00014277929155313352,
+      "loss": 1.0115,
+      "step": 741
+    },
+    {
+      "epoch": 0.2883513067133003,
+      "grad_norm": 0.2041557878255844,
+      "learning_rate": 0.00014270144024912417,
+      "loss": 1.0473,
+      "step": 742
+    },
+    {
+      "epoch": 0.2887399203342077,
+      "grad_norm": 0.2152249962091446,
+      "learning_rate": 0.00014262358894511482,
+      "loss": 1.0802,
+      "step": 743
+    },
+    {
+      "epoch": 0.2891285339551151,
+      "grad_norm": 0.20569871366024017,
+      "learning_rate": 0.0001425457376411055,
+      "loss": 1.0203,
+      "step": 744
+    },
+    {
+      "epoch": 0.28951714757602254,
+      "grad_norm": 0.21128378808498383,
+      "learning_rate": 0.00014246788633709616,
+      "loss": 1.108,
+      "step": 745
+    },
+    {
+      "epoch": 0.28990576119692996,
+      "grad_norm": 0.19587135314941406,
+      "learning_rate": 0.0001423900350330868,
+      "loss": 1.0427,
+      "step": 746
+    },
+    {
+      "epoch": 0.2902943748178374,
+      "grad_norm": 0.22052550315856934,
+      "learning_rate": 0.00014231218372907746,
+      "loss": 1.055,
+      "step": 747
+    },
+    {
+      "epoch": 0.2906829884387448,
+      "grad_norm": 0.21291717886924744,
+      "learning_rate": 0.0001422343324250681,
+      "loss": 1.0591,
+      "step": 748
+    },
+    {
+      "epoch": 0.2910716020596522,
+      "grad_norm": 0.20634084939956665,
+      "learning_rate": 0.0001421564811210588,
+      "loss": 1.0527,
+      "step": 749
+    },
+    {
+      "epoch": 0.29146021568055963,
+      "grad_norm": 0.2075488269329071,
+      "learning_rate": 0.00014207862981704944,
+      "loss": 1.0786,
+      "step": 750
+    },
+    {
+      "epoch": 0.291848829301467,
+      "grad_norm": 0.19780080020427704,
+      "learning_rate": 0.0001420007785130401,
+      "loss": 1.059,
+      "step": 751
+    },
+    {
+      "epoch": 0.2922374429223744,
+      "grad_norm": 0.21212074160575867,
+      "learning_rate": 0.00014192292720903075,
+      "loss": 1.0346,
+      "step": 752
+    },
+    {
+      "epoch": 0.29262605654328183,
+      "grad_norm": 0.2218451350927353,
+      "learning_rate": 0.0001418450759050214,
+      "loss": 1.0908,
+      "step": 753
+    },
+    {
+      "epoch": 0.29301467016418925,
+      "grad_norm": 0.20107759535312653,
+      "learning_rate": 0.00014176722460101208,
+      "loss": 1.0202,
+      "step": 754
+    },
+    {
+      "epoch": 0.29340328378509667,
+      "grad_norm": 0.20933273434638977,
+      "learning_rate": 0.00014168937329700273,
+      "loss": 1.0719,
+      "step": 755
+    },
+    {
+      "epoch": 0.2937918974060041,
+      "grad_norm": 0.22369107604026794,
+      "learning_rate": 0.00014161152199299338,
+      "loss": 1.0433,
+      "step": 756
+    },
+    {
+      "epoch": 0.2941805110269115,
+      "grad_norm": 0.2113707810640335,
+      "learning_rate": 0.00014153367068898403,
+      "loss": 1.0637,
+      "step": 757
+    },
+    {
+      "epoch": 0.2945691246478189,
+      "grad_norm": 0.21105700731277466,
+      "learning_rate": 0.00014145581938497469,
+      "loss": 1.0468,
+      "step": 758
+    },
+    {
+      "epoch": 0.29495773826872634,
+      "grad_norm": 0.20189693570137024,
+      "learning_rate": 0.00014137796808096537,
+      "loss": 1.0281,
+      "step": 759
+    },
+    {
+      "epoch": 0.2953463518896337,
+      "grad_norm": 0.1954152137041092,
+      "learning_rate": 0.00014130011677695602,
+      "loss": 1.0519,
+      "step": 760
+    },
+    {
+      "epoch": 0.2957349655105411,
+      "grad_norm": 0.24295592308044434,
+      "learning_rate": 0.00014122226547294667,
+      "loss": 1.1303,
+      "step": 761
+    },
+    {
+      "epoch": 0.29612357913144854,
+      "grad_norm": 0.20158620178699493,
+      "learning_rate": 0.00014114441416893732,
+      "loss": 1.0367,
+      "step": 762
+    },
+    {
+      "epoch": 0.29651219275235596,
+      "grad_norm": 0.20734666287899017,
+      "learning_rate": 0.00014106656286492797,
+      "loss": 1.0392,
+      "step": 763
+    },
+    {
+      "epoch": 0.2969008063732634,
+      "grad_norm": 0.2177533656358719,
+      "learning_rate": 0.00014098871156091865,
+      "loss": 1.0619,
+      "step": 764
+    },
+    {
+      "epoch": 0.2972894199941708,
+      "grad_norm": 0.1961720883846283,
+      "learning_rate": 0.0001409108602569093,
+      "loss": 0.9872,
+      "step": 765
+    },
+    {
+      "epoch": 0.2976780336150782,
+      "grad_norm": 0.21530941128730774,
+      "learning_rate": 0.00014083300895289996,
+      "loss": 1.1246,
+      "step": 766
+    },
+    {
+      "epoch": 0.29806664723598564,
+      "grad_norm": 0.2039783000946045,
+      "learning_rate": 0.00014075515764889064,
+      "loss": 1.0789,
+      "step": 767
+    },
+    {
+      "epoch": 0.29845526085689306,
+      "grad_norm": 0.20641569793224335,
+      "learning_rate": 0.0001406773063448813,
+      "loss": 1.05,
+      "step": 768
+    },
+    {
+      "epoch": 0.2988438744778004,
+      "grad_norm": 0.2071225494146347,
+      "learning_rate": 0.00014059945504087194,
+      "loss": 1.047,
+      "step": 769
+    },
+    {
+      "epoch": 0.29923248809870784,
+      "grad_norm": 0.20367531478405,
+      "learning_rate": 0.00014052160373686262,
+      "loss": 1.0734,
+      "step": 770
+    },
+    {
+      "epoch": 0.29962110171961526,
+      "grad_norm": 0.21718619763851166,
+      "learning_rate": 0.00014044375243285327,
+      "loss": 1.0613,
+      "step": 771
+    },
+    {
+      "epoch": 0.3000097153405227,
+      "grad_norm": 0.21649087965488434,
+      "learning_rate": 0.00014036590112884392,
+      "loss": 1.0671,
+      "step": 772
+    },
+    {
+      "epoch": 0.3003983289614301,
+      "grad_norm": 0.22223225235939026,
+      "learning_rate": 0.00014028804982483458,
+      "loss": 1.0977,
+      "step": 773
+    },
+    {
+      "epoch": 0.3007869425823375,
+      "grad_norm": 0.23101870715618134,
+      "learning_rate": 0.00014021019852082523,
+      "loss": 1.1236,
+      "step": 774
+    },
+    {
+      "epoch": 0.30117555620324493,
+      "grad_norm": 0.22855506837368011,
+      "learning_rate": 0.0001401323472168159,
+      "loss": 1.0517,
+      "step": 775
+    },
+    {
+      "epoch": 0.30156416982415235,
+      "grad_norm": 0.20862117409706116,
+      "learning_rate": 0.00014005449591280656,
+      "loss": 1.0493,
+      "step": 776
+    },
+    {
+      "epoch": 0.30195278344505977,
+      "grad_norm": 0.21692048013210297,
+      "learning_rate": 0.0001399766446087972,
+      "loss": 1.0681,
+      "step": 777
+    },
+    {
+      "epoch": 0.3023413970659672,
+      "grad_norm": 0.21541331708431244,
+      "learning_rate": 0.00013989879330478786,
+      "loss": 1.0775,
+      "step": 778
+    },
+    {
+      "epoch": 0.30273001068687455,
+      "grad_norm": 0.21221749484539032,
+      "learning_rate": 0.00013982094200077851,
+      "loss": 1.0421,
+      "step": 779
+    },
+    {
+      "epoch": 0.30311862430778197,
+      "grad_norm": 0.22497743368148804,
+      "learning_rate": 0.0001397430906967692,
+      "loss": 1.1115,
+      "step": 780
+    },
+    {
+      "epoch": 0.3035072379286894,
+      "grad_norm": 0.1974119246006012,
+      "learning_rate": 0.00013966523939275985,
+      "loss": 1.0264,
+      "step": 781
+    },
+    {
+      "epoch": 0.3038958515495968,
+      "grad_norm": 0.20349323749542236,
+      "learning_rate": 0.0001395873880887505,
+      "loss": 1.0512,
+      "step": 782
+    },
+    {
+      "epoch": 0.3042844651705042,
+      "grad_norm": 0.21116937696933746,
+      "learning_rate": 0.00013950953678474115,
+      "loss": 1.0135,
+      "step": 783
+    },
+    {
+      "epoch": 0.30467307879141164,
+      "grad_norm": 0.2133677899837494,
+      "learning_rate": 0.0001394316854807318,
+      "loss": 1.0694,
+      "step": 784
+    },
+    {
+      "epoch": 0.30506169241231906,
+      "grad_norm": 0.20406191051006317,
+      "learning_rate": 0.00013935383417672248,
+      "loss": 1.0179,
+      "step": 785
+    },
+    {
+      "epoch": 0.3054503060332265,
+      "grad_norm": 0.21428678929805756,
+      "learning_rate": 0.00013927598287271313,
+      "loss": 1.0577,
+      "step": 786
+    },
+    {
+      "epoch": 0.3058389196541339,
+      "grad_norm": 0.20878921449184418,
+      "learning_rate": 0.00013919813156870379,
+      "loss": 1.0311,
+      "step": 787
+    },
+    {
+      "epoch": 0.30622753327504126,
+      "grad_norm": 0.19033175706863403,
+      "learning_rate": 0.00013912028026469444,
+      "loss": 0.976,
+      "step": 788
+    },
+    {
+      "epoch": 0.3066161468959487,
+      "grad_norm": 0.22138020396232605,
+      "learning_rate": 0.0001390424289606851,
+      "loss": 1.0438,
+      "step": 789
+    },
+    {
+      "epoch": 0.3070047605168561,
+      "grad_norm": 0.20765596628189087,
+      "learning_rate": 0.00013896457765667577,
+      "loss": 1.0865,
+      "step": 790
+    },
+    {
+      "epoch": 0.3073933741377635,
+      "grad_norm": 0.209733247756958,
+      "learning_rate": 0.00013888672635266642,
+      "loss": 1.0648,
+      "step": 791
+    },
+    {
+      "epoch": 0.30778198775867094,
+      "grad_norm": 0.1896686851978302,
+      "learning_rate": 0.00013880887504865707,
+      "loss": 1.0133,
+      "step": 792
+    },
+    {
+      "epoch": 0.30817060137957836,
+      "grad_norm": 0.21651998162269592,
+      "learning_rate": 0.00013873102374464772,
+      "loss": 1.0729,
+      "step": 793
+    },
+    {
+      "epoch": 0.3085592150004858,
+      "grad_norm": 0.21751996874809265,
+      "learning_rate": 0.00013865317244063838,
+      "loss": 1.0444,
+      "step": 794
+    },
+    {
+      "epoch": 0.3089478286213932,
+      "grad_norm": 0.20593520998954773,
+      "learning_rate": 0.00013857532113662906,
+      "loss": 1.0304,
+      "step": 795
+    },
+    {
+      "epoch": 0.3093364422423006,
+      "grad_norm": 0.19937261939048767,
+      "learning_rate": 0.0001384974698326197,
+      "loss": 1.0017,
+      "step": 796
+    },
+    {
+      "epoch": 0.30972505586320803,
+      "grad_norm": 0.18901696801185608,
+      "learning_rate": 0.00013841961852861036,
+      "loss": 1.0362,
+      "step": 797
+    },
+    {
+      "epoch": 0.3101136694841154,
+      "grad_norm": 0.2079760730266571,
+      "learning_rate": 0.000138341767224601,
+      "loss": 1.0784,
+      "step": 798
+    },
+    {
+      "epoch": 0.3105022831050228,
+      "grad_norm": 0.24873265624046326,
+      "learning_rate": 0.00013826391592059166,
+      "loss": 1.1026,
+      "step": 799
+    },
+    {
+      "epoch": 0.31089089672593023,
+      "grad_norm": 0.20185396075248718,
+      "learning_rate": 0.00013818606461658234,
+      "loss": 1.0235,
+      "step": 800
+    },
+    {
+      "epoch": 0.31127951034683765,
+      "grad_norm": 0.211393803358078,
+      "learning_rate": 0.000138108213312573,
+      "loss": 1.0999,
+      "step": 801
+    },
+    {
+      "epoch": 0.31166812396774507,
+      "grad_norm": 0.19948823750019073,
+      "learning_rate": 0.00013803036200856365,
+      "loss": 1.0242,
+      "step": 802
+    },
+    {
+      "epoch": 0.3120567375886525,
+      "grad_norm": 0.21470944583415985,
+      "learning_rate": 0.0001379525107045543,
+      "loss": 1.0736,
+      "step": 803
+    },
+    {
+      "epoch": 0.3124453512095599,
+      "grad_norm": 0.2195902317762375,
+      "learning_rate": 0.00013787465940054495,
+      "loss": 1.0368,
+      "step": 804
+    },
+    {
+      "epoch": 0.3128339648304673,
+      "grad_norm": 0.22142355144023895,
+      "learning_rate": 0.00013779680809653563,
+      "loss": 1.1022,
+      "step": 805
+    },
+    {
+      "epoch": 0.31322257845137474,
+      "grad_norm": 0.20487886667251587,
+      "learning_rate": 0.00013771895679252628,
+      "loss": 1.0478,
+      "step": 806
+    },
+    {
+      "epoch": 0.3136111920722821,
+      "grad_norm": 0.217549130320549,
+      "learning_rate": 0.00013764110548851693,
+      "loss": 1.0526,
+      "step": 807
+    },
+    {
+      "epoch": 0.3139998056931895,
+      "grad_norm": 0.20199982821941376,
+      "learning_rate": 0.0001375632541845076,
+      "loss": 0.9992,
+      "step": 808
+    },
+    {
+      "epoch": 0.31438841931409695,
+      "grad_norm": 0.19496634602546692,
+      "learning_rate": 0.00013748540288049824,
+      "loss": 1.0179,
+      "step": 809
+    },
+    {
+      "epoch": 0.31477703293500436,
+      "grad_norm": 0.21999460458755493,
+      "learning_rate": 0.0001374075515764889,
+      "loss": 1.0547,
+      "step": 810
+    },
+    {
+      "epoch": 0.3151656465559118,
+      "grad_norm": 0.21421074867248535,
+      "learning_rate": 0.00013732970027247957,
+      "loss": 1.0283,
+      "step": 811
+    },
+    {
+      "epoch": 0.3155542601768192,
+      "grad_norm": 0.1913364827632904,
+      "learning_rate": 0.00013725184896847022,
+      "loss": 0.9826,
+      "step": 812
+    },
+    {
+      "epoch": 0.3159428737977266,
+      "grad_norm": 0.20509806275367737,
+      "learning_rate": 0.00013717399766446087,
+      "loss": 1.0303,
+      "step": 813
+    },
+    {
+      "epoch": 0.31633148741863404,
+      "grad_norm": 0.20309868454933167,
+      "learning_rate": 0.00013709614636045153,
+      "loss": 1.0479,
+      "step": 814
+    },
+    {
+      "epoch": 0.31672010103954146,
+      "grad_norm": 0.2274443656206131,
+      "learning_rate": 0.0001370182950564422,
+      "loss": 1.1311,
+      "step": 815
+    },
+    {
+      "epoch": 0.3171087146604489,
+      "grad_norm": 0.22785170376300812,
+      "learning_rate": 0.00013694044375243286,
+      "loss": 1.1009,
+      "step": 816
+    },
+    {
+      "epoch": 0.31749732828135624,
+      "grad_norm": 0.2105439007282257,
+      "learning_rate": 0.0001368625924484235,
+      "loss": 1.0251,
+      "step": 817
+    },
+    {
+      "epoch": 0.31788594190226366,
+      "grad_norm": 0.20583970844745636,
+      "learning_rate": 0.00013678474114441416,
+      "loss": 1.0833,
+      "step": 818
+    },
+    {
+      "epoch": 0.3182745555231711,
+      "grad_norm": 0.21091191470623016,
+      "learning_rate": 0.00013670688984040484,
+      "loss": 1.071,
+      "step": 819
+    },
+    {
+      "epoch": 0.3186631691440785,
+      "grad_norm": 0.20645928382873535,
+      "learning_rate": 0.0001366290385363955,
+      "loss": 1.0605,
+      "step": 820
+    },
+    {
+      "epoch": 0.3190517827649859,
+      "grad_norm": 0.1990513950586319,
+      "learning_rate": 0.00013655118723238614,
+      "loss": 1.0461,
+      "step": 821
+    },
+    {
+      "epoch": 0.31944039638589333,
+      "grad_norm": 0.2192249745130539,
+      "learning_rate": 0.00013647333592837682,
+      "loss": 1.0975,
+      "step": 822
+    },
+    {
+      "epoch": 0.31982901000680075,
+      "grad_norm": 0.2157617211341858,
+      "learning_rate": 0.00013639548462436748,
+      "loss": 1.091,
+      "step": 823
+    },
+    {
+      "epoch": 0.32021762362770817,
+      "grad_norm": 0.21964526176452637,
+      "learning_rate": 0.00013631763332035813,
+      "loss": 1.0286,
+      "step": 824
+    },
+    {
+      "epoch": 0.3206062372486156,
+      "grad_norm": 0.2079797089099884,
+      "learning_rate": 0.00013623978201634878,
+      "loss": 1.0257,
+      "step": 825
+    },
+    {
+      "epoch": 0.32099485086952295,
+      "grad_norm": 0.21220168471336365,
+      "learning_rate": 0.00013616193071233946,
+      "loss": 1.0046,
+      "step": 826
+    },
+    {
+      "epoch": 0.32138346449043037,
+      "grad_norm": 0.2885231673717499,
+      "learning_rate": 0.0001360840794083301,
+      "loss": 1.1442,
+      "step": 827
+    },
+    {
+      "epoch": 0.3217720781113378,
+      "grad_norm": 0.2096511274576187,
+      "learning_rate": 0.00013600622810432076,
+      "loss": 1.0209,
+      "step": 828
+    },
+    {
+      "epoch": 0.3221606917322452,
+      "grad_norm": 0.2179451286792755,
+      "learning_rate": 0.00013592837680031142,
+      "loss": 1.0548,
+      "step": 829
+    },
+    {
+      "epoch": 0.3225493053531526,
+      "grad_norm": 0.2096329927444458,
+      "learning_rate": 0.00013585052549630207,
+      "loss": 1.0279,
+      "step": 830
+    },
+    {
+      "epoch": 0.32293791897406005,
+      "grad_norm": 0.22531811892986298,
+      "learning_rate": 0.00013577267419229275,
+      "loss": 1.0463,
+      "step": 831
+    },
+    {
+      "epoch": 0.32332653259496746,
+      "grad_norm": 0.22516901791095734,
+      "learning_rate": 0.0001356948228882834,
+      "loss": 1.1127,
+      "step": 832
+    },
+    {
+      "epoch": 0.3237151462158749,
+      "grad_norm": 0.22487780451774597,
+      "learning_rate": 0.00013561697158427405,
+      "loss": 1.0707,
+      "step": 833
+    },
+    {
+      "epoch": 0.3241037598367823,
+      "grad_norm": 0.20976543426513672,
+      "learning_rate": 0.0001355391202802647,
+      "loss": 1.0217,
+      "step": 834
+    },
+    {
+      "epoch": 0.32449237345768966,
+      "grad_norm": 0.19849295914173126,
+      "learning_rate": 0.00013546126897625535,
+      "loss": 1.021,
+      "step": 835
+    },
+    {
+      "epoch": 0.3248809870785971,
+      "grad_norm": 0.21772268414497375,
+      "learning_rate": 0.00013538341767224603,
+      "loss": 1.0605,
+      "step": 836
+    },
+    {
+      "epoch": 0.3252696006995045,
+      "grad_norm": 0.19670265913009644,
+      "learning_rate": 0.00013530556636823669,
+      "loss": 1.0165,
+      "step": 837
+    },
+    {
+      "epoch": 0.3256582143204119,
+      "grad_norm": 0.19339734315872192,
+      "learning_rate": 0.00013522771506422734,
+      "loss": 1.0203,
+      "step": 838
+    },
+    {
+      "epoch": 0.32604682794131934,
+      "grad_norm": 0.21289557218551636,
+      "learning_rate": 0.000135149863760218,
+      "loss": 1.0252,
+      "step": 839
+    },
+    {
+      "epoch": 0.32643544156222676,
+      "grad_norm": 0.1964789777994156,
+      "learning_rate": 0.00013507201245620864,
+      "loss": 1.0392,
+      "step": 840
+    },
+    {
+      "epoch": 0.3268240551831342,
+      "grad_norm": 0.20783716440200806,
+      "learning_rate": 0.00013499416115219932,
+      "loss": 1.0569,
+      "step": 841
+    },
+    {
+      "epoch": 0.3272126688040416,
+      "grad_norm": 0.22782161831855774,
+      "learning_rate": 0.00013491630984818997,
+      "loss": 1.0555,
+      "step": 842
+    },
+    {
+      "epoch": 0.327601282424949,
+      "grad_norm": 0.22771142423152924,
+      "learning_rate": 0.00013483845854418063,
+      "loss": 1.085,
+      "step": 843
+    },
+    {
+      "epoch": 0.32798989604585643,
+      "grad_norm": 0.19773711264133453,
+      "learning_rate": 0.00013476060724017128,
+      "loss": 1.008,
+      "step": 844
+    },
+    {
+      "epoch": 0.3283785096667638,
+      "grad_norm": 0.22399166226387024,
+      "learning_rate": 0.00013468275593616193,
+      "loss": 1.0511,
+      "step": 845
+    },
+    {
+      "epoch": 0.3287671232876712,
+      "grad_norm": 0.20488236844539642,
+      "learning_rate": 0.00013460490463215258,
+      "loss": 1.0883,
+      "step": 846
+    },
+    {
+      "epoch": 0.32915573690857863,
+      "grad_norm": 0.21387654542922974,
+      "learning_rate": 0.00013452705332814326,
+      "loss": 1.0808,
+      "step": 847
+    },
+    {
+      "epoch": 0.32954435052948605,
+      "grad_norm": 0.1972568780183792,
+      "learning_rate": 0.0001344492020241339,
+      "loss": 1.0555,
+      "step": 848
+    },
+    {
+      "epoch": 0.32993296415039347,
+      "grad_norm": 0.20835663378238678,
+      "learning_rate": 0.00013437135072012456,
+      "loss": 1.0473,
+      "step": 849
+    },
+    {
+      "epoch": 0.3303215777713009,
+      "grad_norm": 0.19707520306110382,
+      "learning_rate": 0.00013429349941611522,
+      "loss": 0.9585,
+      "step": 850
+    },
+    {
+      "epoch": 0.3307101913922083,
+      "grad_norm": 0.19163411855697632,
+      "learning_rate": 0.00013421564811210587,
+      "loss": 1.0025,
+      "step": 851
+    },
+    {
+      "epoch": 0.3310988050131157,
+      "grad_norm": 0.19730083644390106,
+      "learning_rate": 0.00013413779680809655,
+      "loss": 1.0696,
+      "step": 852
+    },
+    {
+      "epoch": 0.33148741863402315,
+      "grad_norm": 0.19537493586540222,
+      "learning_rate": 0.0001340599455040872,
+      "loss": 1.0466,
+      "step": 853
+    },
+    {
+      "epoch": 0.3318760322549305,
+      "grad_norm": 0.2255164235830307,
+      "learning_rate": 0.00013398209420007785,
+      "loss": 1.0659,
+      "step": 854
+    },
+    {
+      "epoch": 0.3322646458758379,
+      "grad_norm": 0.19774770736694336,
+      "learning_rate": 0.0001339042428960685,
+      "loss": 1.0326,
+      "step": 855
+    },
+    {
+      "epoch": 0.33265325949674535,
+      "grad_norm": 0.2004510909318924,
+      "learning_rate": 0.00013382639159205916,
+      "loss": 1.0327,
+      "step": 856
+    },
+    {
+      "epoch": 0.33304187311765276,
+      "grad_norm": 0.19187591969966888,
+      "learning_rate": 0.00013374854028804984,
+      "loss": 1.0069,
+      "step": 857
+    },
+    {
+      "epoch": 0.3334304867385602,
+      "grad_norm": 0.18775832653045654,
+      "learning_rate": 0.0001336706889840405,
+      "loss": 1.0083,
+      "step": 858
+    },
+    {
+      "epoch": 0.3338191003594676,
+      "grad_norm": 0.2005717158317566,
+      "learning_rate": 0.00013359283768003114,
+      "loss": 1.0398,
+      "step": 859
+    },
+    {
+      "epoch": 0.334207713980375,
+      "grad_norm": 0.19705893099308014,
+      "learning_rate": 0.0001335149863760218,
+      "loss": 1.0031,
+      "step": 860
+    },
+    {
+      "epoch": 0.33459632760128244,
+      "grad_norm": 0.19589562714099884,
+      "learning_rate": 0.00013343713507201244,
+      "loss": 0.9831,
+      "step": 861
+    },
+    {
+      "epoch": 0.33498494122218986,
+      "grad_norm": 0.19302591681480408,
+      "learning_rate": 0.00013335928376800312,
+      "loss": 1.0009,
+      "step": 862
+    },
+    {
+      "epoch": 0.3353735548430973,
+      "grad_norm": 0.20499618351459503,
+      "learning_rate": 0.00013328143246399377,
+      "loss": 1.0205,
+      "step": 863
+    },
+    {
+      "epoch": 0.33576216846400464,
+      "grad_norm": 0.20514456927776337,
+      "learning_rate": 0.00013320358115998443,
+      "loss": 1.0837,
+      "step": 864
+    },
+    {
+      "epoch": 0.33615078208491206,
+      "grad_norm": 0.19285848736763,
+      "learning_rate": 0.00013312572985597508,
+      "loss": 1.0167,
+      "step": 865
+    },
+    {
+      "epoch": 0.3365393957058195,
+      "grad_norm": 0.20891553163528442,
+      "learning_rate": 0.00013304787855196573,
+      "loss": 1.0127,
+      "step": 866
+    },
+    {
+      "epoch": 0.3369280093267269,
+      "grad_norm": 0.20511706173419952,
+      "learning_rate": 0.0001329700272479564,
+      "loss": 0.964,
+      "step": 867
+    },
+    {
+      "epoch": 0.3373166229476343,
+      "grad_norm": 0.1855512261390686,
+      "learning_rate": 0.00013289217594394706,
+      "loss": 0.9721,
+      "step": 868
+    },
+    {
+      "epoch": 0.33770523656854173,
+      "grad_norm": 0.20010098814964294,
+      "learning_rate": 0.00013281432463993771,
+      "loss": 1.0411,
+      "step": 869
+    },
+    {
+      "epoch": 0.33809385018944915,
+      "grad_norm": 0.1991325318813324,
+      "learning_rate": 0.0001327364733359284,
+      "loss": 0.9658,
+      "step": 870
+    },
+    {
+      "epoch": 0.33848246381035657,
+      "grad_norm": 0.19895736873149872,
+      "learning_rate": 0.00013265862203191905,
+      "loss": 1.0744,
+      "step": 871
+    },
+    {
+      "epoch": 0.338871077431264,
+      "grad_norm": 0.2091255635023117,
+      "learning_rate": 0.0001325807707279097,
+      "loss": 1.0375,
+      "step": 872
+    },
+    {
+      "epoch": 0.33925969105217135,
+      "grad_norm": 0.21355532109737396,
+      "learning_rate": 0.00013250291942390035,
+      "loss": 1.09,
+      "step": 873
+    },
+    {
+      "epoch": 0.33964830467307877,
+      "grad_norm": 0.21844851970672607,
+      "learning_rate": 0.00013242506811989103,
+      "loss": 1.0769,
+      "step": 874
+    },
+    {
+      "epoch": 0.3400369182939862,
+      "grad_norm": 0.1877543330192566,
+      "learning_rate": 0.00013234721681588168,
+      "loss": 1.0199,
+      "step": 875
+    },
+    {
+      "epoch": 0.3404255319148936,
+      "grad_norm": 0.2020038366317749,
+      "learning_rate": 0.00013226936551187233,
+      "loss": 1.0218,
+      "step": 876
+    },
+    {
+      "epoch": 0.340814145535801,
+      "grad_norm": 0.20682141184806824,
+      "learning_rate": 0.000132191514207863,
+      "loss": 1.0891,
+      "step": 877
+    },
+    {
+      "epoch": 0.34120275915670845,
+      "grad_norm": 0.21942824125289917,
+      "learning_rate": 0.00013211366290385366,
+      "loss": 0.9877,
+      "step": 878
+    },
+    {
+      "epoch": 0.34159137277761586,
+      "grad_norm": 0.21150313317775726,
+      "learning_rate": 0.00013203581159984432,
+      "loss": 1.0815,
+      "step": 879
+    },
+    {
+      "epoch": 0.3419799863985233,
+      "grad_norm": 0.2073293924331665,
+      "learning_rate": 0.00013195796029583497,
+      "loss": 1.0579,
+      "step": 880
+    },
+    {
+      "epoch": 0.3423686000194307,
+      "grad_norm": 0.221574068069458,
+      "learning_rate": 0.00013188010899182562,
+      "loss": 1.0279,
+      "step": 881
+    },
+    {
+      "epoch": 0.3427572136403381,
+      "grad_norm": 0.22334492206573486,
+      "learning_rate": 0.00013180225768781627,
+      "loss": 1.0837,
+      "step": 882
+    },
+    {
+      "epoch": 0.3431458272612455,
+      "grad_norm": 0.18817654252052307,
+      "learning_rate": 0.00013172440638380695,
+      "loss": 1.0262,
+      "step": 883
+    },
+    {
+      "epoch": 0.3435344408821529,
+      "grad_norm": 0.20126822590827942,
+      "learning_rate": 0.0001316465550797976,
+      "loss": 1.0679,
+      "step": 884
+    },
+    {
+      "epoch": 0.3439230545030603,
+      "grad_norm": 0.2128864973783493,
+      "learning_rate": 0.00013156870377578825,
+      "loss": 1.0316,
+      "step": 885
+    },
+    {
+      "epoch": 0.34431166812396774,
+      "grad_norm": 0.20054499804973602,
+      "learning_rate": 0.0001314908524717789,
+      "loss": 1.0024,
+      "step": 886
+    },
+    {
+      "epoch": 0.34470028174487516,
+      "grad_norm": 0.21358034014701843,
+      "learning_rate": 0.00013141300116776956,
+      "loss": 1.0475,
+      "step": 887
+    },
+    {
+      "epoch": 0.3450888953657826,
+      "grad_norm": 0.21377703547477722,
+      "learning_rate": 0.00013133514986376024,
+      "loss": 1.0957,
+      "step": 888
+    },
+    {
+      "epoch": 0.34547750898669,
+      "grad_norm": 0.20166514813899994,
+      "learning_rate": 0.0001312572985597509,
+      "loss": 1.0189,
+      "step": 889
+    },
+    {
+      "epoch": 0.3458661226075974,
+      "grad_norm": 0.20424878597259521,
+      "learning_rate": 0.00013117944725574154,
+      "loss": 1.0896,
+      "step": 890
+    },
+    {
+      "epoch": 0.34625473622850483,
+      "grad_norm": 0.19028648734092712,
+      "learning_rate": 0.0001311015959517322,
+      "loss": 0.9881,
+      "step": 891
+    },
+    {
+      "epoch": 0.3466433498494122,
+      "grad_norm": 0.20828665792942047,
+      "learning_rate": 0.00013102374464772285,
+      "loss": 0.9932,
+      "step": 892
+    },
+    {
+      "epoch": 0.3470319634703196,
+      "grad_norm": 0.20756572484970093,
+      "learning_rate": 0.00013094589334371353,
+      "loss": 1.0406,
+      "step": 893
+    },
+    {
+      "epoch": 0.34742057709122703,
+      "grad_norm": 0.20768921077251434,
+      "learning_rate": 0.00013086804203970418,
+      "loss": 0.9652,
+      "step": 894
+    },
+    {
+      "epoch": 0.34780919071213445,
+      "grad_norm": 0.20660027861595154,
+      "learning_rate": 0.00013079019073569483,
+      "loss": 1.0728,
+      "step": 895
+    },
+    {
+      "epoch": 0.34819780433304187,
+      "grad_norm": 0.20186837017536163,
+      "learning_rate": 0.00013071233943168548,
+      "loss": 1.0407,
+      "step": 896
+    },
+    {
+      "epoch": 0.3485864179539493,
+      "grad_norm": 0.20880667865276337,
+      "learning_rate": 0.00013063448812767613,
+      "loss": 1.0275,
+      "step": 897
+    },
+    {
+      "epoch": 0.3489750315748567,
+      "grad_norm": 0.22212949395179749,
+      "learning_rate": 0.0001305566368236668,
+      "loss": 1.0293,
+      "step": 898
+    },
+    {
+      "epoch": 0.3493636451957641,
+      "grad_norm": 0.20552745461463928,
+      "learning_rate": 0.00013047878551965746,
+      "loss": 1.0434,
+      "step": 899
+    },
+    {
+      "epoch": 0.34975225881667155,
+      "grad_norm": 0.21239839494228363,
+      "learning_rate": 0.00013040093421564812,
+      "loss": 1.052,
+      "step": 900
+    },
+    {
+      "epoch": 0.3501408724375789,
+      "grad_norm": 0.22420544922351837,
+      "learning_rate": 0.00013032308291163877,
+      "loss": 1.0236,
+      "step": 901
+    },
+    {
+      "epoch": 0.35052948605848633,
+      "grad_norm": 0.23435090482234955,
+      "learning_rate": 0.00013024523160762942,
+      "loss": 1.0876,
+      "step": 902
+    },
+    {
+      "epoch": 0.35091809967939375,
+      "grad_norm": 0.22763386368751526,
+      "learning_rate": 0.0001301673803036201,
+      "loss": 1.0636,
+      "step": 903
+    },
+    {
+      "epoch": 0.35130671330030117,
+      "grad_norm": 0.20948883891105652,
+      "learning_rate": 0.00013008952899961075,
+      "loss": 1.0083,
+      "step": 904
+    },
+    {
+      "epoch": 0.3516953269212086,
+      "grad_norm": 0.20408779382705688,
+      "learning_rate": 0.0001300116776956014,
+      "loss": 1.039,
+      "step": 905
+    },
+    {
+      "epoch": 0.352083940542116,
+      "grad_norm": 0.2126050591468811,
+      "learning_rate": 0.00012993382639159206,
+      "loss": 1.0365,
+      "step": 906
+    },
+    {
+      "epoch": 0.3524725541630234,
+      "grad_norm": 0.20314334332942963,
+      "learning_rate": 0.0001298559750875827,
+      "loss": 1.0474,
+      "step": 907
+    },
+    {
+      "epoch": 0.35286116778393084,
+      "grad_norm": 0.23720984160900116,
+      "learning_rate": 0.0001297781237835734,
+      "loss": 1.0529,
+      "step": 908
+    },
+    {
+      "epoch": 0.35324978140483826,
+      "grad_norm": 0.22642800211906433,
+      "learning_rate": 0.00012970027247956404,
+      "loss": 1.0586,
+      "step": 909
+    },
+    {
+      "epoch": 0.3536383950257457,
+      "grad_norm": 0.20469972491264343,
+      "learning_rate": 0.0001296224211755547,
+      "loss": 1.0267,
+      "step": 910
+    },
+    {
+      "epoch": 0.35402700864665304,
+      "grad_norm": 0.197368785738945,
+      "learning_rate": 0.00012954456987154534,
+      "loss": 1.0348,
+      "step": 911
+    },
+    {
+      "epoch": 0.35441562226756046,
+      "grad_norm": 0.21924498677253723,
+      "learning_rate": 0.000129466718567536,
+      "loss": 1.0861,
+      "step": 912
+    },
+    {
+      "epoch": 0.3548042358884679,
+      "grad_norm": 0.22006285190582275,
+      "learning_rate": 0.00012938886726352667,
+      "loss": 1.0545,
+      "step": 913
+    },
+    {
+      "epoch": 0.3551928495093753,
+      "grad_norm": 0.22419220209121704,
+      "learning_rate": 0.00012931101595951733,
+      "loss": 1.0716,
+      "step": 914
+    },
+    {
+      "epoch": 0.3555814631302827,
+      "grad_norm": 0.215990349650383,
+      "learning_rate": 0.00012923316465550798,
+      "loss": 1.0619,
+      "step": 915
+    },
+    {
+      "epoch": 0.35597007675119013,
+      "grad_norm": 0.20783264935016632,
+      "learning_rate": 0.00012915531335149863,
+      "loss": 1.0412,
+      "step": 916
+    },
+    {
+      "epoch": 0.35635869037209755,
+      "grad_norm": 0.24584618210792542,
+      "learning_rate": 0.00012907746204748928,
+      "loss": 1.1165,
+      "step": 917
+    },
+    {
+      "epoch": 0.35674730399300497,
+      "grad_norm": 0.23146122694015503,
+      "learning_rate": 0.00012899961074347996,
+      "loss": 1.1111,
+      "step": 918
+    },
+    {
+      "epoch": 0.3571359176139124,
+      "grad_norm": 0.19983729720115662,
+      "learning_rate": 0.00012892175943947061,
+      "loss": 1.0674,
+      "step": 919
+    },
+    {
+      "epoch": 0.35752453123481975,
+      "grad_norm": 0.2161000818014145,
+      "learning_rate": 0.00012884390813546127,
+      "loss": 1.076,
+      "step": 920
+    },
+    {
+      "epoch": 0.35791314485572717,
+      "grad_norm": 0.21042793989181519,
+      "learning_rate": 0.00012876605683145192,
+      "loss": 1.0535,
+      "step": 921
+    },
+    {
+      "epoch": 0.3583017584766346,
+      "grad_norm": 0.20135439932346344,
+      "learning_rate": 0.0001286882055274426,
+      "loss": 1.0059,
+      "step": 922
+    },
+    {
+      "epoch": 0.358690372097542,
+      "grad_norm": 0.19394971430301666,
+      "learning_rate": 0.00012861035422343325,
+      "loss": 1.0381,
+      "step": 923
+    },
+    {
+      "epoch": 0.35907898571844943,
+      "grad_norm": 0.21171030402183533,
+      "learning_rate": 0.0001285325029194239,
+      "loss": 1.0513,
+      "step": 924
+    },
+    {
+      "epoch": 0.35946759933935685,
+      "grad_norm": 0.19476690888404846,
+      "learning_rate": 0.00012845465161541458,
+      "loss": 1.0003,
+      "step": 925
+    },
+    {
+      "epoch": 0.35985621296026427,
+      "grad_norm": 0.20468670129776,
+      "learning_rate": 0.00012837680031140523,
+      "loss": 1.0608,
+      "step": 926
+    },
+    {
+      "epoch": 0.3602448265811717,
+      "grad_norm": 0.21159446239471436,
+      "learning_rate": 0.00012829894900739588,
+      "loss": 1.0734,
+      "step": 927
+    },
+    {
+      "epoch": 0.3606334402020791,
+      "grad_norm": 0.21179519593715668,
+      "learning_rate": 0.00012822109770338654,
+      "loss": 1.0957,
+      "step": 928
+    },
+    {
+      "epoch": 0.3610220538229865,
+      "grad_norm": 0.20997527241706848,
+      "learning_rate": 0.00012814324639937722,
+      "loss": 1.0644,
+      "step": 929
+    },
+    {
+      "epoch": 0.3614106674438939,
+      "grad_norm": 0.21178296208381653,
+      "learning_rate": 0.00012806539509536787,
+      "loss": 1.0208,
+      "step": 930
+    },
+    {
+      "epoch": 0.3617992810648013,
+      "grad_norm": 0.20890356600284576,
+      "learning_rate": 0.00012798754379135852,
+      "loss": 1.0888,
+      "step": 931
+    },
+    {
+      "epoch": 0.3621878946857087,
+      "grad_norm": 0.20177409052848816,
+      "learning_rate": 0.00012790969248734917,
+      "loss": 0.9741,
+      "step": 932
+    },
+    {
+      "epoch": 0.36257650830661614,
+      "grad_norm": 0.23504556715488434,
+      "learning_rate": 0.00012783184118333982,
+      "loss": 1.1048,
+      "step": 933
+    },
+    {
+      "epoch": 0.36296512192752356,
+      "grad_norm": 0.22829356789588928,
+      "learning_rate": 0.0001277539898793305,
+      "loss": 1.0798,
+      "step": 934
+    },
+    {
+      "epoch": 0.363353735548431,
+      "grad_norm": 0.2068483531475067,
+      "learning_rate": 0.00012767613857532116,
+      "loss": 1.0452,
+      "step": 935
+    },
+    {
+      "epoch": 0.3637423491693384,
+      "grad_norm": 0.2093171775341034,
+      "learning_rate": 0.0001275982872713118,
+      "loss": 1.0742,
+      "step": 936
+    },
+    {
+      "epoch": 0.3641309627902458,
+      "grad_norm": 0.21478736400604248,
+      "learning_rate": 0.00012752043596730246,
+      "loss": 1.0572,
+      "step": 937
+    },
+    {
+      "epoch": 0.36451957641115323,
+      "grad_norm": 0.1906953752040863,
+      "learning_rate": 0.0001274425846632931,
+      "loss": 1.0107,
+      "step": 938
+    },
+    {
+      "epoch": 0.3649081900320606,
+      "grad_norm": 0.20580604672431946,
+      "learning_rate": 0.0001273647333592838,
+      "loss": 1.0677,
+      "step": 939
+    },
+    {
+      "epoch": 0.365296803652968,
+      "grad_norm": 0.22586850821971893,
+      "learning_rate": 0.00012728688205527444,
+      "loss": 1.0389,
+      "step": 940
+    },
+    {
+      "epoch": 0.36568541727387543,
+      "grad_norm": 0.199899360537529,
+      "learning_rate": 0.0001272090307512651,
+      "loss": 1.0462,
+      "step": 941
+    },
+    {
+      "epoch": 0.36607403089478285,
+      "grad_norm": 0.19881689548492432,
+      "learning_rate": 0.00012713117944725575,
+      "loss": 1.0565,
+      "step": 942
+    },
+    {
+      "epoch": 0.3664626445156903,
+      "grad_norm": 0.21748925745487213,
+      "learning_rate": 0.0001270533281432464,
+      "loss": 1.0659,
+      "step": 943
+    },
+    {
+      "epoch": 0.3668512581365977,
+      "grad_norm": 0.19363689422607422,
+      "learning_rate": 0.00012697547683923708,
+      "loss": 1.0307,
+      "step": 944
+    },
+    {
+      "epoch": 0.3672398717575051,
+      "grad_norm": 0.21701784431934357,
+      "learning_rate": 0.00012689762553522773,
+      "loss": 1.0684,
+      "step": 945
+    },
+    {
+      "epoch": 0.36762848537841253,
+      "grad_norm": 0.21406958997249603,
+      "learning_rate": 0.00012681977423121838,
+      "loss": 1.0703,
+      "step": 946
+    },
+    {
+      "epoch": 0.36801709899931995,
+      "grad_norm": 0.23539729416370392,
+      "learning_rate": 0.00012674192292720903,
+      "loss": 1.1537,
+      "step": 947
+    },
+    {
+      "epoch": 0.36840571262022737,
+      "grad_norm": 0.2177354395389557,
+      "learning_rate": 0.00012666407162319969,
+      "loss": 1.0131,
+      "step": 948
+    },
+    {
+      "epoch": 0.36879432624113473,
+      "grad_norm": 0.255346417427063,
+      "learning_rate": 0.00012658622031919037,
+      "loss": 0.9807,
+      "step": 949
+    },
+    {
+      "epoch": 0.36918293986204215,
+      "grad_norm": 0.2139921486377716,
+      "learning_rate": 0.00012650836901518102,
+      "loss": 1.0392,
+      "step": 950
+    },
+    {
+      "epoch": 0.36957155348294957,
+      "grad_norm": 0.22490833699703217,
+      "learning_rate": 0.00012643051771117167,
+      "loss": 1.0512,
+      "step": 951
+    },
+    {
+      "epoch": 0.369960167103857,
+      "grad_norm": 0.20698820054531097,
+      "learning_rate": 0.00012635266640716232,
+      "loss": 1.0391,
+      "step": 952
+    },
+    {
+      "epoch": 0.3703487807247644,
+      "grad_norm": 0.2276201844215393,
+      "learning_rate": 0.00012627481510315297,
+      "loss": 1.0513,
+      "step": 953
+    },
+    {
+      "epoch": 0.3707373943456718,
+      "grad_norm": 0.2493600994348526,
+      "learning_rate": 0.00012619696379914365,
+      "loss": 1.0136,
+      "step": 954
+    },
+    {
+      "epoch": 0.37112600796657924,
+      "grad_norm": 0.2155001014471054,
+      "learning_rate": 0.0001261191124951343,
+      "loss": 1.0523,
+      "step": 955
+    },
+    {
+      "epoch": 0.37151462158748666,
+      "grad_norm": 0.21571211516857147,
+      "learning_rate": 0.00012604126119112496,
+      "loss": 1.0288,
+      "step": 956
+    },
+    {
+      "epoch": 0.3719032352083941,
+      "grad_norm": 0.23238877952098846,
+      "learning_rate": 0.0001259634098871156,
+      "loss": 1.0638,
+      "step": 957
+    },
+    {
+      "epoch": 0.37229184882930144,
+      "grad_norm": 0.2002813220024109,
+      "learning_rate": 0.00012588555858310626,
+      "loss": 0.9665,
+      "step": 958
+    },
+    {
+      "epoch": 0.37268046245020886,
+      "grad_norm": 0.21712858974933624,
+      "learning_rate": 0.0001258077072790969,
+      "loss": 1.0469,
+      "step": 959
+    },
+    {
+      "epoch": 0.3730690760711163,
+      "grad_norm": 0.2178192287683487,
+      "learning_rate": 0.0001257298559750876,
+      "loss": 1.0267,
+      "step": 960
+    },
+    {
+      "epoch": 0.3734576896920237,
+      "grad_norm": 0.25488024950027466,
+      "learning_rate": 0.00012565200467107824,
+      "loss": 1.0153,
+      "step": 961
+    },
+    {
+      "epoch": 0.3738463033129311,
+      "grad_norm": 0.20070038735866547,
+      "learning_rate": 0.0001255741533670689,
+      "loss": 1.0279,
+      "step": 962
+    },
+    {
+      "epoch": 0.37423491693383854,
+      "grad_norm": 0.21885356307029724,
+      "learning_rate": 0.00012549630206305955,
+      "loss": 1.0395,
+      "step": 963
+    },
+    {
+      "epoch": 0.37462353055474595,
+      "grad_norm": 0.2407921701669693,
+      "learning_rate": 0.0001254184507590502,
+      "loss": 1.0767,
+      "step": 964
+    },
+    {
+      "epoch": 0.3750121441756534,
+      "grad_norm": 0.20645053684711456,
+      "learning_rate": 0.00012534059945504088,
+      "loss": 1.0318,
+      "step": 965
+    },
+    {
+      "epoch": 0.3754007577965608,
+      "grad_norm": 0.21275092661380768,
+      "learning_rate": 0.00012526274815103153,
+      "loss": 1.0546,
+      "step": 966
+    },
+    {
+      "epoch": 0.3757893714174682,
+      "grad_norm": 0.21574917435646057,
+      "learning_rate": 0.00012518489684702218,
+      "loss": 1.032,
+      "step": 967
+    },
+    {
+      "epoch": 0.3761779850383756,
+      "grad_norm": 0.21589480340480804,
+      "learning_rate": 0.00012510704554301284,
+      "loss": 1.0834,
+      "step": 968
+    },
+    {
+      "epoch": 0.376566598659283,
+      "grad_norm": 0.19576796889305115,
+      "learning_rate": 0.0001250291942390035,
+      "loss": 1.0178,
+      "step": 969
+    },
+    {
+      "epoch": 0.3769552122801904,
+      "grad_norm": 0.20941287279129028,
+      "learning_rate": 0.00012495134293499417,
+      "loss": 1.0712,
+      "step": 970
+    },
+    {
+      "epoch": 0.37734382590109783,
+      "grad_norm": 0.22585494816303253,
+      "learning_rate": 0.00012487349163098482,
+      "loss": 1.0401,
+      "step": 971
+    },
+    {
+      "epoch": 0.37773243952200525,
+      "grad_norm": 0.21093420684337616,
+      "learning_rate": 0.00012479564032697547,
+      "loss": 1.0569,
+      "step": 972
+    },
+    {
+      "epoch": 0.37812105314291267,
+      "grad_norm": 0.22375014424324036,
+      "learning_rate": 0.00012471778902296612,
+      "loss": 1.0687,
+      "step": 973
+    },
+    {
+      "epoch": 0.3785096667638201,
+      "grad_norm": 0.19787487387657166,
+      "learning_rate": 0.0001246399377189568,
+      "loss": 1.0266,
+      "step": 974
+    },
+    {
+      "epoch": 0.3788982803847275,
+      "grad_norm": 0.20633013546466827,
+      "learning_rate": 0.00012456208641494745,
+      "loss": 0.9996,
+      "step": 975
+    },
+    {
+      "epoch": 0.3792868940056349,
+      "grad_norm": 0.21559873223304749,
+      "learning_rate": 0.0001244842351109381,
+      "loss": 1.0851,
+      "step": 976
+    },
+    {
+      "epoch": 0.3796755076265423,
+      "grad_norm": 0.2166333943605423,
+      "learning_rate": 0.00012440638380692879,
+      "loss": 1.0859,
+      "step": 977
+    },
+    {
+      "epoch": 0.3800641212474497,
+      "grad_norm": 0.18558773398399353,
+      "learning_rate": 0.00012432853250291944,
+      "loss": 0.9534,
+      "step": 978
+    },
+    {
+      "epoch": 0.3804527348683571,
+      "grad_norm": 0.2086942344903946,
+      "learning_rate": 0.0001242506811989101,
+      "loss": 1.0786,
+      "step": 979
+    },
+    {
+      "epoch": 0.38084134848926454,
+      "grad_norm": 0.2207823544740677,
+      "learning_rate": 0.00012417282989490074,
+      "loss": 1.0626,
+      "step": 980
+    },
+    {
+      "epoch": 0.38122996211017196,
+      "grad_norm": 0.21255749464035034,
+      "learning_rate": 0.00012409497859089142,
+      "loss": 1.063,
+      "step": 981
+    },
+    {
+      "epoch": 0.3816185757310794,
+      "grad_norm": 0.20682042837142944,
+      "learning_rate": 0.00012401712728688207,
+      "loss": 1.034,
+      "step": 982
+    },
+    {
+      "epoch": 0.3820071893519868,
+      "grad_norm": 0.2084134966135025,
+      "learning_rate": 0.00012393927598287272,
+      "loss": 1.0481,
+      "step": 983
+    },
+    {
+      "epoch": 0.3823958029728942,
+      "grad_norm": 0.1922312080860138,
+      "learning_rate": 0.00012386142467886338,
+      "loss": 1.0461,
+      "step": 984
+    },
+    {
+      "epoch": 0.38278441659380164,
+      "grad_norm": 0.20893707871437073,
+      "learning_rate": 0.00012378357337485406,
+      "loss": 1.0797,
+      "step": 985
+    },
+    {
+      "epoch": 0.383173030214709,
+      "grad_norm": 0.19717541337013245,
+      "learning_rate": 0.0001237057220708447,
+      "loss": 1.0028,
+      "step": 986
+    },
+    {
+      "epoch": 0.3835616438356164,
+      "grad_norm": 0.20688053965568542,
+      "learning_rate": 0.00012362787076683536,
+      "loss": 0.989,
+      "step": 987
+    },
+    {
+      "epoch": 0.38395025745652384,
+      "grad_norm": 0.20580583810806274,
+      "learning_rate": 0.000123550019462826,
+      "loss": 1.06,
+      "step": 988
+    },
+    {
+      "epoch": 0.38433887107743125,
+      "grad_norm": 0.2151709794998169,
+      "learning_rate": 0.00012347216815881666,
+      "loss": 1.0685,
+      "step": 989
+    },
+    {
+      "epoch": 0.3847274846983387,
+      "grad_norm": 0.19573980569839478,
+      "learning_rate": 0.00012339431685480734,
+      "loss": 1.0072,
+      "step": 990
+    },
+    {
+      "epoch": 0.3851160983192461,
+      "grad_norm": 0.1949119120836258,
+      "learning_rate": 0.000123316465550798,
+      "loss": 0.9995,
+      "step": 991
+    },
+    {
+      "epoch": 0.3855047119401535,
+      "grad_norm": 0.2062375247478485,
+      "learning_rate": 0.00012323861424678865,
+      "loss": 1.0694,
+      "step": 992
+    },
+    {
+      "epoch": 0.38589332556106093,
+      "grad_norm": 0.2007209211587906,
+      "learning_rate": 0.0001231607629427793,
+      "loss": 1.0397,
+      "step": 993
+    },
+    {
+      "epoch": 0.38628193918196835,
+      "grad_norm": 0.2231544405221939,
+      "learning_rate": 0.00012308291163876995,
+      "loss": 1.0755,
+      "step": 994
+    },
+    {
+      "epoch": 0.38667055280287577,
+      "grad_norm": 0.2103337049484253,
+      "learning_rate": 0.0001230050603347606,
+      "loss": 1.0505,
+      "step": 995
+    },
+    {
+      "epoch": 0.38705916642378313,
+      "grad_norm": 0.20178386569023132,
+      "learning_rate": 0.00012292720903075128,
+      "loss": 1.0696,
+      "step": 996
+    },
+    {
+      "epoch": 0.38744778004469055,
+      "grad_norm": 0.21268007159233093,
+      "learning_rate": 0.00012284935772674193,
+      "loss": 1.0262,
+      "step": 997
+    },
+    {
+      "epoch": 0.38783639366559797,
+      "grad_norm": 0.21439722180366516,
+      "learning_rate": 0.0001227715064227326,
+      "loss": 1.0718,
+      "step": 998
+    },
+    {
+      "epoch": 0.3882250072865054,
+      "grad_norm": 0.19691336154937744,
+      "learning_rate": 0.00012269365511872324,
+      "loss": 0.9663,
+      "step": 999
+    },
+    {
+      "epoch": 0.3886136209074128,
+      "grad_norm": 0.2165926694869995,
+      "learning_rate": 0.0001226158038147139,
+      "loss": 1.0432,
+      "step": 1000
+    },
+    {
+      "epoch": 0.3890022345283202,
+      "grad_norm": 0.20730604231357574,
+      "learning_rate": 0.00012253795251070457,
+      "loss": 1.0386,
+      "step": 1001
+    },
+    {
+      "epoch": 0.38939084814922764,
+      "grad_norm": 0.2138068974018097,
+      "learning_rate": 0.00012246010120669522,
+      "loss": 1.0683,
+      "step": 1002
+    },
+    {
+      "epoch": 0.38977946177013506,
+      "grad_norm": 0.2118951678276062,
+      "learning_rate": 0.00012238224990268587,
+      "loss": 1.0393,
+      "step": 1003
+    },
+    {
+      "epoch": 0.3901680753910425,
+      "grad_norm": 0.20879961550235748,
+      "learning_rate": 0.00012230439859867653,
+      "loss": 1.0349,
+      "step": 1004
+    },
+    {
+      "epoch": 0.39055668901194984,
+      "grad_norm": 0.19588464498519897,
+      "learning_rate": 0.00012222654729466718,
+      "loss": 1.0226,
+      "step": 1005
+    },
+    {
+      "epoch": 0.39094530263285726,
+      "grad_norm": 0.2059485912322998,
+      "learning_rate": 0.00012214869599065786,
+      "loss": 1.052,
+      "step": 1006
+    },
+    {
+      "epoch": 0.3913339162537647,
+      "grad_norm": 0.2299761176109314,
+      "learning_rate": 0.0001220708446866485,
+      "loss": 1.1055,
+      "step": 1007
+    },
+    {
+      "epoch": 0.3917225298746721,
+      "grad_norm": 0.20196737349033356,
+      "learning_rate": 0.00012199299338263916,
+      "loss": 1.0497,
+      "step": 1008
+    },
+    {
+      "epoch": 0.3921111434955795,
+      "grad_norm": 0.20615293085575104,
+      "learning_rate": 0.00012191514207862981,
+      "loss": 1.047,
+      "step": 1009
+    },
+    {
+      "epoch": 0.39249975711648694,
+      "grad_norm": 0.20265278220176697,
+      "learning_rate": 0.00012183729077462047,
+      "loss": 1.0035,
+      "step": 1010
+    },
+    {
+      "epoch": 0.39288837073739435,
+      "grad_norm": 0.20197926461696625,
+      "learning_rate": 0.00012175943947061114,
+      "loss": 0.9847,
+      "step": 1011
+    },
+    {
+      "epoch": 0.3932769843583018,
+      "grad_norm": 0.19974152743816376,
+      "learning_rate": 0.0001216815881666018,
+      "loss": 1.0669,
+      "step": 1012
+    },
+    {
+      "epoch": 0.3936655979792092,
+      "grad_norm": 0.21684005856513977,
+      "learning_rate": 0.00012160373686259245,
+      "loss": 1.0562,
+      "step": 1013
+    },
+    {
+      "epoch": 0.3940542116001166,
+      "grad_norm": 0.2030404955148697,
+      "learning_rate": 0.00012152588555858311,
+      "loss": 1.0159,
+      "step": 1014
+    },
+    {
+      "epoch": 0.394442825221024,
+      "grad_norm": 0.2123572677373886,
+      "learning_rate": 0.00012144803425457377,
+      "loss": 1.0757,
+      "step": 1015
+    },
+    {
+      "epoch": 0.3948314388419314,
+      "grad_norm": 0.20320011675357819,
+      "learning_rate": 0.00012137018295056443,
+      "loss": 1.038,
+      "step": 1016
+    },
+    {
+      "epoch": 0.3952200524628388,
+      "grad_norm": 0.20120739936828613,
+      "learning_rate": 0.00012129233164655508,
+      "loss": 1.1015,
+      "step": 1017
+    },
+    {
+      "epoch": 0.39560866608374623,
+      "grad_norm": 0.19862449169158936,
+      "learning_rate": 0.00012121448034254575,
+      "loss": 1.0328,
+      "step": 1018
+    },
+    {
+      "epoch": 0.39599727970465365,
+      "grad_norm": 0.19761312007904053,
+      "learning_rate": 0.0001211366290385364,
+      "loss": 0.997,
+      "step": 1019
+    },
+    {
+      "epoch": 0.39638589332556107,
+      "grad_norm": 0.1943569928407669,
+      "learning_rate": 0.00012105877773452705,
+      "loss": 1.0099,
+      "step": 1020
+    },
+    {
+      "epoch": 0.3967745069464685,
+      "grad_norm": 0.2109062373638153,
+      "learning_rate": 0.00012098092643051773,
+      "loss": 1.1039,
+      "step": 1021
+    },
+    {
+      "epoch": 0.3971631205673759,
+      "grad_norm": 0.20966266095638275,
+      "learning_rate": 0.00012090307512650839,
+      "loss": 1.1208,
+      "step": 1022
+    },
+    {
+      "epoch": 0.3975517341882833,
+      "grad_norm": 0.19208088517189026,
+      "learning_rate": 0.00012082522382249904,
+      "loss": 1.0147,
+      "step": 1023
+    },
+    {
+      "epoch": 0.3979403478091907,
+      "grad_norm": 0.21821236610412598,
+      "learning_rate": 0.00012074737251848969,
+      "loss": 1.0615,
+      "step": 1024
+    },
+    {
+      "epoch": 0.3983289614300981,
+      "grad_norm": 0.20031368732452393,
+      "learning_rate": 0.00012066952121448034,
+      "loss": 1.0303,
+      "step": 1025
+    },
+    {
+      "epoch": 0.3987175750510055,
+      "grad_norm": 0.22910597920417786,
+      "learning_rate": 0.00012059166991047102,
+      "loss": 1.0182,
+      "step": 1026
+    },
+    {
+      "epoch": 0.39910618867191294,
+      "grad_norm": 0.20816978812217712,
+      "learning_rate": 0.00012051381860646167,
+      "loss": 1.0142,
+      "step": 1027
+    },
+    {
+      "epoch": 0.39949480229282036,
+      "grad_norm": 0.20989780128002167,
+      "learning_rate": 0.00012043596730245232,
+      "loss": 1.0676,
+      "step": 1028
+    },
+    {
+      "epoch": 0.3998834159137278,
+      "grad_norm": 0.21894055604934692,
+      "learning_rate": 0.00012035811599844298,
+      "loss": 1.0222,
+      "step": 1029
+    },
+    {
+      "epoch": 0.4002720295346352,
+      "grad_norm": 0.2170870155096054,
+      "learning_rate": 0.00012028026469443363,
+      "loss": 1.0319,
+      "step": 1030
+    },
+    {
+      "epoch": 0.4006606431555426,
+      "grad_norm": 0.20869679749011993,
+      "learning_rate": 0.00012020241339042428,
+      "loss": 1.055,
+      "step": 1031
+    },
+    {
+      "epoch": 0.40104925677645004,
+      "grad_norm": 0.18850640952587128,
+      "learning_rate": 0.00012012456208641496,
+      "loss": 0.9993,
+      "step": 1032
+    },
+    {
+      "epoch": 0.40143787039735745,
+      "grad_norm": 0.21462580561637878,
+      "learning_rate": 0.00012004671078240561,
+      "loss": 1.0115,
+      "step": 1033
+    },
+    {
+      "epoch": 0.4018264840182648,
+      "grad_norm": 0.2008499950170517,
+      "learning_rate": 0.00011996885947839626,
+      "loss": 1.0229,
+      "step": 1034
+    },
+    {
+      "epoch": 0.40221509763917224,
+      "grad_norm": 0.20063354074954987,
+      "learning_rate": 0.00011989100817438692,
+      "loss": 1.0295,
+      "step": 1035
+    },
+    {
+      "epoch": 0.40260371126007966,
+      "grad_norm": 0.20655786991119385,
+      "learning_rate": 0.00011981315687037757,
+      "loss": 1.0044,
+      "step": 1036
+    },
+    {
+      "epoch": 0.4029923248809871,
+      "grad_norm": 0.1985999196767807,
+      "learning_rate": 0.00011973530556636825,
+      "loss": 1.0063,
+      "step": 1037
+    },
+    {
+      "epoch": 0.4033809385018945,
+      "grad_norm": 0.2039060890674591,
+      "learning_rate": 0.0001196574542623589,
+      "loss": 1.044,
+      "step": 1038
+    },
+    {
+      "epoch": 0.4037695521228019,
+      "grad_norm": 0.21838189661502838,
+      "learning_rate": 0.00011957960295834955,
+      "loss": 1.1101,
+      "step": 1039
+    },
+    {
+      "epoch": 0.40415816574370933,
+      "grad_norm": 0.21508415043354034,
+      "learning_rate": 0.00011950175165434022,
+      "loss": 1.0764,
+      "step": 1040
+    },
+    {
+      "epoch": 0.40454677936461675,
+      "grad_norm": 0.2089119255542755,
+      "learning_rate": 0.00011942390035033087,
+      "loss": 0.9986,
+      "step": 1041
+    },
+    {
+      "epoch": 0.40493539298552417,
+      "grad_norm": 0.19859452545642853,
+      "learning_rate": 0.00011934604904632153,
+      "loss": 1.0122,
+      "step": 1042
+    },
+    {
+      "epoch": 0.40532400660643153,
+      "grad_norm": 0.2018653154373169,
+      "learning_rate": 0.00011926819774231219,
+      "loss": 1.0187,
+      "step": 1043
+    },
+    {
+      "epoch": 0.40571262022733895,
+      "grad_norm": 0.19892063736915588,
+      "learning_rate": 0.00011919034643830285,
+      "loss": 1.0029,
+      "step": 1044
+    },
+    {
+      "epoch": 0.40610123384824637,
+      "grad_norm": 0.20355650782585144,
+      "learning_rate": 0.0001191124951342935,
+      "loss": 1.0484,
+      "step": 1045
+    },
+    {
+      "epoch": 0.4064898474691538,
+      "grad_norm": 0.2033994495868683,
+      "learning_rate": 0.00011903464383028416,
+      "loss": 1.087,
+      "step": 1046
+    },
+    {
+      "epoch": 0.4068784610900612,
+      "grad_norm": 0.2047330141067505,
+      "learning_rate": 0.00011895679252627484,
+      "loss": 1.0774,
+      "step": 1047
+    },
+    {
+      "epoch": 0.4072670747109686,
+      "grad_norm": 0.21420112252235413,
+      "learning_rate": 0.00011887894122226549,
+      "loss": 1.0252,
+      "step": 1048
+    },
+    {
+      "epoch": 0.40765568833187604,
+      "grad_norm": 0.2030097395181656,
+      "learning_rate": 0.00011880108991825614,
+      "loss": 1.0501,
+      "step": 1049
+    },
+    {
+      "epoch": 0.40804430195278346,
+      "grad_norm": 0.2128026783466339,
+      "learning_rate": 0.00011872323861424679,
+      "loss": 1.1031,
+      "step": 1050
+    },
+    {
+      "epoch": 0.4084329155736909,
+      "grad_norm": 0.20724938809871674,
+      "learning_rate": 0.00011864538731023744,
+      "loss": 1.0327,
+      "step": 1051
+    },
+    {
+      "epoch": 0.40882152919459824,
+      "grad_norm": 0.20344072580337524,
+      "learning_rate": 0.00011856753600622812,
+      "loss": 1.0719,
+      "step": 1052
+    },
+    {
+      "epoch": 0.40921014281550566,
+      "grad_norm": 0.2145012468099594,
+      "learning_rate": 0.00011848968470221877,
+      "loss": 1.0582,
+      "step": 1053
+    },
+    {
+      "epoch": 0.4095987564364131,
+      "grad_norm": 0.220048725605011,
+      "learning_rate": 0.00011841183339820943,
+      "loss": 1.0825,
+      "step": 1054
+    },
+    {
+      "epoch": 0.4099873700573205,
+      "grad_norm": 0.19074465334415436,
+      "learning_rate": 0.00011833398209420008,
+      "loss": 0.9657,
+      "step": 1055
+    },
+    {
+      "epoch": 0.4103759836782279,
+      "grad_norm": 0.1958267241716385,
+      "learning_rate": 0.00011825613079019073,
+      "loss": 0.9864,
+      "step": 1056
+    },
+    {
+      "epoch": 0.41076459729913534,
+      "grad_norm": 0.21768233180046082,
+      "learning_rate": 0.00011817827948618141,
+      "loss": 0.9997,
+      "step": 1057
+    },
+    {
+      "epoch": 0.41115321092004276,
+      "grad_norm": 0.20218704640865326,
+      "learning_rate": 0.00011810042818217206,
+      "loss": 1.072,
+      "step": 1058
+    },
+    {
+      "epoch": 0.4115418245409502,
+      "grad_norm": 0.2035023719072342,
+      "learning_rate": 0.00011802257687816271,
+      "loss": 1.0415,
+      "step": 1059
+    },
+    {
+      "epoch": 0.4119304381618576,
+      "grad_norm": 0.22603970766067505,
+      "learning_rate": 0.00011794472557415337,
+      "loss": 1.0751,
+      "step": 1060
+    },
+    {
+      "epoch": 0.412319051782765,
+      "grad_norm": 0.2125842273235321,
+      "learning_rate": 0.00011786687427014402,
+      "loss": 1.0727,
+      "step": 1061
+    },
+    {
+      "epoch": 0.4127076654036724,
+      "grad_norm": 0.2005981206893921,
+      "learning_rate": 0.0001177890229661347,
+      "loss": 1.0191,
+      "step": 1062
+    },
+    {
+      "epoch": 0.4130962790245798,
+      "grad_norm": 0.22252701222896576,
+      "learning_rate": 0.00011771117166212535,
+      "loss": 1.0591,
+      "step": 1063
+    },
+    {
+      "epoch": 0.4134848926454872,
+      "grad_norm": 0.22205251455307007,
+      "learning_rate": 0.000117633320358116,
+      "loss": 1.1198,
+      "step": 1064
+    },
+    {
+      "epoch": 0.41387350626639463,
+      "grad_norm": 0.20037783682346344,
+      "learning_rate": 0.00011755546905410665,
+      "loss": 1.0548,
+      "step": 1065
+    },
+    {
+      "epoch": 0.41426211988730205,
+      "grad_norm": 0.21737834811210632,
+      "learning_rate": 0.00011747761775009732,
+      "loss": 1.0922,
+      "step": 1066
+    },
+    {
+      "epoch": 0.41465073350820947,
+      "grad_norm": 0.19312533736228943,
+      "learning_rate": 0.00011739976644608798,
+      "loss": 0.9836,
+      "step": 1067
+    },
+    {
+      "epoch": 0.4150393471291169,
+      "grad_norm": 0.22055000066757202,
+      "learning_rate": 0.00011732191514207864,
+      "loss": 1.0383,
+      "step": 1068
+    },
+    {
+      "epoch": 0.4154279607500243,
+      "grad_norm": 0.22623857855796814,
+      "learning_rate": 0.0001172440638380693,
+      "loss": 1.0704,
+      "step": 1069
+    },
+    {
+      "epoch": 0.4158165743709317,
+      "grad_norm": 0.21481367945671082,
+      "learning_rate": 0.00011716621253405995,
+      "loss": 1.052,
+      "step": 1070
+    },
+    {
+      "epoch": 0.4162051879918391,
+      "grad_norm": 0.21022087335586548,
+      "learning_rate": 0.0001170883612300506,
+      "loss": 1.1021,
+      "step": 1071
+    },
+    {
+      "epoch": 0.4165938016127465,
+      "grad_norm": 0.2154620885848999,
+      "learning_rate": 0.00011701050992604126,
+      "loss": 1.0128,
+      "step": 1072
+    },
+    {
+      "epoch": 0.4169824152336539,
+      "grad_norm": 0.20545578002929688,
+      "learning_rate": 0.00011693265862203194,
+      "loss": 1.0058,
+      "step": 1073
+    },
+    {
+      "epoch": 0.41737102885456134,
+      "grad_norm": 0.21726195514202118,
+      "learning_rate": 0.00011685480731802259,
+      "loss": 1.0753,
+      "step": 1074
+    },
+    {
+      "epoch": 0.41775964247546876,
+      "grad_norm": 0.2067115604877472,
+      "learning_rate": 0.00011677695601401324,
+      "loss": 1.0594,
+      "step": 1075
+    },
+    {
+      "epoch": 0.4181482560963762,
+      "grad_norm": 0.23024648427963257,
+      "learning_rate": 0.0001166991047100039,
+      "loss": 1.1039,
+      "step": 1076
+    },
+    {
+      "epoch": 0.4185368697172836,
+      "grad_norm": 0.20692144334316254,
+      "learning_rate": 0.00011662125340599455,
+      "loss": 1.0598,
+      "step": 1077
+    },
+    {
+      "epoch": 0.418925483338191,
+      "grad_norm": 0.19839999079704285,
+      "learning_rate": 0.00011654340210198522,
+      "loss": 1.054,
+      "step": 1078
+    },
+    {
+      "epoch": 0.41931409695909844,
+      "grad_norm": 0.19227825105190277,
+      "learning_rate": 0.00011646555079797588,
+      "loss": 0.9453,
+      "step": 1079
+    },
+    {
+      "epoch": 0.41970271058000586,
+      "grad_norm": 0.2112567275762558,
+      "learning_rate": 0.00011638769949396653,
+      "loss": 1.023,
+      "step": 1080
+    },
+    {
+      "epoch": 0.4200913242009132,
+      "grad_norm": 0.185299351811409,
+      "learning_rate": 0.00011630984818995718,
+      "loss": 0.9752,
+      "step": 1081
+    },
+    {
+      "epoch": 0.42047993782182064,
+      "grad_norm": 0.20148858428001404,
+      "learning_rate": 0.00011623199688594783,
+      "loss": 1.0659,
+      "step": 1082
+    },
+    {
+      "epoch": 0.42086855144272806,
+      "grad_norm": 0.1935974359512329,
+      "learning_rate": 0.00011615414558193851,
+      "loss": 1.0116,
+      "step": 1083
+    },
+    {
+      "epoch": 0.4212571650636355,
+      "grad_norm": 0.20433953404426575,
+      "learning_rate": 0.00011607629427792916,
+      "loss": 1.0671,
+      "step": 1084
+    },
+    {
+      "epoch": 0.4216457786845429,
+      "grad_norm": 0.20729799568653107,
+      "learning_rate": 0.00011599844297391982,
+      "loss": 1.0341,
+      "step": 1085
+    },
+    {
+      "epoch": 0.4220343923054503,
+      "grad_norm": 0.2126002460718155,
+      "learning_rate": 0.00011592059166991047,
+      "loss": 1.0188,
+      "step": 1086
+    },
+    {
+      "epoch": 0.42242300592635773,
+      "grad_norm": 0.19453707337379456,
+      "learning_rate": 0.00011584274036590112,
+      "loss": 1.0331,
+      "step": 1087
+    },
+    {
+      "epoch": 0.42281161954726515,
+      "grad_norm": 0.20909856259822845,
+      "learning_rate": 0.0001157648890618918,
+      "loss": 0.9984,
+      "step": 1088
+    },
+    {
+      "epoch": 0.42320023316817257,
+      "grad_norm": 0.19596272706985474,
+      "learning_rate": 0.00011568703775788245,
+      "loss": 1.0121,
+      "step": 1089
+    },
+    {
+      "epoch": 0.42358884678907993,
+      "grad_norm": 0.22045716643333435,
+      "learning_rate": 0.0001156091864538731,
+      "loss": 1.0591,
+      "step": 1090
+    },
+    {
+      "epoch": 0.42397746040998735,
+      "grad_norm": 0.22624897956848145,
+      "learning_rate": 0.00011553133514986376,
+      "loss": 1.0565,
+      "step": 1091
+    },
+    {
+      "epoch": 0.42436607403089477,
+      "grad_norm": 0.20263417065143585,
+      "learning_rate": 0.00011545348384585442,
+      "loss": 1.024,
+      "step": 1092
+    },
+    {
+      "epoch": 0.4247546876518022,
+      "grad_norm": 0.20179417729377747,
+      "learning_rate": 0.00011537563254184509,
+      "loss": 0.9806,
+      "step": 1093
+    },
+    {
+      "epoch": 0.4251433012727096,
+      "grad_norm": 0.30221593379974365,
+      "learning_rate": 0.00011529778123783574,
+      "loss": 1.0683,
+      "step": 1094
+    },
+    {
+      "epoch": 0.425531914893617,
+      "grad_norm": 0.21195146441459656,
+      "learning_rate": 0.0001152199299338264,
+      "loss": 1.1283,
+      "step": 1095
+    },
+    {
+      "epoch": 0.42592052851452444,
+      "grad_norm": 0.21860192716121674,
+      "learning_rate": 0.00011514207862981706,
+      "loss": 1.0046,
+      "step": 1096
+    },
+    {
+      "epoch": 0.42630914213543186,
+      "grad_norm": 0.2234150469303131,
+      "learning_rate": 0.00011506422732580771,
+      "loss": 1.0461,
+      "step": 1097
+    },
+    {
+      "epoch": 0.4266977557563393,
+      "grad_norm": 0.21535125374794006,
+      "learning_rate": 0.00011498637602179837,
+      "loss": 1.0593,
+      "step": 1098
+    },
+    {
+      "epoch": 0.4270863693772467,
+      "grad_norm": 0.19313789904117584,
+      "learning_rate": 0.00011490852471778904,
+      "loss": 1.0357,
+      "step": 1099
+    },
+    {
+      "epoch": 0.42747498299815406,
+      "grad_norm": 0.19886989891529083,
+      "learning_rate": 0.00011483067341377969,
+      "loss": 0.9946,
+      "step": 1100
+    },
+    {
+      "epoch": 0.4278635966190615,
+      "grad_norm": 0.21028490364551544,
+      "learning_rate": 0.00011475282210977034,
+      "loss": 1.0765,
+      "step": 1101
+    },
+    {
+      "epoch": 0.4282522102399689,
+      "grad_norm": 0.2066621333360672,
+      "learning_rate": 0.000114674970805761,
+      "loss": 1.0405,
+      "step": 1102
+    },
+    {
+      "epoch": 0.4286408238608763,
+      "grad_norm": 0.18400220572948456,
+      "learning_rate": 0.00011459711950175168,
+      "loss": 0.9404,
+      "step": 1103
+    },
+    {
+      "epoch": 0.42902943748178374,
+      "grad_norm": 0.2058599591255188,
+      "learning_rate": 0.00011451926819774233,
+      "loss": 1.0505,
+      "step": 1104
+    },
+    {
+      "epoch": 0.42941805110269116,
+      "grad_norm": 0.19696786999702454,
+      "learning_rate": 0.00011444141689373298,
+      "loss": 1.032,
+      "step": 1105
+    },
+    {
+      "epoch": 0.4298066647235986,
+      "grad_norm": 0.2082854062318802,
+      "learning_rate": 0.00011436356558972363,
+      "loss": 1.0914,
+      "step": 1106
+    },
+    {
+      "epoch": 0.430195278344506,
+      "grad_norm": 0.20155015587806702,
+      "learning_rate": 0.00011428571428571428,
+      "loss": 1.0541,
+      "step": 1107
+    },
+    {
+      "epoch": 0.4305838919654134,
+      "grad_norm": 0.23419982194900513,
+      "learning_rate": 0.00011420786298170494,
+      "loss": 1.0684,
+      "step": 1108
+    },
+    {
+      "epoch": 0.4309725055863208,
+      "grad_norm": 0.23493975400924683,
+      "learning_rate": 0.00011413001167769561,
+      "loss": 1.0509,
+      "step": 1109
+    },
+    {
+      "epoch": 0.4313611192072282,
+      "grad_norm": 0.2089843600988388,
+      "learning_rate": 0.00011405216037368627,
+      "loss": 1.0479,
+      "step": 1110
+    },
+    {
+      "epoch": 0.4317497328281356,
+      "grad_norm": 0.21076850593090057,
+      "learning_rate": 0.00011397430906967692,
+      "loss": 1.064,
+      "step": 1111
+    },
+    {
+      "epoch": 0.43213834644904303,
+      "grad_norm": 0.20307987928390503,
+      "learning_rate": 0.00011389645776566757,
+      "loss": 1.0416,
+      "step": 1112
+    },
+    {
+      "epoch": 0.43252696006995045,
+      "grad_norm": 0.20955562591552734,
+      "learning_rate": 0.00011381860646165822,
+      "loss": 1.0158,
+      "step": 1113
+    },
+    {
+      "epoch": 0.43291557369085787,
+      "grad_norm": 0.2074531465768814,
+      "learning_rate": 0.0001137407551576489,
+      "loss": 1.0486,
+      "step": 1114
+    },
+    {
+      "epoch": 0.4333041873117653,
+      "grad_norm": 0.20907235145568848,
+      "learning_rate": 0.00011366290385363955,
+      "loss": 1.0352,
+      "step": 1115
+    },
+    {
+      "epoch": 0.4336928009326727,
+      "grad_norm": 0.21726477146148682,
+      "learning_rate": 0.0001135850525496302,
+      "loss": 1.0068,
+      "step": 1116
+    },
+    {
+      "epoch": 0.4340814145535801,
+      "grad_norm": 0.20231984555721283,
+      "learning_rate": 0.00011350720124562086,
+      "loss": 0.9757,
+      "step": 1117
+    },
+    {
+      "epoch": 0.4344700281744875,
+      "grad_norm": 0.23485834896564484,
+      "learning_rate": 0.00011342934994161152,
+      "loss": 1.0681,
+      "step": 1118
+    },
+    {
+      "epoch": 0.4348586417953949,
+      "grad_norm": 0.21286556124687195,
+      "learning_rate": 0.00011335149863760219,
+      "loss": 1.0399,
+      "step": 1119
+    },
+    {
+      "epoch": 0.4352472554163023,
+      "grad_norm": 0.2097872495651245,
+      "learning_rate": 0.00011327364733359284,
+      "loss": 1.0435,
+      "step": 1120
+    },
+    {
+      "epoch": 0.43563586903720974,
+      "grad_norm": 0.2224377542734146,
+      "learning_rate": 0.00011319579602958351,
+      "loss": 1.1664,
+      "step": 1121
+    },
+    {
+      "epoch": 0.43602448265811716,
+      "grad_norm": 0.19213411211967468,
+      "learning_rate": 0.00011311794472557416,
+      "loss": 1.0424,
+      "step": 1122
+    },
+    {
+      "epoch": 0.4364130962790246,
+      "grad_norm": 0.20974959433078766,
+      "learning_rate": 0.00011304009342156481,
+      "loss": 1.0943,
+      "step": 1123
+    },
+    {
+      "epoch": 0.436801709899932,
+      "grad_norm": 0.19943708181381226,
+      "learning_rate": 0.00011296224211755549,
+      "loss": 1.0652,
+      "step": 1124
+    },
+    {
+      "epoch": 0.4371903235208394,
+      "grad_norm": 0.1832750141620636,
+      "learning_rate": 0.00011288439081354614,
+      "loss": 0.9883,
+      "step": 1125
+    },
+    {
+      "epoch": 0.43757893714174684,
+      "grad_norm": 0.2205052226781845,
+      "learning_rate": 0.0001128065395095368,
+      "loss": 1.0733,
+      "step": 1126
+    },
+    {
+      "epoch": 0.43796755076265426,
+      "grad_norm": 0.2082854062318802,
+      "learning_rate": 0.00011272868820552745,
+      "loss": 1.0141,
+      "step": 1127
+    },
+    {
+      "epoch": 0.4383561643835616,
+      "grad_norm": 0.22755026817321777,
+      "learning_rate": 0.0001126508369015181,
+      "loss": 1.0942,
+      "step": 1128
+    },
+    {
+      "epoch": 0.43874477800446904,
+      "grad_norm": 0.2098863571882248,
+      "learning_rate": 0.00011257298559750878,
+      "loss": 0.9987,
+      "step": 1129
+    },
+    {
+      "epoch": 0.43913339162537646,
+      "grad_norm": 0.20559263229370117,
+      "learning_rate": 0.00011249513429349943,
+      "loss": 1.0345,
+      "step": 1130
+    },
+    {
+      "epoch": 0.4395220052462839,
+      "grad_norm": 0.21955084800720215,
+      "learning_rate": 0.00011241728298949008,
+      "loss": 1.1068,
+      "step": 1131
+    },
+    {
+      "epoch": 0.4399106188671913,
+      "grad_norm": 0.21353478729724884,
+      "learning_rate": 0.00011233943168548073,
+      "loss": 1.0094,
+      "step": 1132
+    },
+    {
+      "epoch": 0.4402992324880987,
+      "grad_norm": 0.19822491705417633,
+      "learning_rate": 0.00011226158038147139,
+      "loss": 0.9758,
+      "step": 1133
+    },
+    {
+      "epoch": 0.44068784610900613,
+      "grad_norm": 0.20079441368579865,
+      "learning_rate": 0.00011218372907746206,
+      "loss": 1.0202,
+      "step": 1134
+    },
+    {
+      "epoch": 0.44107645972991355,
+      "grad_norm": 0.2261926829814911,
+      "learning_rate": 0.00011210587777345272,
+      "loss": 0.9877,
+      "step": 1135
+    },
+    {
+      "epoch": 0.44146507335082097,
+      "grad_norm": 0.2264915257692337,
+      "learning_rate": 0.00011202802646944337,
+      "loss": 0.9887,
+      "step": 1136
+    },
+    {
+      "epoch": 0.44185368697172833,
+      "grad_norm": 0.21853779256343842,
+      "learning_rate": 0.00011195017516543402,
+      "loss": 1.0535,
+      "step": 1137
+    },
+    {
+      "epoch": 0.44224230059263575,
+      "grad_norm": 0.21332694590091705,
+      "learning_rate": 0.00011187232386142467,
+      "loss": 1.0824,
+      "step": 1138
+    },
+    {
+      "epoch": 0.44263091421354317,
+      "grad_norm": 0.21350236237049103,
+      "learning_rate": 0.00011179447255741535,
+      "loss": 1.0758,
+      "step": 1139
+    },
+    {
+      "epoch": 0.4430195278344506,
+      "grad_norm": 0.21305765211582184,
+      "learning_rate": 0.000111716621253406,
+      "loss": 1.035,
+      "step": 1140
+    },
+    {
+      "epoch": 0.443408141455358,
+      "grad_norm": 0.20486389100551605,
+      "learning_rate": 0.00011163876994939666,
+      "loss": 1.0413,
+      "step": 1141
+    },
+    {
+      "epoch": 0.4437967550762654,
+      "grad_norm": 0.19255472719669342,
+      "learning_rate": 0.00011156091864538731,
+      "loss": 0.9583,
+      "step": 1142
+    },
+    {
+      "epoch": 0.44418536869717284,
+      "grad_norm": 0.19824008643627167,
+      "learning_rate": 0.00011148306734137796,
+      "loss": 1.0331,
+      "step": 1143
+    },
+    {
+      "epoch": 0.44457398231808026,
+      "grad_norm": 0.20308080315589905,
+      "learning_rate": 0.00011140521603736863,
+      "loss": 1.0399,
+      "step": 1144
+    },
+    {
+      "epoch": 0.4449625959389877,
+      "grad_norm": 0.2193964123725891,
+      "learning_rate": 0.00011132736473335929,
+      "loss": 1.063,
+      "step": 1145
+    },
+    {
+      "epoch": 0.4453512095598951,
+      "grad_norm": 0.2151576578617096,
+      "learning_rate": 0.00011124951342934994,
+      "loss": 1.0795,
+      "step": 1146
+    },
+    {
+      "epoch": 0.44573982318080246,
+      "grad_norm": 0.23056697845458984,
+      "learning_rate": 0.00011117166212534061,
+      "loss": 1.0351,
+      "step": 1147
+    },
+    {
+      "epoch": 0.4461284368017099,
+      "grad_norm": 0.1973094493150711,
+      "learning_rate": 0.00011109381082133126,
+      "loss": 0.9866,
+      "step": 1148
+    },
+    {
+      "epoch": 0.4465170504226173,
+      "grad_norm": 0.2119562178850174,
+      "learning_rate": 0.00011101595951732191,
+      "loss": 1.0591,
+      "step": 1149
+    },
+    {
+      "epoch": 0.4469056640435247,
+      "grad_norm": 0.20407763123512268,
+      "learning_rate": 0.00011093810821331259,
+      "loss": 0.988,
+      "step": 1150
+    },
+    {
+      "epoch": 0.44729427766443214,
+      "grad_norm": 0.19474107027053833,
+      "learning_rate": 0.00011086025690930324,
+      "loss": 0.9729,
+      "step": 1151
+    },
+    {
+      "epoch": 0.44768289128533956,
+      "grad_norm": 0.2179928421974182,
+      "learning_rate": 0.0001107824056052939,
+      "loss": 1.0558,
+      "step": 1152
+    },
+    {
+      "epoch": 0.448071504906247,
+      "grad_norm": 0.44306451082229614,
+      "learning_rate": 0.00011070455430128455,
+      "loss": 1.0901,
+      "step": 1153
+    },
+    {
+      "epoch": 0.4484601185271544,
+      "grad_norm": 0.22060540318489075,
+      "learning_rate": 0.0001106267029972752,
+      "loss": 1.0009,
+      "step": 1154
+    },
+    {
+      "epoch": 0.4488487321480618,
+      "grad_norm": 0.20534972846508026,
+      "learning_rate": 0.00011054885169326588,
+      "loss": 0.9741,
+      "step": 1155
+    },
+    {
+      "epoch": 0.4492373457689692,
+      "grad_norm": 0.19488993287086487,
+      "learning_rate": 0.00011047100038925653,
+      "loss": 1.0,
+      "step": 1156
+    },
+    {
+      "epoch": 0.4496259593898766,
+      "grad_norm": 0.20462395250797272,
+      "learning_rate": 0.00011039314908524718,
+      "loss": 1.0309,
+      "step": 1157
+    },
+    {
+      "epoch": 0.450014573010784,
+      "grad_norm": 0.2170749306678772,
+      "learning_rate": 0.00011031529778123784,
+      "loss": 1.0726,
+      "step": 1158
+    },
+    {
+      "epoch": 0.45040318663169143,
+      "grad_norm": 0.2066730111837387,
+      "learning_rate": 0.00011023744647722849,
+      "loss": 1.0227,
+      "step": 1159
+    },
+    {
+      "epoch": 0.45079180025259885,
+      "grad_norm": 0.20625676214694977,
+      "learning_rate": 0.00011015959517321917,
+      "loss": 1.0287,
+      "step": 1160
+    },
+    {
+      "epoch": 0.45118041387350627,
+      "grad_norm": 0.19483047723770142,
+      "learning_rate": 0.00011008174386920982,
+      "loss": 0.9639,
+      "step": 1161
+    },
+    {
+      "epoch": 0.4515690274944137,
+      "grad_norm": 0.24705417454242706,
+      "learning_rate": 0.00011000389256520047,
+      "loss": 0.9903,
+      "step": 1162
+    },
+    {
+      "epoch": 0.4519576411153211,
+      "grad_norm": 0.2109205424785614,
+      "learning_rate": 0.00010992604126119112,
+      "loss": 1.054,
+      "step": 1163
+    },
+    {
+      "epoch": 0.4523462547362285,
+      "grad_norm": 0.20904991030693054,
+      "learning_rate": 0.00010984818995718178,
+      "loss": 1.0416,
+      "step": 1164
+    },
+    {
+      "epoch": 0.45273486835713594,
+      "grad_norm": 0.19841328263282776,
+      "learning_rate": 0.00010977033865317245,
+      "loss": 0.9986,
+      "step": 1165
+    },
+    {
+      "epoch": 0.4531234819780433,
+      "grad_norm": 0.20545506477355957,
+      "learning_rate": 0.0001096924873491631,
+      "loss": 1.0337,
+      "step": 1166
+    },
+    {
+      "epoch": 0.4535120955989507,
+      "grad_norm": 0.208644837141037,
+      "learning_rate": 0.00010961463604515376,
+      "loss": 1.0304,
+      "step": 1167
+    },
+    {
+      "epoch": 0.45390070921985815,
+      "grad_norm": 0.2111911028623581,
+      "learning_rate": 0.00010953678474114441,
+      "loss": 1.0398,
+      "step": 1168
+    },
+    {
+      "epoch": 0.45428932284076556,
+      "grad_norm": 0.2600184381008148,
+      "learning_rate": 0.00010945893343713506,
+      "loss": 1.0509,
+      "step": 1169
+    },
+    {
+      "epoch": 0.454677936461673,
+      "grad_norm": 0.2059030532836914,
+      "learning_rate": 0.00010938108213312574,
+      "loss": 0.9347,
+      "step": 1170
+    },
+    {
+      "epoch": 0.4550665500825804,
+      "grad_norm": 0.19232551753520966,
+      "learning_rate": 0.0001093032308291164,
+      "loss": 1.0162,
+      "step": 1171
+    },
+    {
+      "epoch": 0.4554551637034878,
+      "grad_norm": 0.19147330522537231,
+      "learning_rate": 0.00010922537952510705,
+      "loss": 0.9872,
+      "step": 1172
+    },
+    {
+      "epoch": 0.45584377732439524,
+      "grad_norm": 0.2599676251411438,
+      "learning_rate": 0.00010914752822109771,
+      "loss": 1.0402,
+      "step": 1173
+    },
+    {
+      "epoch": 0.45623239094530266,
+      "grad_norm": 0.2159397304058075,
+      "learning_rate": 0.00010906967691708836,
+      "loss": 1.0411,
+      "step": 1174
+    },
+    {
+      "epoch": 0.45662100456621,
+      "grad_norm": 0.23864266276359558,
+      "learning_rate": 0.00010899182561307903,
+      "loss": 1.054,
+      "step": 1175
+    },
+    {
+      "epoch": 0.45700961818711744,
+      "grad_norm": 0.2027217596769333,
+      "learning_rate": 0.0001089139743090697,
+      "loss": 0.9713,
+      "step": 1176
+    },
+    {
+      "epoch": 0.45739823180802486,
+      "grad_norm": 0.1837588995695114,
+      "learning_rate": 0.00010883612300506035,
+      "loss": 0.9698,
+      "step": 1177
+    },
+    {
+      "epoch": 0.4577868454289323,
+      "grad_norm": 0.20038527250289917,
+      "learning_rate": 0.000108758271701051,
+      "loss": 1.0456,
+      "step": 1178
+    },
+    {
+      "epoch": 0.4581754590498397,
+      "grad_norm": 0.21525044739246368,
+      "learning_rate": 0.00010868042039704165,
+      "loss": 1.021,
+      "step": 1179
+    },
+    {
+      "epoch": 0.4585640726707471,
+      "grad_norm": 0.18813730776309967,
+      "learning_rate": 0.0001086025690930323,
+      "loss": 0.9673,
+      "step": 1180
+    },
+    {
+      "epoch": 0.45895268629165453,
+      "grad_norm": 0.2056179642677307,
+      "learning_rate": 0.00010852471778902298,
+      "loss": 1.0119,
+      "step": 1181
+    },
+    {
+      "epoch": 0.45934129991256195,
+      "grad_norm": 0.21599683165550232,
+      "learning_rate": 0.00010844686648501363,
+      "loss": 1.0537,
+      "step": 1182
+    },
+    {
+      "epoch": 0.45972991353346937,
+      "grad_norm": 0.19750265777111053,
+      "learning_rate": 0.00010836901518100429,
+      "loss": 1.0203,
+      "step": 1183
+    },
+    {
+      "epoch": 0.4601185271543768,
+      "grad_norm": 0.22186161577701569,
+      "learning_rate": 0.00010829116387699494,
+      "loss": 1.0583,
+      "step": 1184
+    },
+    {
+      "epoch": 0.46050714077528415,
+      "grad_norm": 0.2109905481338501,
+      "learning_rate": 0.00010821331257298559,
+      "loss": 1.0022,
+      "step": 1185
+    },
+    {
+      "epoch": 0.46089575439619157,
+      "grad_norm": 0.2032858431339264,
+      "learning_rate": 0.00010813546126897627,
+      "loss": 0.9774,
+      "step": 1186
+    },
+    {
+      "epoch": 0.461284368017099,
+      "grad_norm": 0.20381197333335876,
+      "learning_rate": 0.00010805760996496692,
+      "loss": 0.9768,
+      "step": 1187
+    },
+    {
+      "epoch": 0.4616729816380064,
+      "grad_norm": 0.20488987863063812,
+      "learning_rate": 0.00010797975866095757,
+      "loss": 1.0448,
+      "step": 1188
+    },
+    {
+      "epoch": 0.4620615952589138,
+      "grad_norm": 0.20257477462291718,
+      "learning_rate": 0.00010790190735694823,
+      "loss": 1.0157,
+      "step": 1189
+    },
+    {
+      "epoch": 0.46245020887982125,
+      "grad_norm": 0.20761239528656006,
+      "learning_rate": 0.00010782405605293888,
+      "loss": 1.0328,
+      "step": 1190
+    },
+    {
+      "epoch": 0.46283882250072866,
+      "grad_norm": 0.22062581777572632,
+      "learning_rate": 0.00010774620474892956,
+      "loss": 1.0362,
+      "step": 1191
+    },
+    {
+      "epoch": 0.4632274361216361,
+      "grad_norm": 0.19970272481441498,
+      "learning_rate": 0.00010766835344492021,
+      "loss": 1.0783,
+      "step": 1192
+    },
+    {
+      "epoch": 0.4636160497425435,
+      "grad_norm": 0.2221893072128296,
+      "learning_rate": 0.00010759050214091086,
+      "loss": 1.0136,
+      "step": 1193
+    },
+    {
+      "epoch": 0.46400466336345086,
+      "grad_norm": 0.2124665081501007,
+      "learning_rate": 0.00010751265083690151,
+      "loss": 1.0528,
+      "step": 1194
+    },
+    {
+      "epoch": 0.4643932769843583,
+      "grad_norm": 0.2001204937696457,
+      "learning_rate": 0.00010743479953289218,
+      "loss": 1.0495,
+      "step": 1195
+    },
+    {
+      "epoch": 0.4647818906052657,
+      "grad_norm": 0.20979635417461395,
+      "learning_rate": 0.00010735694822888284,
+      "loss": 1.0664,
+      "step": 1196
+    },
+    {
+      "epoch": 0.4651705042261731,
+      "grad_norm": 0.190982848405838,
+      "learning_rate": 0.0001072790969248735,
+      "loss": 1.0256,
+      "step": 1197
+    },
+    {
+      "epoch": 0.46555911784708054,
+      "grad_norm": 0.19910745322704315,
+      "learning_rate": 0.00010720124562086415,
+      "loss": 1.0263,
+      "step": 1198
+    },
+    {
+      "epoch": 0.46594773146798796,
+      "grad_norm": 0.21624085307121277,
+      "learning_rate": 0.00010712339431685481,
+      "loss": 1.0768,
+      "step": 1199
+    },
+    {
+      "epoch": 0.4663363450888954,
+      "grad_norm": 0.20857703685760498,
+      "learning_rate": 0.00010704554301284547,
+      "loss": 1.0892,
+      "step": 1200
+    },
+    {
+      "epoch": 0.4667249587098028,
+      "grad_norm": 0.21897061169147491,
+      "learning_rate": 0.00010696769170883613,
+      "loss": 1.0873,
+      "step": 1201
+    },
+    {
+      "epoch": 0.4671135723307102,
+      "grad_norm": 0.1943386346101761,
+      "learning_rate": 0.0001068898404048268,
+      "loss": 1.0116,
+      "step": 1202
+    },
+    {
+      "epoch": 0.4675021859516176,
+      "grad_norm": 0.22607874870300293,
+      "learning_rate": 0.00010681198910081745,
+      "loss": 1.0328,
+      "step": 1203
+    },
+    {
+      "epoch": 0.467890799572525,
+      "grad_norm": 0.1898999959230423,
+      "learning_rate": 0.0001067341377968081,
+      "loss": 0.9791,
+      "step": 1204
+    },
+    {
+      "epoch": 0.4682794131934324,
+      "grad_norm": 0.2193334400653839,
+      "learning_rate": 0.00010665628649279875,
+      "loss": 1.0742,
+      "step": 1205
+    },
+    {
+      "epoch": 0.46866802681433983,
+      "grad_norm": 0.2096349149942398,
+      "learning_rate": 0.00010657843518878943,
+      "loss": 1.0683,
+      "step": 1206
+    },
+    {
+      "epoch": 0.46905664043524725,
+      "grad_norm": 0.2040576934814453,
+      "learning_rate": 0.00010650058388478008,
+      "loss": 1.0516,
+      "step": 1207
+    },
+    {
+      "epoch": 0.46944525405615467,
+      "grad_norm": 0.20619645714759827,
+      "learning_rate": 0.00010642273258077074,
+      "loss": 1.0429,
+      "step": 1208
+    },
+    {
+      "epoch": 0.4698338676770621,
+      "grad_norm": 0.19753660261631012,
+      "learning_rate": 0.00010634488127676139,
+      "loss": 1.0268,
+      "step": 1209
+    },
+    {
+      "epoch": 0.4702224812979695,
+      "grad_norm": 0.2201426476240158,
+      "learning_rate": 0.00010626702997275204,
+      "loss": 1.0879,
+      "step": 1210
+    },
+    {
+      "epoch": 0.4706110949188769,
+      "grad_norm": 0.21307805180549622,
+      "learning_rate": 0.00010618917866874272,
+      "loss": 1.0186,
+      "step": 1211
+    },
+    {
+      "epoch": 0.47099970853978435,
+      "grad_norm": 0.21142373979091644,
+      "learning_rate": 0.00010611132736473337,
+      "loss": 1.0417,
+      "step": 1212
+    },
+    {
+      "epoch": 0.4713883221606917,
+      "grad_norm": 0.20523706078529358,
+      "learning_rate": 0.00010603347606072402,
+      "loss": 1.0372,
+      "step": 1213
+    },
+    {
+      "epoch": 0.4717769357815991,
+      "grad_norm": 0.19843094050884247,
+      "learning_rate": 0.00010595562475671468,
+      "loss": 1.0062,
+      "step": 1214
+    },
+    {
+      "epoch": 0.47216554940250655,
+      "grad_norm": 0.2146739959716797,
+      "learning_rate": 0.00010587777345270533,
+      "loss": 1.0528,
+      "step": 1215
+    },
+    {
+      "epoch": 0.47255416302341396,
+      "grad_norm": 0.2136303037405014,
+      "learning_rate": 0.00010579992214869601,
+      "loss": 1.0521,
+      "step": 1216
+    },
+    {
+      "epoch": 0.4729427766443214,
+      "grad_norm": 0.21379397809505463,
+      "learning_rate": 0.00010572207084468666,
+      "loss": 1.0362,
+      "step": 1217
+    },
+    {
+      "epoch": 0.4733313902652288,
+      "grad_norm": 0.20459088683128357,
+      "learning_rate": 0.00010564421954067731,
+      "loss": 1.0455,
+      "step": 1218
+    },
+    {
+      "epoch": 0.4737200038861362,
+      "grad_norm": 0.20667988061904907,
+      "learning_rate": 0.00010556636823666796,
+      "loss": 1.0284,
+      "step": 1219
+    },
+    {
+      "epoch": 0.47410861750704364,
+      "grad_norm": 0.21820449829101562,
+      "learning_rate": 0.00010548851693265862,
+      "loss": 1.0584,
+      "step": 1220
+    },
+    {
+      "epoch": 0.47449723112795106,
+      "grad_norm": 0.19705156981945038,
+      "learning_rate": 0.00010541066562864928,
+      "loss": 1.004,
+      "step": 1221
+    },
+    {
+      "epoch": 0.4748858447488584,
+      "grad_norm": 0.19806528091430664,
+      "learning_rate": 0.00010533281432463995,
+      "loss": 1.0519,
+      "step": 1222
+    },
+    {
+      "epoch": 0.47527445836976584,
+      "grad_norm": 0.2006833702325821,
+      "learning_rate": 0.0001052549630206306,
+      "loss": 1.0119,
+      "step": 1223
+    },
+    {
+      "epoch": 0.47566307199067326,
+      "grad_norm": 0.21757058799266815,
+      "learning_rate": 0.00010517711171662125,
+      "loss": 1.0961,
+      "step": 1224
+    },
+    {
+      "epoch": 0.4760516856115807,
+      "grad_norm": 0.2015775889158249,
+      "learning_rate": 0.00010509926041261192,
+      "loss": 1.0419,
+      "step": 1225
+    },
+    {
+      "epoch": 0.4764402992324881,
+      "grad_norm": 0.19691923260688782,
+      "learning_rate": 0.00010502140910860257,
+      "loss": 1.0555,
+      "step": 1226
+    },
+    {
+      "epoch": 0.4768289128533955,
+      "grad_norm": 0.19924800097942352,
+      "learning_rate": 0.00010494355780459323,
+      "loss": 1.0106,
+      "step": 1227
+    },
+    {
+      "epoch": 0.47721752647430293,
+      "grad_norm": 0.21416346728801727,
+      "learning_rate": 0.0001048657065005839,
+      "loss": 1.0741,
+      "step": 1228
+    },
+    {
+      "epoch": 0.47760614009521035,
+      "grad_norm": 0.21823547780513763,
+      "learning_rate": 0.00010478785519657455,
+      "loss": 1.023,
+      "step": 1229
+    },
+    {
+      "epoch": 0.47799475371611777,
+      "grad_norm": 0.2083735466003418,
+      "learning_rate": 0.0001047100038925652,
+      "loss": 1.0424,
+      "step": 1230
+    },
+    {
+      "epoch": 0.4783833673370252,
+      "grad_norm": 0.2219141572713852,
+      "learning_rate": 0.00010463215258855586,
+      "loss": 1.0839,
+      "step": 1231
+    },
+    {
+      "epoch": 0.47877198095793255,
+      "grad_norm": 0.21334600448608398,
+      "learning_rate": 0.00010455430128454653,
+      "loss": 0.9888,
+      "step": 1232
+    },
+    {
+      "epoch": 0.47916059457883997,
+      "grad_norm": 0.2140086442232132,
+      "learning_rate": 0.00010447644998053719,
+      "loss": 1.0119,
+      "step": 1233
+    },
+    {
+      "epoch": 0.4795492081997474,
+      "grad_norm": 0.25360551476478577,
+      "learning_rate": 0.00010439859867652784,
+      "loss": 1.0026,
+      "step": 1234
+    },
+    {
+      "epoch": 0.4799378218206548,
+      "grad_norm": 0.20200380682945251,
+      "learning_rate": 0.00010432074737251849,
+      "loss": 1.0,
+      "step": 1235
+    },
+    {
+      "epoch": 0.4803264354415622,
+      "grad_norm": 0.22641289234161377,
+      "learning_rate": 0.00010424289606850914,
+      "loss": 1.1022,
+      "step": 1236
+    },
+    {
+      "epoch": 0.48071504906246965,
+      "grad_norm": 0.20538561046123505,
+      "learning_rate": 0.00010416504476449982,
+      "loss": 0.9847,
+      "step": 1237
+    },
+    {
+      "epoch": 0.48110366268337706,
+      "grad_norm": 0.206883504986763,
+      "learning_rate": 0.00010408719346049047,
+      "loss": 1.0152,
+      "step": 1238
+    },
+    {
+      "epoch": 0.4814922763042845,
+      "grad_norm": 0.21584320068359375,
+      "learning_rate": 0.00010400934215648113,
+      "loss": 1.0361,
+      "step": 1239
+    },
+    {
+      "epoch": 0.4818808899251919,
+      "grad_norm": 0.20963703095912933,
+      "learning_rate": 0.00010393149085247178,
+      "loss": 1.0814,
+      "step": 1240
+    },
+    {
+      "epoch": 0.48226950354609927,
+      "grad_norm": 0.1965872198343277,
+      "learning_rate": 0.00010385363954846243,
+      "loss": 1.0365,
+      "step": 1241
+    },
+    {
+      "epoch": 0.4826581171670067,
+      "grad_norm": 0.2030191719532013,
+      "learning_rate": 0.00010377578824445311,
+      "loss": 1.0374,
+      "step": 1242
+    },
+    {
+      "epoch": 0.4830467307879141,
+      "grad_norm": 0.21448804438114166,
+      "learning_rate": 0.00010369793694044376,
+      "loss": 0.9686,
+      "step": 1243
+    },
+    {
+      "epoch": 0.4834353444088215,
+      "grad_norm": 0.2181752622127533,
+      "learning_rate": 0.00010362008563643441,
+      "loss": 1.0812,
+      "step": 1244
+    },
+    {
+      "epoch": 0.48382395802972894,
+      "grad_norm": 0.19887101650238037,
+      "learning_rate": 0.00010354223433242507,
+      "loss": 1.036,
+      "step": 1245
+    },
+    {
+      "epoch": 0.48421257165063636,
+      "grad_norm": 0.19007287919521332,
+      "learning_rate": 0.00010346438302841572,
+      "loss": 1.0292,
+      "step": 1246
+    },
+    {
+      "epoch": 0.4846011852715438,
+      "grad_norm": 0.21390347182750702,
+      "learning_rate": 0.0001033865317244064,
+      "loss": 1.0284,
+      "step": 1247
+    },
+    {
+      "epoch": 0.4849897988924512,
+      "grad_norm": 0.23822663724422455,
+      "learning_rate": 0.00010330868042039705,
+      "loss": 1.1044,
+      "step": 1248
+    },
+    {
+      "epoch": 0.4853784125133586,
+      "grad_norm": 0.20779070258140564,
+      "learning_rate": 0.0001032308291163877,
+      "loss": 1.0475,
+      "step": 1249
+    },
+    {
+      "epoch": 0.48576702613426603,
+      "grad_norm": 0.19232134521007538,
+      "learning_rate": 0.00010315297781237835,
+      "loss": 0.9945,
+      "step": 1250
+    },
+    {
+      "epoch": 0.4861556397551734,
+      "grad_norm": 0.22378556430339813,
+      "learning_rate": 0.00010307512650836902,
+      "loss": 1.0462,
+      "step": 1251
+    },
+    {
+      "epoch": 0.4865442533760808,
+      "grad_norm": 0.22156798839569092,
+      "learning_rate": 0.00010299727520435968,
+      "loss": 1.051,
+      "step": 1252
+    },
+    {
+      "epoch": 0.48693286699698823,
+      "grad_norm": 0.19885733723640442,
+      "learning_rate": 0.00010291942390035034,
+      "loss": 1.0593,
+      "step": 1253
+    },
+    {
+      "epoch": 0.48732148061789565,
+      "grad_norm": 0.2172418236732483,
+      "learning_rate": 0.000102841572596341,
+      "loss": 1.0513,
+      "step": 1254
+    },
+    {
+      "epoch": 0.48771009423880307,
+      "grad_norm": 0.22136956453323364,
+      "learning_rate": 0.00010276372129233165,
+      "loss": 1.0438,
+      "step": 1255
+    },
+    {
+      "epoch": 0.4880987078597105,
+      "grad_norm": 0.21337302029132843,
+      "learning_rate": 0.0001026858699883223,
+      "loss": 1.0551,
+      "step": 1256
+    },
+    {
+      "epoch": 0.4884873214806179,
+      "grad_norm": 0.21376267075538635,
+      "learning_rate": 0.00010260801868431296,
+      "loss": 1.054,
+      "step": 1257
+    },
+    {
+      "epoch": 0.4888759351015253,
+      "grad_norm": 0.19498860836029053,
+      "learning_rate": 0.00010253016738030364,
+      "loss": 1.0045,
+      "step": 1258
+    },
+    {
+      "epoch": 0.48926454872243275,
+      "grad_norm": 0.22354961931705475,
+      "learning_rate": 0.00010245231607629429,
+      "loss": 1.096,
+      "step": 1259
+    },
+    {
+      "epoch": 0.4896531623433401,
+      "grad_norm": 0.2078939527273178,
+      "learning_rate": 0.00010237446477228494,
+      "loss": 1.0102,
+      "step": 1260
+    },
+    {
+      "epoch": 0.49004177596424753,
+      "grad_norm": 0.20992495119571686,
+      "learning_rate": 0.00010229661346827559,
+      "loss": 0.9814,
+      "step": 1261
+    },
+    {
+      "epoch": 0.49043038958515495,
+      "grad_norm": 0.2178875207901001,
+      "learning_rate": 0.00010221876216426625,
+      "loss": 1.0489,
+      "step": 1262
+    },
+    {
+      "epoch": 0.49081900320606237,
+      "grad_norm": 0.22152946889400482,
+      "learning_rate": 0.00010214091086025692,
+      "loss": 1.0808,
+      "step": 1263
+    },
+    {
+      "epoch": 0.4912076168269698,
+      "grad_norm": 0.21179009974002838,
+      "learning_rate": 0.00010206305955624758,
+      "loss": 1.0323,
+      "step": 1264
+    },
+    {
+      "epoch": 0.4915962304478772,
+      "grad_norm": 0.2126997411251068,
+      "learning_rate": 0.00010198520825223823,
+      "loss": 1.0093,
+      "step": 1265
+    },
+    {
+      "epoch": 0.4919848440687846,
+      "grad_norm": 0.20912809669971466,
+      "learning_rate": 0.00010190735694822888,
+      "loss": 1.0343,
+      "step": 1266
+    },
+    {
+      "epoch": 0.49237345768969204,
+      "grad_norm": 0.2231636494398117,
+      "learning_rate": 0.00010182950564421953,
+      "loss": 1.0587,
+      "step": 1267
+    },
+    {
+      "epoch": 0.49276207131059946,
+      "grad_norm": 0.1954583376646042,
+      "learning_rate": 0.00010175165434021021,
+      "loss": 0.9566,
+      "step": 1268
+    },
+    {
+      "epoch": 0.4931506849315068,
+      "grad_norm": 0.20520909130573273,
+      "learning_rate": 0.00010167380303620086,
+      "loss": 1.024,
+      "step": 1269
+    },
+    {
+      "epoch": 0.49353929855241424,
+      "grad_norm": 0.21736180782318115,
+      "learning_rate": 0.00010159595173219152,
+      "loss": 1.0434,
+      "step": 1270
+    },
+    {
+      "epoch": 0.49392791217332166,
+      "grad_norm": 0.2360561490058899,
+      "learning_rate": 0.00010151810042818217,
+      "loss": 1.114,
+      "step": 1271
+    },
+    {
+      "epoch": 0.4943165257942291,
+      "grad_norm": 0.20595967769622803,
+      "learning_rate": 0.00010144024912417282,
+      "loss": 0.9909,
+      "step": 1272
+    },
+    {
+      "epoch": 0.4947051394151365,
+      "grad_norm": 0.2161860466003418,
+      "learning_rate": 0.0001013623978201635,
+      "loss": 1.0536,
+      "step": 1273
+    },
+    {
+      "epoch": 0.4950937530360439,
+      "grad_norm": 0.19852355122566223,
+      "learning_rate": 0.00010128454651615415,
+      "loss": 1.0001,
+      "step": 1274
+    },
+    {
+      "epoch": 0.49548236665695133,
+      "grad_norm": 0.21081402897834778,
+      "learning_rate": 0.0001012066952121448,
+      "loss": 1.0151,
+      "step": 1275
+    },
+    {
+      "epoch": 0.49587098027785875,
+      "grad_norm": 0.2053362876176834,
+      "learning_rate": 0.00010112884390813547,
+      "loss": 1.018,
+      "step": 1276
+    },
+    {
+      "epoch": 0.49625959389876617,
+      "grad_norm": 0.21205593645572662,
+      "learning_rate": 0.00010105099260412612,
+      "loss": 0.9912,
+      "step": 1277
+    },
+    {
+      "epoch": 0.4966482075196736,
+      "grad_norm": 0.2005016952753067,
+      "learning_rate": 0.00010097314130011679,
+      "loss": 1.0069,
+      "step": 1278
+    },
+    {
+      "epoch": 0.49703682114058095,
+      "grad_norm": 0.21688181161880493,
+      "learning_rate": 0.00010089528999610744,
+      "loss": 1.0364,
+      "step": 1279
+    },
+    {
+      "epoch": 0.49742543476148837,
+      "grad_norm": 0.20582237839698792,
+      "learning_rate": 0.0001008174386920981,
+      "loss": 1.0138,
+      "step": 1280
+    },
+    {
+      "epoch": 0.4978140483823958,
+      "grad_norm": 0.20824448764324188,
+      "learning_rate": 0.00010073958738808876,
+      "loss": 0.9941,
+      "step": 1281
+    },
+    {
+      "epoch": 0.4982026620033032,
+      "grad_norm": 0.20749075710773468,
+      "learning_rate": 0.00010066173608407941,
+      "loss": 1.0478,
+      "step": 1282
+    },
+    {
+      "epoch": 0.49859127562421063,
+      "grad_norm": 0.20012183487415314,
+      "learning_rate": 0.00010058388478007009,
+      "loss": 0.995,
+      "step": 1283
+    },
+    {
+      "epoch": 0.49897988924511805,
+      "grad_norm": 0.20275959372520447,
+      "learning_rate": 0.00010050603347606074,
+      "loss": 1.097,
+      "step": 1284
+    },
+    {
+      "epoch": 0.49936850286602547,
+      "grad_norm": 0.19588243961334229,
+      "learning_rate": 0.00010042818217205139,
+      "loss": 1.0,
+      "step": 1285
+    },
+    {
+      "epoch": 0.4997571164869329,
+      "grad_norm": 0.20693185925483704,
+      "learning_rate": 0.00010035033086804204,
+      "loss": 1.0527,
+      "step": 1286
+    },
+    {
+      "epoch": 0.5001457301078402,
+      "grad_norm": 0.20330573618412018,
+      "learning_rate": 0.0001002724795640327,
+      "loss": 1.0137,
+      "step": 1287
+    },
+    {
+      "epoch": 0.5005343437287477,
+      "grad_norm": 0.19123876094818115,
+      "learning_rate": 0.00010019462826002337,
+      "loss": 0.9688,
+      "step": 1288
+    },
+    {
+      "epoch": 0.5009229573496551,
+      "grad_norm": 0.2184276431798935,
+      "learning_rate": 0.00010011677695601403,
+      "loss": 1.0367,
+      "step": 1289
+    },
+    {
+      "epoch": 0.5013115709705626,
+      "grad_norm": 0.21642108261585236,
+      "learning_rate": 0.00010003892565200468,
+      "loss": 1.102,
+      "step": 1290
+    },
+    {
+      "epoch": 0.5017001845914699,
+      "grad_norm": 0.20351074635982513,
+      "learning_rate": 9.996107434799533e-05,
+      "loss": 1.0327,
+      "step": 1291
+    },
+    {
+      "epoch": 0.5020887982123774,
+      "grad_norm": 0.22771553695201874,
+      "learning_rate": 9.9883223043986e-05,
+      "loss": 1.104,
+      "step": 1292
+    },
+    {
+      "epoch": 0.5024774118332848,
+      "grad_norm": 0.2271403968334198,
+      "learning_rate": 9.980537173997665e-05,
+      "loss": 1.1313,
+      "step": 1293
+    },
+    {
+      "epoch": 0.5028660254541921,
+      "grad_norm": 0.2157830148935318,
+      "learning_rate": 9.97275204359673e-05,
+      "loss": 1.0203,
+      "step": 1294
+    },
+    {
+      "epoch": 0.5032546390750996,
+      "grad_norm": 0.19555307924747467,
+      "learning_rate": 9.964966913195797e-05,
+      "loss": 1.0194,
+      "step": 1295
+    },
+    {
+      "epoch": 0.503643252696007,
+      "grad_norm": 0.1898549199104309,
+      "learning_rate": 9.957181782794862e-05,
+      "loss": 1.0034,
+      "step": 1296
+    },
+    {
+      "epoch": 0.5040318663169144,
+      "grad_norm": 0.23555906116962433,
+      "learning_rate": 9.949396652393928e-05,
+      "loss": 1.0298,
+      "step": 1297
+    },
+    {
+      "epoch": 0.5044204799378218,
+      "grad_norm": 0.20434850454330444,
+      "learning_rate": 9.941611521992994e-05,
+      "loss": 0.9999,
+      "step": 1298
+    },
+    {
+      "epoch": 0.5048090935587293,
+      "grad_norm": 0.21015289425849915,
+      "learning_rate": 9.933826391592059e-05,
+      "loss": 1.006,
+      "step": 1299
+    },
+    {
+      "epoch": 0.5051977071796366,
+      "grad_norm": 0.21147851645946503,
+      "learning_rate": 9.926041261191125e-05,
+      "loss": 1.0854,
+      "step": 1300
+    },
+    {
+      "epoch": 0.5055863208005441,
+      "grad_norm": 0.19666944444179535,
+      "learning_rate": 9.91825613079019e-05,
+      "loss": 1.0057,
+      "step": 1301
+    },
+    {
+      "epoch": 0.5059749344214515,
+      "grad_norm": 0.21233728528022766,
+      "learning_rate": 9.910471000389257e-05,
+      "loss": 1.0675,
+      "step": 1302
+    },
+    {
+      "epoch": 0.5063635480423588,
+      "grad_norm": 0.21905581653118134,
+      "learning_rate": 9.902685869988322e-05,
+      "loss": 1.0054,
+      "step": 1303
+    },
+    {
+      "epoch": 0.5067521616632663,
+      "grad_norm": 0.23434993624687195,
+      "learning_rate": 9.894900739587389e-05,
+      "loss": 0.9915,
+      "step": 1304
+    },
+    {
+      "epoch": 0.5071407752841737,
+      "grad_norm": 0.21684227883815765,
+      "learning_rate": 9.887115609186454e-05,
+      "loss": 1.1131,
+      "step": 1305
+    },
+    {
+      "epoch": 0.5075293889050811,
+      "grad_norm": 0.21699552237987518,
+      "learning_rate": 9.87933047878552e-05,
+      "loss": 1.0782,
+      "step": 1306
+    },
+    {
+      "epoch": 0.5079180025259885,
+      "grad_norm": 0.2218221127986908,
+      "learning_rate": 9.871545348384586e-05,
+      "loss": 1.0388,
+      "step": 1307
+    },
+    {
+      "epoch": 0.508306616146896,
+      "grad_norm": 0.20104359090328217,
+      "learning_rate": 9.863760217983652e-05,
+      "loss": 1.0336,
+      "step": 1308
+    },
+    {
+      "epoch": 0.5086952297678033,
+      "grad_norm": 0.21907050907611847,
+      "learning_rate": 9.855975087582718e-05,
+      "loss": 1.0587,
+      "step": 1309
+    },
+    {
+      "epoch": 0.5090838433887108,
+      "grad_norm": 0.2140391767024994,
+      "learning_rate": 9.848189957181784e-05,
+      "loss": 1.0351,
+      "step": 1310
+    },
+    {
+      "epoch": 0.5094724570096182,
+      "grad_norm": 0.33287563920021057,
+      "learning_rate": 9.84040482678085e-05,
+      "loss": 0.9908,
+      "step": 1311
+    },
+    {
+      "epoch": 0.5098610706305255,
+      "grad_norm": 0.2706705927848816,
+      "learning_rate": 9.832619696379915e-05,
+      "loss": 1.0078,
+      "step": 1312
+    },
+    {
+      "epoch": 0.510249684251433,
+      "grad_norm": 0.20216278731822968,
+      "learning_rate": 9.824834565978981e-05,
+      "loss": 1.0253,
+      "step": 1313
+    },
+    {
+      "epoch": 0.5106382978723404,
+      "grad_norm": 0.20736576616764069,
+      "learning_rate": 9.817049435578046e-05,
+      "loss": 1.0217,
+      "step": 1314
+    },
+    {
+      "epoch": 0.5110269114932479,
+      "grad_norm": 0.2275344580411911,
+      "learning_rate": 9.809264305177113e-05,
+      "loss": 1.0139,
+      "step": 1315
+    },
+    {
+      "epoch": 0.5114155251141552,
+      "grad_norm": 0.22243620455265045,
+      "learning_rate": 9.801479174776178e-05,
+      "loss": 1.0427,
+      "step": 1316
+    },
+    {
+      "epoch": 0.5118041387350627,
+      "grad_norm": 0.198841854929924,
+      "learning_rate": 9.793694044375243e-05,
+      "loss": 1.0231,
+      "step": 1317
+    },
+    {
+      "epoch": 0.5121927523559701,
+      "grad_norm": 0.2031068503856659,
+      "learning_rate": 9.78590891397431e-05,
+      "loss": 1.0184,
+      "step": 1318
+    },
+    {
+      "epoch": 0.5125813659768775,
+      "grad_norm": 0.21712587773799896,
+      "learning_rate": 9.778123783573375e-05,
+      "loss": 1.0205,
+      "step": 1319
+    },
+    {
+      "epoch": 0.5129699795977849,
+      "grad_norm": 0.19366060197353363,
+      "learning_rate": 9.77033865317244e-05,
+      "loss": 0.9623,
+      "step": 1320
+    },
+    {
+      "epoch": 0.5133585932186923,
+      "grad_norm": 0.19845952093601227,
+      "learning_rate": 9.762553522771507e-05,
+      "loss": 1.0209,
+      "step": 1321
+    },
+    {
+      "epoch": 0.5137472068395997,
+      "grad_norm": 0.19700276851654053,
+      "learning_rate": 9.754768392370572e-05,
+      "loss": 0.9506,
+      "step": 1322
+    },
+    {
+      "epoch": 0.5141358204605071,
+      "grad_norm": 0.19797460734844208,
+      "learning_rate": 9.746983261969639e-05,
+      "loss": 1.0928,
+      "step": 1323
+    },
+    {
+      "epoch": 0.5145244340814146,
+      "grad_norm": 0.20470699667930603,
+      "learning_rate": 9.739198131568704e-05,
+      "loss": 1.0835,
+      "step": 1324
+    },
+    {
+      "epoch": 0.5149130477023219,
+      "grad_norm": 0.19121742248535156,
+      "learning_rate": 9.731413001167769e-05,
+      "loss": 0.9877,
+      "step": 1325
+    },
+    {
+      "epoch": 0.5153016613232294,
+      "grad_norm": 0.20026616752147675,
+      "learning_rate": 9.723627870766836e-05,
+      "loss": 1.0094,
+      "step": 1326
+    },
+    {
+      "epoch": 0.5156902749441368,
+      "grad_norm": 0.2214539796113968,
+      "learning_rate": 9.715842740365901e-05,
+      "loss": 0.9867,
+      "step": 1327
+    },
+    {
+      "epoch": 0.5160788885650442,
+      "grad_norm": 0.22674603760242462,
+      "learning_rate": 9.708057609964967e-05,
+      "loss": 1.0738,
+      "step": 1328
+    },
+    {
+      "epoch": 0.5164675021859516,
+      "grad_norm": 0.21274834871292114,
+      "learning_rate": 9.700272479564033e-05,
+      "loss": 1.0458,
+      "step": 1329
+    },
+    {
+      "epoch": 0.5168561158068591,
+      "grad_norm": 0.20305052399635315,
+      "learning_rate": 9.692487349163099e-05,
+      "loss": 1.0041,
+      "step": 1330
+    },
+    {
+      "epoch": 0.5172447294277664,
+      "grad_norm": 0.1840772181749344,
+      "learning_rate": 9.684702218762166e-05,
+      "loss": 0.9498,
+      "step": 1331
+    },
+    {
+      "epoch": 0.5176333430486738,
+      "grad_norm": 0.2055782824754715,
+      "learning_rate": 9.676917088361231e-05,
+      "loss": 1.0223,
+      "step": 1332
+    },
+    {
+      "epoch": 0.5180219566695813,
+      "grad_norm": 0.21826402842998505,
+      "learning_rate": 9.669131957960297e-05,
+      "loss": 1.1068,
+      "step": 1333
+    },
+    {
+      "epoch": 0.5184105702904886,
+      "grad_norm": 0.22516922652721405,
+      "learning_rate": 9.661346827559363e-05,
+      "loss": 1.0957,
+      "step": 1334
+    },
+    {
+      "epoch": 0.5187991839113961,
+      "grad_norm": 0.21044284105300903,
+      "learning_rate": 9.653561697158428e-05,
+      "loss": 1.0384,
+      "step": 1335
+    },
+    {
+      "epoch": 0.5191877975323035,
+      "grad_norm": 0.20275571942329407,
+      "learning_rate": 9.645776566757494e-05,
+      "loss": 0.9978,
+      "step": 1336
+    },
+    {
+      "epoch": 0.519576411153211,
+      "grad_norm": 0.2077122926712036,
+      "learning_rate": 9.63799143635656e-05,
+      "loss": 1.0418,
+      "step": 1337
+    },
+    {
+      "epoch": 0.5199650247741183,
+      "grad_norm": 0.19158867001533508,
+      "learning_rate": 9.630206305955625e-05,
+      "loss": 1.0527,
+      "step": 1338
+    },
+    {
+      "epoch": 0.5203536383950258,
+      "grad_norm": 0.1932496577501297,
+      "learning_rate": 9.622421175554691e-05,
+      "loss": 1.0039,
+      "step": 1339
+    },
+    {
+      "epoch": 0.5207422520159332,
+      "grad_norm": 0.21937766671180725,
+      "learning_rate": 9.614636045153757e-05,
+      "loss": 1.0373,
+      "step": 1340
+    },
+    {
+      "epoch": 0.5211308656368405,
+      "grad_norm": 0.2268432229757309,
+      "learning_rate": 9.606850914752823e-05,
+      "loss": 1.0815,
+      "step": 1341
+    },
+    {
+      "epoch": 0.521519479257748,
+      "grad_norm": 0.2147454470396042,
+      "learning_rate": 9.599065784351888e-05,
+      "loss": 1.0331,
+      "step": 1342
+    },
+    {
+      "epoch": 0.5219080928786554,
+      "grad_norm": 0.19899709522724152,
+      "learning_rate": 9.591280653950954e-05,
+      "loss": 1.032,
+      "step": 1343
+    },
+    {
+      "epoch": 0.5222967064995628,
+      "grad_norm": 0.19646069407463074,
+      "learning_rate": 9.58349552355002e-05,
+      "loss": 0.9788,
+      "step": 1344
+    },
+    {
+      "epoch": 0.5226853201204702,
+      "grad_norm": 0.2146075963973999,
+      "learning_rate": 9.575710393149085e-05,
+      "loss": 1.0201,
+      "step": 1345
+    },
+    {
+      "epoch": 0.5230739337413777,
+      "grad_norm": 0.1968650370836258,
+      "learning_rate": 9.567925262748152e-05,
+      "loss": 0.9894,
+      "step": 1346
+    },
+    {
+      "epoch": 0.523462547362285,
+      "grad_norm": 0.21111296117305756,
+      "learning_rate": 9.560140132347217e-05,
+      "loss": 1.0961,
+      "step": 1347
+    },
+    {
+      "epoch": 0.5238511609831925,
+      "grad_norm": 0.20917272567749023,
+      "learning_rate": 9.552355001946282e-05,
+      "loss": 1.0435,
+      "step": 1348
+    },
+    {
+      "epoch": 0.5242397746040999,
+      "grad_norm": 0.2029752880334854,
+      "learning_rate": 9.544569871545349e-05,
+      "loss": 1.0328,
+      "step": 1349
+    },
+    {
+      "epoch": 0.5246283882250072,
+      "grad_norm": 0.20726613700389862,
+      "learning_rate": 9.536784741144414e-05,
+      "loss": 1.0465,
+      "step": 1350
+    },
+    {
+      "epoch": 0.5250170018459147,
+      "grad_norm": 0.19778740406036377,
+      "learning_rate": 9.52899961074348e-05,
+      "loss": 1.0058,
+      "step": 1351
+    },
+    {
+      "epoch": 0.5254056154668221,
+      "grad_norm": 0.19958540797233582,
+      "learning_rate": 9.521214480342546e-05,
+      "loss": 1.0164,
+      "step": 1352
+    },
+    {
+      "epoch": 0.5257942290877295,
+      "grad_norm": 0.2151395082473755,
+      "learning_rate": 9.513429349941611e-05,
+      "loss": 1.0703,
+      "step": 1353
+    },
+    {
+      "epoch": 0.5261828427086369,
+      "grad_norm": 0.2366979569196701,
+      "learning_rate": 9.505644219540678e-05,
+      "loss": 0.9832,
+      "step": 1354
+    },
+    {
+      "epoch": 0.5265714563295444,
+      "grad_norm": 0.22064165771007538,
+      "learning_rate": 9.497859089139743e-05,
+      "loss": 1.0181,
+      "step": 1355
+    },
+    {
+      "epoch": 0.5269600699504517,
+      "grad_norm": 0.20221936702728271,
+      "learning_rate": 9.49007395873881e-05,
+      "loss": 1.0424,
+      "step": 1356
+    },
+    {
+      "epoch": 0.5273486835713592,
+      "grad_norm": 0.19608759880065918,
+      "learning_rate": 9.482288828337876e-05,
+      "loss": 1.0074,
+      "step": 1357
+    },
+    {
+      "epoch": 0.5277372971922666,
+      "grad_norm": 0.20686689019203186,
+      "learning_rate": 9.474503697936941e-05,
+      "loss": 1.0213,
+      "step": 1358
+    },
+    {
+      "epoch": 0.528125910813174,
+      "grad_norm": 0.223610520362854,
+      "learning_rate": 9.466718567536008e-05,
+      "loss": 1.05,
+      "step": 1359
+    },
+    {
+      "epoch": 0.5285145244340814,
+      "grad_norm": 0.2135966569185257,
+      "learning_rate": 9.458933437135073e-05,
+      "loss": 1.034,
+      "step": 1360
+    },
+    {
+      "epoch": 0.5289031380549888,
+      "grad_norm": 0.1933239996433258,
+      "learning_rate": 9.451148306734138e-05,
+      "loss": 0.9883,
+      "step": 1361
+    },
+    {
+      "epoch": 0.5292917516758963,
+      "grad_norm": 0.20794694125652313,
+      "learning_rate": 9.443363176333205e-05,
+      "loss": 1.0103,
+      "step": 1362
+    },
+    {
+      "epoch": 0.5296803652968036,
+      "grad_norm": 0.20128493010997772,
+      "learning_rate": 9.43557804593227e-05,
+      "loss": 1.015,
+      "step": 1363
+    },
+    {
+      "epoch": 0.5300689789177111,
+      "grad_norm": 0.2128933072090149,
+      "learning_rate": 9.427792915531336e-05,
+      "loss": 1.0038,
+      "step": 1364
+    },
+    {
+      "epoch": 0.5304575925386185,
+      "grad_norm": 0.2046983689069748,
+      "learning_rate": 9.420007785130402e-05,
+      "loss": 0.9948,
+      "step": 1365
+    },
+    {
+      "epoch": 0.5308462061595259,
+      "grad_norm": 0.20909680426120758,
+      "learning_rate": 9.412222654729467e-05,
+      "loss": 1.0308,
+      "step": 1366
+    },
+    {
+      "epoch": 0.5312348197804333,
+      "grad_norm": 0.2182164192199707,
+      "learning_rate": 9.404437524328533e-05,
+      "loss": 1.0018,
+      "step": 1367
+    },
+    {
+      "epoch": 0.5316234334013407,
+      "grad_norm": 0.2107028216123581,
+      "learning_rate": 9.396652393927599e-05,
+      "loss": 1.0419,
+      "step": 1368
+    },
+    {
+      "epoch": 0.5320120470222481,
+      "grad_norm": 0.24631445109844208,
+      "learning_rate": 9.388867263526665e-05,
+      "loss": 1.0171,
+      "step": 1369
+    },
+    {
+      "epoch": 0.5324006606431555,
+      "grad_norm": 0.20331013202667236,
+      "learning_rate": 9.38108213312573e-05,
+      "loss": 1.0592,
+      "step": 1370
+    },
+    {
+      "epoch": 0.532789274264063,
+      "grad_norm": 0.19266058504581451,
+      "learning_rate": 9.373297002724796e-05,
+      "loss": 0.9912,
+      "step": 1371
+    },
+    {
+      "epoch": 0.5331778878849703,
+      "grad_norm": 0.22874227166175842,
+      "learning_rate": 9.365511872323862e-05,
+      "loss": 1.0533,
+      "step": 1372
+    },
+    {
+      "epoch": 0.5335665015058778,
+      "grad_norm": 0.2088235765695572,
+      "learning_rate": 9.357726741922927e-05,
+      "loss": 1.0464,
+      "step": 1373
+    },
+    {
+      "epoch": 0.5339551151267852,
+      "grad_norm": 0.2112397700548172,
+      "learning_rate": 9.349941611521994e-05,
+      "loss": 1.0503,
+      "step": 1374
+    },
+    {
+      "epoch": 0.5343437287476926,
+      "grad_norm": 0.20712170004844666,
+      "learning_rate": 9.342156481121059e-05,
+      "loss": 1.0237,
+      "step": 1375
+    },
+    {
+      "epoch": 0.5347323423686,
+      "grad_norm": 0.20077116787433624,
+      "learning_rate": 9.334371350720124e-05,
+      "loss": 1.0467,
+      "step": 1376
+    },
+    {
+      "epoch": 0.5351209559895075,
+      "grad_norm": 0.20394501090049744,
+      "learning_rate": 9.326586220319191e-05,
+      "loss": 1.0054,
+      "step": 1377
+    },
+    {
+      "epoch": 0.5355095696104148,
+      "grad_norm": 0.19459395110607147,
+      "learning_rate": 9.318801089918256e-05,
+      "loss": 0.9792,
+      "step": 1378
+    },
+    {
+      "epoch": 0.5358981832313222,
+      "grad_norm": 0.2116049826145172,
+      "learning_rate": 9.311015959517321e-05,
+      "loss": 1.0345,
+      "step": 1379
+    },
+    {
+      "epoch": 0.5362867968522297,
+      "grad_norm": 0.21672269701957703,
+      "learning_rate": 9.303230829116388e-05,
+      "loss": 1.0709,
+      "step": 1380
+    },
+    {
+      "epoch": 0.536675410473137,
+      "grad_norm": 0.20358407497406006,
+      "learning_rate": 9.295445698715453e-05,
+      "loss": 1.0534,
+      "step": 1381
+    },
+    {
+      "epoch": 0.5370640240940445,
+      "grad_norm": 0.19512853026390076,
+      "learning_rate": 9.28766056831452e-05,
+      "loss": 0.9397,
+      "step": 1382
+    },
+    {
+      "epoch": 0.5374526377149519,
+      "grad_norm": 0.2140122503042221,
+      "learning_rate": 9.279875437913586e-05,
+      "loss": 1.0164,
+      "step": 1383
+    },
+    {
+      "epoch": 0.5378412513358594,
+      "grad_norm": 0.20486049354076385,
+      "learning_rate": 9.272090307512651e-05,
+      "loss": 0.9892,
+      "step": 1384
+    },
+    {
+      "epoch": 0.5382298649567667,
+      "grad_norm": 0.20023222267627716,
+      "learning_rate": 9.264305177111718e-05,
+      "loss": 1.0019,
+      "step": 1385
+    },
+    {
+      "epoch": 0.5386184785776742,
+      "grad_norm": 0.20024439692497253,
+      "learning_rate": 9.256520046710783e-05,
+      "loss": 0.9717,
+      "step": 1386
+    },
+    {
+      "epoch": 0.5390070921985816,
+      "grad_norm": 0.21021386981010437,
+      "learning_rate": 9.24873491630985e-05,
+      "loss": 1.028,
+      "step": 1387
+    },
+    {
+      "epoch": 0.5393957058194889,
+      "grad_norm": 0.18508704006671906,
+      "learning_rate": 9.240949785908915e-05,
+      "loss": 1.0008,
+      "step": 1388
+    },
+    {
+      "epoch": 0.5397843194403964,
+      "grad_norm": 0.19351208209991455,
+      "learning_rate": 9.23316465550798e-05,
+      "loss": 0.9898,
+      "step": 1389
+    },
+    {
+      "epoch": 0.5401729330613038,
+      "grad_norm": 0.20341919362545013,
+      "learning_rate": 9.225379525107047e-05,
+      "loss": 1.0203,
+      "step": 1390
+    },
+    {
+      "epoch": 0.5405615466822112,
+      "grad_norm": 0.1942797303199768,
+      "learning_rate": 9.217594394706112e-05,
+      "loss": 1.003,
+      "step": 1391
+    },
+    {
+      "epoch": 0.5409501603031186,
+      "grad_norm": 0.2056138813495636,
+      "learning_rate": 9.209809264305178e-05,
+      "loss": 1.0149,
+      "step": 1392
+    },
+    {
+      "epoch": 0.5413387739240261,
+      "grad_norm": 0.21572062373161316,
+      "learning_rate": 9.202024133904244e-05,
+      "loss": 0.9808,
+      "step": 1393
+    },
+    {
+      "epoch": 0.5417273875449334,
+      "grad_norm": 0.19841499626636505,
+      "learning_rate": 9.194239003503309e-05,
+      "loss": 1.0467,
+      "step": 1394
+    },
+    {
+      "epoch": 0.5421160011658409,
+      "grad_norm": 0.20452147722244263,
+      "learning_rate": 9.186453873102375e-05,
+      "loss": 1.0378,
+      "step": 1395
+    },
+    {
+      "epoch": 0.5425046147867483,
+      "grad_norm": 0.2090451419353485,
+      "learning_rate": 9.17866874270144e-05,
+      "loss": 1.0823,
+      "step": 1396
+    },
+    {
+      "epoch": 0.5428932284076556,
+      "grad_norm": 0.215814009308815,
+      "learning_rate": 9.170883612300506e-05,
+      "loss": 1.0994,
+      "step": 1397
+    },
+    {
+      "epoch": 0.5432818420285631,
+      "grad_norm": 0.19924724102020264,
+      "learning_rate": 9.163098481899572e-05,
+      "loss": 1.0099,
+      "step": 1398
+    },
+    {
+      "epoch": 0.5436704556494705,
+      "grad_norm": 0.20074865221977234,
+      "learning_rate": 9.155313351498638e-05,
+      "loss": 1.0163,
+      "step": 1399
+    },
+    {
+      "epoch": 0.544059069270378,
+      "grad_norm": 0.21737203001976013,
+      "learning_rate": 9.147528221097704e-05,
+      "loss": 1.0527,
+      "step": 1400
+    },
+    {
+      "epoch": 0.5444476828912853,
+      "grad_norm": 0.2036885768175125,
+      "learning_rate": 9.139743090696769e-05,
+      "loss": 1.0208,
+      "step": 1401
+    },
+    {
+      "epoch": 0.5448362965121928,
+      "grad_norm": 0.20861585438251495,
+      "learning_rate": 9.131957960295835e-05,
+      "loss": 1.0175,
+      "step": 1402
+    },
+    {
+      "epoch": 0.5452249101331001,
+      "grad_norm": 0.23425570130348206,
+      "learning_rate": 9.124172829894901e-05,
+      "loss": 1.053,
+      "step": 1403
+    },
+    {
+      "epoch": 0.5456135237540076,
+      "grad_norm": 0.20389291644096375,
+      "learning_rate": 9.116387699493966e-05,
+      "loss": 1.0479,
+      "step": 1404
+    },
+    {
+      "epoch": 0.546002137374915,
+      "grad_norm": 0.20166678726673126,
+      "learning_rate": 9.108602569093033e-05,
+      "loss": 1.0064,
+      "step": 1405
+    },
+    {
+      "epoch": 0.5463907509958223,
+      "grad_norm": 0.21419203281402588,
+      "learning_rate": 9.100817438692098e-05,
+      "loss": 1.0122,
+      "step": 1406
+    },
+    {
+      "epoch": 0.5467793646167298,
+      "grad_norm": 0.20541758835315704,
+      "learning_rate": 9.093032308291165e-05,
+      "loss": 1.0355,
+      "step": 1407
+    },
+    {
+      "epoch": 0.5471679782376372,
+      "grad_norm": 0.21865367889404297,
+      "learning_rate": 9.08524717789023e-05,
+      "loss": 1.0201,
+      "step": 1408
+    },
+    {
+      "epoch": 0.5475565918585447,
+      "grad_norm": 0.21181468665599823,
+      "learning_rate": 9.077462047489296e-05,
+      "loss": 1.0501,
+      "step": 1409
+    },
+    {
+      "epoch": 0.547945205479452,
+      "grad_norm": 0.21016767621040344,
+      "learning_rate": 9.069676917088362e-05,
+      "loss": 1.0452,
+      "step": 1410
+    },
+    {
+      "epoch": 0.5483338191003595,
+      "grad_norm": 0.21119755506515503,
+      "learning_rate": 9.061891786687428e-05,
+      "loss": 1.0935,
+      "step": 1411
+    },
+    {
+      "epoch": 0.5487224327212669,
+      "grad_norm": 0.20688095688819885,
+      "learning_rate": 9.054106656286493e-05,
+      "loss": 1.0526,
+      "step": 1412
+    },
+    {
+      "epoch": 0.5491110463421743,
+      "grad_norm": 0.21857528388500214,
+      "learning_rate": 9.04632152588556e-05,
+      "loss": 1.0067,
+      "step": 1413
+    },
+    {
+      "epoch": 0.5494996599630817,
+      "grad_norm": 0.2196548581123352,
+      "learning_rate": 9.038536395484625e-05,
+      "loss": 1.0263,
+      "step": 1414
+    },
+    {
+      "epoch": 0.5498882735839892,
+      "grad_norm": 0.21952040493488312,
+      "learning_rate": 9.03075126508369e-05,
+      "loss": 1.0009,
+      "step": 1415
+    },
+    {
+      "epoch": 0.5502768872048965,
+      "grad_norm": 0.20059294998645782,
+      "learning_rate": 9.022966134682757e-05,
+      "loss": 1.0481,
+      "step": 1416
+    },
+    {
+      "epoch": 0.5506655008258039,
+      "grad_norm": 0.1960824728012085,
+      "learning_rate": 9.015181004281822e-05,
+      "loss": 1.0003,
+      "step": 1417
+    },
+    {
+      "epoch": 0.5510541144467114,
+      "grad_norm": 0.19051724672317505,
+      "learning_rate": 9.007395873880889e-05,
+      "loss": 0.9556,
+      "step": 1418
+    },
+    {
+      "epoch": 0.5514427280676187,
+      "grad_norm": 0.21008028090000153,
+      "learning_rate": 8.999610743479954e-05,
+      "loss": 1.0457,
+      "step": 1419
+    },
+    {
+      "epoch": 0.5518313416885262,
+      "grad_norm": 0.21465444564819336,
+      "learning_rate": 8.991825613079019e-05,
+      "loss": 1.0196,
+      "step": 1420
+    },
+    {
+      "epoch": 0.5522199553094336,
+      "grad_norm": 0.2062770277261734,
+      "learning_rate": 8.984040482678086e-05,
+      "loss": 1.0501,
+      "step": 1421
+    },
+    {
+      "epoch": 0.552608568930341,
+      "grad_norm": 0.21400012075901031,
+      "learning_rate": 8.976255352277151e-05,
+      "loss": 1.0711,
+      "step": 1422
+    },
+    {
+      "epoch": 0.5529971825512484,
+      "grad_norm": 0.19617624580860138,
+      "learning_rate": 8.968470221876217e-05,
+      "loss": 0.9858,
+      "step": 1423
+    },
+    {
+      "epoch": 0.5533857961721559,
+      "grad_norm": 0.20835624635219574,
+      "learning_rate": 8.960685091475283e-05,
+      "loss": 1.0122,
+      "step": 1424
+    },
+    {
+      "epoch": 0.5537744097930632,
+      "grad_norm": 0.21708111464977264,
+      "learning_rate": 8.952899961074348e-05,
+      "loss": 1.0108,
+      "step": 1425
+    },
+    {
+      "epoch": 0.5541630234139706,
+      "grad_norm": 0.20877864956855774,
+      "learning_rate": 8.945114830673414e-05,
+      "loss": 1.0389,
+      "step": 1426
+    },
+    {
+      "epoch": 0.5545516370348781,
+      "grad_norm": 0.1924441158771515,
+      "learning_rate": 8.93732970027248e-05,
+      "loss": 1.0088,
+      "step": 1427
+    },
+    {
+      "epoch": 0.5549402506557854,
+      "grad_norm": 0.20288826525211334,
+      "learning_rate": 8.929544569871546e-05,
+      "loss": 1.0296,
+      "step": 1428
+    },
+    {
+      "epoch": 0.5553288642766929,
+      "grad_norm": 0.2008143663406372,
+      "learning_rate": 8.921759439470611e-05,
+      "loss": 1.0521,
+      "step": 1429
+    },
+    {
+      "epoch": 0.5557174778976003,
+      "grad_norm": 0.24407047033309937,
+      "learning_rate": 8.913974309069677e-05,
+      "loss": 1.1038,
+      "step": 1430
+    },
+    {
+      "epoch": 0.5561060915185078,
+      "grad_norm": 0.2172536998987198,
+      "learning_rate": 8.906189178668743e-05,
+      "loss": 1.0811,
+      "step": 1431
+    },
+    {
+      "epoch": 0.5564947051394151,
+      "grad_norm": 0.21712054312229156,
+      "learning_rate": 8.898404048267808e-05,
+      "loss": 1.0642,
+      "step": 1432
+    },
+    {
+      "epoch": 0.5568833187603226,
+      "grad_norm": 0.22482797503471375,
+      "learning_rate": 8.890618917866875e-05,
+      "loss": 1.0742,
+      "step": 1433
+    },
+    {
+      "epoch": 0.55727193238123,
+      "grad_norm": 0.1974876970052719,
+      "learning_rate": 8.88283378746594e-05,
+      "loss": 0.9954,
+      "step": 1434
+    },
+    {
+      "epoch": 0.5576605460021373,
+      "grad_norm": 0.19162166118621826,
+      "learning_rate": 8.875048657065007e-05,
+      "loss": 1.0074,
+      "step": 1435
+    },
+    {
+      "epoch": 0.5580491596230448,
+      "grad_norm": 0.20439045131206512,
+      "learning_rate": 8.867263526664072e-05,
+      "loss": 1.026,
+      "step": 1436
+    },
+    {
+      "epoch": 0.5584377732439522,
+      "grad_norm": 0.1947651207447052,
+      "learning_rate": 8.859478396263138e-05,
+      "loss": 0.9848,
+      "step": 1437
+    },
+    {
+      "epoch": 0.5588263868648596,
+      "grad_norm": 0.21434316039085388,
+      "learning_rate": 8.851693265862204e-05,
+      "loss": 1.0843,
+      "step": 1438
+    },
+    {
+      "epoch": 0.559215000485767,
+      "grad_norm": 1.3314417600631714,
+      "learning_rate": 8.84390813546127e-05,
+      "loss": 1.0356,
+      "step": 1439
+    },
+    {
+      "epoch": 0.5596036141066745,
+      "grad_norm": 0.20131289958953857,
+      "learning_rate": 8.836123005060335e-05,
+      "loss": 1.0214,
+      "step": 1440
+    },
+    {
+      "epoch": 0.5599922277275818,
+      "grad_norm": 0.21596461534500122,
+      "learning_rate": 8.828337874659402e-05,
+      "loss": 1.0962,
+      "step": 1441
+    },
+    {
+      "epoch": 0.5603808413484893,
+      "grad_norm": 0.20477193593978882,
+      "learning_rate": 8.820552744258467e-05,
+      "loss": 1.0643,
+      "step": 1442
+    },
+    {
+      "epoch": 0.5607694549693967,
+      "grad_norm": 0.1978107988834381,
+      "learning_rate": 8.812767613857532e-05,
+      "loss": 1.0054,
+      "step": 1443
+    },
+    {
+      "epoch": 0.561158068590304,
+      "grad_norm": 0.219422847032547,
+      "learning_rate": 8.804982483456599e-05,
+      "loss": 1.0009,
+      "step": 1444
+    },
+    {
+      "epoch": 0.5615466822112115,
+      "grad_norm": 0.21489015221595764,
+      "learning_rate": 8.797197353055664e-05,
+      "loss": 1.052,
+      "step": 1445
+    },
+    {
+      "epoch": 0.5619352958321189,
+      "grad_norm": 0.2235930860042572,
+      "learning_rate": 8.78941222265473e-05,
+      "loss": 1.037,
+      "step": 1446
+    },
+    {
+      "epoch": 0.5623239094530263,
+      "grad_norm": 0.19922038912773132,
+      "learning_rate": 8.781627092253796e-05,
+      "loss": 1.0006,
+      "step": 1447
+    },
+    {
+      "epoch": 0.5627125230739337,
+      "grad_norm": 0.24740247428417206,
+      "learning_rate": 8.773841961852861e-05,
+      "loss": 1.0753,
+      "step": 1448
+    },
+    {
+      "epoch": 0.5631011366948412,
+      "grad_norm": 0.2148803174495697,
+      "learning_rate": 8.766056831451928e-05,
+      "loss": 1.0712,
+      "step": 1449
+    },
+    {
+      "epoch": 0.5634897503157485,
+      "grad_norm": 0.19838745892047882,
+      "learning_rate": 8.758271701050993e-05,
+      "loss": 1.027,
+      "step": 1450
+    },
+    {
+      "epoch": 0.563878363936656,
+      "grad_norm": 0.20328201353549957,
+      "learning_rate": 8.750486570650058e-05,
+      "loss": 1.0117,
+      "step": 1451
+    },
+    {
+      "epoch": 0.5642669775575634,
+      "grad_norm": 0.21230114996433258,
+      "learning_rate": 8.742701440249125e-05,
+      "loss": 1.0658,
+      "step": 1452
+    },
+    {
+      "epoch": 0.5646555911784708,
+      "grad_norm": 0.2030259519815445,
+      "learning_rate": 8.73491630984819e-05,
+      "loss": 1.0002,
+      "step": 1453
+    },
+    {
+      "epoch": 0.5650442047993782,
+      "grad_norm": 0.21404659748077393,
+      "learning_rate": 8.727131179447256e-05,
+      "loss": 1.0572,
+      "step": 1454
+    },
+    {
+      "epoch": 0.5654328184202856,
+      "grad_norm": 0.2148464322090149,
+      "learning_rate": 8.719346049046322e-05,
+      "loss": 1.0164,
+      "step": 1455
+    },
+    {
+      "epoch": 0.5658214320411931,
+      "grad_norm": 0.22083118557929993,
+      "learning_rate": 8.711560918645387e-05,
+      "loss": 0.9704,
+      "step": 1456
+    },
+    {
+      "epoch": 0.5662100456621004,
+      "grad_norm": 0.19305935502052307,
+      "learning_rate": 8.703775788244453e-05,
+      "loss": 1.0034,
+      "step": 1457
+    },
+    {
+      "epoch": 0.5665986592830079,
+      "grad_norm": 0.2100098729133606,
+      "learning_rate": 8.695990657843518e-05,
+      "loss": 1.0907,
+      "step": 1458
+    },
+    {
+      "epoch": 0.5669872729039153,
+      "grad_norm": 0.18947799503803253,
+      "learning_rate": 8.688205527442585e-05,
+      "loss": 0.9664,
+      "step": 1459
+    },
+    {
+      "epoch": 0.5673758865248227,
+      "grad_norm": 0.22341710329055786,
+      "learning_rate": 8.68042039704165e-05,
+      "loss": 1.0551,
+      "step": 1460
+    },
+    {
+      "epoch": 0.5677645001457301,
+      "grad_norm": 0.219679057598114,
+      "learning_rate": 8.672635266640717e-05,
+      "loss": 1.0398,
+      "step": 1461
+    },
+    {
+      "epoch": 0.5681531137666376,
+      "grad_norm": 0.22389841079711914,
+      "learning_rate": 8.664850136239782e-05,
+      "loss": 1.0472,
+      "step": 1462
+    },
+    {
+      "epoch": 0.5685417273875449,
+      "grad_norm": 0.21402975916862488,
+      "learning_rate": 8.657065005838849e-05,
+      "loss": 1.0224,
+      "step": 1463
+    },
+    {
+      "epoch": 0.5689303410084523,
+      "grad_norm": 0.20917154848575592,
+      "learning_rate": 8.649279875437915e-05,
+      "loss": 1.0526,
+      "step": 1464
+    },
+    {
+      "epoch": 0.5693189546293598,
+      "grad_norm": 0.2252056896686554,
+      "learning_rate": 8.64149474503698e-05,
+      "loss": 1.1064,
+      "step": 1465
+    },
+    {
+      "epoch": 0.5697075682502671,
+      "grad_norm": 0.21834802627563477,
+      "learning_rate": 8.633709614636046e-05,
+      "loss": 1.0318,
+      "step": 1466
+    },
+    {
+      "epoch": 0.5700961818711746,
+      "grad_norm": 0.21882353723049164,
+      "learning_rate": 8.625924484235112e-05,
+      "loss": 1.0285,
+      "step": 1467
+    },
+    {
+      "epoch": 0.570484795492082,
+      "grad_norm": 0.2028426229953766,
+      "learning_rate": 8.618139353834177e-05,
+      "loss": 1.0356,
+      "step": 1468
+    },
+    {
+      "epoch": 0.5708734091129894,
+      "grad_norm": 0.22297166287899017,
+      "learning_rate": 8.610354223433243e-05,
+      "loss": 1.0804,
+      "step": 1469
+    },
+    {
+      "epoch": 0.5712620227338968,
+      "grad_norm": 0.21775268018245697,
+      "learning_rate": 8.602569093032309e-05,
+      "loss": 0.9978,
+      "step": 1470
+    },
+    {
+      "epoch": 0.5716506363548043,
+      "grad_norm": 0.20362353324890137,
+      "learning_rate": 8.594783962631374e-05,
+      "loss": 0.9982,
+      "step": 1471
+    },
+    {
+      "epoch": 0.5720392499757117,
+      "grad_norm": 0.21854591369628906,
+      "learning_rate": 8.586998832230441e-05,
+      "loss": 1.0465,
+      "step": 1472
+    },
+    {
+      "epoch": 0.572427863596619,
+      "grad_norm": 0.20501428842544556,
+      "learning_rate": 8.579213701829506e-05,
+      "loss": 1.0468,
+      "step": 1473
+    },
+    {
+      "epoch": 0.5728164772175265,
+      "grad_norm": 0.21606214344501495,
+      "learning_rate": 8.571428571428571e-05,
+      "loss": 1.0477,
+      "step": 1474
+    },
+    {
+      "epoch": 0.5732050908384339,
+      "grad_norm": 0.2100660502910614,
+      "learning_rate": 8.563643441027638e-05,
+      "loss": 1.0071,
+      "step": 1475
+    },
+    {
+      "epoch": 0.5735937044593413,
+      "grad_norm": 0.21008896827697754,
+      "learning_rate": 8.555858310626703e-05,
+      "loss": 0.9914,
+      "step": 1476
+    },
+    {
+      "epoch": 0.5739823180802487,
+      "grad_norm": 0.22192159295082092,
+      "learning_rate": 8.54807318022577e-05,
+      "loss": 1.0385,
+      "step": 1477
+    },
+    {
+      "epoch": 0.5743709317011562,
+      "grad_norm": 0.20123356580734253,
+      "learning_rate": 8.540288049824835e-05,
+      "loss": 1.0062,
+      "step": 1478
+    },
+    {
+      "epoch": 0.5747595453220635,
+      "grad_norm": 0.201947420835495,
+      "learning_rate": 8.5325029194239e-05,
+      "loss": 1.0218,
+      "step": 1479
+    },
+    {
+      "epoch": 0.575148158942971,
+      "grad_norm": 0.22804415225982666,
+      "learning_rate": 8.524717789022967e-05,
+      "loss": 1.0445,
+      "step": 1480
+    },
+    {
+      "epoch": 0.5755367725638784,
+      "grad_norm": 0.20527036488056183,
+      "learning_rate": 8.516932658622032e-05,
+      "loss": 0.9972,
+      "step": 1481
+    },
+    {
+      "epoch": 0.5759253861847857,
+      "grad_norm": 0.20298773050308228,
+      "learning_rate": 8.509147528221098e-05,
+      "loss": 1.0272,
+      "step": 1482
+    },
+    {
+      "epoch": 0.5763139998056932,
+      "grad_norm": 0.22500957548618317,
+      "learning_rate": 8.501362397820164e-05,
+      "loss": 1.0982,
+      "step": 1483
+    },
+    {
+      "epoch": 0.5767026134266006,
+      "grad_norm": 0.1950521320104599,
+      "learning_rate": 8.493577267419229e-05,
+      "loss": 0.9848,
+      "step": 1484
+    },
+    {
+      "epoch": 0.577091227047508,
+      "grad_norm": 0.21087585389614105,
+      "learning_rate": 8.485792137018295e-05,
+      "loss": 1.0125,
+      "step": 1485
+    },
+    {
+      "epoch": 0.5774798406684154,
+      "grad_norm": 0.20122238993644714,
+      "learning_rate": 8.47800700661736e-05,
+      "loss": 1.0533,
+      "step": 1486
+    },
+    {
+      "epoch": 0.5778684542893229,
+      "grad_norm": 0.20149008929729462,
+      "learning_rate": 8.470221876216427e-05,
+      "loss": 1.0719,
+      "step": 1487
+    },
+    {
+      "epoch": 0.5782570679102302,
+      "grad_norm": 0.21307213604450226,
+      "learning_rate": 8.462436745815494e-05,
+      "loss": 1.0522,
+      "step": 1488
+    },
+    {
+      "epoch": 0.5786456815311377,
+      "grad_norm": 0.21828554570674896,
+      "learning_rate": 8.454651615414559e-05,
+      "loss": 1.0184,
+      "step": 1489
+    },
+    {
+      "epoch": 0.5790342951520451,
+      "grad_norm": 0.22002705931663513,
+      "learning_rate": 8.446866485013625e-05,
+      "loss": 1.0101,
+      "step": 1490
+    },
+    {
+      "epoch": 0.5794229087729524,
+      "grad_norm": 0.19479142129421234,
+      "learning_rate": 8.43908135461269e-05,
+      "loss": 0.9889,
+      "step": 1491
+    },
+    {
+      "epoch": 0.5798115223938599,
+      "grad_norm": 0.21346086263656616,
+      "learning_rate": 8.431296224211756e-05,
+      "loss": 1.0373,
+      "step": 1492
+    },
+    {
+      "epoch": 0.5802001360147673,
+      "grad_norm": 0.20177558064460754,
+      "learning_rate": 8.423511093810822e-05,
+      "loss": 1.0215,
+      "step": 1493
+    },
+    {
+      "epoch": 0.5805887496356748,
+      "grad_norm": 0.2117915153503418,
+      "learning_rate": 8.415725963409888e-05,
+      "loss": 1.0321,
+      "step": 1494
+    },
+    {
+      "epoch": 0.5809773632565821,
+      "grad_norm": 0.21304374933242798,
+      "learning_rate": 8.407940833008954e-05,
+      "loss": 1.0123,
+      "step": 1495
+    },
+    {
+      "epoch": 0.5813659768774896,
+      "grad_norm": 0.21173715591430664,
+      "learning_rate": 8.400155702608019e-05,
+      "loss": 1.0696,
+      "step": 1496
+    },
+    {
+      "epoch": 0.581754590498397,
+      "grad_norm": 0.20407019555568695,
+      "learning_rate": 8.392370572207085e-05,
+      "loss": 1.0086,
+      "step": 1497
+    },
+    {
+      "epoch": 0.5821432041193044,
+      "grad_norm": 0.209481880068779,
+      "learning_rate": 8.384585441806151e-05,
+      "loss": 0.9975,
+      "step": 1498
+    },
+    {
+      "epoch": 0.5825318177402118,
+      "grad_norm": 0.22184531390666962,
+      "learning_rate": 8.376800311405216e-05,
+      "loss": 1.0956,
+      "step": 1499
+    },
+    {
+      "epoch": 0.5829204313611193,
+      "grad_norm": 0.21344684064388275,
+      "learning_rate": 8.369015181004283e-05,
+      "loss": 1.0685,
+      "step": 1500
+    },
+    {
+      "epoch": 0.5833090449820266,
+      "grad_norm": 0.19837221503257751,
+      "learning_rate": 8.361230050603348e-05,
+      "loss": 1.0149,
+      "step": 1501
+    },
+    {
+      "epoch": 0.583697658602934,
+      "grad_norm": 0.2133672833442688,
+      "learning_rate": 8.353444920202413e-05,
+      "loss": 1.0453,
+      "step": 1502
+    },
+    {
+      "epoch": 0.5840862722238415,
+      "grad_norm": 0.21944090723991394,
+      "learning_rate": 8.34565978980148e-05,
+      "loss": 1.04,
+      "step": 1503
+    },
+    {
+      "epoch": 0.5844748858447488,
+      "grad_norm": 0.1983667016029358,
+      "learning_rate": 8.337874659400545e-05,
+      "loss": 0.9919,
+      "step": 1504
+    },
+    {
+      "epoch": 0.5848634994656563,
+      "grad_norm": 0.2025303989648819,
+      "learning_rate": 8.33008952899961e-05,
+      "loss": 1.0021,
+      "step": 1505
+    },
+    {
+      "epoch": 0.5852521130865637,
+      "grad_norm": 0.2015170007944107,
+      "learning_rate": 8.322304398598677e-05,
+      "loss": 0.9945,
+      "step": 1506
+    },
+    {
+      "epoch": 0.5856407267074711,
+      "grad_norm": 0.20768272876739502,
+      "learning_rate": 8.314519268197742e-05,
+      "loss": 1.0465,
+      "step": 1507
+    },
+    {
+      "epoch": 0.5860293403283785,
+      "grad_norm": 0.20513412356376648,
+      "learning_rate": 8.306734137796809e-05,
+      "loss": 1.0124,
+      "step": 1508
+    },
+    {
+      "epoch": 0.586417953949286,
+      "grad_norm": 0.20268471539020538,
+      "learning_rate": 8.298949007395874e-05,
+      "loss": 1.0586,
+      "step": 1509
+    },
+    {
+      "epoch": 0.5868065675701933,
+      "grad_norm": 0.20915938913822174,
+      "learning_rate": 8.291163876994939e-05,
+      "loss": 1.0047,
+      "step": 1510
+    },
+    {
+      "epoch": 0.5871951811911007,
+      "grad_norm": 0.2161451131105423,
+      "learning_rate": 8.283378746594006e-05,
+      "loss": 1.0184,
+      "step": 1511
+    },
+    {
+      "epoch": 0.5875837948120082,
+      "grad_norm": 0.1915571093559265,
+      "learning_rate": 8.275593616193071e-05,
+      "loss": 1.0187,
+      "step": 1512
+    },
+    {
+      "epoch": 0.5879724084329155,
+      "grad_norm": 0.20907992124557495,
+      "learning_rate": 8.267808485792137e-05,
+      "loss": 1.0212,
+      "step": 1513
+    },
+    {
+      "epoch": 0.588361022053823,
+      "grad_norm": 0.20140786468982697,
+      "learning_rate": 8.260023355391204e-05,
+      "loss": 1.014,
+      "step": 1514
+    },
+    {
+      "epoch": 0.5887496356747304,
+      "grad_norm": 0.208252415060997,
+      "learning_rate": 8.252238224990269e-05,
+      "loss": 1.0806,
+      "step": 1515
+    },
+    {
+      "epoch": 0.5891382492956379,
+      "grad_norm": 0.20596125721931458,
+      "learning_rate": 8.244453094589336e-05,
+      "loss": 0.9823,
+      "step": 1516
+    },
+    {
+      "epoch": 0.5895268629165452,
+      "grad_norm": 0.18832452595233917,
+      "learning_rate": 8.236667964188401e-05,
+      "loss": 0.9925,
+      "step": 1517
+    },
+    {
+      "epoch": 0.5899154765374527,
+      "grad_norm": 0.2078334391117096,
+      "learning_rate": 8.228882833787467e-05,
+      "loss": 1.0587,
+      "step": 1518
+    },
+    {
+      "epoch": 0.59030409015836,
+      "grad_norm": 0.20121365785598755,
+      "learning_rate": 8.221097703386533e-05,
+      "loss": 1.0607,
+      "step": 1519
+    },
+    {
+      "epoch": 0.5906927037792674,
+      "grad_norm": 0.19666099548339844,
+      "learning_rate": 8.213312572985598e-05,
+      "loss": 1.0124,
+      "step": 1520
+    },
+    {
+      "epoch": 0.5910813174001749,
+      "grad_norm": 0.20176006853580475,
+      "learning_rate": 8.205527442584664e-05,
+      "loss": 1.0297,
+      "step": 1521
+    },
+    {
+      "epoch": 0.5914699310210823,
+      "grad_norm": 0.2038574516773224,
+      "learning_rate": 8.19774231218373e-05,
+      "loss": 1.0311,
+      "step": 1522
+    },
+    {
+      "epoch": 0.5918585446419897,
+      "grad_norm": 0.19517424702644348,
+      "learning_rate": 8.189957181782796e-05,
+      "loss": 0.9945,
+      "step": 1523
+    },
+    {
+      "epoch": 0.5922471582628971,
+      "grad_norm": 0.19599094986915588,
+      "learning_rate": 8.182172051381861e-05,
+      "loss": 1.0255,
+      "step": 1524
+    },
+    {
+      "epoch": 0.5926357718838046,
+      "grad_norm": 0.21409402787685394,
+      "learning_rate": 8.174386920980927e-05,
+      "loss": 1.0868,
+      "step": 1525
+    },
+    {
+      "epoch": 0.5930243855047119,
+      "grad_norm": 0.19567830860614777,
+      "learning_rate": 8.166601790579993e-05,
+      "loss": 0.9654,
+      "step": 1526
+    },
+    {
+      "epoch": 0.5934129991256194,
+      "grad_norm": 0.2275007963180542,
+      "learning_rate": 8.158816660179058e-05,
+      "loss": 1.0867,
+      "step": 1527
+    },
+    {
+      "epoch": 0.5938016127465268,
+      "grad_norm": 0.19826427102088928,
+      "learning_rate": 8.151031529778123e-05,
+      "loss": 1.0301,
+      "step": 1528
+    },
+    {
+      "epoch": 0.5941902263674341,
+      "grad_norm": 0.2051352709531784,
+      "learning_rate": 8.14324639937719e-05,
+      "loss": 1.023,
+      "step": 1529
+    },
+    {
+      "epoch": 0.5945788399883416,
+      "grad_norm": 0.19492043554782867,
+      "learning_rate": 8.135461268976255e-05,
+      "loss": 0.9608,
+      "step": 1530
+    },
+    {
+      "epoch": 0.594967453609249,
+      "grad_norm": 0.21521608531475067,
+      "learning_rate": 8.127676138575322e-05,
+      "loss": 1.0612,
+      "step": 1531
+    },
+    {
+      "epoch": 0.5953560672301564,
+      "grad_norm": 0.22739367187023163,
+      "learning_rate": 8.119891008174387e-05,
+      "loss": 1.0603,
+      "step": 1532
+    },
+    {
+      "epoch": 0.5957446808510638,
+      "grad_norm": 0.20334595441818237,
+      "learning_rate": 8.112105877773452e-05,
+      "loss": 1.0191,
+      "step": 1533
+    },
+    {
+      "epoch": 0.5961332944719713,
+      "grad_norm": 0.20985397696495056,
+      "learning_rate": 8.104320747372519e-05,
+      "loss": 1.0721,
+      "step": 1534
+    },
+    {
+      "epoch": 0.5965219080928786,
+      "grad_norm": 0.20472954213619232,
+      "learning_rate": 8.096535616971584e-05,
+      "loss": 1.0556,
+      "step": 1535
+    },
+    {
+      "epoch": 0.5969105217137861,
+      "grad_norm": 0.2112964689731598,
+      "learning_rate": 8.08875048657065e-05,
+      "loss": 1.0016,
+      "step": 1536
+    },
+    {
+      "epoch": 0.5972991353346935,
+      "grad_norm": 0.21330617368221283,
+      "learning_rate": 8.080965356169716e-05,
+      "loss": 1.0783,
+      "step": 1537
+    },
+    {
+      "epoch": 0.5976877489556008,
+      "grad_norm": 0.20907814800739288,
+      "learning_rate": 8.073180225768782e-05,
+      "loss": 1.071,
+      "step": 1538
+    },
+    {
+      "epoch": 0.5980763625765083,
+      "grad_norm": 0.2038964033126831,
+      "learning_rate": 8.065395095367848e-05,
+      "loss": 1.0039,
+      "step": 1539
+    },
+    {
+      "epoch": 0.5984649761974157,
+      "grad_norm": 0.2175542712211609,
+      "learning_rate": 8.057609964966914e-05,
+      "loss": 1.0015,
+      "step": 1540
+    },
+    {
+      "epoch": 0.5988535898183232,
+      "grad_norm": 0.21474529802799225,
+      "learning_rate": 8.049824834565979e-05,
+      "loss": 1.0273,
+      "step": 1541
+    },
+    {
+      "epoch": 0.5992422034392305,
+      "grad_norm": 0.21428482234477997,
+      "learning_rate": 8.042039704165046e-05,
+      "loss": 1.0767,
+      "step": 1542
+    },
+    {
+      "epoch": 0.599630817060138,
+      "grad_norm": 0.20287524163722992,
+      "learning_rate": 8.034254573764111e-05,
+      "loss": 1.064,
+      "step": 1543
+    },
+    {
+      "epoch": 0.6000194306810454,
+      "grad_norm": 0.20689848065376282,
+      "learning_rate": 8.026469443363178e-05,
+      "loss": 1.0084,
+      "step": 1544
+    },
+    {
+      "epoch": 0.6004080443019528,
+      "grad_norm": 0.22451332211494446,
+      "learning_rate": 8.018684312962243e-05,
+      "loss": 1.1039,
+      "step": 1545
+    },
+    {
+      "epoch": 0.6007966579228602,
+      "grad_norm": 0.21381956338882446,
+      "learning_rate": 8.010899182561308e-05,
+      "loss": 1.0551,
+      "step": 1546
+    },
+    {
+      "epoch": 0.6011852715437677,
+      "grad_norm": 0.20108483731746674,
+      "learning_rate": 8.003114052160375e-05,
+      "loss": 1.0326,
+      "step": 1547
+    },
+    {
+      "epoch": 0.601573885164675,
+      "grad_norm": 0.19739678502082825,
+      "learning_rate": 7.99532892175944e-05,
+      "loss": 1.0319,
+      "step": 1548
+    },
+    {
+      "epoch": 0.6019624987855824,
+      "grad_norm": 0.21635359525680542,
+      "learning_rate": 7.987543791358506e-05,
+      "loss": 1.0465,
+      "step": 1549
+    },
+    {
+      "epoch": 0.6023511124064899,
+      "grad_norm": 0.1949319988489151,
+      "learning_rate": 7.979758660957572e-05,
+      "loss": 1.0026,
+      "step": 1550
+    },
+    {
+      "epoch": 0.6027397260273972,
+      "grad_norm": 0.1989699900150299,
+      "learning_rate": 7.971973530556637e-05,
+      "loss": 1.021,
+      "step": 1551
+    },
+    {
+      "epoch": 0.6031283396483047,
+      "grad_norm": 0.24031391739845276,
+      "learning_rate": 7.964188400155703e-05,
+      "loss": 1.0293,
+      "step": 1552
+    },
+    {
+      "epoch": 0.6035169532692121,
+      "grad_norm": 0.21247251331806183,
+      "learning_rate": 7.956403269754769e-05,
+      "loss": 1.023,
+      "step": 1553
+    },
+    {
+      "epoch": 0.6039055668901195,
+      "grad_norm": 0.21565628051757812,
+      "learning_rate": 7.948618139353835e-05,
+      "loss": 1.1027,
+      "step": 1554
+    },
+    {
+      "epoch": 0.6042941805110269,
+      "grad_norm": 0.21207931637763977,
+      "learning_rate": 7.9408330089529e-05,
+      "loss": 1.0634,
+      "step": 1555
+    },
+    {
+      "epoch": 0.6046827941319344,
+      "grad_norm": 0.21354155242443085,
+      "learning_rate": 7.933047878551965e-05,
+      "loss": 1.0433,
+      "step": 1556
+    },
+    {
+      "epoch": 0.6050714077528417,
+      "grad_norm": 0.21708370745182037,
+      "learning_rate": 7.925262748151032e-05,
+      "loss": 1.0499,
+      "step": 1557
+    },
+    {
+      "epoch": 0.6054600213737491,
+      "grad_norm": 0.2051447182893753,
+      "learning_rate": 7.917477617750097e-05,
+      "loss": 1.0042,
+      "step": 1558
+    },
+    {
+      "epoch": 0.6058486349946566,
+      "grad_norm": 0.18768000602722168,
+      "learning_rate": 7.909692487349164e-05,
+      "loss": 1.009,
+      "step": 1559
+    },
+    {
+      "epoch": 0.6062372486155639,
+      "grad_norm": 0.2142931967973709,
+      "learning_rate": 7.901907356948229e-05,
+      "loss": 1.0458,
+      "step": 1560
+    },
+    {
+      "epoch": 0.6066258622364714,
+      "grad_norm": 0.21006444096565247,
+      "learning_rate": 7.894122226547294e-05,
+      "loss": 1.0286,
+      "step": 1561
+    },
+    {
+      "epoch": 0.6070144758573788,
+      "grad_norm": 0.2187039703130722,
+      "learning_rate": 7.886337096146361e-05,
+      "loss": 1.0103,
+      "step": 1562
+    },
+    {
+      "epoch": 0.6074030894782863,
+      "grad_norm": 0.19863669574260712,
+      "learning_rate": 7.878551965745426e-05,
+      "loss": 0.9925,
+      "step": 1563
+    },
+    {
+      "epoch": 0.6077917030991936,
+      "grad_norm": 0.21771976351737976,
+      "learning_rate": 7.870766835344493e-05,
+      "loss": 0.9853,
+      "step": 1564
+    },
+    {
+      "epoch": 0.6081803167201011,
+      "grad_norm": 0.21714983880519867,
+      "learning_rate": 7.862981704943558e-05,
+      "loss": 1.0123,
+      "step": 1565
+    },
+    {
+      "epoch": 0.6085689303410085,
+      "grad_norm": 0.2251398265361786,
+      "learning_rate": 7.855196574542624e-05,
+      "loss": 1.0265,
+      "step": 1566
+    },
+    {
+      "epoch": 0.6089575439619158,
+      "grad_norm": 0.22089716792106628,
+      "learning_rate": 7.84741144414169e-05,
+      "loss": 1.0689,
+      "step": 1567
+    },
+    {
+      "epoch": 0.6093461575828233,
+      "grad_norm": 0.2453841269016266,
+      "learning_rate": 7.839626313740756e-05,
+      "loss": 1.0185,
+      "step": 1568
+    },
+    {
+      "epoch": 0.6097347712037307,
+      "grad_norm": 0.21866528689861298,
+      "learning_rate": 7.831841183339821e-05,
+      "loss": 1.0361,
+      "step": 1569
+    },
+    {
+      "epoch": 0.6101233848246381,
+      "grad_norm": 0.22421486675739288,
+      "learning_rate": 7.824056052938888e-05,
+      "loss": 1.024,
+      "step": 1570
+    },
+    {
+      "epoch": 0.6105119984455455,
+      "grad_norm": 0.21107137203216553,
+      "learning_rate": 7.816270922537953e-05,
+      "loss": 1.0335,
+      "step": 1571
+    },
+    {
+      "epoch": 0.610900612066453,
+      "grad_norm": 0.20731772482395172,
+      "learning_rate": 7.80848579213702e-05,
+      "loss": 1.0563,
+      "step": 1572
+    },
+    {
+      "epoch": 0.6112892256873603,
+      "grad_norm": 0.19535884261131287,
+      "learning_rate": 7.800700661736085e-05,
+      "loss": 0.9698,
+      "step": 1573
+    },
+    {
+      "epoch": 0.6116778393082678,
+      "grad_norm": 0.20449021458625793,
+      "learning_rate": 7.79291553133515e-05,
+      "loss": 1.0125,
+      "step": 1574
+    },
+    {
+      "epoch": 0.6120664529291752,
+      "grad_norm": 0.19576509296894073,
+      "learning_rate": 7.785130400934217e-05,
+      "loss": 0.9326,
+      "step": 1575
+    },
+    {
+      "epoch": 0.6124550665500825,
+      "grad_norm": 0.18914124369621277,
+      "learning_rate": 7.777345270533282e-05,
+      "loss": 0.9939,
+      "step": 1576
+    },
+    {
+      "epoch": 0.61284368017099,
+      "grad_norm": 0.21239091455936432,
+      "learning_rate": 7.769560140132348e-05,
+      "loss": 1.0271,
+      "step": 1577
+    },
+    {
+      "epoch": 0.6132322937918974,
+      "grad_norm": 0.22204811871051788,
+      "learning_rate": 7.761775009731414e-05,
+      "loss": 1.0524,
+      "step": 1578
+    },
+    {
+      "epoch": 0.6136209074128048,
+      "grad_norm": 0.20047850906848907,
+      "learning_rate": 7.753989879330479e-05,
+      "loss": 1.0076,
+      "step": 1579
+    },
+    {
+      "epoch": 0.6140095210337122,
+      "grad_norm": 0.22619746625423431,
+      "learning_rate": 7.746204748929545e-05,
+      "loss": 1.0611,
+      "step": 1580
+    },
+    {
+      "epoch": 0.6143981346546197,
+      "grad_norm": 0.2500879466533661,
+      "learning_rate": 7.73841961852861e-05,
+      "loss": 1.0364,
+      "step": 1581
+    },
+    {
+      "epoch": 0.614786748275527,
+      "grad_norm": 0.23486928641796112,
+      "learning_rate": 7.730634488127676e-05,
+      "loss": 1.0472,
+      "step": 1582
+    },
+    {
+      "epoch": 0.6151753618964345,
+      "grad_norm": 0.19849038124084473,
+      "learning_rate": 7.722849357726742e-05,
+      "loss": 0.9847,
+      "step": 1583
+    },
+    {
+      "epoch": 0.6155639755173419,
+      "grad_norm": 0.21516263484954834,
+      "learning_rate": 7.715064227325807e-05,
+      "loss": 1.0351,
+      "step": 1584
+    },
+    {
+      "epoch": 0.6159525891382492,
+      "grad_norm": 0.20137760043144226,
+      "learning_rate": 7.707279096924874e-05,
+      "loss": 0.9879,
+      "step": 1585
+    },
+    {
+      "epoch": 0.6163412027591567,
+      "grad_norm": 0.2146228402853012,
+      "learning_rate": 7.699493966523939e-05,
+      "loss": 1.0792,
+      "step": 1586
+    },
+    {
+      "epoch": 0.6167298163800641,
+      "grad_norm": 0.19929760694503784,
+      "learning_rate": 7.691708836123004e-05,
+      "loss": 1.0313,
+      "step": 1587
+    },
+    {
+      "epoch": 0.6171184300009716,
+      "grad_norm": 0.201123908162117,
+      "learning_rate": 7.683923705722071e-05,
+      "loss": 1.0279,
+      "step": 1588
+    },
+    {
+      "epoch": 0.6175070436218789,
+      "grad_norm": 0.2154105007648468,
+      "learning_rate": 7.676138575321136e-05,
+      "loss": 1.075,
+      "step": 1589
+    },
+    {
+      "epoch": 0.6178956572427864,
+      "grad_norm": 0.2028442770242691,
+      "learning_rate": 7.668353444920203e-05,
+      "loss": 0.9771,
+      "step": 1590
+    },
+    {
+      "epoch": 0.6182842708636938,
+      "grad_norm": 0.18003074824810028,
+      "learning_rate": 7.660568314519268e-05,
+      "loss": 0.9677,
+      "step": 1591
+    },
+    {
+      "epoch": 0.6186728844846012,
+      "grad_norm": 0.23250891268253326,
+      "learning_rate": 7.652783184118335e-05,
+      "loss": 1.015,
+      "step": 1592
+    },
+    {
+      "epoch": 0.6190614981055086,
+      "grad_norm": 0.2047244906425476,
+      "learning_rate": 7.6449980537174e-05,
+      "loss": 1.0044,
+      "step": 1593
+    },
+    {
+      "epoch": 0.6194501117264161,
+      "grad_norm": 0.20011259615421295,
+      "learning_rate": 7.637212923316466e-05,
+      "loss": 1.0089,
+      "step": 1594
+    },
+    {
+      "epoch": 0.6198387253473234,
+      "grad_norm": 0.2212608903646469,
+      "learning_rate": 7.629427792915533e-05,
+      "loss": 1.0457,
+      "step": 1595
+    },
+    {
+      "epoch": 0.6202273389682308,
+      "grad_norm": 0.22725115716457367,
+      "learning_rate": 7.621642662514598e-05,
+      "loss": 1.1198,
+      "step": 1596
+    },
+    {
+      "epoch": 0.6206159525891383,
+      "grad_norm": 0.2065306007862091,
+      "learning_rate": 7.613857532113663e-05,
+      "loss": 1.0572,
+      "step": 1597
+    },
+    {
+      "epoch": 0.6210045662100456,
+      "grad_norm": 0.2132783830165863,
+      "learning_rate": 7.60607240171273e-05,
+      "loss": 1.0332,
+      "step": 1598
+    },
+    {
+      "epoch": 0.6213931798309531,
+      "grad_norm": 0.20527103543281555,
+      "learning_rate": 7.598287271311795e-05,
+      "loss": 1.0156,
+      "step": 1599
+    },
+    {
+      "epoch": 0.6217817934518605,
+      "grad_norm": 0.23608024418354034,
+      "learning_rate": 7.59050214091086e-05,
+      "loss": 1.0379,
+      "step": 1600
+    },
+    {
+      "epoch": 0.6221704070727679,
+      "grad_norm": 0.22227297723293304,
+      "learning_rate": 7.582717010509927e-05,
+      "loss": 1.0507,
+      "step": 1601
+    },
+    {
+      "epoch": 0.6225590206936753,
+      "grad_norm": 0.22359615564346313,
+      "learning_rate": 7.574931880108992e-05,
+      "loss": 1.0705,
+      "step": 1602
+    },
+    {
+      "epoch": 0.6229476343145828,
+      "grad_norm": 0.20478755235671997,
+      "learning_rate": 7.567146749708059e-05,
+      "loss": 1.0309,
+      "step": 1603
+    },
+    {
+      "epoch": 0.6233362479354901,
+      "grad_norm": 0.2223423272371292,
+      "learning_rate": 7.559361619307124e-05,
+      "loss": 1.0386,
+      "step": 1604
+    },
+    {
+      "epoch": 0.6237248615563975,
+      "grad_norm": 0.21232105791568756,
+      "learning_rate": 7.551576488906189e-05,
+      "loss": 1.0353,
+      "step": 1605
+    },
+    {
+      "epoch": 0.624113475177305,
+      "grad_norm": 0.22431129217147827,
+      "learning_rate": 7.543791358505256e-05,
+      "loss": 1.1017,
+      "step": 1606
+    },
+    {
+      "epoch": 0.6245020887982123,
+      "grad_norm": 0.20826031267642975,
+      "learning_rate": 7.536006228104321e-05,
+      "loss": 1.0172,
+      "step": 1607
+    },
+    {
+      "epoch": 0.6248907024191198,
+      "grad_norm": 0.2803161144256592,
+      "learning_rate": 7.528221097703387e-05,
+      "loss": 1.0554,
+      "step": 1608
+    },
+    {
+      "epoch": 0.6252793160400272,
+      "grad_norm": 0.2185174971818924,
+      "learning_rate": 7.520435967302453e-05,
+      "loss": 0.9842,
+      "step": 1609
+    },
+    {
+      "epoch": 0.6256679296609347,
+      "grad_norm": 0.2091478854417801,
+      "learning_rate": 7.512650836901518e-05,
+      "loss": 0.9783,
+      "step": 1610
+    },
+    {
+      "epoch": 0.626056543281842,
+      "grad_norm": 0.22342967987060547,
+      "learning_rate": 7.504865706500584e-05,
+      "loss": 0.9891,
+      "step": 1611
+    },
+    {
+      "epoch": 0.6264451569027495,
+      "grad_norm": 0.195283442735672,
+      "learning_rate": 7.49708057609965e-05,
+      "loss": 0.9654,
+      "step": 1612
+    },
+    {
+      "epoch": 0.6268337705236569,
+      "grad_norm": 0.21048255264759064,
+      "learning_rate": 7.489295445698716e-05,
+      "loss": 1.0112,
+      "step": 1613
+    },
+    {
+      "epoch": 0.6272223841445642,
+      "grad_norm": 0.21405541896820068,
+      "learning_rate": 7.481510315297781e-05,
+      "loss": 1.0498,
+      "step": 1614
+    },
+    {
+      "epoch": 0.6276109977654717,
+      "grad_norm": 0.2144453227519989,
+      "learning_rate": 7.473725184896846e-05,
+      "loss": 1.0487,
+      "step": 1615
+    },
+    {
+      "epoch": 0.627999611386379,
+      "grad_norm": 0.21963326632976532,
+      "learning_rate": 7.465940054495913e-05,
+      "loss": 1.0634,
+      "step": 1616
+    },
+    {
+      "epoch": 0.6283882250072865,
+      "grad_norm": 0.20100601017475128,
+      "learning_rate": 7.458154924094978e-05,
+      "loss": 1.0407,
+      "step": 1617
+    },
+    {
+      "epoch": 0.6287768386281939,
+      "grad_norm": 0.19469478726387024,
+      "learning_rate": 7.450369793694045e-05,
+      "loss": 0.9923,
+      "step": 1618
+    },
+    {
+      "epoch": 0.6291654522491014,
+      "grad_norm": 0.2114047408103943,
+      "learning_rate": 7.442584663293111e-05,
+      "loss": 1.0263,
+      "step": 1619
+    },
+    {
+      "epoch": 0.6295540658700087,
+      "grad_norm": 0.21080389618873596,
+      "learning_rate": 7.434799532892177e-05,
+      "loss": 1.0012,
+      "step": 1620
+    },
+    {
+      "epoch": 0.6299426794909162,
+      "grad_norm": 0.20366831123828888,
+      "learning_rate": 7.427014402491243e-05,
+      "loss": 1.0254,
+      "step": 1621
+    },
+    {
+      "epoch": 0.6303312931118236,
+      "grad_norm": 0.209821879863739,
+      "learning_rate": 7.419229272090308e-05,
+      "loss": 0.9416,
+      "step": 1622
+    },
+    {
+      "epoch": 0.6307199067327309,
+      "grad_norm": 0.2228868007659912,
+      "learning_rate": 7.411444141689374e-05,
+      "loss": 1.0128,
+      "step": 1623
+    },
+    {
+      "epoch": 0.6311085203536384,
+      "grad_norm": 0.19673995673656464,
+      "learning_rate": 7.40365901128844e-05,
+      "loss": 0.9709,
+      "step": 1624
+    },
+    {
+      "epoch": 0.6314971339745458,
+      "grad_norm": 0.21590839326381683,
+      "learning_rate": 7.395873880887505e-05,
+      "loss": 1.0251,
+      "step": 1625
+    },
+    {
+      "epoch": 0.6318857475954532,
+      "grad_norm": 0.20200593769550323,
+      "learning_rate": 7.388088750486572e-05,
+      "loss": 1.0307,
+      "step": 1626
+    },
+    {
+      "epoch": 0.6322743612163606,
+      "grad_norm": 0.19623909890651703,
+      "learning_rate": 7.380303620085637e-05,
+      "loss": 1.0375,
+      "step": 1627
+    },
+    {
+      "epoch": 0.6326629748372681,
+      "grad_norm": 0.19878128170967102,
+      "learning_rate": 7.372518489684702e-05,
+      "loss": 0.9844,
+      "step": 1628
+    },
+    {
+      "epoch": 0.6330515884581754,
+      "grad_norm": 0.21292422711849213,
+      "learning_rate": 7.364733359283769e-05,
+      "loss": 1.0228,
+      "step": 1629
+    },
+    {
+      "epoch": 0.6334402020790829,
+      "grad_norm": 0.1915559619665146,
+      "learning_rate": 7.356948228882834e-05,
+      "loss": 0.9818,
+      "step": 1630
+    },
+    {
+      "epoch": 0.6338288156999903,
+      "grad_norm": 0.2264430969953537,
+      "learning_rate": 7.3491630984819e-05,
+      "loss": 1.146,
+      "step": 1631
+    },
+    {
+      "epoch": 0.6342174293208978,
+      "grad_norm": 0.19332270324230194,
+      "learning_rate": 7.341377968080966e-05,
+      "loss": 1.0007,
+      "step": 1632
+    },
+    {
+      "epoch": 0.6346060429418051,
+      "grad_norm": 0.217147096991539,
+      "learning_rate": 7.333592837680031e-05,
+      "loss": 1.0498,
+      "step": 1633
+    },
+    {
+      "epoch": 0.6349946565627125,
+      "grad_norm": 0.22200679779052734,
+      "learning_rate": 7.325807707279098e-05,
+      "loss": 1.0358,
+      "step": 1634
+    },
+    {
+      "epoch": 0.63538327018362,
+      "grad_norm": 0.19485117495059967,
+      "learning_rate": 7.318022576878163e-05,
+      "loss": 0.9717,
+      "step": 1635
+    },
+    {
+      "epoch": 0.6357718838045273,
+      "grad_norm": 0.20595680177211761,
+      "learning_rate": 7.310237446477228e-05,
+      "loss": 1.0195,
+      "step": 1636
+    },
+    {
+      "epoch": 0.6361604974254348,
+      "grad_norm": 0.21184709668159485,
+      "learning_rate": 7.302452316076294e-05,
+      "loss": 1.0354,
+      "step": 1637
+    },
+    {
+      "epoch": 0.6365491110463422,
+      "grad_norm": 0.22607794404029846,
+      "learning_rate": 7.29466718567536e-05,
+      "loss": 1.0217,
+      "step": 1638
+    },
+    {
+      "epoch": 0.6369377246672496,
+      "grad_norm": 0.20236065983772278,
+      "learning_rate": 7.286882055274426e-05,
+      "loss": 1.0441,
+      "step": 1639
+    },
+    {
+      "epoch": 0.637326338288157,
+      "grad_norm": 0.19979622960090637,
+      "learning_rate": 7.279096924873491e-05,
+      "loss": 1.0105,
+      "step": 1640
+    },
+    {
+      "epoch": 0.6377149519090645,
+      "grad_norm": 0.2655459940433502,
+      "learning_rate": 7.271311794472557e-05,
+      "loss": 1.0726,
+      "step": 1641
+    },
+    {
+      "epoch": 0.6381035655299718,
+      "grad_norm": 0.25107496976852417,
+      "learning_rate": 7.263526664071623e-05,
+      "loss": 1.037,
+      "step": 1642
+    },
+    {
+      "epoch": 0.6384921791508792,
+      "grad_norm": 0.19250229001045227,
+      "learning_rate": 7.255741533670688e-05,
+      "loss": 0.9741,
+      "step": 1643
+    },
+    {
+      "epoch": 0.6388807927717867,
+      "grad_norm": 0.19324181973934174,
+      "learning_rate": 7.247956403269755e-05,
+      "loss": 1.0333,
+      "step": 1644
+    },
+    {
+      "epoch": 0.639269406392694,
+      "grad_norm": 0.22267483174800873,
+      "learning_rate": 7.240171272868822e-05,
+      "loss": 1.0313,
+      "step": 1645
+    },
+    {
+      "epoch": 0.6396580200136015,
+      "grad_norm": 0.2775348722934723,
+      "learning_rate": 7.232386142467887e-05,
+      "loss": 1.0686,
+      "step": 1646
+    },
+    {
+      "epoch": 0.6400466336345089,
+      "grad_norm": 0.1886623501777649,
+      "learning_rate": 7.224601012066953e-05,
+      "loss": 1.0029,
+      "step": 1647
+    },
+    {
+      "epoch": 0.6404352472554163,
+      "grad_norm": 0.20303374528884888,
+      "learning_rate": 7.216815881666019e-05,
+      "loss": 1.0346,
+      "step": 1648
+    },
+    {
+      "epoch": 0.6408238608763237,
+      "grad_norm": 0.20815756916999817,
+      "learning_rate": 7.209030751265085e-05,
+      "loss": 1.0258,
+      "step": 1649
+    },
+    {
+      "epoch": 0.6412124744972312,
+      "grad_norm": 0.22055703401565552,
+      "learning_rate": 7.20124562086415e-05,
+      "loss": 1.0215,
+      "step": 1650
+    },
+    {
+      "epoch": 0.6416010881181385,
+      "grad_norm": 0.20248562097549438,
+      "learning_rate": 7.193460490463215e-05,
+      "loss": 0.9979,
+      "step": 1651
+    },
+    {
+      "epoch": 0.6419897017390459,
+      "grad_norm": 0.2093247026205063,
+      "learning_rate": 7.185675360062282e-05,
+      "loss": 1.0605,
+      "step": 1652
+    },
+    {
+      "epoch": 0.6423783153599534,
+      "grad_norm": 0.22276204824447632,
+      "learning_rate": 7.177890229661347e-05,
+      "loss": 1.0788,
+      "step": 1653
+    },
+    {
+      "epoch": 0.6427669289808607,
+      "grad_norm": 0.19959624111652374,
+      "learning_rate": 7.170105099260412e-05,
+      "loss": 0.9954,
+      "step": 1654
+    },
+    {
+      "epoch": 0.6431555426017682,
+      "grad_norm": 0.20173248648643494,
+      "learning_rate": 7.162319968859479e-05,
+      "loss": 1.003,
+      "step": 1655
+    },
+    {
+      "epoch": 0.6435441562226756,
+      "grad_norm": 0.207533061504364,
+      "learning_rate": 7.154534838458544e-05,
+      "loss": 1.043,
+      "step": 1656
+    },
+    {
+      "epoch": 0.643932769843583,
+      "grad_norm": 0.21928350627422333,
+      "learning_rate": 7.146749708057611e-05,
+      "loss": 1.0472,
+      "step": 1657
+    },
+    {
+      "epoch": 0.6443213834644904,
+      "grad_norm": 0.2567078173160553,
+      "learning_rate": 7.138964577656676e-05,
+      "loss": 1.0946,
+      "step": 1658
+    },
+    {
+      "epoch": 0.6447099970853979,
+      "grad_norm": 0.19454176723957062,
+      "learning_rate": 7.131179447255741e-05,
+      "loss": 0.9437,
+      "step": 1659
+    },
+    {
+      "epoch": 0.6450986107063053,
+      "grad_norm": 0.19198423624038696,
+      "learning_rate": 7.123394316854808e-05,
+      "loss": 0.9976,
+      "step": 1660
+    },
+    {
+      "epoch": 0.6454872243272126,
+      "grad_norm": 0.1929445117712021,
+      "learning_rate": 7.115609186453873e-05,
+      "loss": 1.0279,
+      "step": 1661
+    },
+    {
+      "epoch": 0.6458758379481201,
+      "grad_norm": 0.2041027694940567,
+      "learning_rate": 7.10782405605294e-05,
+      "loss": 1.0458,
+      "step": 1662
+    },
+    {
+      "epoch": 0.6462644515690275,
+      "grad_norm": 0.23750995099544525,
+      "learning_rate": 7.100038925652005e-05,
+      "loss": 1.0916,
+      "step": 1663
+    },
+    {
+      "epoch": 0.6466530651899349,
+      "grad_norm": 0.1971994787454605,
+      "learning_rate": 7.09225379525107e-05,
+      "loss": 0.951,
+      "step": 1664
+    },
+    {
+      "epoch": 0.6470416788108423,
+      "grad_norm": 0.20459246635437012,
+      "learning_rate": 7.084468664850136e-05,
+      "loss": 0.9653,
+      "step": 1665
+    },
+    {
+      "epoch": 0.6474302924317498,
+      "grad_norm": 0.2137187272310257,
+      "learning_rate": 7.076683534449202e-05,
+      "loss": 1.0291,
+      "step": 1666
+    },
+    {
+      "epoch": 0.6478189060526571,
+      "grad_norm": 0.21235258877277374,
+      "learning_rate": 7.068898404048268e-05,
+      "loss": 1.0104,
+      "step": 1667
+    },
+    {
+      "epoch": 0.6482075196735646,
+      "grad_norm": 0.23120944201946259,
+      "learning_rate": 7.061113273647333e-05,
+      "loss": 1.0693,
+      "step": 1668
+    },
+    {
+      "epoch": 0.648596133294472,
+      "grad_norm": 1.38257896900177,
+      "learning_rate": 7.053328143246399e-05,
+      "loss": 1.0339,
+      "step": 1669
+    },
+    {
+      "epoch": 0.6489847469153793,
+      "grad_norm": 0.20898790657520294,
+      "learning_rate": 7.045543012845465e-05,
+      "loss": 1.004,
+      "step": 1670
+    },
+    {
+      "epoch": 0.6493733605362868,
+      "grad_norm": 0.20251236855983734,
+      "learning_rate": 7.037757882444532e-05,
+      "loss": 0.9992,
+      "step": 1671
+    },
+    {
+      "epoch": 0.6497619741571942,
+      "grad_norm": 0.2358030527830124,
+      "learning_rate": 7.029972752043597e-05,
+      "loss": 0.9854,
+      "step": 1672
+    },
+    {
+      "epoch": 0.6501505877781016,
+      "grad_norm": 0.18945704400539398,
+      "learning_rate": 7.022187621642664e-05,
+      "loss": 0.9677,
+      "step": 1673
+    },
+    {
+      "epoch": 0.650539201399009,
+      "grad_norm": 0.1965213567018509,
+      "learning_rate": 7.014402491241729e-05,
+      "loss": 1.0118,
+      "step": 1674
+    },
+    {
+      "epoch": 0.6509278150199165,
+      "grad_norm": 0.2340148687362671,
+      "learning_rate": 7.006617360840795e-05,
+      "loss": 1.0312,
+      "step": 1675
+    },
+    {
+      "epoch": 0.6513164286408238,
+      "grad_norm": 0.1992296278476715,
+      "learning_rate": 6.99883223043986e-05,
+      "loss": 1.0155,
+      "step": 1676
+    },
+    {
+      "epoch": 0.6517050422617313,
+      "grad_norm": 0.20410223305225372,
+      "learning_rate": 6.991047100038926e-05,
+      "loss": 1.0646,
+      "step": 1677
+    },
+    {
+      "epoch": 0.6520936558826387,
+      "grad_norm": 0.19254536926746368,
+      "learning_rate": 6.983261969637992e-05,
+      "loss": 0.9538,
+      "step": 1678
+    },
+    {
+      "epoch": 0.6524822695035462,
+      "grad_norm": 0.19980847835540771,
+      "learning_rate": 6.975476839237057e-05,
+      "loss": 0.9912,
+      "step": 1679
+    },
+    {
+      "epoch": 0.6528708831244535,
+      "grad_norm": 0.19503261148929596,
+      "learning_rate": 6.967691708836124e-05,
+      "loss": 0.9844,
+      "step": 1680
+    },
+    {
+      "epoch": 0.6532594967453609,
+      "grad_norm": 0.22375883162021637,
+      "learning_rate": 6.959906578435189e-05,
+      "loss": 1.1266,
+      "step": 1681
+    },
+    {
+      "epoch": 0.6536481103662684,
+      "grad_norm": 0.21456514298915863,
+      "learning_rate": 6.952121448034254e-05,
+      "loss": 1.0902,
+      "step": 1682
+    },
+    {
+      "epoch": 0.6540367239871757,
+      "grad_norm": 0.20348122715950012,
+      "learning_rate": 6.944336317633321e-05,
+      "loss": 1.0228,
+      "step": 1683
+    },
+    {
+      "epoch": 0.6544253376080832,
+      "grad_norm": 0.21647393703460693,
+      "learning_rate": 6.936551187232386e-05,
+      "loss": 1.0653,
+      "step": 1684
+    },
+    {
+      "epoch": 0.6548139512289906,
+      "grad_norm": 0.20160923898220062,
+      "learning_rate": 6.928766056831453e-05,
+      "loss": 1.0249,
+      "step": 1685
+    },
+    {
+      "epoch": 0.655202564849898,
+      "grad_norm": 0.20070499181747437,
+      "learning_rate": 6.920980926430518e-05,
+      "loss": 1.0585,
+      "step": 1686
+    },
+    {
+      "epoch": 0.6555911784708054,
+      "grad_norm": 0.2656902074813843,
+      "learning_rate": 6.913195796029583e-05,
+      "loss": 1.0042,
+      "step": 1687
+    },
+    {
+      "epoch": 0.6559797920917129,
+      "grad_norm": 0.1934545785188675,
+      "learning_rate": 6.90541066562865e-05,
+      "loss": 0.9831,
+      "step": 1688
+    },
+    {
+      "epoch": 0.6563684057126202,
+      "grad_norm": 0.21719245612621307,
+      "learning_rate": 6.897625535227715e-05,
+      "loss": 0.9934,
+      "step": 1689
+    },
+    {
+      "epoch": 0.6567570193335276,
+      "grad_norm": 0.20906969904899597,
+      "learning_rate": 6.889840404826782e-05,
+      "loss": 1.023,
+      "step": 1690
+    },
+    {
+      "epoch": 0.6571456329544351,
+      "grad_norm": 0.225227490067482,
+      "learning_rate": 6.882055274425847e-05,
+      "loss": 1.0265,
+      "step": 1691
+    },
+    {
+      "epoch": 0.6575342465753424,
+      "grad_norm": 0.22766710817813873,
+      "learning_rate": 6.874270144024912e-05,
+      "loss": 1.0306,
+      "step": 1692
+    },
+    {
+      "epoch": 0.6579228601962499,
+      "grad_norm": 0.20964065194129944,
+      "learning_rate": 6.866485013623978e-05,
+      "loss": 0.9431,
+      "step": 1693
+    },
+    {
+      "epoch": 0.6583114738171573,
+      "grad_norm": 0.19821231067180634,
+      "learning_rate": 6.858699883223044e-05,
+      "loss": 0.9959,
+      "step": 1694
+    },
+    {
+      "epoch": 0.6587000874380647,
+      "grad_norm": 0.2071307748556137,
+      "learning_rate": 6.85091475282211e-05,
+      "loss": 1.0332,
+      "step": 1695
+    },
+    {
+      "epoch": 0.6590887010589721,
+      "grad_norm": 0.27962490916252136,
+      "learning_rate": 6.843129622421175e-05,
+      "loss": 0.9755,
+      "step": 1696
+    },
+    {
+      "epoch": 0.6594773146798796,
+      "grad_norm": 0.21582698822021484,
+      "learning_rate": 6.835344492020242e-05,
+      "loss": 1.0305,
+      "step": 1697
+    },
+    {
+      "epoch": 0.6598659283007869,
+      "grad_norm": 0.1872921586036682,
+      "learning_rate": 6.827559361619307e-05,
+      "loss": 0.9693,
+      "step": 1698
+    },
+    {
+      "epoch": 0.6602545419216943,
+      "grad_norm": 0.27033379673957825,
+      "learning_rate": 6.819774231218374e-05,
+      "loss": 1.0756,
+      "step": 1699
+    },
+    {
+      "epoch": 0.6606431555426018,
+      "grad_norm": 0.2010008543729782,
+      "learning_rate": 6.811989100817439e-05,
+      "loss": 1.0077,
+      "step": 1700
+    },
+    {
+      "epoch": 0.6610317691635091,
+      "grad_norm": 0.20637495815753937,
+      "learning_rate": 6.804203970416506e-05,
+      "loss": 1.0208,
+      "step": 1701
+    },
+    {
+      "epoch": 0.6614203827844166,
+      "grad_norm": 0.21331818401813507,
+      "learning_rate": 6.796418840015571e-05,
+      "loss": 1.0242,
+      "step": 1702
+    },
+    {
+      "epoch": 0.661808996405324,
+      "grad_norm": 0.2092941552400589,
+      "learning_rate": 6.788633709614637e-05,
+      "loss": 1.0949,
+      "step": 1703
+    },
+    {
+      "epoch": 0.6621976100262315,
+      "grad_norm": 0.22332265973091125,
+      "learning_rate": 6.780848579213703e-05,
+      "loss": 1.1068,
+      "step": 1704
+    },
+    {
+      "epoch": 0.6625862236471388,
+      "grad_norm": 0.20077067613601685,
+      "learning_rate": 6.773063448812768e-05,
+      "loss": 0.9801,
+      "step": 1705
+    },
+    {
+      "epoch": 0.6629748372680463,
+      "grad_norm": 0.2057008296251297,
+      "learning_rate": 6.765278318411834e-05,
+      "loss": 1.0058,
+      "step": 1706
+    },
+    {
+      "epoch": 0.6633634508889537,
+      "grad_norm": 0.20337353646755219,
+      "learning_rate": 6.7574931880109e-05,
+      "loss": 1.0141,
+      "step": 1707
+    },
+    {
+      "epoch": 0.663752064509861,
+      "grad_norm": 0.22756130993366241,
+      "learning_rate": 6.749708057609966e-05,
+      "loss": 1.0287,
+      "step": 1708
+    },
+    {
+      "epoch": 0.6641406781307685,
+      "grad_norm": 0.2052423506975174,
+      "learning_rate": 6.741922927209031e-05,
+      "loss": 1.0069,
+      "step": 1709
+    },
+    {
+      "epoch": 0.6645292917516759,
+      "grad_norm": 0.1988023817539215,
+      "learning_rate": 6.734137796808096e-05,
+      "loss": 0.9761,
+      "step": 1710
+    },
+    {
+      "epoch": 0.6649179053725833,
+      "grad_norm": 0.20491188764572144,
+      "learning_rate": 6.726352666407163e-05,
+      "loss": 0.9767,
+      "step": 1711
+    },
+    {
+      "epoch": 0.6653065189934907,
+      "grad_norm": 0.18790274858474731,
+      "learning_rate": 6.718567536006228e-05,
+      "loss": 0.9944,
+      "step": 1712
+    },
+    {
+      "epoch": 0.6656951326143982,
+      "grad_norm": 0.19979891180992126,
+      "learning_rate": 6.710782405605293e-05,
+      "loss": 1.0842,
+      "step": 1713
+    },
+    {
+      "epoch": 0.6660837462353055,
+      "grad_norm": 0.22204813361167908,
+      "learning_rate": 6.70299727520436e-05,
+      "loss": 1.0561,
+      "step": 1714
+    },
+    {
+      "epoch": 0.666472359856213,
+      "grad_norm": 0.20182965695858002,
+      "learning_rate": 6.695212144803425e-05,
+      "loss": 1.0015,
+      "step": 1715
+    },
+    {
+      "epoch": 0.6668609734771204,
+      "grad_norm": 0.20719997584819794,
+      "learning_rate": 6.687427014402492e-05,
+      "loss": 1.0144,
+      "step": 1716
+    },
+    {
+      "epoch": 0.6672495870980278,
+      "grad_norm": 0.1944626122713089,
+      "learning_rate": 6.679641884001557e-05,
+      "loss": 1.0083,
+      "step": 1717
+    },
+    {
+      "epoch": 0.6676382007189352,
+      "grad_norm": 0.2072264701128006,
+      "learning_rate": 6.671856753600622e-05,
+      "loss": 1.0246,
+      "step": 1718
+    },
+    {
+      "epoch": 0.6680268143398426,
+      "grad_norm": 0.2134973257780075,
+      "learning_rate": 6.664071623199689e-05,
+      "loss": 1.0926,
+      "step": 1719
+    },
+    {
+      "epoch": 0.66841542796075,
+      "grad_norm": 0.2119186669588089,
+      "learning_rate": 6.656286492798754e-05,
+      "loss": 1.0129,
+      "step": 1720
+    },
+    {
+      "epoch": 0.6688040415816574,
+      "grad_norm": 0.21205540001392365,
+      "learning_rate": 6.64850136239782e-05,
+      "loss": 1.0611,
+      "step": 1721
+    },
+    {
+      "epoch": 0.6691926552025649,
+      "grad_norm": 0.21632088720798492,
+      "learning_rate": 6.640716231996886e-05,
+      "loss": 1.0821,
+      "step": 1722
+    },
+    {
+      "epoch": 0.6695812688234722,
+      "grad_norm": 0.21734434366226196,
+      "learning_rate": 6.632931101595952e-05,
+      "loss": 1.0821,
+      "step": 1723
+    },
+    {
+      "epoch": 0.6699698824443797,
+      "grad_norm": 0.2030603289604187,
+      "learning_rate": 6.625145971195017e-05,
+      "loss": 0.9976,
+      "step": 1724
+    },
+    {
+      "epoch": 0.6703584960652871,
+      "grad_norm": 0.19921456277370453,
+      "learning_rate": 6.617360840794084e-05,
+      "loss": 0.9187,
+      "step": 1725
+    },
+    {
+      "epoch": 0.6707471096861946,
+      "grad_norm": 0.20548826456069946,
+      "learning_rate": 6.60957571039315e-05,
+      "loss": 1.0486,
+      "step": 1726
+    },
+    {
+      "epoch": 0.6711357233071019,
+      "grad_norm": 0.21784676611423492,
+      "learning_rate": 6.601790579992216e-05,
+      "loss": 1.1089,
+      "step": 1727
+    },
+    {
+      "epoch": 0.6715243369280093,
+      "grad_norm": 0.2137753963470459,
+      "learning_rate": 6.594005449591281e-05,
+      "loss": 1.0075,
+      "step": 1728
+    },
+    {
+      "epoch": 0.6719129505489168,
+      "grad_norm": 0.20200639963150024,
+      "learning_rate": 6.586220319190348e-05,
+      "loss": 0.9915,
+      "step": 1729
+    },
+    {
+      "epoch": 0.6723015641698241,
+      "grad_norm": 0.20898796617984772,
+      "learning_rate": 6.578435188789413e-05,
+      "loss": 1.0292,
+      "step": 1730
+    },
+    {
+      "epoch": 0.6726901777907316,
+      "grad_norm": 0.22515977919101715,
+      "learning_rate": 6.570650058388478e-05,
+      "loss": 1.0118,
+      "step": 1731
+    },
+    {
+      "epoch": 0.673078791411639,
+      "grad_norm": 0.2132793813943863,
+      "learning_rate": 6.562864927987545e-05,
+      "loss": 1.1097,
+      "step": 1732
+    },
+    {
+      "epoch": 0.6734674050325464,
+      "grad_norm": 0.20358797907829285,
+      "learning_rate": 6.55507979758661e-05,
+      "loss": 1.0241,
+      "step": 1733
+    },
+    {
+      "epoch": 0.6738560186534538,
+      "grad_norm": 0.21155016124248505,
+      "learning_rate": 6.547294667185676e-05,
+      "loss": 1.0235,
+      "step": 1734
+    },
+    {
+      "epoch": 0.6742446322743613,
+      "grad_norm": 0.198009192943573,
+      "learning_rate": 6.539509536784741e-05,
+      "loss": 0.9542,
+      "step": 1735
+    },
+    {
+      "epoch": 0.6746332458952686,
+      "grad_norm": 0.20318005979061127,
+      "learning_rate": 6.531724406383807e-05,
+      "loss": 0.9993,
+      "step": 1736
+    },
+    {
+      "epoch": 0.675021859516176,
+      "grad_norm": 0.21384860575199127,
+      "learning_rate": 6.523939275982873e-05,
+      "loss": 1.1188,
+      "step": 1737
+    },
+    {
+      "epoch": 0.6754104731370835,
+      "grad_norm": 0.18736955523490906,
+      "learning_rate": 6.516154145581938e-05,
+      "loss": 0.9832,
+      "step": 1738
+    },
+    {
+      "epoch": 0.6757990867579908,
+      "grad_norm": 0.2002391368150711,
+      "learning_rate": 6.508369015181005e-05,
+      "loss": 1.0288,
+      "step": 1739
+    },
+    {
+      "epoch": 0.6761877003788983,
+      "grad_norm": 0.20011006295681,
+      "learning_rate": 6.50058388478007e-05,
+      "loss": 0.9588,
+      "step": 1740
+    },
+    {
+      "epoch": 0.6765763139998057,
+      "grad_norm": 0.20782291889190674,
+      "learning_rate": 6.492798754379135e-05,
+      "loss": 1.0033,
+      "step": 1741
+    },
+    {
+      "epoch": 0.6769649276207131,
+      "grad_norm": 0.2056814581155777,
+      "learning_rate": 6.485013623978202e-05,
+      "loss": 1.0648,
+      "step": 1742
+    },
+    {
+      "epoch": 0.6773535412416205,
+      "grad_norm": 0.2207457572221756,
+      "learning_rate": 6.477228493577267e-05,
+      "loss": 1.0758,
+      "step": 1743
+    },
+    {
+      "epoch": 0.677742154862528,
+      "grad_norm": 0.20437198877334595,
+      "learning_rate": 6.469443363176334e-05,
+      "loss": 1.0253,
+      "step": 1744
+    },
+    {
+      "epoch": 0.6781307684834353,
+      "grad_norm": 0.198721781373024,
+      "learning_rate": 6.461658232775399e-05,
+      "loss": 1.0087,
+      "step": 1745
+    },
+    {
+      "epoch": 0.6785193821043427,
+      "grad_norm": 0.22781015932559967,
+      "learning_rate": 6.453873102374464e-05,
+      "loss": 1.0692,
+      "step": 1746
+    },
+    {
+      "epoch": 0.6789079957252502,
+      "grad_norm": 0.21826857328414917,
+      "learning_rate": 6.446087971973531e-05,
+      "loss": 1.0232,
+      "step": 1747
+    },
+    {
+      "epoch": 0.6792966093461575,
+      "grad_norm": 0.2156928926706314,
+      "learning_rate": 6.438302841572596e-05,
+      "loss": 1.0686,
+      "step": 1748
+    },
+    {
+      "epoch": 0.679685222967065,
+      "grad_norm": 0.2161693125963211,
+      "learning_rate": 6.430517711171662e-05,
+      "loss": 1.0298,
+      "step": 1749
+    },
+    {
+      "epoch": 0.6800738365879724,
+      "grad_norm": 0.19139425456523895,
+      "learning_rate": 6.422732580770729e-05,
+      "loss": 0.9545,
+      "step": 1750
+    },
+    {
+      "epoch": 0.6804624502088799,
+      "grad_norm": 0.22626161575317383,
+      "learning_rate": 6.414947450369794e-05,
+      "loss": 1.0669,
+      "step": 1751
+    },
+    {
+      "epoch": 0.6808510638297872,
+      "grad_norm": 0.2135801464319229,
+      "learning_rate": 6.407162319968861e-05,
+      "loss": 1.0187,
+      "step": 1752
+    },
+    {
+      "epoch": 0.6812396774506947,
+      "grad_norm": 0.20803681015968323,
+      "learning_rate": 6.399377189567926e-05,
+      "loss": 1.0856,
+      "step": 1753
+    },
+    {
+      "epoch": 0.681628291071602,
+      "grad_norm": 0.21317154169082642,
+      "learning_rate": 6.391592059166991e-05,
+      "loss": 1.1018,
+      "step": 1754
+    },
+    {
+      "epoch": 0.6820169046925094,
+      "grad_norm": 0.20877891778945923,
+      "learning_rate": 6.383806928766058e-05,
+      "loss": 1.0383,
+      "step": 1755
+    },
+    {
+      "epoch": 0.6824055183134169,
+      "grad_norm": 0.20769146084785461,
+      "learning_rate": 6.376021798365123e-05,
+      "loss": 1.0852,
+      "step": 1756
+    },
+    {
+      "epoch": 0.6827941319343243,
+      "grad_norm": 0.2252657413482666,
+      "learning_rate": 6.36823666796419e-05,
+      "loss": 1.0749,
+      "step": 1757
+    },
+    {
+      "epoch": 0.6831827455552317,
+      "grad_norm": 0.24453257024288177,
+      "learning_rate": 6.360451537563255e-05,
+      "loss": 1.1042,
+      "step": 1758
+    },
+    {
+      "epoch": 0.6835713591761391,
+      "grad_norm": 0.2082965075969696,
+      "learning_rate": 6.35266640716232e-05,
+      "loss": 1.0729,
+      "step": 1759
+    },
+    {
+      "epoch": 0.6839599727970466,
+      "grad_norm": 0.20121856033802032,
+      "learning_rate": 6.344881276761387e-05,
+      "loss": 1.038,
+      "step": 1760
+    },
+    {
+      "epoch": 0.6843485864179539,
+      "grad_norm": 0.20096386969089508,
+      "learning_rate": 6.337096146360452e-05,
+      "loss": 0.9655,
+      "step": 1761
+    },
+    {
+      "epoch": 0.6847372000388614,
+      "grad_norm": 0.20015959441661835,
+      "learning_rate": 6.329311015959518e-05,
+      "loss": 1.0187,
+      "step": 1762
+    },
+    {
+      "epoch": 0.6851258136597688,
+      "grad_norm": 0.21056395769119263,
+      "learning_rate": 6.321525885558583e-05,
+      "loss": 1.0567,
+      "step": 1763
+    },
+    {
+      "epoch": 0.6855144272806762,
+      "grad_norm": 0.2211030125617981,
+      "learning_rate": 6.313740755157649e-05,
+      "loss": 1.0588,
+      "step": 1764
+    },
+    {
+      "epoch": 0.6859030409015836,
+      "grad_norm": 0.20809797942638397,
+      "learning_rate": 6.305955624756715e-05,
+      "loss": 0.9488,
+      "step": 1765
+    },
+    {
+      "epoch": 0.686291654522491,
+      "grad_norm": 0.2331530600786209,
+      "learning_rate": 6.29817049435578e-05,
+      "loss": 1.0789,
+      "step": 1766
+    },
+    {
+      "epoch": 0.6866802681433984,
+      "grad_norm": 0.21708674728870392,
+      "learning_rate": 6.290385363954846e-05,
+      "loss": 1.0518,
+      "step": 1767
+    },
+    {
+      "epoch": 0.6870688817643058,
+      "grad_norm": 0.2088184356689453,
+      "learning_rate": 6.282600233553912e-05,
+      "loss": 1.0178,
+      "step": 1768
+    },
+    {
+      "epoch": 0.6874574953852133,
+      "grad_norm": 0.20285943150520325,
+      "learning_rate": 6.274815103152977e-05,
+      "loss": 1.018,
+      "step": 1769
+    },
+    {
+      "epoch": 0.6878461090061206,
+      "grad_norm": 0.211436927318573,
+      "learning_rate": 6.267029972752044e-05,
+      "loss": 1.0572,
+      "step": 1770
+    },
+    {
+      "epoch": 0.6882347226270281,
+      "grad_norm": 0.21108384430408478,
+      "learning_rate": 6.259244842351109e-05,
+      "loss": 1.0227,
+      "step": 1771
+    },
+    {
+      "epoch": 0.6886233362479355,
+      "grad_norm": 0.2060437649488449,
+      "learning_rate": 6.251459711950174e-05,
+      "loss": 1.0251,
+      "step": 1772
+    },
+    {
+      "epoch": 0.689011949868843,
+      "grad_norm": 0.20819245278835297,
+      "learning_rate": 6.243674581549241e-05,
+      "loss": 1.0643,
+      "step": 1773
+    },
+    {
+      "epoch": 0.6894005634897503,
+      "grad_norm": 0.2172113060951233,
+      "learning_rate": 6.235889451148306e-05,
+      "loss": 1.0869,
+      "step": 1774
+    },
+    {
+      "epoch": 0.6897891771106577,
+      "grad_norm": 0.2087356299161911,
+      "learning_rate": 6.228104320747373e-05,
+      "loss": 1.0622,
+      "step": 1775
+    },
+    {
+      "epoch": 0.6901777907315652,
+      "grad_norm": 0.1958473175764084,
+      "learning_rate": 6.220319190346439e-05,
+      "loss": 0.9542,
+      "step": 1776
+    },
+    {
+      "epoch": 0.6905664043524725,
+      "grad_norm": 0.23630915582180023,
+      "learning_rate": 6.212534059945504e-05,
+      "loss": 1.0535,
+      "step": 1777
+    },
+    {
+      "epoch": 0.69095501797338,
+      "grad_norm": 0.2127649188041687,
+      "learning_rate": 6.204748929544571e-05,
+      "loss": 0.972,
+      "step": 1778
+    },
+    {
+      "epoch": 0.6913436315942874,
+      "grad_norm": 0.19873055815696716,
+      "learning_rate": 6.196963799143636e-05,
+      "loss": 0.9969,
+      "step": 1779
+    },
+    {
+      "epoch": 0.6917322452151948,
+      "grad_norm": 0.2013067901134491,
+      "learning_rate": 6.189178668742703e-05,
+      "loss": 1.0399,
+      "step": 1780
+    },
+    {
+      "epoch": 0.6921208588361022,
+      "grad_norm": 0.21300987899303436,
+      "learning_rate": 6.181393538341768e-05,
+      "loss": 1.0377,
+      "step": 1781
+    },
+    {
+      "epoch": 0.6925094724570097,
+      "grad_norm": 0.21665994822978973,
+      "learning_rate": 6.173608407940833e-05,
+      "loss": 1.008,
+      "step": 1782
+    },
+    {
+      "epoch": 0.692898086077917,
+      "grad_norm": 0.21622590720653534,
+      "learning_rate": 6.1658232775399e-05,
+      "loss": 1.1128,
+      "step": 1783
+    },
+    {
+      "epoch": 0.6932866996988244,
+      "grad_norm": 0.2000272423028946,
+      "learning_rate": 6.158038147138965e-05,
+      "loss": 1.0115,
+      "step": 1784
+    },
+    {
+      "epoch": 0.6936753133197319,
+      "grad_norm": 0.20774856209754944,
+      "learning_rate": 6.15025301673803e-05,
+      "loss": 1.066,
+      "step": 1785
+    },
+    {
+      "epoch": 0.6940639269406392,
+      "grad_norm": 0.18497461080551147,
+      "learning_rate": 6.142467886337097e-05,
+      "loss": 0.9608,
+      "step": 1786
+    },
+    {
+      "epoch": 0.6944525405615467,
+      "grad_norm": 0.19819007813930511,
+      "learning_rate": 6.134682755936162e-05,
+      "loss": 1.0114,
+      "step": 1787
+    },
+    {
+      "epoch": 0.6948411541824541,
+      "grad_norm": 0.22013314068317413,
+      "learning_rate": 6.126897625535229e-05,
+      "loss": 0.976,
+      "step": 1788
+    },
+    {
+      "epoch": 0.6952297678033615,
+      "grad_norm": 0.2066160887479782,
+      "learning_rate": 6.119112495134294e-05,
+      "loss": 1.0585,
+      "step": 1789
+    },
+    {
+      "epoch": 0.6956183814242689,
+      "grad_norm": 0.21364475786685944,
+      "learning_rate": 6.111327364733359e-05,
+      "loss": 1.0842,
+      "step": 1790
+    },
+    {
+      "epoch": 0.6960069950451764,
+      "grad_norm": 0.19731444120407104,
+      "learning_rate": 6.103542234332425e-05,
+      "loss": 0.9936,
+      "step": 1791
+    },
+    {
+      "epoch": 0.6963956086660837,
+      "grad_norm": 0.2162671983242035,
+      "learning_rate": 6.095757103931491e-05,
+      "loss": 1.0446,
+      "step": 1792
+    },
+    {
+      "epoch": 0.6967842222869911,
+      "grad_norm": 0.21486608684062958,
+      "learning_rate": 6.087971973530557e-05,
+      "loss": 1.0441,
+      "step": 1793
+    },
+    {
+      "epoch": 0.6971728359078986,
+      "grad_norm": 0.20850563049316406,
+      "learning_rate": 6.0801868431296224e-05,
+      "loss": 1.0431,
+      "step": 1794
+    },
+    {
+      "epoch": 0.6975614495288059,
+      "grad_norm": 0.20492027699947357,
+      "learning_rate": 6.072401712728688e-05,
+      "loss": 0.9845,
+      "step": 1795
+    },
+    {
+      "epoch": 0.6979500631497134,
+      "grad_norm": 0.1986648142337799,
+      "learning_rate": 6.064616582327754e-05,
+      "loss": 0.9855,
+      "step": 1796
+    },
+    {
+      "epoch": 0.6983386767706208,
+      "grad_norm": 0.20606310665607452,
+      "learning_rate": 6.05683145192682e-05,
+      "loss": 1.0608,
+      "step": 1797
+    },
+    {
+      "epoch": 0.6987272903915283,
+      "grad_norm": 0.20496073365211487,
+      "learning_rate": 6.0490463215258867e-05,
+      "loss": 1.0311,
+      "step": 1798
+    },
+    {
+      "epoch": 0.6991159040124356,
+      "grad_norm": 0.2153409719467163,
+      "learning_rate": 6.041261191124952e-05,
+      "loss": 1.0394,
+      "step": 1799
+    },
+    {
+      "epoch": 0.6995045176333431,
+      "grad_norm": 0.21410655975341797,
+      "learning_rate": 6.033476060724017e-05,
+      "loss": 1.0229,
+      "step": 1800
+    },
+    {
+      "epoch": 0.6998931312542505,
+      "grad_norm": 0.20418782532215118,
+      "learning_rate": 6.0256909303230836e-05,
+      "loss": 1.0382,
+      "step": 1801
+    },
+    {
+      "epoch": 0.7002817448751578,
+      "grad_norm": 0.19154146313667297,
+      "learning_rate": 6.017905799922149e-05,
+      "loss": 0.9891,
+      "step": 1802
+    },
+    {
+      "epoch": 0.7006703584960653,
+      "grad_norm": 0.19138328731060028,
+      "learning_rate": 6.010120669521214e-05,
+      "loss": 0.9638,
+      "step": 1803
+    },
+    {
+      "epoch": 0.7010589721169727,
+      "grad_norm": 0.19704872369766235,
+      "learning_rate": 6.0023355391202806e-05,
+      "loss": 0.9835,
+      "step": 1804
+    },
+    {
+      "epoch": 0.7014475857378801,
+      "grad_norm": 0.2175600379705429,
+      "learning_rate": 5.994550408719346e-05,
+      "loss": 1.1192,
+      "step": 1805
+    },
+    {
+      "epoch": 0.7018361993587875,
+      "grad_norm": 0.21614274382591248,
+      "learning_rate": 5.9867652783184124e-05,
+      "loss": 1.0877,
+      "step": 1806
+    },
+    {
+      "epoch": 0.702224812979695,
+      "grad_norm": 0.20461414754390717,
+      "learning_rate": 5.9789801479174776e-05,
+      "loss": 0.9706,
+      "step": 1807
+    },
+    {
+      "epoch": 0.7026134266006023,
+      "grad_norm": 0.1989748477935791,
+      "learning_rate": 5.9711950175165434e-05,
+      "loss": 1.0004,
+      "step": 1808
+    },
+    {
+      "epoch": 0.7030020402215098,
+      "grad_norm": 0.21304792165756226,
+      "learning_rate": 5.963409887115609e-05,
+      "loss": 1.0177,
+      "step": 1809
+    },
+    {
+      "epoch": 0.7033906538424172,
+      "grad_norm": 0.19023855030536652,
+      "learning_rate": 5.955624756714675e-05,
+      "loss": 0.9759,
+      "step": 1810
+    },
+    {
+      "epoch": 0.7037792674633246,
+      "grad_norm": 0.21915188431739807,
+      "learning_rate": 5.947839626313742e-05,
+      "loss": 1.0621,
+      "step": 1811
+    },
+    {
+      "epoch": 0.704167881084232,
+      "grad_norm": 0.21626822650432587,
+      "learning_rate": 5.940054495912807e-05,
+      "loss": 1.0144,
+      "step": 1812
+    },
+    {
+      "epoch": 0.7045564947051394,
+      "grad_norm": 0.20742040872573853,
+      "learning_rate": 5.932269365511872e-05,
+      "loss": 0.9778,
+      "step": 1813
+    },
+    {
+      "epoch": 0.7049451083260468,
+      "grad_norm": 0.2172158658504486,
+      "learning_rate": 5.924484235110939e-05,
+      "loss": 1.0416,
+      "step": 1814
+    },
+    {
+      "epoch": 0.7053337219469542,
+      "grad_norm": 0.209465891122818,
+      "learning_rate": 5.916699104710004e-05,
+      "loss": 1.0378,
+      "step": 1815
+    },
+    {
+      "epoch": 0.7057223355678617,
+      "grad_norm": 0.2097882628440857,
+      "learning_rate": 5.9089139743090705e-05,
+      "loss": 1.0166,
+      "step": 1816
+    },
+    {
+      "epoch": 0.706110949188769,
+      "grad_norm": 0.2251904308795929,
+      "learning_rate": 5.901128843908136e-05,
+      "loss": 1.0783,
+      "step": 1817
+    },
+    {
+      "epoch": 0.7064995628096765,
+      "grad_norm": 0.1952916979789734,
+      "learning_rate": 5.893343713507201e-05,
+      "loss": 0.993,
+      "step": 1818
+    },
+    {
+      "epoch": 0.7068881764305839,
+      "grad_norm": 0.20997455716133118,
+      "learning_rate": 5.8855585831062675e-05,
+      "loss": 1.0448,
+      "step": 1819
+    },
+    {
+      "epoch": 0.7072767900514914,
+      "grad_norm": 0.20070020854473114,
+      "learning_rate": 5.877773452705333e-05,
+      "loss": 0.9603,
+      "step": 1820
+    },
+    {
+      "epoch": 0.7076654036723987,
+      "grad_norm": 0.25765034556388855,
+      "learning_rate": 5.869988322304399e-05,
+      "loss": 1.0361,
+      "step": 1821
+    },
+    {
+      "epoch": 0.7080540172933061,
+      "grad_norm": 0.21948982775211334,
+      "learning_rate": 5.862203191903465e-05,
+      "loss": 1.0668,
+      "step": 1822
+    },
+    {
+      "epoch": 0.7084426309142136,
+      "grad_norm": 0.1867108792066574,
+      "learning_rate": 5.85441806150253e-05,
+      "loss": 0.9372,
+      "step": 1823
+    },
+    {
+      "epoch": 0.7088312445351209,
+      "grad_norm": 0.2037520408630371,
+      "learning_rate": 5.846632931101597e-05,
+      "loss": 0.9905,
+      "step": 1824
+    },
+    {
+      "epoch": 0.7092198581560284,
+      "grad_norm": 0.21352072060108185,
+      "learning_rate": 5.838847800700662e-05,
+      "loss": 1.0514,
+      "step": 1825
+    },
+    {
+      "epoch": 0.7096084717769358,
+      "grad_norm": 0.1949845850467682,
+      "learning_rate": 5.831062670299727e-05,
+      "loss": 0.9636,
+      "step": 1826
+    },
+    {
+      "epoch": 0.7099970853978432,
+      "grad_norm": 0.2092294692993164,
+      "learning_rate": 5.823277539898794e-05,
+      "loss": 1.0361,
+      "step": 1827
+    },
+    {
+      "epoch": 0.7103856990187506,
+      "grad_norm": 0.20054267346858978,
+      "learning_rate": 5.815492409497859e-05,
+      "loss": 1.0195,
+      "step": 1828
+    },
+    {
+      "epoch": 0.7107743126396581,
+      "grad_norm": 0.2202107012271881,
+      "learning_rate": 5.8077072790969256e-05,
+      "loss": 1.0918,
+      "step": 1829
+    },
+    {
+      "epoch": 0.7111629262605654,
+      "grad_norm": 0.2001042366027832,
+      "learning_rate": 5.799922148695991e-05,
+      "loss": 1.0142,
+      "step": 1830
+    },
+    {
+      "epoch": 0.7115515398814728,
+      "grad_norm": 0.2102631777524948,
+      "learning_rate": 5.792137018295056e-05,
+      "loss": 1.0231,
+      "step": 1831
+    },
+    {
+      "epoch": 0.7119401535023803,
+      "grad_norm": 0.21717461943626404,
+      "learning_rate": 5.7843518878941226e-05,
+      "loss": 1.0295,
+      "step": 1832
+    },
+    {
+      "epoch": 0.7123287671232876,
+      "grad_norm": 0.2001933753490448,
+      "learning_rate": 5.776566757493188e-05,
+      "loss": 1.022,
+      "step": 1833
+    },
+    {
+      "epoch": 0.7127173807441951,
+      "grad_norm": 0.2218201756477356,
+      "learning_rate": 5.7687816270922544e-05,
+      "loss": 1.0762,
+      "step": 1834
+    },
+    {
+      "epoch": 0.7131059943651025,
+      "grad_norm": 0.20680001378059387,
+      "learning_rate": 5.76099649669132e-05,
+      "loss": 1.0017,
+      "step": 1835
+    },
+    {
+      "epoch": 0.7134946079860099,
+      "grad_norm": 0.21511508524417877,
+      "learning_rate": 5.7532113662903854e-05,
+      "loss": 1.048,
+      "step": 1836
+    },
+    {
+      "epoch": 0.7138832216069173,
+      "grad_norm": 0.19720061123371124,
+      "learning_rate": 5.745426235889452e-05,
+      "loss": 0.9983,
+      "step": 1837
+    },
+    {
+      "epoch": 0.7142718352278248,
+      "grad_norm": 0.2005409449338913,
+      "learning_rate": 5.737641105488517e-05,
+      "loss": 0.9941,
+      "step": 1838
+    },
+    {
+      "epoch": 0.7146604488487321,
+      "grad_norm": 0.2222924679517746,
+      "learning_rate": 5.729855975087584e-05,
+      "loss": 1.0476,
+      "step": 1839
+    },
+    {
+      "epoch": 0.7150490624696395,
+      "grad_norm": 0.21131208539009094,
+      "learning_rate": 5.722070844686649e-05,
+      "loss": 1.0208,
+      "step": 1840
+    },
+    {
+      "epoch": 0.715437676090547,
+      "grad_norm": 0.2307305932044983,
+      "learning_rate": 5.714285714285714e-05,
+      "loss": 0.9867,
+      "step": 1841
+    },
+    {
+      "epoch": 0.7158262897114543,
+      "grad_norm": 0.1974973827600479,
+      "learning_rate": 5.706500583884781e-05,
+      "loss": 1.0285,
+      "step": 1842
+    },
+    {
+      "epoch": 0.7162149033323618,
+      "grad_norm": 0.2006559520959854,
+      "learning_rate": 5.698715453483846e-05,
+      "loss": 1.024,
+      "step": 1843
+    },
+    {
+      "epoch": 0.7166035169532692,
+      "grad_norm": 0.21160584688186646,
+      "learning_rate": 5.690930323082911e-05,
+      "loss": 1.0256,
+      "step": 1844
+    },
+    {
+      "epoch": 0.7169921305741767,
+      "grad_norm": 0.28184664249420166,
+      "learning_rate": 5.683145192681978e-05,
+      "loss": 1.0443,
+      "step": 1845
+    },
+    {
+      "epoch": 0.717380744195084,
+      "grad_norm": 0.2206653356552124,
+      "learning_rate": 5.675360062281043e-05,
+      "loss": 1.0458,
+      "step": 1846
+    },
+    {
+      "epoch": 0.7177693578159915,
+      "grad_norm": 0.21346066892147064,
+      "learning_rate": 5.6675749318801095e-05,
+      "loss": 1.0106,
+      "step": 1847
+    },
+    {
+      "epoch": 0.7181579714368989,
+      "grad_norm": 0.20931747555732727,
+      "learning_rate": 5.6597898014791753e-05,
+      "loss": 0.9831,
+      "step": 1848
+    },
+    {
+      "epoch": 0.7185465850578063,
+      "grad_norm": 0.2026771456003189,
+      "learning_rate": 5.6520046710782406e-05,
+      "loss": 1.0162,
+      "step": 1849
+    },
+    {
+      "epoch": 0.7189351986787137,
+      "grad_norm": 0.21388716995716095,
+      "learning_rate": 5.644219540677307e-05,
+      "loss": 1.0867,
+      "step": 1850
+    },
+    {
+      "epoch": 0.7193238122996211,
+      "grad_norm": 0.2039308398962021,
+      "learning_rate": 5.636434410276372e-05,
+      "loss": 1.0325,
+      "step": 1851
+    },
+    {
+      "epoch": 0.7197124259205285,
+      "grad_norm": 0.21741114556789398,
+      "learning_rate": 5.628649279875439e-05,
+      "loss": 1.0251,
+      "step": 1852
+    },
+    {
+      "epoch": 0.7201010395414359,
+      "grad_norm": 0.21343208849430084,
+      "learning_rate": 5.620864149474504e-05,
+      "loss": 1.0766,
+      "step": 1853
+    },
+    {
+      "epoch": 0.7204896531623434,
+      "grad_norm": 0.21712560951709747,
+      "learning_rate": 5.613079019073569e-05,
+      "loss": 1.0643,
+      "step": 1854
+    },
+    {
+      "epoch": 0.7208782667832507,
+      "grad_norm": 0.2176978886127472,
+      "learning_rate": 5.605293888672636e-05,
+      "loss": 1.0375,
+      "step": 1855
+    },
+    {
+      "epoch": 0.7212668804041582,
+      "grad_norm": 0.2065533846616745,
+      "learning_rate": 5.597508758271701e-05,
+      "loss": 1.0385,
+      "step": 1856
+    },
+    {
+      "epoch": 0.7216554940250656,
+      "grad_norm": 0.2169170081615448,
+      "learning_rate": 5.5897236278707676e-05,
+      "loss": 1.0197,
+      "step": 1857
+    },
+    {
+      "epoch": 0.722044107645973,
+      "grad_norm": 0.2047201544046402,
+      "learning_rate": 5.581938497469833e-05,
+      "loss": 0.9794,
+      "step": 1858
+    },
+    {
+      "epoch": 0.7224327212668804,
+      "grad_norm": 0.20898981392383575,
+      "learning_rate": 5.574153367068898e-05,
+      "loss": 1.032,
+      "step": 1859
+    },
+    {
+      "epoch": 0.7228213348877878,
+      "grad_norm": 0.2090533971786499,
+      "learning_rate": 5.5663682366679646e-05,
+      "loss": 1.0694,
+      "step": 1860
+    },
+    {
+      "epoch": 0.7232099485086952,
+      "grad_norm": 0.21963149309158325,
+      "learning_rate": 5.5585831062670305e-05,
+      "loss": 1.0367,
+      "step": 1861
+    },
+    {
+      "epoch": 0.7235985621296026,
+      "grad_norm": 0.1974373459815979,
+      "learning_rate": 5.550797975866096e-05,
+      "loss": 1.0402,
+      "step": 1862
+    },
+    {
+      "epoch": 0.7239871757505101,
+      "grad_norm": 0.1924194097518921,
+      "learning_rate": 5.543012845465162e-05,
+      "loss": 0.9647,
+      "step": 1863
+    },
+    {
+      "epoch": 0.7243757893714174,
+      "grad_norm": 0.21366077661514282,
+      "learning_rate": 5.5352277150642274e-05,
+      "loss": 1.0139,
+      "step": 1864
+    },
+    {
+      "epoch": 0.7247644029923249,
+      "grad_norm": 0.21722929179668427,
+      "learning_rate": 5.527442584663294e-05,
+      "loss": 1.0366,
+      "step": 1865
+    },
+    {
+      "epoch": 0.7251530166132323,
+      "grad_norm": 0.20646587014198303,
+      "learning_rate": 5.519657454262359e-05,
+      "loss": 1.0465,
+      "step": 1866
+    },
+    {
+      "epoch": 0.7255416302341398,
+      "grad_norm": 0.19144394993782043,
+      "learning_rate": 5.5118723238614244e-05,
+      "loss": 0.9645,
+      "step": 1867
+    },
+    {
+      "epoch": 0.7259302438550471,
+      "grad_norm": 0.19553838670253754,
+      "learning_rate": 5.504087193460491e-05,
+      "loss": 0.98,
+      "step": 1868
+    },
+    {
+      "epoch": 0.7263188574759545,
+      "grad_norm": 0.21739792823791504,
+      "learning_rate": 5.496302063059556e-05,
+      "loss": 1.002,
+      "step": 1869
+    },
+    {
+      "epoch": 0.726707471096862,
+      "grad_norm": 0.1910562962293625,
+      "learning_rate": 5.488516932658623e-05,
+      "loss": 0.985,
+      "step": 1870
+    },
+    {
+      "epoch": 0.7270960847177693,
+      "grad_norm": 0.2133384346961975,
+      "learning_rate": 5.480731802257688e-05,
+      "loss": 1.0325,
+      "step": 1871
+    },
+    {
+      "epoch": 0.7274846983386768,
+      "grad_norm": 0.21884119510650635,
+      "learning_rate": 5.472946671856753e-05,
+      "loss": 1.0412,
+      "step": 1872
+    },
+    {
+      "epoch": 0.7278733119595842,
+      "grad_norm": 0.21069306135177612,
+      "learning_rate": 5.46516154145582e-05,
+      "loss": 1.0474,
+      "step": 1873
+    },
+    {
+      "epoch": 0.7282619255804916,
+      "grad_norm": 0.19266243278980255,
+      "learning_rate": 5.4573764110548856e-05,
+      "loss": 0.9941,
+      "step": 1874
+    },
+    {
+      "epoch": 0.728650539201399,
+      "grad_norm": 0.21255099773406982,
+      "learning_rate": 5.4495912806539515e-05,
+      "loss": 1.0211,
+      "step": 1875
+    },
+    {
+      "epoch": 0.7290391528223065,
+      "grad_norm": 0.1924402117729187,
+      "learning_rate": 5.4418061502530173e-05,
+      "loss": 1.0117,
+      "step": 1876
+    },
+    {
+      "epoch": 0.7294277664432138,
+      "grad_norm": 0.2019895315170288,
+      "learning_rate": 5.4340210198520825e-05,
+      "loss": 0.9921,
+      "step": 1877
+    },
+    {
+      "epoch": 0.7298163800641212,
+      "grad_norm": 0.20398026704788208,
+      "learning_rate": 5.426235889451149e-05,
+      "loss": 1.0423,
+      "step": 1878
+    },
+    {
+      "epoch": 0.7302049936850287,
+      "grad_norm": 0.20153217017650604,
+      "learning_rate": 5.418450759050214e-05,
+      "loss": 1.0333,
+      "step": 1879
+    },
+    {
+      "epoch": 0.730593607305936,
+      "grad_norm": 0.21259640157222748,
+      "learning_rate": 5.4106656286492795e-05,
+      "loss": 1.0689,
+      "step": 1880
+    },
+    {
+      "epoch": 0.7309822209268435,
+      "grad_norm": 0.2037276029586792,
+      "learning_rate": 5.402880498248346e-05,
+      "loss": 1.0203,
+      "step": 1881
+    },
+    {
+      "epoch": 0.7313708345477509,
+      "grad_norm": 0.19976729154586792,
+      "learning_rate": 5.395095367847411e-05,
+      "loss": 1.0173,
+      "step": 1882
+    },
+    {
+      "epoch": 0.7317594481686583,
+      "grad_norm": 0.20481806993484497,
+      "learning_rate": 5.387310237446478e-05,
+      "loss": 0.9864,
+      "step": 1883
+    },
+    {
+      "epoch": 0.7321480617895657,
+      "grad_norm": 0.21900932490825653,
+      "learning_rate": 5.379525107045543e-05,
+      "loss": 1.0519,
+      "step": 1884
+    },
+    {
+      "epoch": 0.7325366754104732,
+      "grad_norm": 0.200319305062294,
+      "learning_rate": 5.371739976644609e-05,
+      "loss": 1.0834,
+      "step": 1885
+    },
+    {
+      "epoch": 0.7329252890313805,
+      "grad_norm": 0.19662296772003174,
+      "learning_rate": 5.363954846243675e-05,
+      "loss": 0.9794,
+      "step": 1886
+    },
+    {
+      "epoch": 0.7333139026522879,
+      "grad_norm": 0.2113952785730362,
+      "learning_rate": 5.356169715842741e-05,
+      "loss": 1.0763,
+      "step": 1887
+    },
+    {
+      "epoch": 0.7337025162731954,
+      "grad_norm": 0.21348755061626434,
+      "learning_rate": 5.3483845854418066e-05,
+      "loss": 1.0781,
+      "step": 1888
+    },
+    {
+      "epoch": 0.7340911298941027,
+      "grad_norm": 0.20673702657222748,
+      "learning_rate": 5.3405994550408725e-05,
+      "loss": 1.0513,
+      "step": 1889
+    },
+    {
+      "epoch": 0.7344797435150102,
+      "grad_norm": 0.210855171084404,
+      "learning_rate": 5.332814324639938e-05,
+      "loss": 0.9972,
+      "step": 1890
+    },
+    {
+      "epoch": 0.7348683571359176,
+      "grad_norm": 0.2136204093694687,
+      "learning_rate": 5.325029194239004e-05,
+      "loss": 1.03,
+      "step": 1891
+    },
+    {
+      "epoch": 0.7352569707568251,
+      "grad_norm": 0.20035260915756226,
+      "learning_rate": 5.3172440638380694e-05,
+      "loss": 0.9739,
+      "step": 1892
+    },
+    {
+      "epoch": 0.7356455843777324,
+      "grad_norm": 0.1943352371454239,
+      "learning_rate": 5.309458933437136e-05,
+      "loss": 0.9411,
+      "step": 1893
+    },
+    {
+      "epoch": 0.7360341979986399,
+      "grad_norm": 0.3994326889514923,
+      "learning_rate": 5.301673803036201e-05,
+      "loss": 1.0714,
+      "step": 1894
+    },
+    {
+      "epoch": 0.7364228116195473,
+      "grad_norm": 0.21691356599330902,
+      "learning_rate": 5.2938886726352664e-05,
+      "loss": 1.0648,
+      "step": 1895
+    },
+    {
+      "epoch": 0.7368114252404547,
+      "grad_norm": 0.19853095710277557,
+      "learning_rate": 5.286103542234333e-05,
+      "loss": 0.983,
+      "step": 1896
+    },
+    {
+      "epoch": 0.7372000388613621,
+      "grad_norm": 0.21836897730827332,
+      "learning_rate": 5.278318411833398e-05,
+      "loss": 1.0396,
+      "step": 1897
+    },
+    {
+      "epoch": 0.7375886524822695,
+      "grad_norm": 0.19596605002880096,
+      "learning_rate": 5.270533281432464e-05,
+      "loss": 0.9593,
+      "step": 1898
+    },
+    {
+      "epoch": 0.7379772661031769,
+      "grad_norm": 0.2141752541065216,
+      "learning_rate": 5.26274815103153e-05,
+      "loss": 1.0373,
+      "step": 1899
+    },
+    {
+      "epoch": 0.7383658797240843,
+      "grad_norm": 0.20552939176559448,
+      "learning_rate": 5.254963020630596e-05,
+      "loss": 1.0352,
+      "step": 1900
+    },
+    {
+      "epoch": 0.7387544933449918,
+      "grad_norm": 0.2095794975757599,
+      "learning_rate": 5.247177890229662e-05,
+      "loss": 1.0632,
+      "step": 1901
+    },
+    {
+      "epoch": 0.7391431069658991,
+      "grad_norm": 0.19894710183143616,
+      "learning_rate": 5.2393927598287276e-05,
+      "loss": 0.9886,
+      "step": 1902
+    },
+    {
+      "epoch": 0.7395317205868066,
+      "grad_norm": 0.22996319830417633,
+      "learning_rate": 5.231607629427793e-05,
+      "loss": 1.0826,
+      "step": 1903
+    },
+    {
+      "epoch": 0.739920334207714,
+      "grad_norm": 0.21416957676410675,
+      "learning_rate": 5.2238224990268593e-05,
+      "loss": 1.0161,
+      "step": 1904
+    },
+    {
+      "epoch": 0.7403089478286214,
+      "grad_norm": 0.21819345653057098,
+      "learning_rate": 5.2160373686259245e-05,
+      "loss": 1.0458,
+      "step": 1905
+    },
+    {
+      "epoch": 0.7406975614495288,
+      "grad_norm": 0.21327044069766998,
+      "learning_rate": 5.208252238224991e-05,
+      "loss": 1.0721,
+      "step": 1906
+    },
+    {
+      "epoch": 0.7410861750704362,
+      "grad_norm": 0.21436645090579987,
+      "learning_rate": 5.200467107824056e-05,
+      "loss": 1.0743,
+      "step": 1907
+    },
+    {
+      "epoch": 0.7414747886913436,
+      "grad_norm": 0.215640127658844,
+      "learning_rate": 5.1926819774231215e-05,
+      "loss": 1.0274,
+      "step": 1908
+    },
+    {
+      "epoch": 0.741863402312251,
+      "grad_norm": 0.2043589949607849,
+      "learning_rate": 5.184896847022188e-05,
+      "loss": 1.0618,
+      "step": 1909
+    },
+    {
+      "epoch": 0.7422520159331585,
+      "grad_norm": 0.2014230340719223,
+      "learning_rate": 5.177111716621253e-05,
+      "loss": 0.9892,
+      "step": 1910
+    },
+    {
+      "epoch": 0.7426406295540658,
+      "grad_norm": 0.19954468309879303,
+      "learning_rate": 5.16932658622032e-05,
+      "loss": 0.9815,
+      "step": 1911
+    },
+    {
+      "epoch": 0.7430292431749733,
+      "grad_norm": 0.23119708895683289,
+      "learning_rate": 5.161541455819385e-05,
+      "loss": 1.0783,
+      "step": 1912
+    },
+    {
+      "epoch": 0.7434178567958807,
+      "grad_norm": 0.20650482177734375,
+      "learning_rate": 5.153756325418451e-05,
+      "loss": 1.0162,
+      "step": 1913
+    },
+    {
+      "epoch": 0.7438064704167882,
+      "grad_norm": 0.20021970570087433,
+      "learning_rate": 5.145971195017517e-05,
+      "loss": 1.0062,
+      "step": 1914
+    },
+    {
+      "epoch": 0.7441950840376955,
+      "grad_norm": 0.23300811648368835,
+      "learning_rate": 5.138186064616583e-05,
+      "loss": 1.0049,
+      "step": 1915
+    },
+    {
+      "epoch": 0.7445836976586029,
+      "grad_norm": 0.23268327116966248,
+      "learning_rate": 5.130400934215648e-05,
+      "loss": 1.0138,
+      "step": 1916
+    },
+    {
+      "epoch": 0.7449723112795104,
+      "grad_norm": 0.20413407683372498,
+      "learning_rate": 5.1226158038147145e-05,
+      "loss": 0.9903,
+      "step": 1917
+    },
+    {
+      "epoch": 0.7453609249004177,
+      "grad_norm": 0.20714978873729706,
+      "learning_rate": 5.1148306734137797e-05,
+      "loss": 1.0374,
+      "step": 1918
+    },
+    {
+      "epoch": 0.7457495385213252,
+      "grad_norm": 0.2000850886106491,
+      "learning_rate": 5.107045543012846e-05,
+      "loss": 0.9885,
+      "step": 1919
+    },
+    {
+      "epoch": 0.7461381521422326,
+      "grad_norm": 0.2054719179868698,
+      "learning_rate": 5.0992604126119114e-05,
+      "loss": 1.0551,
+      "step": 1920
+    },
+    {
+      "epoch": 0.74652676576314,
+      "grad_norm": 0.2351357489824295,
+      "learning_rate": 5.0914752822109766e-05,
+      "loss": 1.0693,
+      "step": 1921
+    },
+    {
+      "epoch": 0.7469153793840474,
+      "grad_norm": 0.22370338439941406,
+      "learning_rate": 5.083690151810043e-05,
+      "loss": 0.9781,
+      "step": 1922
+    },
+    {
+      "epoch": 0.7473039930049549,
+      "grad_norm": 0.18734332919120789,
+      "learning_rate": 5.0759050214091084e-05,
+      "loss": 0.9329,
+      "step": 1923
+    },
+    {
+      "epoch": 0.7476926066258622,
+      "grad_norm": 0.22099906206130981,
+      "learning_rate": 5.068119891008175e-05,
+      "loss": 1.0498,
+      "step": 1924
+    },
+    {
+      "epoch": 0.7480812202467696,
+      "grad_norm": 0.20144490897655487,
+      "learning_rate": 5.06033476060724e-05,
+      "loss": 0.9865,
+      "step": 1925
+    },
+    {
+      "epoch": 0.7484698338676771,
+      "grad_norm": 0.21770039200782776,
+      "learning_rate": 5.052549630206306e-05,
+      "loss": 1.0867,
+      "step": 1926
+    },
+    {
+      "epoch": 0.7488584474885844,
+      "grad_norm": 0.19649921357631683,
+      "learning_rate": 5.044764499805372e-05,
+      "loss": 0.9887,
+      "step": 1927
+    },
+    {
+      "epoch": 0.7492470611094919,
+      "grad_norm": 0.1940620392560959,
+      "learning_rate": 5.036979369404438e-05,
+      "loss": 1.0073,
+      "step": 1928
+    },
+    {
+      "epoch": 0.7496356747303993,
+      "grad_norm": 0.20987650752067566,
+      "learning_rate": 5.0291942390035044e-05,
+      "loss": 1.046,
+      "step": 1929
+    },
+    {
+      "epoch": 0.7500242883513067,
+      "grad_norm": 0.2116398960351944,
+      "learning_rate": 5.0214091086025696e-05,
+      "loss": 1.0423,
+      "step": 1930
+    },
+    {
+      "epoch": 0.7504129019722141,
+      "grad_norm": 0.18996965885162354,
+      "learning_rate": 5.013623978201635e-05,
+      "loss": 0.9822,
+      "step": 1931
+    },
+    {
+      "epoch": 0.7508015155931216,
+      "grad_norm": 0.20942547917366028,
+      "learning_rate": 5.005838847800701e-05,
+      "loss": 1.0472,
+      "step": 1932
+    },
+    {
+      "epoch": 0.751190129214029,
+      "grad_norm": 0.19006839394569397,
+      "learning_rate": 4.9980537173997665e-05,
+      "loss": 0.993,
+      "step": 1933
+    },
+    {
+      "epoch": 0.7515787428349364,
+      "grad_norm": 0.21508941054344177,
+      "learning_rate": 4.9902685869988324e-05,
+      "loss": 1.0406,
+      "step": 1934
+    },
+    {
+      "epoch": 0.7519673564558438,
+      "grad_norm": 0.1989334225654602,
+      "learning_rate": 4.982483456597898e-05,
+      "loss": 0.9997,
+      "step": 1935
+    },
+    {
+      "epoch": 0.7523559700767511,
+      "grad_norm": 0.19993600249290466,
+      "learning_rate": 4.974698326196964e-05,
+      "loss": 1.0139,
+      "step": 1936
+    },
+    {
+      "epoch": 0.7527445836976586,
+      "grad_norm": 0.20927831530570984,
+      "learning_rate": 4.9669131957960294e-05,
+      "loss": 0.995,
+      "step": 1937
+    },
+    {
+      "epoch": 0.753133197318566,
+      "grad_norm": 0.20963850617408752,
+      "learning_rate": 4.959128065395095e-05,
+      "loss": 1.0678,
+      "step": 1938
+    },
+    {
+      "epoch": 0.7535218109394735,
+      "grad_norm": 0.19523034989833832,
+      "learning_rate": 4.951342934994161e-05,
+      "loss": 0.9883,
+      "step": 1939
+    },
+    {
+      "epoch": 0.7539104245603808,
+      "grad_norm": 0.21588142216205597,
+      "learning_rate": 4.943557804593227e-05,
+      "loss": 1.0398,
+      "step": 1940
+    },
+    {
+      "epoch": 0.7542990381812883,
+      "grad_norm": 0.19894704222679138,
+      "learning_rate": 4.935772674192293e-05,
+      "loss": 1.0125,
+      "step": 1941
+    },
+    {
+      "epoch": 0.7546876518021957,
+      "grad_norm": 0.2155168056488037,
+      "learning_rate": 4.927987543791359e-05,
+      "loss": 1.0447,
+      "step": 1942
+    },
+    {
+      "epoch": 0.7550762654231031,
+      "grad_norm": 0.212605819106102,
+      "learning_rate": 4.920202413390425e-05,
+      "loss": 1.077,
+      "step": 1943
+    },
+    {
+      "epoch": 0.7554648790440105,
+      "grad_norm": 0.2168148010969162,
+      "learning_rate": 4.9124172829894906e-05,
+      "loss": 1.0029,
+      "step": 1944
+    },
+    {
+      "epoch": 0.7558534926649179,
+      "grad_norm": 0.2020149528980255,
+      "learning_rate": 4.9046321525885565e-05,
+      "loss": 1.0684,
+      "step": 1945
+    },
+    {
+      "epoch": 0.7562421062858253,
+      "grad_norm": 0.21063408255577087,
+      "learning_rate": 4.8968470221876217e-05,
+      "loss": 1.0147,
+      "step": 1946
+    },
+    {
+      "epoch": 0.7566307199067327,
+      "grad_norm": 0.19599388539791107,
+      "learning_rate": 4.8890618917866875e-05,
+      "loss": 0.9719,
+      "step": 1947
+    },
+    {
+      "epoch": 0.7570193335276402,
+      "grad_norm": 0.2158602923154831,
+      "learning_rate": 4.8812767613857534e-05,
+      "loss": 1.0439,
+      "step": 1948
+    },
+    {
+      "epoch": 0.7574079471485475,
+      "grad_norm": 0.21013815701007843,
+      "learning_rate": 4.873491630984819e-05,
+      "loss": 1.0319,
+      "step": 1949
+    },
+    {
+      "epoch": 0.757796560769455,
+      "grad_norm": 0.2020798772573471,
+      "learning_rate": 4.8657065005838845e-05,
+      "loss": 1.0037,
+      "step": 1950
+    },
+    {
+      "epoch": 0.7581851743903624,
+      "grad_norm": 0.21202047169208527,
+      "learning_rate": 4.8579213701829504e-05,
+      "loss": 0.9823,
+      "step": 1951
+    },
+    {
+      "epoch": 0.7585737880112698,
+      "grad_norm": 0.20750083029270172,
+      "learning_rate": 4.850136239782016e-05,
+      "loss": 1.0073,
+      "step": 1952
+    },
+    {
+      "epoch": 0.7589624016321772,
+      "grad_norm": 0.20938372611999512,
+      "learning_rate": 4.842351109381083e-05,
+      "loss": 1.0326,
+      "step": 1953
+    },
+    {
+      "epoch": 0.7593510152530846,
+      "grad_norm": 0.21984544396400452,
+      "learning_rate": 4.834565978980149e-05,
+      "loss": 1.0363,
+      "step": 1954
+    },
+    {
+      "epoch": 0.759739628873992,
+      "grad_norm": 0.20306189358234406,
+      "learning_rate": 4.826780848579214e-05,
+      "loss": 1.0374,
+      "step": 1955
+    },
+    {
+      "epoch": 0.7601282424948994,
+      "grad_norm": 0.20631705224514008,
+      "learning_rate": 4.81899571817828e-05,
+      "loss": 1.0985,
+      "step": 1956
+    },
+    {
+      "epoch": 0.7605168561158069,
+      "grad_norm": 0.22092190384864807,
+      "learning_rate": 4.811210587777346e-05,
+      "loss": 1.0216,
+      "step": 1957
+    },
+    {
+      "epoch": 0.7609054697367142,
+      "grad_norm": 0.21419481933116913,
+      "learning_rate": 4.8034254573764116e-05,
+      "loss": 1.0327,
+      "step": 1958
+    },
+    {
+      "epoch": 0.7612940833576217,
+      "grad_norm": 0.1954476237297058,
+      "learning_rate": 4.795640326975477e-05,
+      "loss": 1.0139,
+      "step": 1959
+    },
+    {
+      "epoch": 0.7616826969785291,
+      "grad_norm": 0.21092113852500916,
+      "learning_rate": 4.7878551965745427e-05,
+      "loss": 1.0934,
+      "step": 1960
+    },
+    {
+      "epoch": 0.7620713105994366,
+      "grad_norm": 0.1998988837003708,
+      "learning_rate": 4.7800700661736085e-05,
+      "loss": 0.9782,
+      "step": 1961
+    },
+    {
+      "epoch": 0.7624599242203439,
+      "grad_norm": 0.20410674810409546,
+      "learning_rate": 4.7722849357726744e-05,
+      "loss": 1.0186,
+      "step": 1962
+    },
+    {
+      "epoch": 0.7628485378412513,
+      "grad_norm": 0.25312289595603943,
+      "learning_rate": 4.76449980537174e-05,
+      "loss": 1.0103,
+      "step": 1963
+    },
+    {
+      "epoch": 0.7632371514621588,
+      "grad_norm": 0.20648318529129028,
+      "learning_rate": 4.7567146749708055e-05,
+      "loss": 1.0314,
+      "step": 1964
+    },
+    {
+      "epoch": 0.7636257650830661,
+      "grad_norm": 0.20513702929019928,
+      "learning_rate": 4.7489295445698714e-05,
+      "loss": 0.981,
+      "step": 1965
+    },
+    {
+      "epoch": 0.7640143787039736,
+      "grad_norm": 0.20063039660453796,
+      "learning_rate": 4.741144414168938e-05,
+      "loss": 1.0218,
+      "step": 1966
+    },
+    {
+      "epoch": 0.764402992324881,
+      "grad_norm": 0.20328521728515625,
+      "learning_rate": 4.733359283768004e-05,
+      "loss": 1.0614,
+      "step": 1967
+    },
+    {
+      "epoch": 0.7647916059457884,
+      "grad_norm": 0.2209623008966446,
+      "learning_rate": 4.725574153367069e-05,
+      "loss": 1.0478,
+      "step": 1968
+    },
+    {
+      "epoch": 0.7651802195666958,
+      "grad_norm": 0.2023559957742691,
+      "learning_rate": 4.717789022966135e-05,
+      "loss": 1.0455,
+      "step": 1969
+    },
+    {
+      "epoch": 0.7655688331876033,
+      "grad_norm": 0.20461297035217285,
+      "learning_rate": 4.710003892565201e-05,
+      "loss": 0.9427,
+      "step": 1970
+    },
+    {
+      "epoch": 0.7659574468085106,
+      "grad_norm": 0.2108335793018341,
+      "learning_rate": 4.702218762164267e-05,
+      "loss": 1.0344,
+      "step": 1971
+    },
+    {
+      "epoch": 0.766346060429418,
+      "grad_norm": 0.20883473753929138,
+      "learning_rate": 4.6944336317633326e-05,
+      "loss": 1.0336,
+      "step": 1972
+    },
+    {
+      "epoch": 0.7667346740503255,
+      "grad_norm": 0.20144741237163544,
+      "learning_rate": 4.686648501362398e-05,
+      "loss": 1.0101,
+      "step": 1973
+    },
+    {
+      "epoch": 0.7671232876712328,
+      "grad_norm": 0.21269328892230988,
+      "learning_rate": 4.6788633709614637e-05,
+      "loss": 0.9989,
+      "step": 1974
+    },
+    {
+      "epoch": 0.7675119012921403,
+      "grad_norm": 0.20673738420009613,
+      "learning_rate": 4.6710782405605295e-05,
+      "loss": 1.0235,
+      "step": 1975
+    },
+    {
+      "epoch": 0.7679005149130477,
+      "grad_norm": 0.1966594159603119,
+      "learning_rate": 4.6632931101595954e-05,
+      "loss": 1.0081,
+      "step": 1976
+    },
+    {
+      "epoch": 0.7682891285339551,
+      "grad_norm": 0.22186829149723053,
+      "learning_rate": 4.6555079797586606e-05,
+      "loss": 1.0081,
+      "step": 1977
+    },
+    {
+      "epoch": 0.7686777421548625,
+      "grad_norm": 0.20602557063102722,
+      "learning_rate": 4.6477228493577265e-05,
+      "loss": 1.0381,
+      "step": 1978
+    },
+    {
+      "epoch": 0.76906635577577,
+      "grad_norm": 0.19581305980682373,
+      "learning_rate": 4.639937718956793e-05,
+      "loss": 1.0196,
+      "step": 1979
+    },
+    {
+      "epoch": 0.7694549693966773,
+      "grad_norm": 0.20162086188793182,
+      "learning_rate": 4.632152588555859e-05,
+      "loss": 1.0168,
+      "step": 1980
+    },
+    {
+      "epoch": 0.7698435830175848,
+      "grad_norm": 0.21967145800590515,
+      "learning_rate": 4.624367458154925e-05,
+      "loss": 1.0339,
+      "step": 1981
+    },
+    {
+      "epoch": 0.7702321966384922,
+      "grad_norm": 0.20245851576328278,
+      "learning_rate": 4.61658232775399e-05,
+      "loss": 1.0349,
+      "step": 1982
+    },
+    {
+      "epoch": 0.7706208102593995,
+      "grad_norm": 0.20409934222698212,
+      "learning_rate": 4.608797197353056e-05,
+      "loss": 1.0296,
+      "step": 1983
+    },
+    {
+      "epoch": 0.771009423880307,
+      "grad_norm": 0.19757163524627686,
+      "learning_rate": 4.601012066952122e-05,
+      "loss": 1.0443,
+      "step": 1984
+    },
+    {
+      "epoch": 0.7713980375012144,
+      "grad_norm": 0.20038221776485443,
+      "learning_rate": 4.593226936551188e-05,
+      "loss": 1.0431,
+      "step": 1985
+    },
+    {
+      "epoch": 0.7717866511221219,
+      "grad_norm": 0.2112458199262619,
+      "learning_rate": 4.585441806150253e-05,
+      "loss": 1.0553,
+      "step": 1986
+    },
+    {
+      "epoch": 0.7721752647430292,
+      "grad_norm": 0.21868042647838593,
+      "learning_rate": 4.577656675749319e-05,
+      "loss": 1.0061,
+      "step": 1987
+    },
+    {
+      "epoch": 0.7725638783639367,
+      "grad_norm": 0.22484582662582397,
+      "learning_rate": 4.5698715453483846e-05,
+      "loss": 1.0831,
+      "step": 1988
+    },
+    {
+      "epoch": 0.7729524919848441,
+      "grad_norm": 0.20265011489391327,
+      "learning_rate": 4.5620864149474505e-05,
+      "loss": 1.0206,
+      "step": 1989
+    },
+    {
+      "epoch": 0.7733411056057515,
+      "grad_norm": 0.2052810937166214,
+      "learning_rate": 4.5543012845465164e-05,
+      "loss": 1.0366,
+      "step": 1990
+    },
+    {
+      "epoch": 0.7737297192266589,
+      "grad_norm": 0.21016088128089905,
+      "learning_rate": 4.546516154145582e-05,
+      "loss": 0.9963,
+      "step": 1991
+    },
+    {
+      "epoch": 0.7741183328475663,
+      "grad_norm": 0.19719412922859192,
+      "learning_rate": 4.538731023744648e-05,
+      "loss": 0.9853,
+      "step": 1992
+    },
+    {
+      "epoch": 0.7745069464684737,
+      "grad_norm": 0.20447245240211487,
+      "learning_rate": 4.530945893343714e-05,
+      "loss": 0.9977,
+      "step": 1993
+    },
+    {
+      "epoch": 0.7748955600893811,
+      "grad_norm": 0.21796588599681854,
+      "learning_rate": 4.52316076294278e-05,
+      "loss": 1.0949,
+      "step": 1994
+    },
+    {
+      "epoch": 0.7752841737102886,
+      "grad_norm": 0.2041284590959549,
+      "learning_rate": 4.515375632541845e-05,
+      "loss": 1.0034,
+      "step": 1995
+    },
+    {
+      "epoch": 0.7756727873311959,
+      "grad_norm": 0.21134726703166962,
+      "learning_rate": 4.507590502140911e-05,
+      "loss": 1.0076,
+      "step": 1996
+    },
+    {
+      "epoch": 0.7760614009521034,
+      "grad_norm": 0.20730996131896973,
+      "learning_rate": 4.499805371739977e-05,
+      "loss": 1.0456,
+      "step": 1997
+    },
+    {
+      "epoch": 0.7764500145730108,
+      "grad_norm": 0.22316931188106537,
+      "learning_rate": 4.492020241339043e-05,
+      "loss": 0.9418,
+      "step": 1998
+    },
+    {
+      "epoch": 0.7768386281939182,
+      "grad_norm": 0.21494819223880768,
+      "learning_rate": 4.484235110938109e-05,
+      "loss": 1.0597,
+      "step": 1999
+    },
+    {
+      "epoch": 0.7772272418148256,
+      "grad_norm": 0.20344491302967072,
+      "learning_rate": 4.476449980537174e-05,
+      "loss": 0.9749,
+      "step": 2000
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 2574,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.7079644445254222e+19,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/outputs/checkpoint-2500/README.md b/outputs/checkpoint-2500/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3abf956c074d00f34a12693c8d6da9738211d7c7
--- /dev/null
+++ b/outputs/checkpoint-2500/README.md
@@ -0,0 +1,209 @@
+---
+base_model: unsloth/gpt-oss-20b-unsloth-bnb-4bit
+library_name: peft
+tags:
+- base_model:adapter:unsloth/gpt-oss-20b-unsloth-bnb-4bit
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/outputs/checkpoint-2500/adapter_config.json b/outputs/checkpoint-2500/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e285b9b6e018b5b9f23736d6699eb1a4267764e7
--- /dev/null
+++ b/outputs/checkpoint-2500/adapter_config.json
@@ -0,0 +1,45 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": {
+    "base_model_class": "GptOssForCausalLM",
+    "parent_library": "transformers.models.gpt_oss.modeling_gpt_oss"
+  },
+  "base_model_name_or_path": "unsloth/gpt-oss-20b-unsloth-bnb-4bit",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "v_proj",
+    "up_proj",
+    "down_proj",
+    "gate_proj",
+    "k_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": null,
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/outputs/checkpoint-2500/chat_template.jinja b/outputs/checkpoint-2500/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..a3650f886e98b2834c25727759c8e0ab8495f316
--- /dev/null
+++ b/outputs/checkpoint-2500/chat_template.jinja
@@ -0,0 +1,315 @@
+{# Copyright 2025-present Unsloth. Apache 2.0 License. Unsloth chat template fixes. Edited from ggml-org & OpenAI #}
+{#-
+  In addition to the normal inputs of `messages` and `tools`, this template also accepts the
+  following kwargs:
+  - "builtin_tools": A list, can contain "browser" and/or "python".
+  - "model_identity": A string that optionally describes the model identity.
+  - "reasoning_effort": A string that describes the reasoning effort, defaults to "medium".
+ #}
+
+{#- Tool Definition Rendering ============================================== #}
+{%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%}
+    {%- if param_spec.type == "array" -%}
+        {%- if param_spec['items'] -%}
+            {%- if param_spec['items']['type'] == "string" -%}
+                {{- "string[]" }}
+            {%- elif param_spec['items']['type'] == "number" -%}
+                {{- "number[]" }}
+            {%- elif param_spec['items']['type'] == "integer" -%}
+                {{- "number[]" }}
+            {%- elif param_spec['items']['type'] == "boolean" -%}
+                {{- "boolean[]" }}
+            {%- else -%}
+                {%- set inner_type = render_typescript_type(param_spec['items'], required_params) -%}
+                {%- if inner_type == "object | object" or inner_type|length > 50 -%}
+                    {{- "any[]" }}
+                {%- else -%}
+                    {{- inner_type + "[]" }}
+                {%- endif -%}
+            {%- endif -%}
+            {%- if param_spec.nullable -%}
+                {{- " | null" }}
+            {%- endif -%}
+        {%- else -%}
+            {{- "any[]" }}
+            {%- if param_spec.nullable -%}
+                {{- " | null" }}
+            {%- endif -%}
+        {%- endif -%}
+    {%- elif param_spec.type is defined and param_spec.type is iterable and param_spec.type is not string and param_spec.type is not mapping and param_spec.type[0] is defined -%}
+        {#- Handle array of types like ["object", "object"] from Union[dict, list] #}
+        {%- if param_spec.type | length > 1 -%}
+            {{- param_spec.type | join(" | ") }}
+        {%- else -%}
+            {{- param_spec.type[0] }}
+        {%- endif -%}
+    {%- elif param_spec.oneOf -%}
+        {#- Handle oneOf schemas - check for complex unions and fallback to any #}
+        {%- set has_object_variants = false -%}
+        {%- for variant in param_spec.oneOf -%}
+            {%- if variant.type == "object" -%}
+                {%- set has_object_variants = true -%}
+            {%- endif -%}
+        {%- endfor -%}
+        {%- if has_object_variants and param_spec.oneOf|length > 1 -%}
+            {{- "any" }}
+        {%- else -%}
+            {%- for variant in param_spec.oneOf -%}
+                {{- render_typescript_type(variant, required_params) -}}
+                {%- if variant.description %}
+                    {{- "// " + variant.description }}
+                {%- endif -%}
+                {%- if variant.default is defined %}
+                    {{ "// default: " + variant.default|tojson }}
+                {%- endif -%}
+                {%- if not loop.last %}
+                    {{- " | " }}
+                {% endif -%}
+            {%- endfor -%}
+        {%- endif -%}
+    {%- elif param_spec.type == "string" -%}
+        {%- if param_spec.enum -%}
+            {{- '"' + param_spec.enum|join('" | "') + '"' -}}
+        {%- else -%}
+            {{- "string" }}
+            {%- if param_spec.nullable %}
+                {{- " | null" }}
+            {%- endif -%}
+        {%- endif -%}
+    {%- elif param_spec.type == "number" -%}
+        {{- "number" }}
+    {%- elif param_spec.type == "integer" -%}
+        {{- "number" }}
+    {%- elif param_spec.type == "boolean" -%}
+        {{- "boolean" }}
+
+    {%- elif param_spec.type == "object" -%}
+        {%- if param_spec.properties -%}
+            {{- "{\n" }}
+            {%- for prop_name, prop_spec in param_spec.properties.items() -%}
+                {{- prop_name -}}
+                {%- if prop_name not in (param_spec.required or []) -%}
+                    {{- "?" }}
+                {%- endif -%}
+                {{- ": " }}
+                {{ render_typescript_type(prop_spec, param_spec.required or []) }}
+                {%- if not loop.last -%}
+                    {{-", " }}
+                {%- endif -%}
+            {%- endfor -%}
+            {{- "}" }}
+        {%- else -%}
+            {{- "object" }}
+        {%- endif -%}
+    {%- else -%}
+        {{- "any" }}
+    {%- endif -%}
+{%- endmacro -%}
+
+{%- macro render_tool_namespace(namespace_name, tools) -%}
+    {{- "## " + namespace_name + "\n\n" }}
+    {{- "namespace " + namespace_name + " {\n\n" }}
+    {%- for tool in tools %}
+        {%- set tool = tool.function %}
+        {{- "// " + tool.description + "\n" }}
+        {{- "type "+ tool.name + " = " }}
+        {%- if tool.parameters and tool.parameters.properties -%}
+            {{- "(_: " }}
+            {{- "{\n" }}
+            {%- for param_name, param_spec in tool.parameters.properties.items() %}
+                {{- "// " + param_spec.description + "\n" }}
+                {{- param_name }}
+                {%- if param_name not in (tool.parameters.required or []) -%}
+                    {{- "?" }}
+                {%- endif -%}
+                {{- ": " }}
+                {{- render_typescript_type(param_spec, tool.parameters.required or []) }}
+                {%- if param_spec.default is defined -%}
+                    {%- if param_spec.enum %}
+                        {{- ", // default: " + param_spec.default }}
+                    {%- elif param_spec.oneOf %}
+                        {{- "// default: " + param_spec.default }}
+                    {%- else %}
+                        {{- ", // default: " + param_spec.default|tojson }}
+                    {%- endif -%}
+                {%- endif -%}
+                {%- if not loop.last %}
+                    {{- ",\n" }}
+                {%- else %}
+                    {{- "\n" }}
+                {%- endif -%}
+            {%- endfor %}
+            {{- "}) => any;\n\n" }}
+        {%- else -%}
+            {{- "() => any;\n\n" }}
+        {%- endif -%}
+    {%- endfor %}
+    {{- "} // namespace " + namespace_name }}
+{%- endmacro -%}
+
+{%- macro render_builtin_tools(browser_tool, python_tool) -%}
+    {%- if browser_tool %}
+        {{- "## browser\n\n" }}
+        {{- "// Tool for browsing.\n" }}
+        {{- "// The `cursor` appears in brackets before each browsing display: `[{cursor}]`.\n" }}
+        {{- "// Cite information from the tool using the following format:\n" }}
+        {{- "// `【{cursor}†L{line_start}(-L{line_end})?】`, for example: `【6†L9-L11】` or `【8†L3】`.\n" }}
+        {{- "// Do not quote more than 10 words directly from the tool output.\n" }}
+        {{- "// sources=web (default: web)\n" }}
+        {{- "namespace browser {\n\n" }}
+        {{- "// Searches for information related to `query` and displays `topn` results.\n" }}
+        {{- "type search = (_: {\n" }}
+        {{- "query: string,\n" }}
+        {{- "topn?: number, // default: 10\n" }}
+        {{- "source?: string,\n" }}
+        {{- "}) => any;\n\n" }}
+        {{- "// Opens the link `id` from the page indicated by `cursor` starting at line number `loc`, showing `num_lines` lines.\n" }}
+        {{- "// Valid link ids are displayed with the formatting: `【{id}†.*】`.\n" }}
+        {{- "// If `cursor` is not provided, the most recent page is implied.\n" }}
+        {{- "// If `id` is a string, it is treated as a fully qualified URL associated with `source`.\n" }}
+        {{- "// If `loc` is not provided, the viewport will be positioned at the beginning of the document or centered on the most relevant passage, if available.\n" }}
+        {{- "// Use this function without `id` to scroll to a new location of an opened page.\n" }}
+        {{- "type open = (_: {\n" }}
+        {{- "id?: number | string, // default: -1\n" }}
+        {{- "cursor?: number, // default: -1\n" }}
+        {{- "loc?: number, // default: -1\n" }}
+        {{- "num_lines?: number, // default: -1\n" }}
+        {{- "view_source?: boolean, // default: false\n" }}
+        {{- "source?: string,\n" }}
+        {{- "}) => any;\n\n" }}
+        {{- "// Finds exact matches of `pattern` in the current page, or the page given by `cursor`.\n" }}
+        {{- "type find = (_: {\n" }}
+        {{- "pattern: string,\n" }}
+        {{- "cursor?: number, // default: -1\n" }}
+        {{- "}) => any;\n\n" }}
+        {{- "} // namespace browser\n\n" }}
+    {%- endif -%}
+
+    {%- if python_tool %}
+        {{- "## python\n\n" }}
+        {{- "Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).\n\n" }}
+        {{- "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 120.0 seconds. The drive at '/mnt/data' can be used to save and persist user files. Internet access for this session is UNKNOWN. Depends on the cluster.\n\n" }}
+    {%- endif -%}
+{%- endmacro -%}
+
+{#- System Message Construction ============================================ #}
+{%- macro build_system_message() -%}
+    {%- if model_identity is not defined %}
+        {{- "You are ChatGPT, a large language model trained by OpenAI.\n" -}}
+    {%- else %}
+        {{- model_identity }}
+    {%- endif %}
+    {{- "Knowledge cutoff: 2024-06\n" }}
+    {{- "Current date: " + strftime_now("%Y-%m-%d") + "\n\n" }}
+    {%- if reasoning_effort is not defined %}
+        {%- set reasoning_effort = "medium" %}
+    {%- endif %}
+    {{- "Reasoning: " + reasoning_effort + "\n\n" }}
+    {%- if builtin_tools is defined %}
+        {{- "# Tools\n\n" }}
+        {%- set available_builtin_tools = namespace(browser=false, python=false) %}
+        {%- for tool in builtin_tools %}
+            {%- if tool == "browser" %}
+                {%- set available_builtin_tools.browser = true %}
+            {%- elif tool == "python" %}
+                {%- set available_builtin_tools.python = true %}
+            {%- endif %}
+        {%- endfor %}
+        {{- render_builtin_tools(available_builtin_tools.browser, available_builtin_tools.python) }}
+    {%- endif -%}
+    {{- "# Valid channels: analysis, commentary, final. Channel must be included for every message." }}
+    {%- if tools is defined -%}
+        {{- "\nCalls to these tools must go to the commentary channel: 'functions'." }}
+    {%- endif -%}
+{%- endmacro -%}
+
+{#- Main Template Logic ================================================= #}
+{#- Set defaults #}
+
+{#- Render system message #}
+{{- "<|start|>system<|message|>" }}
+{{- build_system_message() }}
+{{- "<|end|>" }}
+
+{#- Extract developer message #}
+{%- if messages[0].role == "developer" or messages[0].role == "system" %}
+    {%- set developer_message = messages[0].content %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set developer_message = "" %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+
+{#- Render developer message #}
+{%- if developer_message or tools %}
+    {{- "<|start|>developer<|message|>" }}
+    {%- if developer_message %}
+        {{- "# Instructions\n\n" }}
+        {{- developer_message }}
+    {%- endif %}
+    {%- if tools -%}
+        {{- "\n\n" }}
+        {{- "# Tools\n\n" }}
+        {{- render_tool_namespace("functions", tools) }}
+    {%- endif -%}
+    {{- "<|end|>" }}
+{%- endif %}
+
+{#- Render messages #}
+{%- set last_tool_call = namespace(name=none) %}
+{%- for message in loop_messages -%}
+    {#- At this point only assistant/user/tool messages should remain #}
+    {%- if message.role == 'assistant' -%}
+        {%- if "tool_calls" in message %}
+            {#- We assume max 1 tool call per message, and so we infer the tool call name #}
+            {#- in "tool" messages from the most recent assistant tool call name #}
+            {%- set tool_call = message.tool_calls[0] %}
+            {%- if tool_call.function %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {%- if message.content %}
+                {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.content + "<|end|>" }}
+            {%- endif %}
+            {{- "<|start|>assistant to=" }}
+            {{- "functions." + tool_call.name + "<|channel|>commentary json<|message|>" }}
+            {{- tool_call.arguments|tojson }}
+            {{- "<|call|>" }}
+            {%- set last_tool_call.name = tool_call.name %}
+        {%- elif "thinking" in message and loop.last and not add_generation_prompt %}
+            {#- Only render the CoT if the final turn is an assistant turn and add_generation_prompt is false #}
+            {#- This is a situation that should only occur in training, never in inference. #}
+            {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }}
+            {#- <|return|> indicates the end of generation, but <|end|> does not #}
+            {#- <|return|> should never be an input to the model, but we include it as the final token #}
+            {#- when training, so the model learns to emit it. #}
+            {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|return|>" }}
+            {%- set last_tool_call.name = none %}
+        {%- elif "thinking" in message %}
+            {#- CoT is dropped during all previous turns, so we never render it for inference #}
+            {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|end|>" }}
+            {%- set last_tool_call.name = none %}
+        {%- elif loop.last and not add_generation_prompt %}
+            {#- <|return|> indicates the end of generation, but <|end|> does not #}
+            {#- <|return|> should never be an input to the model, but we include it as the final token #}
+            {#- when training, so the model learns to emit it. #}
+            {{- "<|start|>assistant<|message|>" + message.content + "<|return|>" }}
+        {%- else %}
+            {{- "<|start|>assistant<|message|>" + message.content + "<|end|>" }}
+            {%- set last_tool_call.name = none %}
+        {%- endif %}
+    {%- elif message.role == 'tool' -%}
+        {%- if last_tool_call.name is none %}
+            {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }}
+        {%- endif %}
+        {{- "<|start|>functions." + last_tool_call.name }}
+        {{- " to=assistant<|channel|>commentary<|message|>" + message.content|tojson + "<|end|>" }}
+    {%- else -%}
+        {{- "<|start|>user<|message|>" + message.content + "<|end|>" }}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Generation prompt #}
+{%- if add_generation_prompt -%}
+<|start|>assistant
+{%- endif -%}
+{# Copyright 2025-present Unsloth. Apache 2.0 License. Unsloth chat template fixes. Edited from ggml-org & OpenAI #}
\ No newline at end of file
diff --git a/outputs/checkpoint-2500/optimizer.pt b/outputs/checkpoint-2500/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b35ff1d3f5514a357050a8186bbc57a31ead7aff
--- /dev/null
+++ b/outputs/checkpoint-2500/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7e45070f968f70d7a53d726615310d86734504e3bce33d45c6aeee13b2a6a00
+size 16894883
diff --git a/outputs/checkpoint-2500/rng_state.pth b/outputs/checkpoint-2500/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..3ef66339b9befa098183fd5d69faed6838e526b0
--- /dev/null
+++ b/outputs/checkpoint-2500/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f1d565802a8e26c4e8a31328752b7a7fdc186d9401aa008e65697d0ad8c22e33
+size 14645
diff --git a/outputs/checkpoint-2500/special_tokens_map.json b/outputs/checkpoint-2500/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..6fba18753f4d09dbb8fcdf1482daff36b963d639
--- /dev/null
+++ b/outputs/checkpoint-2500/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|return|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|reserved_200017|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/outputs/checkpoint-2500/tokenizer.json b/outputs/checkpoint-2500/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..6ec3ef1795cbbda6b7cb7d1f114919cbe3fdd647
--- /dev/null
+++ b/outputs/checkpoint-2500/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0614fe83cadab421296e664e1f48f4261fa8fef6e03e63bb75c20f38e37d07d3
+size 27868174
diff --git a/outputs/checkpoint-2500/tokenizer_config.json b/outputs/checkpoint-2500/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..482ae30d27a74c38d2228e69dd37c529fc485a45
--- /dev/null
+++ b/outputs/checkpoint-2500/tokenizer_config.json
@@ -0,0 +1,185 @@
+{
+  "added_tokens_decoder": {
+    "199998": {
+      "content": "<|startoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "199999": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200000": {
+      "content": "<|reserved_200000|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200001": {
+      "content": "<|reserved_200001|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200002": {
+      "content": "<|return|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200003": {
+      "content": "<|constrain|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200004": {
+      "content": "<|reserved_200004|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200005": {
+      "content": "<|channel|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200006": {
+      "content": "<|start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200007": {
+      "content": "<|end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200008": {
+      "content": "<|message|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200009": {
+      "content": "<|reserved_200009|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200010": {
+      "content": "<|reserved_200010|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200011": {
+      "content": "<|reserved_200011|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200012": {
+      "content": "<|call|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200013": {
+      "content": "<|reserved_200013|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200014": {
+      "content": "<|reserved_200014|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200015": {
+      "content": "<|reserved_200015|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200016": {
+      "content": "<|reserved_200016|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200017": {
+      "content": "<|reserved_200017|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200018": {
+      "content": "<|endofprompt|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|startoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|return|>",
+  "extra_special_tokens": {},
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 131072,
+  "pad_token": "<|reserved_200017|>",
+  "padding_side": "right",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": null
+}
diff --git a/outputs/checkpoint-2500/trainer_state.json b/outputs/checkpoint-2500/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..ec05f859eef78c5a2160e814e1eb1c901377dfec
--- /dev/null
+++ b/outputs/checkpoint-2500/trainer_state.json
@@ -0,0 +1,17534 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9715340522685321,
+  "eval_steps": 500,
+  "global_step": 2500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 13.684800148010254,
+      "learning_rate": 0.0,
+      "loss": 2.3276,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 13.660787582397461,
+      "learning_rate": 4e-05,
+      "loss": 2.2792,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 13.35280704498291,
+      "learning_rate": 8e-05,
+      "loss": 2.4151,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 6.15027379989624,
+      "learning_rate": 0.00012,
+      "loss": 1.7812,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 1.3168226480484009,
+      "learning_rate": 0.00016,
+      "loss": 1.4536,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.9872580170631409,
+      "learning_rate": 0.0002,
+      "loss": 1.4171,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.7496100664138794,
+      "learning_rate": 0.00019935064935064936,
+      "loss": 1.4168,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.7376005053520203,
+      "learning_rate": 0.00019870129870129872,
+      "loss": 1.3659,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5281137824058533,
+      "learning_rate": 0.00019805194805194807,
+      "loss": 1.2566,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.5485746264457703,
+      "learning_rate": 0.00019740259740259742,
+      "loss": 1.3761,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.5506592392921448,
+      "learning_rate": 0.00019675324675324675,
+      "loss": 1.3327,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.49382686614990234,
+      "learning_rate": 0.00019610389610389613,
+      "loss": 1.3727,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.36203011870384216,
+      "learning_rate": 0.00019545454545454548,
+      "loss": 1.1515,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.3528599739074707,
+      "learning_rate": 0.0001948051948051948,
+      "loss": 1.2636,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.31244418025016785,
+      "learning_rate": 0.00019415584415584416,
+      "loss": 1.1873,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.3379523754119873,
+      "learning_rate": 0.00019350649350649354,
+      "loss": 1.2657,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.3025083839893341,
+      "learning_rate": 0.00019285714285714286,
+      "loss": 1.2846,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.2560190260410309,
+      "learning_rate": 0.00019220779220779222,
+      "loss": 1.1587,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.2554129958152771,
+      "learning_rate": 0.00019155844155844157,
+      "loss": 1.2812,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.22662702202796936,
+      "learning_rate": 0.00019090909090909092,
+      "loss": 1.1664,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.2515714168548584,
+      "learning_rate": 0.00019025974025974027,
+      "loss": 1.2177,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.24396637082099915,
+      "learning_rate": 0.00018961038961038963,
+      "loss": 1.2053,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.24488303065299988,
+      "learning_rate": 0.00018896103896103895,
+      "loss": 1.2074,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.2168620079755783,
+      "learning_rate": 0.00018831168831168833,
+      "loss": 1.1284,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.24021224677562714,
+      "learning_rate": 0.00018766233766233769,
+      "loss": 1.2169,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.20057056844234467,
+      "learning_rate": 0.000187012987012987,
+      "loss": 1.1031,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.19900795817375183,
+      "learning_rate": 0.00018636363636363636,
+      "loss": 1.1004,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.2019268423318863,
+      "learning_rate": 0.00018571428571428572,
+      "loss": 1.1476,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.1996479034423828,
+      "learning_rate": 0.00018506493506493507,
+      "loss": 1.1455,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.25262022018432617,
+      "learning_rate": 0.00018441558441558442,
+      "loss": 1.1025,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.225438192486763,
+      "learning_rate": 0.00018376623376623378,
+      "loss": 1.1954,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.17834505438804626,
+      "learning_rate": 0.00018311688311688313,
+      "loss": 1.0934,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.20071206986904144,
+      "learning_rate": 0.00018246753246753248,
+      "loss": 1.0488,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.1920139640569687,
+      "learning_rate": 0.00018181818181818183,
+      "loss": 1.123,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.18714852631092072,
+      "learning_rate": 0.0001811688311688312,
+      "loss": 1.0798,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.18315713107585907,
+      "learning_rate": 0.00018051948051948054,
+      "loss": 1.1107,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.19156870245933533,
+      "learning_rate": 0.00017987012987012987,
+      "loss": 1.1125,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.21527768671512604,
+      "learning_rate": 0.00017922077922077922,
+      "loss": 1.1346,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.1871163249015808,
+      "learning_rate": 0.0001785714285714286,
+      "loss": 1.0742,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.17750784754753113,
+      "learning_rate": 0.00017792207792207792,
+      "loss": 1.1323,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.177419051527977,
+      "learning_rate": 0.00017727272727272728,
+      "loss": 1.1405,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.16714292764663696,
+      "learning_rate": 0.00017662337662337663,
+      "loss": 1.1084,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.1610356718301773,
+      "learning_rate": 0.00017597402597402598,
+      "loss": 1.1125,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.2548656761646271,
+      "learning_rate": 0.00017532467532467534,
+      "loss": 1.1114,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.1731044203042984,
+      "learning_rate": 0.0001746753246753247,
+      "loss": 1.1197,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.1739533394575119,
+      "learning_rate": 0.00017402597402597401,
+      "loss": 1.1777,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.2178352177143097,
+      "learning_rate": 0.0001733766233766234,
+      "loss": 1.1111,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.17247150838375092,
+      "learning_rate": 0.00017272727272727275,
+      "loss": 1.1253,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.18075324594974518,
+      "learning_rate": 0.00017207792207792207,
+      "loss": 1.1358,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.15898071229457855,
+      "learning_rate": 0.00017142857142857143,
+      "loss": 1.0606,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.16518613696098328,
+      "learning_rate": 0.0001707792207792208,
+      "loss": 1.0944,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.16035063564777374,
+      "learning_rate": 0.00017012987012987013,
+      "loss": 1.0554,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.1686483472585678,
+      "learning_rate": 0.00016948051948051948,
+      "loss": 1.0384,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.16575631499290466,
+      "learning_rate": 0.00016883116883116884,
+      "loss": 1.0243,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.16840039193630219,
+      "learning_rate": 0.0001681818181818182,
+      "loss": 1.117,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.17616064846515656,
+      "learning_rate": 0.00016753246753246754,
+      "loss": 1.0743,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.168218195438385,
+      "learning_rate": 0.0001668831168831169,
+      "loss": 1.0627,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.17026656866073608,
+      "learning_rate": 0.00016623376623376625,
+      "loss": 1.0059,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.16454458236694336,
+      "learning_rate": 0.0001655844155844156,
+      "loss": 0.9943,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.17185136675834656,
+      "learning_rate": 0.00016493506493506495,
+      "loss": 1.1545,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.17822986841201782,
+      "learning_rate": 0.00016428571428571428,
+      "loss": 1.073,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.1676608771085739,
+      "learning_rate": 0.00016363636363636366,
+      "loss": 1.0886,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.1727771908044815,
+      "learning_rate": 0.000162987012987013,
+      "loss": 1.0432,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.17827573418617249,
+      "learning_rate": 0.00016233766233766234,
+      "loss": 1.083,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.19807517528533936,
+      "learning_rate": 0.0001616883116883117,
+      "loss": 1.1208,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.17693684995174408,
+      "learning_rate": 0.00016103896103896104,
+      "loss": 1.089,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.15489234030246735,
+      "learning_rate": 0.0001603896103896104,
+      "loss": 0.9707,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.16443990170955658,
+      "learning_rate": 0.00015974025974025975,
+      "loss": 1.0643,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.2051103413105011,
+      "learning_rate": 0.0001590909090909091,
+      "loss": 1.1246,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.18824075162410736,
+      "learning_rate": 0.00015844155844155845,
+      "loss": 1.0855,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.18659448623657227,
+      "learning_rate": 0.0001577922077922078,
+      "loss": 1.1412,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.1854114979505539,
+      "learning_rate": 0.00015714285714285716,
+      "loss": 1.0249,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.1876193732023239,
+      "learning_rate": 0.00015649350649350649,
+      "loss": 1.1029,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.1888684630393982,
+      "learning_rate": 0.00015584415584415587,
+      "loss": 1.0789,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.20240606367588043,
+      "learning_rate": 0.0001551948051948052,
+      "loss": 1.0495,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.232120081782341,
+      "learning_rate": 0.00015454545454545454,
+      "loss": 1.0735,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.16897843778133392,
+      "learning_rate": 0.0001538961038961039,
+      "loss": 1.0164,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.18796634674072266,
+      "learning_rate": 0.00015324675324675325,
+      "loss": 1.0676,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.19574032723903656,
+      "learning_rate": 0.0001525974025974026,
+      "loss": 1.0456,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.18007811903953552,
+      "learning_rate": 0.00015194805194805196,
+      "loss": 1.0894,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.18932929635047913,
+      "learning_rate": 0.0001512987012987013,
+      "loss": 1.0729,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.20614288747310638,
+      "learning_rate": 0.00015064935064935066,
+      "loss": 1.0854,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.19291089475154877,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.1217,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.18916529417037964,
+      "learning_rate": 0.00014935064935064934,
+      "loss": 1.0963,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.20306220650672913,
+      "learning_rate": 0.00014870129870129872,
+      "loss": 1.0898,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.17870067059993744,
+      "learning_rate": 0.00014805194805194807,
+      "loss": 1.0213,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.18411923944950104,
+      "learning_rate": 0.0001474025974025974,
+      "loss": 1.0844,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.18788227438926697,
+      "learning_rate": 0.00014675324675324675,
+      "loss": 1.0338,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.23874884843826294,
+      "learning_rate": 0.00014610389610389613,
+      "loss": 1.1118,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.19380499422550201,
+      "learning_rate": 0.00014545454545454546,
+      "loss": 1.0464,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.18968750536441803,
+      "learning_rate": 0.0001448051948051948,
+      "loss": 1.0569,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.19545753300189972,
+      "learning_rate": 0.00014415584415584416,
+      "loss": 1.1225,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.19170494377613068,
+      "learning_rate": 0.00014350649350649352,
+      "loss": 1.0602,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.17953918874263763,
+      "learning_rate": 0.00014285714285714287,
+      "loss": 1.032,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.1822536289691925,
+      "learning_rate": 0.00014220779220779222,
+      "loss": 1.0559,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.18591298162937164,
+      "learning_rate": 0.00014155844155844155,
+      "loss": 1.031,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.2129002958536148,
+      "learning_rate": 0.00014090909090909093,
+      "loss": 1.1391,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.18386681377887726,
+      "learning_rate": 0.00014025974025974028,
+      "loss": 0.9919,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.18314239382743835,
+      "learning_rate": 0.0001396103896103896,
+      "loss": 1.0445,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.1999066174030304,
+      "learning_rate": 0.00013896103896103896,
+      "loss": 1.0538,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.18741188943386078,
+      "learning_rate": 0.00013831168831168834,
+      "loss": 1.0722,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.19351010024547577,
+      "learning_rate": 0.00013766233766233766,
+      "loss": 1.0491,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.18859203159809113,
+      "learning_rate": 0.00013701298701298702,
+      "loss": 1.0593,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.1962767392396927,
+      "learning_rate": 0.00013636363636363637,
+      "loss": 1.1344,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.20819440484046936,
+      "learning_rate": 0.00013571428571428572,
+      "loss": 1.1137,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.19590184092521667,
+      "learning_rate": 0.00013506493506493507,
+      "loss": 1.0624,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.18631424009799957,
+      "learning_rate": 0.00013441558441558443,
+      "loss": 1.0587,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.19572143256664276,
+      "learning_rate": 0.00013376623376623375,
+      "loss": 1.0494,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.1910988837480545,
+      "learning_rate": 0.00013311688311688313,
+      "loss": 1.0481,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.19455869495868683,
+      "learning_rate": 0.00013246753246753249,
+      "loss": 1.029,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.18669827282428741,
+      "learning_rate": 0.0001318181818181818,
+      "loss": 1.0513,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.17523664236068726,
+      "learning_rate": 0.0001311688311688312,
+      "loss": 1.0126,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.17929129302501678,
+      "learning_rate": 0.00013051948051948052,
+      "loss": 1.0717,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.19380168616771698,
+      "learning_rate": 0.00012987012987012987,
+      "loss": 1.0324,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.18090228736400604,
+      "learning_rate": 0.00012922077922077922,
+      "loss": 1.0515,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.2067340910434723,
+      "learning_rate": 0.00012857142857142858,
+      "loss": 1.0939,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.1880485862493515,
+      "learning_rate": 0.00012792207792207793,
+      "loss": 1.0986,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.182168647646904,
+      "learning_rate": 0.00012727272727272728,
+      "loss": 1.0109,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.20187129080295563,
+      "learning_rate": 0.00012662337662337663,
+      "loss": 1.0668,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.2082669734954834,
+      "learning_rate": 0.000125974025974026,
+      "loss": 1.054,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.18294434249401093,
+      "learning_rate": 0.00012532467532467534,
+      "loss": 1.0397,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.20515067875385284,
+      "learning_rate": 0.00012467532467532467,
+      "loss": 1.1092,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.1758790761232376,
+      "learning_rate": 0.00012402597402597402,
+      "loss": 0.9755,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.2170792669057846,
+      "learning_rate": 0.0001233766233766234,
+      "loss": 1.0434,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.202157124876976,
+      "learning_rate": 0.00012272727272727272,
+      "loss": 1.1129,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.18556398153305054,
+      "learning_rate": 0.00012207792207792208,
+      "loss": 1.0665,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.20196087658405304,
+      "learning_rate": 0.00012142857142857143,
+      "loss": 1.1,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.1921566128730774,
+      "learning_rate": 0.0001207792207792208,
+      "loss": 1.0918,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.18866224586963654,
+      "learning_rate": 0.00012012987012987014,
+      "loss": 1.0014,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.207601398229599,
+      "learning_rate": 0.00011948051948051949,
+      "loss": 1.0726,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.21592366695404053,
+      "learning_rate": 0.00011883116883116883,
+      "loss": 1.1379,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.2016124576330185,
+      "learning_rate": 0.0001181818181818182,
+      "loss": 1.1428,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.20478437840938568,
+      "learning_rate": 0.00011753246753246753,
+      "loss": 1.121,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.22730594873428345,
+      "learning_rate": 0.00011688311688311689,
+      "loss": 1.0319,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.22592711448669434,
+      "learning_rate": 0.00011623376623376625,
+      "loss": 1.1264,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.20035041868686676,
+      "learning_rate": 0.00011558441558441559,
+      "loss": 1.0686,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.20648567378520966,
+      "learning_rate": 0.00011493506493506494,
+      "loss": 1.0817,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.21222743391990662,
+      "learning_rate": 0.00011428571428571428,
+      "loss": 1.0678,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.2075391560792923,
+      "learning_rate": 0.00011363636363636365,
+      "loss": 1.0897,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.1964101791381836,
+      "learning_rate": 0.000112987012987013,
+      "loss": 1.0906,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.22406511008739471,
+      "learning_rate": 0.00011233766233766234,
+      "loss": 1.0594,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.23787978291511536,
+      "learning_rate": 0.00011168831168831168,
+      "loss": 1.1053,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.21196185052394867,
+      "learning_rate": 0.00011103896103896105,
+      "loss": 1.0923,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.21042804419994354,
+      "learning_rate": 0.0001103896103896104,
+      "loss": 1.0381,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.2267436534166336,
+      "learning_rate": 0.00010974025974025974,
+      "loss": 1.0818,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.23742735385894775,
+      "learning_rate": 0.00010909090909090909,
+      "loss": 1.0872,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.17787213623523712,
+      "learning_rate": 0.00010844155844155846,
+      "loss": 1.03,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.22422832250595093,
+      "learning_rate": 0.0001077922077922078,
+      "loss": 1.0738,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.22946301102638245,
+      "learning_rate": 0.00010714285714285715,
+      "loss": 1.0274,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.2137996405363083,
+      "learning_rate": 0.00010649350649350649,
+      "loss": 1.0539,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.1748756766319275,
+      "learning_rate": 0.00010584415584415586,
+      "loss": 1.0355,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.22275175154209137,
+      "learning_rate": 0.0001051948051948052,
+      "loss": 1.1696,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.20996077358722687,
+      "learning_rate": 0.00010454545454545455,
+      "loss": 1.0303,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.1945938766002655,
+      "learning_rate": 0.00010389610389610389,
+      "loss": 0.9747,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.1970377266407013,
+      "learning_rate": 0.00010324675324675325,
+      "loss": 1.0358,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.18814732134342194,
+      "learning_rate": 0.00010259740259740261,
+      "loss": 0.9612,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.2153233289718628,
+      "learning_rate": 0.00010194805194805195,
+      "loss": 1.0749,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.21788008511066437,
+      "learning_rate": 0.0001012987012987013,
+      "loss": 1.0883,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.214650496840477,
+      "learning_rate": 0.00010064935064935067,
+      "loss": 1.0539,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.19312834739685059,
+      "learning_rate": 0.0001,
+      "loss": 1.0657,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.19916598498821259,
+      "learning_rate": 9.935064935064936e-05,
+      "loss": 1.0478,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.2057606726884842,
+      "learning_rate": 9.870129870129871e-05,
+      "loss": 1.0094,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.22159607708454132,
+      "learning_rate": 9.805194805194806e-05,
+      "loss": 1.0952,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.18274275958538055,
+      "learning_rate": 9.74025974025974e-05,
+      "loss": 1.0065,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.19835162162780762,
+      "learning_rate": 9.675324675324677e-05,
+      "loss": 1.0742,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.2114904820919037,
+      "learning_rate": 9.610389610389611e-05,
+      "loss": 1.1109,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.21488523483276367,
+      "learning_rate": 9.545454545454546e-05,
+      "loss": 1.0465,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.19870303571224213,
+      "learning_rate": 9.480519480519481e-05,
+      "loss": 1.0318,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.20413029193878174,
+      "learning_rate": 9.415584415584417e-05,
+      "loss": 1.0817,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.1847231239080429,
+      "learning_rate": 9.35064935064935e-05,
+      "loss": 1.0144,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.2715964913368225,
+      "learning_rate": 9.285714285714286e-05,
+      "loss": 0.9832,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.2225002497434616,
+      "learning_rate": 9.220779220779221e-05,
+      "loss": 1.1051,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.22931510210037231,
+      "learning_rate": 9.155844155844156e-05,
+      "loss": 1.1042,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.21848627924919128,
+      "learning_rate": 9.090909090909092e-05,
+      "loss": 1.1151,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.19852259755134583,
+      "learning_rate": 9.025974025974027e-05,
+      "loss": 1.0889,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.2080363780260086,
+      "learning_rate": 8.961038961038961e-05,
+      "loss": 1.0777,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.22391024231910706,
+      "learning_rate": 8.896103896103896e-05,
+      "loss": 1.1092,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.21793846786022186,
+      "learning_rate": 8.831168831168831e-05,
+      "loss": 1.044,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.2009749859571457,
+      "learning_rate": 8.766233766233767e-05,
+      "loss": 1.0198,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.19432318210601807,
+      "learning_rate": 8.701298701298701e-05,
+      "loss": 1.075,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.18634547293186188,
+      "learning_rate": 8.636363636363637e-05,
+      "loss": 0.9964,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.1947103589773178,
+      "learning_rate": 8.571428571428571e-05,
+      "loss": 1.0025,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.23098671436309814,
+      "learning_rate": 8.506493506493507e-05,
+      "loss": 1.0562,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.19686414301395416,
+      "learning_rate": 8.441558441558442e-05,
+      "loss": 1.0285,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.19852428138256073,
+      "learning_rate": 8.376623376623377e-05,
+      "loss": 1.0054,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.21483510732650757,
+      "learning_rate": 8.311688311688312e-05,
+      "loss": 1.108,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.23313644528388977,
+      "learning_rate": 8.246753246753248e-05,
+      "loss": 1.1383,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.21453145146369934,
+      "learning_rate": 8.181818181818183e-05,
+      "loss": 1.0911,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.20268195867538452,
+      "learning_rate": 8.116883116883117e-05,
+      "loss": 1.0145,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.20576398074626923,
+      "learning_rate": 8.051948051948052e-05,
+      "loss": 1.0829,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.21732626855373383,
+      "learning_rate": 7.987012987012987e-05,
+      "loss": 1.0152,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.22046895325183868,
+      "learning_rate": 7.922077922077923e-05,
+      "loss": 1.1311,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.19727715849876404,
+      "learning_rate": 7.857142857142858e-05,
+      "loss": 1.0364,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.20861488580703735,
+      "learning_rate": 7.792207792207793e-05,
+      "loss": 1.0435,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.18545083701610565,
+      "learning_rate": 7.727272727272727e-05,
+      "loss": 1.0299,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.19965052604675293,
+      "learning_rate": 7.662337662337662e-05,
+      "loss": 1.0511,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.23673909902572632,
+      "learning_rate": 7.597402597402598e-05,
+      "loss": 1.081,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.17583179473876953,
+      "learning_rate": 7.532467532467533e-05,
+      "loss": 0.9808,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.2129366099834442,
+      "learning_rate": 7.467532467532467e-05,
+      "loss": 1.0522,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.21679140627384186,
+      "learning_rate": 7.402597402597404e-05,
+      "loss": 1.0567,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.2032000720500946,
+      "learning_rate": 7.337662337662338e-05,
+      "loss": 1.0466,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.1887970268726349,
+      "learning_rate": 7.272727272727273e-05,
+      "loss": 1.0329,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.21060192584991455,
+      "learning_rate": 7.207792207792208e-05,
+      "loss": 1.1021,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.21191425621509552,
+      "learning_rate": 7.142857142857143e-05,
+      "loss": 0.99,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.1995989829301834,
+      "learning_rate": 7.077922077922077e-05,
+      "loss": 1.0526,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.1849513053894043,
+      "learning_rate": 7.012987012987014e-05,
+      "loss": 0.9998,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.1948779672384262,
+      "learning_rate": 6.948051948051948e-05,
+      "loss": 1.075,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.20374052226543427,
+      "learning_rate": 6.883116883116883e-05,
+      "loss": 1.0933,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.2102465033531189,
+      "learning_rate": 6.818181818181818e-05,
+      "loss": 1.1123,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.21376173198223114,
+      "learning_rate": 6.753246753246754e-05,
+      "loss": 1.1233,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.20934203267097473,
+      "learning_rate": 6.688311688311688e-05,
+      "loss": 1.1374,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.18604128062725067,
+      "learning_rate": 6.623376623376624e-05,
+      "loss": 1.0213,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.19644233584403992,
+      "learning_rate": 6.55844155844156e-05,
+      "loss": 1.0046,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.18479463458061218,
+      "learning_rate": 6.493506493506494e-05,
+      "loss": 0.9792,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.1945149153470993,
+      "learning_rate": 6.428571428571429e-05,
+      "loss": 1.0584,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.2070147544145584,
+      "learning_rate": 6.363636363636364e-05,
+      "loss": 1.071,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.19645985960960388,
+      "learning_rate": 6.2987012987013e-05,
+      "loss": 1.0721,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.1960117667913437,
+      "learning_rate": 6.233766233766233e-05,
+      "loss": 1.071,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.20168261229991913,
+      "learning_rate": 6.16883116883117e-05,
+      "loss": 1.0808,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.21254412829875946,
+      "learning_rate": 6.103896103896104e-05,
+      "loss": 1.0287,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.21271063387393951,
+      "learning_rate": 6.03896103896104e-05,
+      "loss": 1.0605,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.2081408053636551,
+      "learning_rate": 5.9740259740259744e-05,
+      "loss": 1.091,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.21113798022270203,
+      "learning_rate": 5.90909090909091e-05,
+      "loss": 1.1323,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.20670844614505768,
+      "learning_rate": 5.844155844155844e-05,
+      "loss": 1.0955,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.2010120451450348,
+      "learning_rate": 5.7792207792207796e-05,
+      "loss": 1.1068,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.20379121601581573,
+      "learning_rate": 5.714285714285714e-05,
+      "loss": 1.0419,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.22799807786941528,
+      "learning_rate": 5.64935064935065e-05,
+      "loss": 1.0904,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.2005995213985443,
+      "learning_rate": 5.584415584415584e-05,
+      "loss": 1.078,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.20329605042934418,
+      "learning_rate": 5.51948051948052e-05,
+      "loss": 1.0245,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.19283504784107208,
+      "learning_rate": 5.4545454545454546e-05,
+      "loss": 1.0367,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.20624355971813202,
+      "learning_rate": 5.38961038961039e-05,
+      "loss": 1.1046,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.21362991631031036,
+      "learning_rate": 5.3246753246753245e-05,
+      "loss": 1.1104,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.20447863638401031,
+      "learning_rate": 5.25974025974026e-05,
+      "loss": 1.0514,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.1974381059408188,
+      "learning_rate": 5.1948051948051944e-05,
+      "loss": 1.0048,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.21237170696258545,
+      "learning_rate": 5.1298701298701304e-05,
+      "loss": 1.1299,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.21224971115589142,
+      "learning_rate": 5.064935064935065e-05,
+      "loss": 1.05,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.19865018129348755,
+      "learning_rate": 5e-05,
+      "loss": 1.0665,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.19199275970458984,
+      "learning_rate": 4.9350649350649355e-05,
+      "loss": 0.9531,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.19573214650154114,
+      "learning_rate": 4.87012987012987e-05,
+      "loss": 1.0318,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.21338805556297302,
+      "learning_rate": 4.8051948051948054e-05,
+      "loss": 1.0343,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.2254691869020462,
+      "learning_rate": 4.740259740259741e-05,
+      "loss": 1.0472,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.18101665377616882,
+      "learning_rate": 4.675324675324675e-05,
+      "loss": 1.017,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.22090592980384827,
+      "learning_rate": 4.6103896103896106e-05,
+      "loss": 1.0389,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.20865507423877716,
+      "learning_rate": 4.545454545454546e-05,
+      "loss": 1.0369,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.21619610488414764,
+      "learning_rate": 4.4805194805194805e-05,
+      "loss": 1.109,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.21694771945476532,
+      "learning_rate": 4.415584415584416e-05,
+      "loss": 1.0525,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.2182662934064865,
+      "learning_rate": 4.3506493506493503e-05,
+      "loss": 1.0331,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.2026486098766327,
+      "learning_rate": 4.2857142857142856e-05,
+      "loss": 1.027,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.19606547057628632,
+      "learning_rate": 4.220779220779221e-05,
+      "loss": 1.0242,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.22107470035552979,
+      "learning_rate": 4.155844155844156e-05,
+      "loss": 1.0924,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.19960008561611176,
+      "learning_rate": 4.0909090909090915e-05,
+      "loss": 1.0384,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.1945488154888153,
+      "learning_rate": 4.025974025974026e-05,
+      "loss": 1.0673,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.22067414224147797,
+      "learning_rate": 3.9610389610389614e-05,
+      "loss": 1.0426,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.19010980427265167,
+      "learning_rate": 3.8961038961038966e-05,
+      "loss": 1.0617,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.18781176209449768,
+      "learning_rate": 3.831168831168831e-05,
+      "loss": 1.0243,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.20388829708099365,
+      "learning_rate": 3.7662337662337665e-05,
+      "loss": 1.0476,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.19911155104637146,
+      "learning_rate": 3.701298701298702e-05,
+      "loss": 1.0324,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.19884039461612701,
+      "learning_rate": 3.6363636363636364e-05,
+      "loss": 1.0242,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.19036105275154114,
+      "learning_rate": 3.571428571428572e-05,
+      "loss": 1.0323,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.20039844512939453,
+      "learning_rate": 3.506493506493507e-05,
+      "loss": 1.0749,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.1899934560060501,
+      "learning_rate": 3.4415584415584416e-05,
+      "loss": 1.0115,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.20019090175628662,
+      "learning_rate": 3.376623376623377e-05,
+      "loss": 1.0782,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.2020583152770996,
+      "learning_rate": 3.311688311688312e-05,
+      "loss": 1.0687,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.21407337486743927,
+      "learning_rate": 3.246753246753247e-05,
+      "loss": 1.1015,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.1871640682220459,
+      "learning_rate": 3.181818181818182e-05,
+      "loss": 0.9637,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.21622811257839203,
+      "learning_rate": 3.1168831168831166e-05,
+      "loss": 1.1222,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.22504661977291107,
+      "learning_rate": 3.051948051948052e-05,
+      "loss": 1.132,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.19177629053592682,
+      "learning_rate": 2.9870129870129872e-05,
+      "loss": 1.0281,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.1970544159412384,
+      "learning_rate": 2.922077922077922e-05,
+      "loss": 1.0393,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.21554522216320038,
+      "learning_rate": 2.857142857142857e-05,
+      "loss": 1.074,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.21131229400634766,
+      "learning_rate": 2.792207792207792e-05,
+      "loss": 1.054,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.19816523790359497,
+      "learning_rate": 2.7272727272727273e-05,
+      "loss": 1.0456,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.21075209975242615,
+      "learning_rate": 2.6623376623376623e-05,
+      "loss": 1.0758,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.2296527624130249,
+      "learning_rate": 2.5974025974025972e-05,
+      "loss": 1.0917,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.19722610712051392,
+      "learning_rate": 2.5324675324675325e-05,
+      "loss": 1.0704,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.18721099197864532,
+      "learning_rate": 2.4675324675324678e-05,
+      "loss": 0.9919,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.20244193077087402,
+      "learning_rate": 2.4025974025974027e-05,
+      "loss": 1.0368,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.19518914818763733,
+      "learning_rate": 2.3376623376623376e-05,
+      "loss": 1.0436,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.19650357961654663,
+      "learning_rate": 2.272727272727273e-05,
+      "loss": 1.0306,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.20320096611976624,
+      "learning_rate": 2.207792207792208e-05,
+      "loss": 1.0941,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.18296951055526733,
+      "learning_rate": 2.1428571428571428e-05,
+      "loss": 0.9802,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.21357610821723938,
+      "learning_rate": 2.077922077922078e-05,
+      "loss": 1.0449,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.193921759724617,
+      "learning_rate": 2.012987012987013e-05,
+      "loss": 1.0116,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.1953902244567871,
+      "learning_rate": 1.9480519480519483e-05,
+      "loss": 1.0105,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.19440975785255432,
+      "learning_rate": 1.8831168831168833e-05,
+      "loss": 0.9952,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.21054105460643768,
+      "learning_rate": 1.8181818181818182e-05,
+      "loss": 1.0701,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.18844804167747498,
+      "learning_rate": 1.7532467532467535e-05,
+      "loss": 1.0146,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.2067311704158783,
+      "learning_rate": 1.6883116883116884e-05,
+      "loss": 1.0781,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.1941213756799698,
+      "learning_rate": 1.6233766233766234e-05,
+      "loss": 0.9814,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.22726193070411682,
+      "learning_rate": 1.5584415584415583e-05,
+      "loss": 1.1431,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.18025581538677216,
+      "learning_rate": 1.4935064935064936e-05,
+      "loss": 0.9649,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.21535000205039978,
+      "learning_rate": 1.4285714285714285e-05,
+      "loss": 1.0441,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.20014546811580658,
+      "learning_rate": 1.3636363636363637e-05,
+      "loss": 1.0166,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.22738787531852722,
+      "learning_rate": 1.2987012987012986e-05,
+      "loss": 1.0564,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.2020861804485321,
+      "learning_rate": 1.2337662337662339e-05,
+      "loss": 1.1241,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.19888809323310852,
+      "learning_rate": 1.1688311688311688e-05,
+      "loss": 1.1114,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.20912377536296844,
+      "learning_rate": 1.103896103896104e-05,
+      "loss": 1.0971,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.21206621825695038,
+      "learning_rate": 1.038961038961039e-05,
+      "loss": 1.0601,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.18667680025100708,
+      "learning_rate": 9.740259740259742e-06,
+      "loss": 1.0291,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.21125559508800507,
+      "learning_rate": 9.090909090909091e-06,
+      "loss": 1.0483,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.21776145696640015,
+      "learning_rate": 8.441558441558442e-06,
+      "loss": 0.9912,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.20144303143024445,
+      "learning_rate": 7.792207792207792e-06,
+      "loss": 1.0357,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.1984029859304428,
+      "learning_rate": 7.142857142857143e-06,
+      "loss": 1.0648,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.17972829937934875,
+      "learning_rate": 6.493506493506493e-06,
+      "loss": 1.0033,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.1818286031484604,
+      "learning_rate": 5.844155844155844e-06,
+      "loss": 0.997,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.19670912623405457,
+      "learning_rate": 5.194805194805195e-06,
+      "loss": 1.0256,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.20527283847332,
+      "learning_rate": 4.5454545454545455e-06,
+      "loss": 1.0348,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.19025909900665283,
+      "learning_rate": 3.896103896103896e-06,
+      "loss": 1.0682,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.19544818997383118,
+      "learning_rate": 3.2467532467532465e-06,
+      "loss": 0.9872,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.22112183272838593,
+      "learning_rate": 2.5974025974025976e-06,
+      "loss": 1.0661,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.23328153789043427,
+      "learning_rate": 1.948051948051948e-06,
+      "loss": 1.0691,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.20181375741958618,
+      "learning_rate": 1.2987012987012988e-06,
+      "loss": 0.9416,
+      "step": 312
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.29312625527381897,
+      "learning_rate": 6.493506493506494e-07,
+      "loss": 1.1216,
+      "step": 313
+    },
+    {
+      "epoch": 0.12202467696492762,
+      "grad_norm": 0.2231415957212448,
+      "learning_rate": 0.0,
+      "loss": 1.0468,
+      "step": 314
+    },
+    {
+      "epoch": 0.12241329058583503,
+      "grad_norm": 0.22263288497924805,
+      "learning_rate": 0.00017594394706111328,
+      "loss": 1.0399,
+      "step": 315
+    },
+    {
+      "epoch": 0.12280190420674245,
+      "grad_norm": 0.22909891605377197,
+      "learning_rate": 0.00017586609575710393,
+      "loss": 1.1069,
+      "step": 316
+    },
+    {
+      "epoch": 0.12319051782764986,
+      "grad_norm": 0.23951445519924164,
+      "learning_rate": 0.0001757882444530946,
+      "loss": 1.1036,
+      "step": 317
+    },
+    {
+      "epoch": 0.12357913144855727,
+      "grad_norm": 0.2409268021583557,
+      "learning_rate": 0.00017571039314908526,
+      "loss": 1.1114,
+      "step": 318
+    },
+    {
+      "epoch": 0.12396774506946469,
+      "grad_norm": 0.23753899335861206,
+      "learning_rate": 0.00017563254184507592,
+      "loss": 1.1297,
+      "step": 319
+    },
+    {
+      "epoch": 0.12435635869037209,
+      "grad_norm": 0.2823902666568756,
+      "learning_rate": 0.00017555469054106657,
+      "loss": 1.1293,
+      "step": 320
+    },
+    {
+      "epoch": 0.12474497231127951,
+      "grad_norm": 0.24093545973300934,
+      "learning_rate": 0.00017547683923705722,
+      "loss": 1.0678,
+      "step": 321
+    },
+    {
+      "epoch": 0.12513358593218693,
+      "grad_norm": 0.22565563023090363,
+      "learning_rate": 0.0001753989879330479,
+      "loss": 1.1408,
+      "step": 322
+    },
+    {
+      "epoch": 0.12552219955309435,
+      "grad_norm": 0.22569572925567627,
+      "learning_rate": 0.00017532113662903855,
+      "loss": 1.0543,
+      "step": 323
+    },
+    {
+      "epoch": 0.12591081317400174,
+      "grad_norm": 0.24962866306304932,
+      "learning_rate": 0.0001752432853250292,
+      "loss": 1.0818,
+      "step": 324
+    },
+    {
+      "epoch": 0.12629942679490916,
+      "grad_norm": 0.22184576094150543,
+      "learning_rate": 0.00017516543402101986,
+      "loss": 1.0835,
+      "step": 325
+    },
+    {
+      "epoch": 0.12668804041581658,
+      "grad_norm": 0.2572194039821625,
+      "learning_rate": 0.0001750875827170105,
+      "loss": 1.0767,
+      "step": 326
+    },
+    {
+      "epoch": 0.127076654036724,
+      "grad_norm": 0.24131342768669128,
+      "learning_rate": 0.00017500973141300116,
+      "loss": 1.0981,
+      "step": 327
+    },
+    {
+      "epoch": 0.1274652676576314,
+      "grad_norm": 0.2386389970779419,
+      "learning_rate": 0.00017493188010899184,
+      "loss": 1.0828,
+      "step": 328
+    },
+    {
+      "epoch": 0.1278538812785388,
+      "grad_norm": 0.2654125690460205,
+      "learning_rate": 0.0001748540288049825,
+      "loss": 1.1266,
+      "step": 329
+    },
+    {
+      "epoch": 0.12824249489944622,
+      "grad_norm": 0.2925739884376526,
+      "learning_rate": 0.00017477617750097314,
+      "loss": 1.0983,
+      "step": 330
+    },
+    {
+      "epoch": 0.12863110852035364,
+      "grad_norm": 0.26589342951774597,
+      "learning_rate": 0.0001746983261969638,
+      "loss": 1.1029,
+      "step": 331
+    },
+    {
+      "epoch": 0.12901972214126106,
+      "grad_norm": 0.24565957486629486,
+      "learning_rate": 0.00017462047489295445,
+      "loss": 1.0975,
+      "step": 332
+    },
+    {
+      "epoch": 0.12940833576216845,
+      "grad_norm": 0.2459682673215866,
+      "learning_rate": 0.00017454262358894513,
+      "loss": 1.0566,
+      "step": 333
+    },
+    {
+      "epoch": 0.12979694938307587,
+      "grad_norm": 0.23349183797836304,
+      "learning_rate": 0.00017446477228493578,
+      "loss": 1.0833,
+      "step": 334
+    },
+    {
+      "epoch": 0.1301855630039833,
+      "grad_norm": 0.26166337728500366,
+      "learning_rate": 0.00017438692098092643,
+      "loss": 1.1598,
+      "step": 335
+    },
+    {
+      "epoch": 0.1305741766248907,
+      "grad_norm": 0.24188168346881866,
+      "learning_rate": 0.00017430906967691708,
+      "loss": 1.0728,
+      "step": 336
+    },
+    {
+      "epoch": 0.13096279024579813,
+      "grad_norm": 0.22922398149967194,
+      "learning_rate": 0.00017423121837290773,
+      "loss": 1.0311,
+      "step": 337
+    },
+    {
+      "epoch": 0.13135140386670552,
+      "grad_norm": 0.2652754485607147,
+      "learning_rate": 0.00017415336706889841,
+      "loss": 1.1096,
+      "step": 338
+    },
+    {
+      "epoch": 0.13174001748761294,
+      "grad_norm": 0.2355881780385971,
+      "learning_rate": 0.00017407551576488907,
+      "loss": 1.0964,
+      "step": 339
+    },
+    {
+      "epoch": 0.13212863110852036,
+      "grad_norm": 0.244523823261261,
+      "learning_rate": 0.00017399766446087972,
+      "loss": 1.142,
+      "step": 340
+    },
+    {
+      "epoch": 0.13251724472942777,
+      "grad_norm": 0.24705976247787476,
+      "learning_rate": 0.00017391981315687037,
+      "loss": 1.0943,
+      "step": 341
+    },
+    {
+      "epoch": 0.13290585835033517,
+      "grad_norm": 0.22817552089691162,
+      "learning_rate": 0.00017384196185286102,
+      "loss": 1.0621,
+      "step": 342
+    },
+    {
+      "epoch": 0.13329447197124258,
+      "grad_norm": 0.22605225443840027,
+      "learning_rate": 0.0001737641105488517,
+      "loss": 1.0714,
+      "step": 343
+    },
+    {
+      "epoch": 0.13368308559215,
+      "grad_norm": 0.2584545314311981,
+      "learning_rate": 0.00017368625924484235,
+      "loss": 1.1367,
+      "step": 344
+    },
+    {
+      "epoch": 0.13407169921305742,
+      "grad_norm": 0.2248220443725586,
+      "learning_rate": 0.000173608407940833,
+      "loss": 1.0872,
+      "step": 345
+    },
+    {
+      "epoch": 0.13446031283396484,
+      "grad_norm": 0.2141868770122528,
+      "learning_rate": 0.00017353055663682368,
+      "loss": 1.0572,
+      "step": 346
+    },
+    {
+      "epoch": 0.13484892645487223,
+      "grad_norm": 0.2615523934364319,
+      "learning_rate": 0.00017345270533281434,
+      "loss": 1.1048,
+      "step": 347
+    },
+    {
+      "epoch": 0.13523754007577965,
+      "grad_norm": 0.22990448772907257,
+      "learning_rate": 0.000173374854028805,
+      "loss": 1.0528,
+      "step": 348
+    },
+    {
+      "epoch": 0.13562615369668707,
+      "grad_norm": 0.2132262885570526,
+      "learning_rate": 0.00017329700272479564,
+      "loss": 1.0476,
+      "step": 349
+    },
+    {
+      "epoch": 0.1360147673175945,
+      "grad_norm": 0.2578272819519043,
+      "learning_rate": 0.00017321915142078632,
+      "loss": 1.0852,
+      "step": 350
+    },
+    {
+      "epoch": 0.1364033809385019,
+      "grad_norm": 0.22881457209587097,
+      "learning_rate": 0.00017314130011677697,
+      "loss": 1.1017,
+      "step": 351
+    },
+    {
+      "epoch": 0.1367919945594093,
+      "grad_norm": 0.21067696809768677,
+      "learning_rate": 0.00017306344881276762,
+      "loss": 1.0444,
+      "step": 352
+    },
+    {
+      "epoch": 0.13718060818031672,
+      "grad_norm": 0.2304215282201767,
+      "learning_rate": 0.0001729855975087583,
+      "loss": 1.0737,
+      "step": 353
+    },
+    {
+      "epoch": 0.13756922180122413,
+      "grad_norm": 0.2031925916671753,
+      "learning_rate": 0.00017290774620474895,
+      "loss": 1.0036,
+      "step": 354
+    },
+    {
+      "epoch": 0.13795783542213155,
+      "grad_norm": 0.27281051874160767,
+      "learning_rate": 0.0001728298949007396,
+      "loss": 1.148,
+      "step": 355
+    },
+    {
+      "epoch": 0.13834644904303897,
+      "grad_norm": 0.204191654920578,
+      "learning_rate": 0.00017275204359673026,
+      "loss": 0.9607,
+      "step": 356
+    },
+    {
+      "epoch": 0.13873506266394636,
+      "grad_norm": 0.221976637840271,
+      "learning_rate": 0.0001726741922927209,
+      "loss": 1.1068,
+      "step": 357
+    },
+    {
+      "epoch": 0.13912367628485378,
+      "grad_norm": 0.20831729471683502,
+      "learning_rate": 0.0001725963409887116,
+      "loss": 1.034,
+      "step": 358
+    },
+    {
+      "epoch": 0.1395122899057612,
+      "grad_norm": 0.21639779210090637,
+      "learning_rate": 0.00017251848968470224,
+      "loss": 1.0613,
+      "step": 359
+    },
+    {
+      "epoch": 0.13990090352666862,
+      "grad_norm": 0.1959424465894699,
+      "learning_rate": 0.0001724406383806929,
+      "loss": 1.0506,
+      "step": 360
+    },
+    {
+      "epoch": 0.140289517147576,
+      "grad_norm": 0.2044398933649063,
+      "learning_rate": 0.00017236278707668355,
+      "loss": 1.0316,
+      "step": 361
+    },
+    {
+      "epoch": 0.14067813076848343,
+      "grad_norm": 0.21483004093170166,
+      "learning_rate": 0.0001722849357726742,
+      "loss": 1.0361,
+      "step": 362
+    },
+    {
+      "epoch": 0.14106674438939085,
+      "grad_norm": 0.237701416015625,
+      "learning_rate": 0.00017220708446866485,
+      "loss": 1.1264,
+      "step": 363
+    },
+    {
+      "epoch": 0.14145535801029827,
+      "grad_norm": 0.20750795304775238,
+      "learning_rate": 0.00017212923316465553,
+      "loss": 1.0523,
+      "step": 364
+    },
+    {
+      "epoch": 0.14184397163120568,
+      "grad_norm": 0.2252965271472931,
+      "learning_rate": 0.00017205138186064618,
+      "loss": 1.0764,
+      "step": 365
+    },
+    {
+      "epoch": 0.14223258525211308,
+      "grad_norm": 0.2033565789461136,
+      "learning_rate": 0.00017197353055663683,
+      "loss": 1.064,
+      "step": 366
+    },
+    {
+      "epoch": 0.1426211988730205,
+      "grad_norm": 0.21123190224170685,
+      "learning_rate": 0.00017189567925262749,
+      "loss": 1.0515,
+      "step": 367
+    },
+    {
+      "epoch": 0.1430098124939279,
+      "grad_norm": 0.20646221935749054,
+      "learning_rate": 0.00017181782794861814,
+      "loss": 1.0617,
+      "step": 368
+    },
+    {
+      "epoch": 0.14339842611483533,
+      "grad_norm": 0.2079589068889618,
+      "learning_rate": 0.00017173997664460882,
+      "loss": 1.0569,
+      "step": 369
+    },
+    {
+      "epoch": 0.14378703973574275,
+      "grad_norm": 0.216246098279953,
+      "learning_rate": 0.00017166212534059947,
+      "loss": 1.0986,
+      "step": 370
+    },
+    {
+      "epoch": 0.14417565335665014,
+      "grad_norm": 0.20711806416511536,
+      "learning_rate": 0.00017158427403659012,
+      "loss": 1.1342,
+      "step": 371
+    },
+    {
+      "epoch": 0.14456426697755756,
+      "grad_norm": 0.235435351729393,
+      "learning_rate": 0.00017150642273258077,
+      "loss": 1.1082,
+      "step": 372
+    },
+    {
+      "epoch": 0.14495288059846498,
+      "grad_norm": 0.2273191511631012,
+      "learning_rate": 0.00017142857142857143,
+      "loss": 1.1064,
+      "step": 373
+    },
+    {
+      "epoch": 0.1453414942193724,
+      "grad_norm": 0.2075672745704651,
+      "learning_rate": 0.0001713507201245621,
+      "loss": 1.0536,
+      "step": 374
+    },
+    {
+      "epoch": 0.14573010784027982,
+      "grad_norm": 0.20764274895191193,
+      "learning_rate": 0.00017127286882055276,
+      "loss": 1.0673,
+      "step": 375
+    },
+    {
+      "epoch": 0.1461187214611872,
+      "grad_norm": 0.2441243678331375,
+      "learning_rate": 0.0001711950175165434,
+      "loss": 1.1271,
+      "step": 376
+    },
+    {
+      "epoch": 0.14650733508209463,
+      "grad_norm": 0.2383374124765396,
+      "learning_rate": 0.00017111716621253406,
+      "loss": 1.083,
+      "step": 377
+    },
+    {
+      "epoch": 0.14689594870300204,
+      "grad_norm": 0.2172410786151886,
+      "learning_rate": 0.0001710393149085247,
+      "loss": 1.0605,
+      "step": 378
+    },
+    {
+      "epoch": 0.14728456232390946,
+      "grad_norm": 0.22591541707515717,
+      "learning_rate": 0.0001709614636045154,
+      "loss": 1.0931,
+      "step": 379
+    },
+    {
+      "epoch": 0.14767317594481685,
+      "grad_norm": 0.23099495470523834,
+      "learning_rate": 0.00017088361230050604,
+      "loss": 1.1021,
+      "step": 380
+    },
+    {
+      "epoch": 0.14806178956572427,
+      "grad_norm": 0.21461094915866852,
+      "learning_rate": 0.0001708057609964967,
+      "loss": 1.0959,
+      "step": 381
+    },
+    {
+      "epoch": 0.1484504031866317,
+      "grad_norm": 0.21557241678237915,
+      "learning_rate": 0.00017072790969248735,
+      "loss": 1.0155,
+      "step": 382
+    },
+    {
+      "epoch": 0.1488390168075391,
+      "grad_norm": 0.234396293759346,
+      "learning_rate": 0.000170650058388478,
+      "loss": 1.1289,
+      "step": 383
+    },
+    {
+      "epoch": 0.14922763042844653,
+      "grad_norm": 0.22895503044128418,
+      "learning_rate": 0.00017057220708446868,
+      "loss": 0.9919,
+      "step": 384
+    },
+    {
+      "epoch": 0.14961624404935392,
+      "grad_norm": 0.2054683268070221,
+      "learning_rate": 0.00017049435578045933,
+      "loss": 1.0607,
+      "step": 385
+    },
+    {
+      "epoch": 0.15000485767026134,
+      "grad_norm": 0.25569215416908264,
+      "learning_rate": 0.00017041650447644998,
+      "loss": 1.0517,
+      "step": 386
+    },
+    {
+      "epoch": 0.15039347129116876,
+      "grad_norm": 0.2222641259431839,
+      "learning_rate": 0.00017033865317244064,
+      "loss": 1.0404,
+      "step": 387
+    },
+    {
+      "epoch": 0.15078208491207618,
+      "grad_norm": 0.20501169562339783,
+      "learning_rate": 0.0001702608018684313,
+      "loss": 0.9897,
+      "step": 388
+    },
+    {
+      "epoch": 0.1511706985329836,
+      "grad_norm": 0.22080403566360474,
+      "learning_rate": 0.00017018295056442197,
+      "loss": 1.1013,
+      "step": 389
+    },
+    {
+      "epoch": 0.15155931215389098,
+      "grad_norm": 0.21218529343605042,
+      "learning_rate": 0.00017010509926041262,
+      "loss": 1.0541,
+      "step": 390
+    },
+    {
+      "epoch": 0.1519479257747984,
+      "grad_norm": 0.23064807057380676,
+      "learning_rate": 0.00017002724795640327,
+      "loss": 1.037,
+      "step": 391
+    },
+    {
+      "epoch": 0.15233653939570582,
+      "grad_norm": 0.21164493262767792,
+      "learning_rate": 0.00016994939665239392,
+      "loss": 1.0769,
+      "step": 392
+    },
+    {
+      "epoch": 0.15272515301661324,
+      "grad_norm": 0.22565549612045288,
+      "learning_rate": 0.00016987154534838457,
+      "loss": 1.0638,
+      "step": 393
+    },
+    {
+      "epoch": 0.15311376663752063,
+      "grad_norm": 0.22492647171020508,
+      "learning_rate": 0.00016979369404437525,
+      "loss": 1.063,
+      "step": 394
+    },
+    {
+      "epoch": 0.15350238025842805,
+      "grad_norm": 0.22335395216941833,
+      "learning_rate": 0.0001697158427403659,
+      "loss": 1.1032,
+      "step": 395
+    },
+    {
+      "epoch": 0.15389099387933547,
+      "grad_norm": 0.2164154201745987,
+      "learning_rate": 0.00016963799143635656,
+      "loss": 1.1275,
+      "step": 396
+    },
+    {
+      "epoch": 0.1542796075002429,
+      "grad_norm": 0.22547736763954163,
+      "learning_rate": 0.0001695601401323472,
+      "loss": 1.1324,
+      "step": 397
+    },
+    {
+      "epoch": 0.1546682211211503,
+      "grad_norm": 0.2028045952320099,
+      "learning_rate": 0.0001694822888283379,
+      "loss": 1.0057,
+      "step": 398
+    },
+    {
+      "epoch": 0.1550568347420577,
+      "grad_norm": 0.20770573616027832,
+      "learning_rate": 0.00016940443752432854,
+      "loss": 1.0311,
+      "step": 399
+    },
+    {
+      "epoch": 0.15544544836296512,
+      "grad_norm": 0.2231476902961731,
+      "learning_rate": 0.0001693265862203192,
+      "loss": 1.0535,
+      "step": 400
+    },
+    {
+      "epoch": 0.15583406198387253,
+      "grad_norm": 0.21618099510669708,
+      "learning_rate": 0.00016924873491630987,
+      "loss": 1.0616,
+      "step": 401
+    },
+    {
+      "epoch": 0.15622267560477995,
+      "grad_norm": 0.24024419486522675,
+      "learning_rate": 0.00016917088361230052,
+      "loss": 1.1324,
+      "step": 402
+    },
+    {
+      "epoch": 0.15661128922568737,
+      "grad_norm": 0.2002171128988266,
+      "learning_rate": 0.00016909303230829118,
+      "loss": 1.015,
+      "step": 403
+    },
+    {
+      "epoch": 0.15699990284659476,
+      "grad_norm": 0.21771477162837982,
+      "learning_rate": 0.00016901518100428183,
+      "loss": 1.0817,
+      "step": 404
+    },
+    {
+      "epoch": 0.15738851646750218,
+      "grad_norm": 0.22052259743213654,
+      "learning_rate": 0.0001689373297002725,
+      "loss": 1.0836,
+      "step": 405
+    },
+    {
+      "epoch": 0.1577771300884096,
+      "grad_norm": 0.1964062750339508,
+      "learning_rate": 0.00016885947839626316,
+      "loss": 1.0505,
+      "step": 406
+    },
+    {
+      "epoch": 0.15816574370931702,
+      "grad_norm": 0.22714298963546753,
+      "learning_rate": 0.0001687816270922538,
+      "loss": 1.0702,
+      "step": 407
+    },
+    {
+      "epoch": 0.15855435733022444,
+      "grad_norm": 0.20647728443145752,
+      "learning_rate": 0.00016870377578824446,
+      "loss": 1.0349,
+      "step": 408
+    },
+    {
+      "epoch": 0.15894297095113183,
+      "grad_norm": 0.2355160117149353,
+      "learning_rate": 0.00016862592448423512,
+      "loss": 1.0305,
+      "step": 409
+    },
+    {
+      "epoch": 0.15933158457203925,
+      "grad_norm": 0.22890770435333252,
+      "learning_rate": 0.0001685480731802258,
+      "loss": 1.0854,
+      "step": 410
+    },
+    {
+      "epoch": 0.15972019819294667,
+      "grad_norm": 0.21947838366031647,
+      "learning_rate": 0.00016847022187621645,
+      "loss": 1.0948,
+      "step": 411
+    },
+    {
+      "epoch": 0.16010881181385409,
+      "grad_norm": 0.22334899008274078,
+      "learning_rate": 0.0001683923705722071,
+      "loss": 1.006,
+      "step": 412
+    },
+    {
+      "epoch": 0.16049742543476148,
+      "grad_norm": 0.22324936091899872,
+      "learning_rate": 0.00016831451926819775,
+      "loss": 1.0402,
+      "step": 413
+    },
+    {
+      "epoch": 0.1608860390556689,
+      "grad_norm": 0.21462097764015198,
+      "learning_rate": 0.0001682366679641884,
+      "loss": 1.077,
+      "step": 414
+    },
+    {
+      "epoch": 0.1612746526765763,
+      "grad_norm": 0.24567006528377533,
+      "learning_rate": 0.00016815881666017908,
+      "loss": 1.15,
+      "step": 415
+    },
+    {
+      "epoch": 0.16166326629748373,
+      "grad_norm": 0.26437243819236755,
+      "learning_rate": 0.00016808096535616973,
+      "loss": 1.1251,
+      "step": 416
+    },
+    {
+      "epoch": 0.16205187991839115,
+      "grad_norm": 0.2217959761619568,
+      "learning_rate": 0.00016800311405216039,
+      "loss": 1.1103,
+      "step": 417
+    },
+    {
+      "epoch": 0.16244049353929854,
+      "grad_norm": 0.24402475357055664,
+      "learning_rate": 0.00016792526274815104,
+      "loss": 1.0672,
+      "step": 418
+    },
+    {
+      "epoch": 0.16282910716020596,
+      "grad_norm": 0.21609526872634888,
+      "learning_rate": 0.0001678474114441417,
+      "loss": 1.0291,
+      "step": 419
+    },
+    {
+      "epoch": 0.16321772078111338,
+      "grad_norm": 0.20054642856121063,
+      "learning_rate": 0.00016776956014013237,
+      "loss": 1.0704,
+      "step": 420
+    },
+    {
+      "epoch": 0.1636063344020208,
+      "grad_norm": 0.22864869236946106,
+      "learning_rate": 0.00016769170883612302,
+      "loss": 1.0612,
+      "step": 421
+    },
+    {
+      "epoch": 0.16399494802292822,
+      "grad_norm": 0.22651974856853485,
+      "learning_rate": 0.00016761385753211367,
+      "loss": 1.0749,
+      "step": 422
+    },
+    {
+      "epoch": 0.1643835616438356,
+      "grad_norm": 0.21587328612804413,
+      "learning_rate": 0.00016753600622810433,
+      "loss": 1.0398,
+      "step": 423
+    },
+    {
+      "epoch": 0.16477217526474303,
+      "grad_norm": 0.1953774094581604,
+      "learning_rate": 0.00016745815492409498,
+      "loss": 1.0275,
+      "step": 424
+    },
+    {
+      "epoch": 0.16516078888565044,
+      "grad_norm": 0.21803410351276398,
+      "learning_rate": 0.00016738030362008566,
+      "loss": 1.1219,
+      "step": 425
+    },
+    {
+      "epoch": 0.16554940250655786,
+      "grad_norm": 0.2034682035446167,
+      "learning_rate": 0.0001673024523160763,
+      "loss": 1.0342,
+      "step": 426
+    },
+    {
+      "epoch": 0.16593801612746525,
+      "grad_norm": 0.20135951042175293,
+      "learning_rate": 0.00016722460101206696,
+      "loss": 0.9802,
+      "step": 427
+    },
+    {
+      "epoch": 0.16632662974837267,
+      "grad_norm": 0.23310376703739166,
+      "learning_rate": 0.0001671467497080576,
+      "loss": 1.0789,
+      "step": 428
+    },
+    {
+      "epoch": 0.1667152433692801,
+      "grad_norm": 0.21475404500961304,
+      "learning_rate": 0.00016706889840404827,
+      "loss": 1.0416,
+      "step": 429
+    },
+    {
+      "epoch": 0.1671038569901875,
+      "grad_norm": 0.21661072969436646,
+      "learning_rate": 0.00016699104710003894,
+      "loss": 1.0568,
+      "step": 430
+    },
+    {
+      "epoch": 0.16749247061109493,
+      "grad_norm": 0.20310629904270172,
+      "learning_rate": 0.0001669131957960296,
+      "loss": 0.9968,
+      "step": 431
+    },
+    {
+      "epoch": 0.16788108423200232,
+      "grad_norm": 0.2596947252750397,
+      "learning_rate": 0.00016683534449202025,
+      "loss": 1.0478,
+      "step": 432
+    },
+    {
+      "epoch": 0.16826969785290974,
+      "grad_norm": 0.22226987779140472,
+      "learning_rate": 0.0001667574931880109,
+      "loss": 1.0898,
+      "step": 433
+    },
+    {
+      "epoch": 0.16865831147381716,
+      "grad_norm": 0.22499911487102509,
+      "learning_rate": 0.00016667964188400155,
+      "loss": 1.07,
+      "step": 434
+    },
+    {
+      "epoch": 0.16904692509472458,
+      "grad_norm": 0.2717292308807373,
+      "learning_rate": 0.0001666017905799922,
+      "loss": 1.0562,
+      "step": 435
+    },
+    {
+      "epoch": 0.169435538715632,
+      "grad_norm": 0.22052323818206787,
+      "learning_rate": 0.00016652393927598288,
+      "loss": 1.0732,
+      "step": 436
+    },
+    {
+      "epoch": 0.16982415233653939,
+      "grad_norm": 0.21741728484630585,
+      "learning_rate": 0.00016644608797197354,
+      "loss": 1.0409,
+      "step": 437
+    },
+    {
+      "epoch": 0.1702127659574468,
+      "grad_norm": 0.20701193809509277,
+      "learning_rate": 0.0001663682366679642,
+      "loss": 1.0731,
+      "step": 438
+    },
+    {
+      "epoch": 0.17060137957835422,
+      "grad_norm": 0.22071130573749542,
+      "learning_rate": 0.00016629038536395484,
+      "loss": 1.0992,
+      "step": 439
+    },
+    {
+      "epoch": 0.17098999319926164,
+      "grad_norm": 0.20261412858963013,
+      "learning_rate": 0.0001662125340599455,
+      "loss": 1.0051,
+      "step": 440
+    },
+    {
+      "epoch": 0.17137860682016906,
+      "grad_norm": 0.2082947939634323,
+      "learning_rate": 0.00016613468275593617,
+      "loss": 1.0477,
+      "step": 441
+    },
+    {
+      "epoch": 0.17176722044107645,
+      "grad_norm": 0.22534717619419098,
+      "learning_rate": 0.00016605683145192682,
+      "loss": 1.041,
+      "step": 442
+    },
+    {
+      "epoch": 0.17215583406198387,
+      "grad_norm": 0.21547731757164001,
+      "learning_rate": 0.00016597898014791748,
+      "loss": 1.0528,
+      "step": 443
+    },
+    {
+      "epoch": 0.1725444476828913,
+      "grad_norm": 0.24141089618206024,
+      "learning_rate": 0.00016590112884390813,
+      "loss": 1.0928,
+      "step": 444
+    },
+    {
+      "epoch": 0.1729330613037987,
+      "grad_norm": 0.21910884976387024,
+      "learning_rate": 0.00016582327753989878,
+      "loss": 1.063,
+      "step": 445
+    },
+    {
+      "epoch": 0.1733216749247061,
+      "grad_norm": 0.21782316267490387,
+      "learning_rate": 0.00016574542623588946,
+      "loss": 1.0976,
+      "step": 446
+    },
+    {
+      "epoch": 0.17371028854561352,
+      "grad_norm": 0.21771778166294098,
+      "learning_rate": 0.0001656675749318801,
+      "loss": 1.0677,
+      "step": 447
+    },
+    {
+      "epoch": 0.17409890216652094,
+      "grad_norm": 0.22117659449577332,
+      "learning_rate": 0.00016558972362787076,
+      "loss": 1.0669,
+      "step": 448
+    },
+    {
+      "epoch": 0.17448751578742835,
+      "grad_norm": 0.21918092668056488,
+      "learning_rate": 0.00016551187232386141,
+      "loss": 1.0955,
+      "step": 449
+    },
+    {
+      "epoch": 0.17487612940833577,
+      "grad_norm": 0.22027818858623505,
+      "learning_rate": 0.0001654340210198521,
+      "loss": 1.0201,
+      "step": 450
+    },
+    {
+      "epoch": 0.17526474302924316,
+      "grad_norm": 0.2042885720729828,
+      "learning_rate": 0.00016535616971584275,
+      "loss": 1.0881,
+      "step": 451
+    },
+    {
+      "epoch": 0.17565335665015058,
+      "grad_norm": 0.21788261830806732,
+      "learning_rate": 0.0001652783184118334,
+      "loss": 1.0918,
+      "step": 452
+    },
+    {
+      "epoch": 0.176041970271058,
+      "grad_norm": 0.23332571983337402,
+      "learning_rate": 0.00016520046710782408,
+      "loss": 1.091,
+      "step": 453
+    },
+    {
+      "epoch": 0.17643058389196542,
+      "grad_norm": 0.20204192399978638,
+      "learning_rate": 0.00016512261580381473,
+      "loss": 1.0366,
+      "step": 454
+    },
+    {
+      "epoch": 0.17681919751287284,
+      "grad_norm": 0.21761906147003174,
+      "learning_rate": 0.00016504476449980538,
+      "loss": 1.0131,
+      "step": 455
+    },
+    {
+      "epoch": 0.17720781113378023,
+      "grad_norm": 0.2152051478624344,
+      "learning_rate": 0.00016496691319579606,
+      "loss": 1.0868,
+      "step": 456
+    },
+    {
+      "epoch": 0.17759642475468765,
+      "grad_norm": 0.22776494920253754,
+      "learning_rate": 0.0001648890618917867,
+      "loss": 1.0807,
+      "step": 457
+    },
+    {
+      "epoch": 0.17798503837559507,
+      "grad_norm": 0.2171342968940735,
+      "learning_rate": 0.00016481121058777736,
+      "loss": 1.0537,
+      "step": 458
+    },
+    {
+      "epoch": 0.17837365199650249,
+      "grad_norm": 0.2046273946762085,
+      "learning_rate": 0.00016473335928376802,
+      "loss": 1.0097,
+      "step": 459
+    },
+    {
+      "epoch": 0.17876226561740988,
+      "grad_norm": 0.2047681361436844,
+      "learning_rate": 0.00016465550797975867,
+      "loss": 1.0204,
+      "step": 460
+    },
+    {
+      "epoch": 0.1791508792383173,
+      "grad_norm": 0.1876862645149231,
+      "learning_rate": 0.00016457765667574935,
+      "loss": 0.9383,
+      "step": 461
+    },
+    {
+      "epoch": 0.17953949285922471,
+      "grad_norm": 0.218430757522583,
+      "learning_rate": 0.00016449980537174,
+      "loss": 1.0721,
+      "step": 462
+    },
+    {
+      "epoch": 0.17992810648013213,
+      "grad_norm": 0.2245480865240097,
+      "learning_rate": 0.00016442195406773065,
+      "loss": 1.0859,
+      "step": 463
+    },
+    {
+      "epoch": 0.18031672010103955,
+      "grad_norm": 0.22577151656150818,
+      "learning_rate": 0.0001643441027637213,
+      "loss": 1.0825,
+      "step": 464
+    },
+    {
+      "epoch": 0.18070533372194694,
+      "grad_norm": 0.20132745802402496,
+      "learning_rate": 0.00016426625145971196,
+      "loss": 1.0615,
+      "step": 465
+    },
+    {
+      "epoch": 0.18109394734285436,
+      "grad_norm": 0.2277505248785019,
+      "learning_rate": 0.00016418840015570263,
+      "loss": 1.0426,
+      "step": 466
+    },
+    {
+      "epoch": 0.18148256096376178,
+      "grad_norm": 0.22540105879306793,
+      "learning_rate": 0.0001641105488516933,
+      "loss": 1.0481,
+      "step": 467
+    },
+    {
+      "epoch": 0.1818711745846692,
+      "grad_norm": 0.20358088612556458,
+      "learning_rate": 0.00016403269754768394,
+      "loss": 1.0286,
+      "step": 468
+    },
+    {
+      "epoch": 0.18225978820557662,
+      "grad_norm": 0.22534145414829254,
+      "learning_rate": 0.0001639548462436746,
+      "loss": 1.1183,
+      "step": 469
+    },
+    {
+      "epoch": 0.182648401826484,
+      "grad_norm": 0.2188873142004013,
+      "learning_rate": 0.00016387699493966524,
+      "loss": 1.0439,
+      "step": 470
+    },
+    {
+      "epoch": 0.18303701544739143,
+      "grad_norm": 0.2128048539161682,
+      "learning_rate": 0.00016379914363565592,
+      "loss": 1.027,
+      "step": 471
+    },
+    {
+      "epoch": 0.18342562906829885,
+      "grad_norm": 0.2518141567707062,
+      "learning_rate": 0.00016372129233164657,
+      "loss": 1.0468,
+      "step": 472
+    },
+    {
+      "epoch": 0.18381424268920626,
+      "grad_norm": 0.2189142256975174,
+      "learning_rate": 0.00016364344102763723,
+      "loss": 1.0581,
+      "step": 473
+    },
+    {
+      "epoch": 0.18420285631011368,
+      "grad_norm": 0.31266725063323975,
+      "learning_rate": 0.00016356558972362788,
+      "loss": 1.0554,
+      "step": 474
+    },
+    {
+      "epoch": 0.18459146993102107,
+      "grad_norm": 0.21343916654586792,
+      "learning_rate": 0.00016348773841961853,
+      "loss": 1.0795,
+      "step": 475
+    },
+    {
+      "epoch": 0.1849800835519285,
+      "grad_norm": 0.22907280921936035,
+      "learning_rate": 0.00016340988711560918,
+      "loss": 1.0304,
+      "step": 476
+    },
+    {
+      "epoch": 0.1853686971728359,
+      "grad_norm": 0.2105257511138916,
+      "learning_rate": 0.00016333203581159986,
+      "loss": 1.0231,
+      "step": 477
+    },
+    {
+      "epoch": 0.18575731079374333,
+      "grad_norm": 0.19537831842899323,
+      "learning_rate": 0.00016325418450759051,
+      "loss": 1.0103,
+      "step": 478
+    },
+    {
+      "epoch": 0.18614592441465072,
+      "grad_norm": 0.20522372424602509,
+      "learning_rate": 0.00016317633320358117,
+      "loss": 1.0196,
+      "step": 479
+    },
+    {
+      "epoch": 0.18653453803555814,
+      "grad_norm": 0.21646477282047272,
+      "learning_rate": 0.00016309848189957182,
+      "loss": 1.0579,
+      "step": 480
+    },
+    {
+      "epoch": 0.18692315165646556,
+      "grad_norm": 0.21077193319797516,
+      "learning_rate": 0.00016302063059556247,
+      "loss": 1.0638,
+      "step": 481
+    },
+    {
+      "epoch": 0.18731176527737298,
+      "grad_norm": 0.20357473194599152,
+      "learning_rate": 0.00016294277929155315,
+      "loss": 1.0635,
+      "step": 482
+    },
+    {
+      "epoch": 0.1877003788982804,
+      "grad_norm": 0.2188001275062561,
+      "learning_rate": 0.0001628649279875438,
+      "loss": 1.0267,
+      "step": 483
+    },
+    {
+      "epoch": 0.1880889925191878,
+      "grad_norm": 0.2128928154706955,
+      "learning_rate": 0.00016278707668353445,
+      "loss": 0.9706,
+      "step": 484
+    },
+    {
+      "epoch": 0.1884776061400952,
+      "grad_norm": 0.22081372141838074,
+      "learning_rate": 0.0001627092253795251,
+      "loss": 1.08,
+      "step": 485
+    },
+    {
+      "epoch": 0.18886621976100262,
+      "grad_norm": 0.2250615805387497,
+      "learning_rate": 0.00016263137407551576,
+      "loss": 1.1451,
+      "step": 486
+    },
+    {
+      "epoch": 0.18925483338191004,
+      "grad_norm": 0.1984967589378357,
+      "learning_rate": 0.00016255352277150644,
+      "loss": 1.0744,
+      "step": 487
+    },
+    {
+      "epoch": 0.18964344700281746,
+      "grad_norm": 0.20778900384902954,
+      "learning_rate": 0.0001624756714674971,
+      "loss": 1.0623,
+      "step": 488
+    },
+    {
+      "epoch": 0.19003206062372485,
+      "grad_norm": 0.2026563137769699,
+      "learning_rate": 0.00016239782016348774,
+      "loss": 1.0714,
+      "step": 489
+    },
+    {
+      "epoch": 0.19042067424463227,
+      "grad_norm": 0.21598374843597412,
+      "learning_rate": 0.0001623199688594784,
+      "loss": 1.0869,
+      "step": 490
+    },
+    {
+      "epoch": 0.1908092878655397,
+      "grad_norm": 0.18944978713989258,
+      "learning_rate": 0.00016224211755546904,
+      "loss": 1.055,
+      "step": 491
+    },
+    {
+      "epoch": 0.1911979014864471,
+      "grad_norm": 0.20698946714401245,
+      "learning_rate": 0.00016216426625145972,
+      "loss": 1.0392,
+      "step": 492
+    },
+    {
+      "epoch": 0.1915865151073545,
+      "grad_norm": 0.22395353019237518,
+      "learning_rate": 0.00016208641494745038,
+      "loss": 1.0681,
+      "step": 493
+    },
+    {
+      "epoch": 0.19197512872826192,
+      "grad_norm": 0.22372962534427643,
+      "learning_rate": 0.00016200856364344103,
+      "loss": 1.0767,
+      "step": 494
+    },
+    {
+      "epoch": 0.19236374234916934,
+      "grad_norm": 0.2066701054573059,
+      "learning_rate": 0.00016193071233943168,
+      "loss": 1.0061,
+      "step": 495
+    },
+    {
+      "epoch": 0.19275235597007676,
+      "grad_norm": 0.19716408848762512,
+      "learning_rate": 0.00016185286103542233,
+      "loss": 1.039,
+      "step": 496
+    },
+    {
+      "epoch": 0.19314096959098417,
+      "grad_norm": 0.22159601747989655,
+      "learning_rate": 0.000161775009731413,
+      "loss": 1.0832,
+      "step": 497
+    },
+    {
+      "epoch": 0.19352958321189156,
+      "grad_norm": 0.21509626507759094,
+      "learning_rate": 0.00016169715842740366,
+      "loss": 1.0264,
+      "step": 498
+    },
+    {
+      "epoch": 0.19391819683279898,
+      "grad_norm": 0.21598199009895325,
+      "learning_rate": 0.00016161930712339431,
+      "loss": 1.049,
+      "step": 499
+    },
+    {
+      "epoch": 0.1943068104537064,
+      "grad_norm": 0.20279590785503387,
+      "learning_rate": 0.00016154145581938497,
+      "loss": 1.0505,
+      "step": 500
+    },
+    {
+      "epoch": 0.19469542407461382,
+      "grad_norm": 0.21796855330467224,
+      "learning_rate": 0.00016146360451537565,
+      "loss": 1.0885,
+      "step": 501
+    },
+    {
+      "epoch": 0.19508403769552124,
+      "grad_norm": 0.22128933668136597,
+      "learning_rate": 0.0001613857532113663,
+      "loss": 1.0903,
+      "step": 502
+    },
+    {
+      "epoch": 0.19547265131642863,
+      "grad_norm": 0.2032536417245865,
+      "learning_rate": 0.00016130790190735695,
+      "loss": 1.0285,
+      "step": 503
+    },
+    {
+      "epoch": 0.19586126493733605,
+      "grad_norm": 0.23738974332809448,
+      "learning_rate": 0.0001612300506033476,
+      "loss": 1.1188,
+      "step": 504
+    },
+    {
+      "epoch": 0.19624987855824347,
+      "grad_norm": 0.19614790380001068,
+      "learning_rate": 0.00016115219929933828,
+      "loss": 1.04,
+      "step": 505
+    },
+    {
+      "epoch": 0.1966384921791509,
+      "grad_norm": 0.2198178917169571,
+      "learning_rate": 0.00016107434799532893,
+      "loss": 1.0696,
+      "step": 506
+    },
+    {
+      "epoch": 0.1970271058000583,
+      "grad_norm": 0.18814648687839508,
+      "learning_rate": 0.00016099649669131959,
+      "loss": 1.0203,
+      "step": 507
+    },
+    {
+      "epoch": 0.1974157194209657,
+      "grad_norm": 0.20699037611484528,
+      "learning_rate": 0.00016091864538731026,
+      "loss": 1.1074,
+      "step": 508
+    },
+    {
+      "epoch": 0.19780433304187311,
+      "grad_norm": 0.21490445733070374,
+      "learning_rate": 0.00016084079408330092,
+      "loss": 1.0682,
+      "step": 509
+    },
+    {
+      "epoch": 0.19819294666278053,
+      "grad_norm": 0.2363848090171814,
+      "learning_rate": 0.00016076294277929157,
+      "loss": 1.0408,
+      "step": 510
+    },
+    {
+      "epoch": 0.19858156028368795,
+      "grad_norm": 0.20186659693717957,
+      "learning_rate": 0.00016068509147528222,
+      "loss": 1.026,
+      "step": 511
+    },
+    {
+      "epoch": 0.19897017390459534,
+      "grad_norm": 0.21564024686813354,
+      "learning_rate": 0.00016060724017127287,
+      "loss": 1.0418,
+      "step": 512
+    },
+    {
+      "epoch": 0.19935878752550276,
+      "grad_norm": 0.19151560962200165,
+      "learning_rate": 0.00016052938886726355,
+      "loss": 1.0037,
+      "step": 513
+    },
+    {
+      "epoch": 0.19974740114641018,
+      "grad_norm": 0.21038194000720978,
+      "learning_rate": 0.0001604515375632542,
+      "loss": 1.0545,
+      "step": 514
+    },
+    {
+      "epoch": 0.2001360147673176,
+      "grad_norm": 0.20496582984924316,
+      "learning_rate": 0.00016037368625924486,
+      "loss": 1.0543,
+      "step": 515
+    },
+    {
+      "epoch": 0.20052462838822502,
+      "grad_norm": 0.20689113438129425,
+      "learning_rate": 0.0001602958349552355,
+      "loss": 1.0905,
+      "step": 516
+    },
+    {
+      "epoch": 0.2009132420091324,
+      "grad_norm": 0.2284041792154312,
+      "learning_rate": 0.00016021798365122616,
+      "loss": 1.0717,
+      "step": 517
+    },
+    {
+      "epoch": 0.20130185563003983,
+      "grad_norm": 0.23457761108875275,
+      "learning_rate": 0.00016014013234721684,
+      "loss": 1.106,
+      "step": 518
+    },
+    {
+      "epoch": 0.20169046925094725,
+      "grad_norm": 0.2088528722524643,
+      "learning_rate": 0.0001600622810432075,
+      "loss": 1.0428,
+      "step": 519
+    },
+    {
+      "epoch": 0.20207908287185467,
+      "grad_norm": 0.2170068770647049,
+      "learning_rate": 0.00015998442973919814,
+      "loss": 0.9875,
+      "step": 520
+    },
+    {
+      "epoch": 0.20246769649276208,
+      "grad_norm": 0.2270561158657074,
+      "learning_rate": 0.0001599065784351888,
+      "loss": 1.0676,
+      "step": 521
+    },
+    {
+      "epoch": 0.20285631011366947,
+      "grad_norm": 0.2151324599981308,
+      "learning_rate": 0.00015982872713117945,
+      "loss": 1.0675,
+      "step": 522
+    },
+    {
+      "epoch": 0.2032449237345769,
+      "grad_norm": 0.23113249242305756,
+      "learning_rate": 0.00015975087582717013,
+      "loss": 1.0608,
+      "step": 523
+    },
+    {
+      "epoch": 0.2036335373554843,
+      "grad_norm": 0.2587106227874756,
+      "learning_rate": 0.00015967302452316078,
+      "loss": 1.0867,
+      "step": 524
+    },
+    {
+      "epoch": 0.20402215097639173,
+      "grad_norm": 0.21842992305755615,
+      "learning_rate": 0.00015959517321915143,
+      "loss": 1.0726,
+      "step": 525
+    },
+    {
+      "epoch": 0.20441076459729912,
+      "grad_norm": 0.20867805182933807,
+      "learning_rate": 0.00015951732191514208,
+      "loss": 1.0578,
+      "step": 526
+    },
+    {
+      "epoch": 0.20479937821820654,
+      "grad_norm": 0.2396962195634842,
+      "learning_rate": 0.00015943947061113273,
+      "loss": 1.0292,
+      "step": 527
+    },
+    {
+      "epoch": 0.20518799183911396,
+      "grad_norm": 0.221155047416687,
+      "learning_rate": 0.00015936161930712341,
+      "loss": 1.0019,
+      "step": 528
+    },
+    {
+      "epoch": 0.20557660546002138,
+      "grad_norm": 0.20032119750976562,
+      "learning_rate": 0.00015928376800311407,
+      "loss": 1.0435,
+      "step": 529
+    },
+    {
+      "epoch": 0.2059652190809288,
+      "grad_norm": 0.24095888435840607,
+      "learning_rate": 0.00015920591669910472,
+      "loss": 1.0355,
+      "step": 530
+    },
+    {
+      "epoch": 0.2063538327018362,
+      "grad_norm": 0.2286604344844818,
+      "learning_rate": 0.00015912806539509537,
+      "loss": 0.9989,
+      "step": 531
+    },
+    {
+      "epoch": 0.2067424463227436,
+      "grad_norm": 0.21537137031555176,
+      "learning_rate": 0.00015905021409108602,
+      "loss": 1.0642,
+      "step": 532
+    },
+    {
+      "epoch": 0.20713105994365102,
+      "grad_norm": 0.22447925806045532,
+      "learning_rate": 0.0001589723627870767,
+      "loss": 1.1244,
+      "step": 533
+    },
+    {
+      "epoch": 0.20751967356455844,
+      "grad_norm": 0.21077273786067963,
+      "learning_rate": 0.00015889451148306735,
+      "loss": 1.0167,
+      "step": 534
+    },
+    {
+      "epoch": 0.20790828718546586,
+      "grad_norm": 0.22340558469295502,
+      "learning_rate": 0.000158816660179058,
+      "loss": 1.0991,
+      "step": 535
+    },
+    {
+      "epoch": 0.20829690080637325,
+      "grad_norm": 0.223599374294281,
+      "learning_rate": 0.00015873880887504866,
+      "loss": 1.086,
+      "step": 536
+    },
+    {
+      "epoch": 0.20868551442728067,
+      "grad_norm": 0.2615208923816681,
+      "learning_rate": 0.0001586609575710393,
+      "loss": 1.0584,
+      "step": 537
+    },
+    {
+      "epoch": 0.2090741280481881,
+      "grad_norm": 0.2085907757282257,
+      "learning_rate": 0.00015858310626703,
+      "loss": 1.0994,
+      "step": 538
+    },
+    {
+      "epoch": 0.2094627416690955,
+      "grad_norm": 0.2170211672782898,
+      "learning_rate": 0.00015850525496302064,
+      "loss": 1.1105,
+      "step": 539
+    },
+    {
+      "epoch": 0.20985135529000293,
+      "grad_norm": 0.21978625655174255,
+      "learning_rate": 0.0001584274036590113,
+      "loss": 1.002,
+      "step": 540
+    },
+    {
+      "epoch": 0.21023996891091032,
+      "grad_norm": 0.23684021830558777,
+      "learning_rate": 0.00015834955235500194,
+      "loss": 1.1216,
+      "step": 541
+    },
+    {
+      "epoch": 0.21062858253181774,
+      "grad_norm": 0.220269113779068,
+      "learning_rate": 0.0001582717010509926,
+      "loss": 1.0773,
+      "step": 542
+    },
+    {
+      "epoch": 0.21101719615272516,
+      "grad_norm": 0.22447973489761353,
+      "learning_rate": 0.00015819384974698328,
+      "loss": 1.0941,
+      "step": 543
+    },
+    {
+      "epoch": 0.21140580977363257,
+      "grad_norm": 0.22435730695724487,
+      "learning_rate": 0.00015811599844297393,
+      "loss": 1.0138,
+      "step": 544
+    },
+    {
+      "epoch": 0.21179442339453997,
+      "grad_norm": 0.2230793684720993,
+      "learning_rate": 0.00015803814713896458,
+      "loss": 1.0343,
+      "step": 545
+    },
+    {
+      "epoch": 0.21218303701544738,
+      "grad_norm": 0.23491905629634857,
+      "learning_rate": 0.00015796029583495523,
+      "loss": 1.11,
+      "step": 546
+    },
+    {
+      "epoch": 0.2125716506363548,
+      "grad_norm": 0.213560551404953,
+      "learning_rate": 0.00015788244453094588,
+      "loss": 1.0615,
+      "step": 547
+    },
+    {
+      "epoch": 0.21296026425726222,
+      "grad_norm": 0.21392837166786194,
+      "learning_rate": 0.00015780459322693654,
+      "loss": 1.0872,
+      "step": 548
+    },
+    {
+      "epoch": 0.21334887787816964,
+      "grad_norm": 0.20007692277431488,
+      "learning_rate": 0.00015772674192292722,
+      "loss": 1.0394,
+      "step": 549
+    },
+    {
+      "epoch": 0.21373749149907703,
+      "grad_norm": 0.1969841718673706,
+      "learning_rate": 0.00015764889061891787,
+      "loss": 1.0381,
+      "step": 550
+    },
+    {
+      "epoch": 0.21412610511998445,
+      "grad_norm": 0.21874025464057922,
+      "learning_rate": 0.00015757103931490852,
+      "loss": 1.0822,
+      "step": 551
+    },
+    {
+      "epoch": 0.21451471874089187,
+      "grad_norm": 0.21824273467063904,
+      "learning_rate": 0.00015749318801089917,
+      "loss": 1.0802,
+      "step": 552
+    },
+    {
+      "epoch": 0.2149033323617993,
+      "grad_norm": 0.20942047238349915,
+      "learning_rate": 0.00015741533670688985,
+      "loss": 1.0634,
+      "step": 553
+    },
+    {
+      "epoch": 0.2152919459827067,
+      "grad_norm": 0.1940152943134308,
+      "learning_rate": 0.0001573374854028805,
+      "loss": 1.0264,
+      "step": 554
+    },
+    {
+      "epoch": 0.2156805596036141,
+      "grad_norm": 0.19859059154987335,
+      "learning_rate": 0.00015725963409887115,
+      "loss": 0.9701,
+      "step": 555
+    },
+    {
+      "epoch": 0.21606917322452152,
+      "grad_norm": 0.22239404916763306,
+      "learning_rate": 0.0001571817827948618,
+      "loss": 1.1282,
+      "step": 556
+    },
+    {
+      "epoch": 0.21645778684542893,
+      "grad_norm": 0.23820599913597107,
+      "learning_rate": 0.00015710393149085249,
+      "loss": 1.1123,
+      "step": 557
+    },
+    {
+      "epoch": 0.21684640046633635,
+      "grad_norm": 0.21279917657375336,
+      "learning_rate": 0.00015702608018684314,
+      "loss": 1.0542,
+      "step": 558
+    },
+    {
+      "epoch": 0.21723501408724374,
+      "grad_norm": 0.2065514773130417,
+      "learning_rate": 0.0001569482288828338,
+      "loss": 1.0685,
+      "step": 559
+    },
+    {
+      "epoch": 0.21762362770815116,
+      "grad_norm": 0.20130831003189087,
+      "learning_rate": 0.00015687037757882447,
+      "loss": 0.9869,
+      "step": 560
+    },
+    {
+      "epoch": 0.21801224132905858,
+      "grad_norm": 0.2187541127204895,
+      "learning_rate": 0.00015679252627481512,
+      "loss": 1.1095,
+      "step": 561
+    },
+    {
+      "epoch": 0.218400854949966,
+      "grad_norm": 0.21028277277946472,
+      "learning_rate": 0.00015671467497080577,
+      "loss": 1.0804,
+      "step": 562
+    },
+    {
+      "epoch": 0.21878946857087342,
+      "grad_norm": 0.8187636733055115,
+      "learning_rate": 0.00015663682366679643,
+      "loss": 1.0782,
+      "step": 563
+    },
+    {
+      "epoch": 0.2191780821917808,
+      "grad_norm": 0.20059974491596222,
+      "learning_rate": 0.0001565589723627871,
+      "loss": 1.0279,
+      "step": 564
+    },
+    {
+      "epoch": 0.21956669581268823,
+      "grad_norm": 0.20440839231014252,
+      "learning_rate": 0.00015648112105877776,
+      "loss": 0.9863,
+      "step": 565
+    },
+    {
+      "epoch": 0.21995530943359565,
+      "grad_norm": 0.21423624455928802,
+      "learning_rate": 0.0001564032697547684,
+      "loss": 1.0685,
+      "step": 566
+    },
+    {
+      "epoch": 0.22034392305450307,
+      "grad_norm": 0.22430062294006348,
+      "learning_rate": 0.00015632541845075906,
+      "loss": 1.0761,
+      "step": 567
+    },
+    {
+      "epoch": 0.22073253667541048,
+      "grad_norm": 0.22782258689403534,
+      "learning_rate": 0.0001562475671467497,
+      "loss": 1.1024,
+      "step": 568
+    },
+    {
+      "epoch": 0.22112115029631788,
+      "grad_norm": 0.21150320768356323,
+      "learning_rate": 0.0001561697158427404,
+      "loss": 1.0621,
+      "step": 569
+    },
+    {
+      "epoch": 0.2215097639172253,
+      "grad_norm": 0.20342351496219635,
+      "learning_rate": 0.00015609186453873104,
+      "loss": 1.0667,
+      "step": 570
+    },
+    {
+      "epoch": 0.2218983775381327,
+      "grad_norm": 0.22866711020469666,
+      "learning_rate": 0.0001560140132347217,
+      "loss": 1.0631,
+      "step": 571
+    },
+    {
+      "epoch": 0.22228699115904013,
+      "grad_norm": 0.2200063169002533,
+      "learning_rate": 0.00015593616193071235,
+      "loss": 1.0448,
+      "step": 572
+    },
+    {
+      "epoch": 0.22267560477994755,
+      "grad_norm": 0.19440248608589172,
+      "learning_rate": 0.000155858310626703,
+      "loss": 1.037,
+      "step": 573
+    },
+    {
+      "epoch": 0.22306421840085494,
+      "grad_norm": 0.205752432346344,
+      "learning_rate": 0.00015578045932269368,
+      "loss": 1.0465,
+      "step": 574
+    },
+    {
+      "epoch": 0.22345283202176236,
+      "grad_norm": 0.22247998416423798,
+      "learning_rate": 0.00015570260801868433,
+      "loss": 0.997,
+      "step": 575
+    },
+    {
+      "epoch": 0.22384144564266978,
+      "grad_norm": 0.22199274599552155,
+      "learning_rate": 0.00015562475671467498,
+      "loss": 1.0178,
+      "step": 576
+    },
+    {
+      "epoch": 0.2242300592635772,
+      "grad_norm": 0.2114989310503006,
+      "learning_rate": 0.00015554690541066564,
+      "loss": 1.0457,
+      "step": 577
+    },
+    {
+      "epoch": 0.2246186728844846,
+      "grad_norm": 0.24248506128787994,
+      "learning_rate": 0.0001554690541066563,
+      "loss": 1.002,
+      "step": 578
+    },
+    {
+      "epoch": 0.225007286505392,
+      "grad_norm": 0.2565505802631378,
+      "learning_rate": 0.00015539120280264697,
+      "loss": 1.0541,
+      "step": 579
+    },
+    {
+      "epoch": 0.22539590012629943,
+      "grad_norm": 0.22799409925937653,
+      "learning_rate": 0.00015531335149863762,
+      "loss": 1.0788,
+      "step": 580
+    },
+    {
+      "epoch": 0.22578451374720684,
+      "grad_norm": 0.2196080982685089,
+      "learning_rate": 0.00015523550019462827,
+      "loss": 1.0877,
+      "step": 581
+    },
+    {
+      "epoch": 0.22617312736811426,
+      "grad_norm": 0.21992824971675873,
+      "learning_rate": 0.00015515764889061892,
+      "loss": 1.0213,
+      "step": 582
+    },
+    {
+      "epoch": 0.22656174098902165,
+      "grad_norm": 0.22793298959732056,
+      "learning_rate": 0.00015507979758660957,
+      "loss": 1.0633,
+      "step": 583
+    },
+    {
+      "epoch": 0.22695035460992907,
+      "grad_norm": 0.21707972884178162,
+      "learning_rate": 0.00015500194628260023,
+      "loss": 1.081,
+      "step": 584
+    },
+    {
+      "epoch": 0.2273389682308365,
+      "grad_norm": 0.220685675740242,
+      "learning_rate": 0.0001549240949785909,
+      "loss": 1.0658,
+      "step": 585
+    },
+    {
+      "epoch": 0.2277275818517439,
+      "grad_norm": 0.22576668858528137,
+      "learning_rate": 0.00015484624367458156,
+      "loss": 1.0795,
+      "step": 586
+    },
+    {
+      "epoch": 0.22811619547265133,
+      "grad_norm": 0.21778982877731323,
+      "learning_rate": 0.0001547683923705722,
+      "loss": 1.033,
+      "step": 587
+    },
+    {
+      "epoch": 0.22850480909355872,
+      "grad_norm": 0.22748610377311707,
+      "learning_rate": 0.00015469054106656286,
+      "loss": 1.0948,
+      "step": 588
+    },
+    {
+      "epoch": 0.22889342271446614,
+      "grad_norm": 0.21561284363269806,
+      "learning_rate": 0.00015461268976255351,
+      "loss": 1.0022,
+      "step": 589
+    },
+    {
+      "epoch": 0.22928203633537356,
+      "grad_norm": 0.2419756054878235,
+      "learning_rate": 0.0001545348384585442,
+      "loss": 1.0786,
+      "step": 590
+    },
+    {
+      "epoch": 0.22967064995628098,
+      "grad_norm": 0.20479315519332886,
+      "learning_rate": 0.00015445698715453485,
+      "loss": 1.027,
+      "step": 591
+    },
+    {
+      "epoch": 0.2300592635771884,
+      "grad_norm": 0.21365883946418762,
+      "learning_rate": 0.0001543791358505255,
+      "loss": 1.0773,
+      "step": 592
+    },
+    {
+      "epoch": 0.23044787719809579,
+      "grad_norm": 0.23133166134357452,
+      "learning_rate": 0.00015430128454651615,
+      "loss": 1.0877,
+      "step": 593
+    },
+    {
+      "epoch": 0.2308364908190032,
+      "grad_norm": 0.2110515981912613,
+      "learning_rate": 0.0001542234332425068,
+      "loss": 1.0509,
+      "step": 594
+    },
+    {
+      "epoch": 0.23122510443991062,
+      "grad_norm": 0.20658442378044128,
+      "learning_rate": 0.00015414558193849748,
+      "loss": 1.0623,
+      "step": 595
+    },
+    {
+      "epoch": 0.23161371806081804,
+      "grad_norm": 0.21831996738910675,
+      "learning_rate": 0.00015406773063448813,
+      "loss": 1.021,
+      "step": 596
+    },
+    {
+      "epoch": 0.23200233168172543,
+      "grad_norm": 0.23015642166137695,
+      "learning_rate": 0.00015398987933047878,
+      "loss": 1.0358,
+      "step": 597
+    },
+    {
+      "epoch": 0.23239094530263285,
+      "grad_norm": 0.23071645200252533,
+      "learning_rate": 0.00015391202802646944,
+      "loss": 1.1255,
+      "step": 598
+    },
+    {
+      "epoch": 0.23277955892354027,
+      "grad_norm": 0.19513486325740814,
+      "learning_rate": 0.0001538341767224601,
+      "loss": 1.0189,
+      "step": 599
+    },
+    {
+      "epoch": 0.2331681725444477,
+      "grad_norm": 0.20821452140808105,
+      "learning_rate": 0.00015375632541845077,
+      "loss": 1.0843,
+      "step": 600
+    },
+    {
+      "epoch": 0.2335567861653551,
+      "grad_norm": 0.20563223958015442,
+      "learning_rate": 0.00015367847411444142,
+      "loss": 1.0012,
+      "step": 601
+    },
+    {
+      "epoch": 0.2339453997862625,
+      "grad_norm": 0.22674202919006348,
+      "learning_rate": 0.00015360062281043207,
+      "loss": 1.0371,
+      "step": 602
+    },
+    {
+      "epoch": 0.23433401340716992,
+      "grad_norm": 0.20744135975837708,
+      "learning_rate": 0.00015352277150642272,
+      "loss": 1.0466,
+      "step": 603
+    },
+    {
+      "epoch": 0.23472262702807734,
+      "grad_norm": 0.22103577852249146,
+      "learning_rate": 0.00015344492020241338,
+      "loss": 1.0942,
+      "step": 604
+    },
+    {
+      "epoch": 0.23511124064898475,
+      "grad_norm": 0.20643098652362823,
+      "learning_rate": 0.00015336706889840406,
+      "loss": 1.0682,
+      "step": 605
+    },
+    {
+      "epoch": 0.23549985426989217,
+      "grad_norm": 0.23436777293682098,
+      "learning_rate": 0.0001532892175943947,
+      "loss": 1.0613,
+      "step": 606
+    },
+    {
+      "epoch": 0.23588846789079956,
+      "grad_norm": 0.21898899972438812,
+      "learning_rate": 0.00015321136629038536,
+      "loss": 1.0571,
+      "step": 607
+    },
+    {
+      "epoch": 0.23627708151170698,
+      "grad_norm": 0.20569247007369995,
+      "learning_rate": 0.00015313351498637604,
+      "loss": 1.061,
+      "step": 608
+    },
+    {
+      "epoch": 0.2366656951326144,
+      "grad_norm": 0.2099207490682602,
+      "learning_rate": 0.0001530556636823667,
+      "loss": 1.0776,
+      "step": 609
+    },
+    {
+      "epoch": 0.23705430875352182,
+      "grad_norm": 0.20078738033771515,
+      "learning_rate": 0.00015297781237835734,
+      "loss": 1.0341,
+      "step": 610
+    },
+    {
+      "epoch": 0.2374429223744292,
+      "grad_norm": 0.20327065885066986,
+      "learning_rate": 0.000152899961074348,
+      "loss": 1.0168,
+      "step": 611
+    },
+    {
+      "epoch": 0.23783153599533663,
+      "grad_norm": 0.21741214394569397,
+      "learning_rate": 0.00015282210977033867,
+      "loss": 1.0726,
+      "step": 612
+    },
+    {
+      "epoch": 0.23822014961624405,
+      "grad_norm": 0.2065727263689041,
+      "learning_rate": 0.00015274425846632933,
+      "loss": 1.0474,
+      "step": 613
+    },
+    {
+      "epoch": 0.23860876323715147,
+      "grad_norm": 0.21241194009780884,
+      "learning_rate": 0.00015266640716231998,
+      "loss": 1.0666,
+      "step": 614
+    },
+    {
+      "epoch": 0.23899737685805889,
+      "grad_norm": 0.2194201797246933,
+      "learning_rate": 0.00015258855585831066,
+      "loss": 1.1411,
+      "step": 615
+    },
+    {
+      "epoch": 0.23938599047896628,
+      "grad_norm": 0.21537193655967712,
+      "learning_rate": 0.0001525107045543013,
+      "loss": 1.081,
+      "step": 616
+    },
+    {
+      "epoch": 0.2397746040998737,
+      "grad_norm": 0.21125951409339905,
+      "learning_rate": 0.00015243285325029196,
+      "loss": 1.0679,
+      "step": 617
+    },
+    {
+      "epoch": 0.2401632177207811,
+      "grad_norm": 0.21342721581459045,
+      "learning_rate": 0.0001523550019462826,
+      "loss": 1.0564,
+      "step": 618
+    },
+    {
+      "epoch": 0.24055183134168853,
+      "grad_norm": 0.2223503291606903,
+      "learning_rate": 0.00015227715064227327,
+      "loss": 1.1163,
+      "step": 619
+    },
+    {
+      "epoch": 0.24094044496259595,
+      "grad_norm": 0.21626527607440948,
+      "learning_rate": 0.00015219929933826394,
+      "loss": 1.0793,
+      "step": 620
+    },
+    {
+      "epoch": 0.24132905858350334,
+      "grad_norm": 0.21899500489234924,
+      "learning_rate": 0.0001521214480342546,
+      "loss": 1.0864,
+      "step": 621
+    },
+    {
+      "epoch": 0.24171767220441076,
+      "grad_norm": 0.2499915212392807,
+      "learning_rate": 0.00015204359673024525,
+      "loss": 1.1381,
+      "step": 622
+    },
+    {
+      "epoch": 0.24210628582531818,
+      "grad_norm": 0.2108345925807953,
+      "learning_rate": 0.0001519657454262359,
+      "loss": 1.0534,
+      "step": 623
+    },
+    {
+      "epoch": 0.2424948994462256,
+      "grad_norm": 0.2224910855293274,
+      "learning_rate": 0.00015188789412222655,
+      "loss": 1.0235,
+      "step": 624
+    },
+    {
+      "epoch": 0.24288351306713302,
+      "grad_norm": 0.22163094580173492,
+      "learning_rate": 0.0001518100428182172,
+      "loss": 1.0143,
+      "step": 625
+    },
+    {
+      "epoch": 0.2432721266880404,
+      "grad_norm": 0.20709283649921417,
+      "learning_rate": 0.00015173219151420788,
+      "loss": 1.0506,
+      "step": 626
+    },
+    {
+      "epoch": 0.24366074030894783,
+      "grad_norm": 0.2112802267074585,
+      "learning_rate": 0.00015165434021019854,
+      "loss": 1.0692,
+      "step": 627
+    },
+    {
+      "epoch": 0.24404935392985525,
+      "grad_norm": 0.23622830212116241,
+      "learning_rate": 0.0001515764889061892,
+      "loss": 1.0769,
+      "step": 628
+    },
+    {
+      "epoch": 0.24443796755076266,
+      "grad_norm": 0.23328271508216858,
+      "learning_rate": 0.00015149863760217984,
+      "loss": 1.1158,
+      "step": 629
+    },
+    {
+      "epoch": 0.24482658117167005,
+      "grad_norm": 0.2071760892868042,
+      "learning_rate": 0.0001514207862981705,
+      "loss": 1.0133,
+      "step": 630
+    },
+    {
+      "epoch": 0.24521519479257747,
+      "grad_norm": 0.21428920328617096,
+      "learning_rate": 0.00015134293499416117,
+      "loss": 1.0342,
+      "step": 631
+    },
+    {
+      "epoch": 0.2456038084134849,
+      "grad_norm": 0.22225375473499298,
+      "learning_rate": 0.00015126508369015182,
+      "loss": 1.1054,
+      "step": 632
+    },
+    {
+      "epoch": 0.2459924220343923,
+      "grad_norm": 0.2096671611070633,
+      "learning_rate": 0.00015118723238614248,
+      "loss": 1.0229,
+      "step": 633
+    },
+    {
+      "epoch": 0.24638103565529973,
+      "grad_norm": 0.21473252773284912,
+      "learning_rate": 0.00015110938108213313,
+      "loss": 1.0915,
+      "step": 634
+    },
+    {
+      "epoch": 0.24676964927620712,
+      "grad_norm": 0.2071562111377716,
+      "learning_rate": 0.00015103152977812378,
+      "loss": 1.047,
+      "step": 635
+    },
+    {
+      "epoch": 0.24715826289711454,
+      "grad_norm": 0.19868609309196472,
+      "learning_rate": 0.00015095367847411446,
+      "loss": 1.0073,
+      "step": 636
+    },
+    {
+      "epoch": 0.24754687651802196,
+      "grad_norm": 0.20937366783618927,
+      "learning_rate": 0.0001508758271701051,
+      "loss": 1.0155,
+      "step": 637
+    },
+    {
+      "epoch": 0.24793549013892938,
+      "grad_norm": 0.19225911796092987,
+      "learning_rate": 0.00015079797586609576,
+      "loss": 1.0163,
+      "step": 638
+    },
+    {
+      "epoch": 0.2483241037598368,
+      "grad_norm": 0.20427283644676208,
+      "learning_rate": 0.00015072012456208641,
+      "loss": 1.062,
+      "step": 639
+    },
+    {
+      "epoch": 0.24871271738074419,
+      "grad_norm": 0.21640253067016602,
+      "learning_rate": 0.00015064227325807707,
+      "loss": 1.025,
+      "step": 640
+    },
+    {
+      "epoch": 0.2491013310016516,
+      "grad_norm": 0.20416739583015442,
+      "learning_rate": 0.00015056442195406775,
+      "loss": 1.0635,
+      "step": 641
+    },
+    {
+      "epoch": 0.24948994462255902,
+      "grad_norm": 0.1990521252155304,
+      "learning_rate": 0.0001504865706500584,
+      "loss": 1.0757,
+      "step": 642
+    },
+    {
+      "epoch": 0.24987855824346644,
+      "grad_norm": 0.21636444330215454,
+      "learning_rate": 0.00015040871934604905,
+      "loss": 1.0441,
+      "step": 643
+    },
+    {
+      "epoch": 0.25026717186437386,
+      "grad_norm": 0.21253719925880432,
+      "learning_rate": 0.0001503308680420397,
+      "loss": 1.0574,
+      "step": 644
+    },
+    {
+      "epoch": 0.2506557854852813,
+      "grad_norm": 0.2134159356355667,
+      "learning_rate": 0.00015025301673803035,
+      "loss": 1.0396,
+      "step": 645
+    },
+    {
+      "epoch": 0.2510443991061887,
+      "grad_norm": 0.2018527239561081,
+      "learning_rate": 0.00015017516543402103,
+      "loss": 1.0606,
+      "step": 646
+    },
+    {
+      "epoch": 0.25143301272709606,
+      "grad_norm": 0.20320741832256317,
+      "learning_rate": 0.00015009731413001169,
+      "loss": 1.0093,
+      "step": 647
+    },
+    {
+      "epoch": 0.2518216263480035,
+      "grad_norm": 0.21007056534290314,
+      "learning_rate": 0.00015001946282600234,
+      "loss": 1.0284,
+      "step": 648
+    },
+    {
+      "epoch": 0.2522102399689109,
+      "grad_norm": 0.22453372180461884,
+      "learning_rate": 0.000149941611521993,
+      "loss": 1.0271,
+      "step": 649
+    },
+    {
+      "epoch": 0.2525988535898183,
+      "grad_norm": 0.19889335334300995,
+      "learning_rate": 0.00014986376021798364,
+      "loss": 1.0238,
+      "step": 650
+    },
+    {
+      "epoch": 0.25298746721072574,
+      "grad_norm": 0.19339965283870697,
+      "learning_rate": 0.00014978590891397432,
+      "loss": 1.024,
+      "step": 651
+    },
+    {
+      "epoch": 0.25337608083163315,
+      "grad_norm": 0.22362011671066284,
+      "learning_rate": 0.00014970805760996497,
+      "loss": 1.0722,
+      "step": 652
+    },
+    {
+      "epoch": 0.2537646944525406,
+      "grad_norm": 0.2110588103532791,
+      "learning_rate": 0.00014963020630595562,
+      "loss": 1.0541,
+      "step": 653
+    },
+    {
+      "epoch": 0.254153308073448,
+      "grad_norm": 0.203025683760643,
+      "learning_rate": 0.00014955235500194628,
+      "loss": 1.0335,
+      "step": 654
+    },
+    {
+      "epoch": 0.2545419216943554,
+      "grad_norm": 0.20884902775287628,
+      "learning_rate": 0.00014947450369793693,
+      "loss": 1.0507,
+      "step": 655
+    },
+    {
+      "epoch": 0.2549305353152628,
+      "grad_norm": 0.21234256029129028,
+      "learning_rate": 0.0001493966523939276,
+      "loss": 1.0372,
+      "step": 656
+    },
+    {
+      "epoch": 0.2553191489361702,
+      "grad_norm": 0.1984352171421051,
+      "learning_rate": 0.00014931880108991826,
+      "loss": 0.9979,
+      "step": 657
+    },
+    {
+      "epoch": 0.2557077625570776,
+      "grad_norm": 0.18848282098770142,
+      "learning_rate": 0.0001492409497859089,
+      "loss": 0.9973,
+      "step": 658
+    },
+    {
+      "epoch": 0.25609637617798503,
+      "grad_norm": 0.2201709896326065,
+      "learning_rate": 0.00014916309848189956,
+      "loss": 1.0386,
+      "step": 659
+    },
+    {
+      "epoch": 0.25648498979889245,
+      "grad_norm": 0.23094095289707184,
+      "learning_rate": 0.00014908524717789024,
+      "loss": 1.1205,
+      "step": 660
+    },
+    {
+      "epoch": 0.25687360341979987,
+      "grad_norm": 0.21087734401226044,
+      "learning_rate": 0.0001490073958738809,
+      "loss": 1.0231,
+      "step": 661
+    },
+    {
+      "epoch": 0.2572622170407073,
+      "grad_norm": 0.24970979988574982,
+      "learning_rate": 0.00014892954456987155,
+      "loss": 1.0421,
+      "step": 662
+    },
+    {
+      "epoch": 0.2576508306616147,
+      "grad_norm": 0.22024711966514587,
+      "learning_rate": 0.00014885169326586223,
+      "loss": 1.1033,
+      "step": 663
+    },
+    {
+      "epoch": 0.2580394442825221,
+      "grad_norm": 0.2195248156785965,
+      "learning_rate": 0.00014877384196185288,
+      "loss": 1.089,
+      "step": 664
+    },
+    {
+      "epoch": 0.25842805790342954,
+      "grad_norm": 0.20236417651176453,
+      "learning_rate": 0.00014869599065784353,
+      "loss": 1.0196,
+      "step": 665
+    },
+    {
+      "epoch": 0.2588166715243369,
+      "grad_norm": 0.21973329782485962,
+      "learning_rate": 0.00014861813935383418,
+      "loss": 1.0844,
+      "step": 666
+    },
+    {
+      "epoch": 0.2592052851452443,
+      "grad_norm": 0.2069879174232483,
+      "learning_rate": 0.00014854028804982486,
+      "loss": 1.0312,
+      "step": 667
+    },
+    {
+      "epoch": 0.25959389876615174,
+      "grad_norm": 0.2037455290555954,
+      "learning_rate": 0.00014846243674581551,
+      "loss": 1.0018,
+      "step": 668
+    },
+    {
+      "epoch": 0.25998251238705916,
+      "grad_norm": 0.24176378548145294,
+      "learning_rate": 0.00014838458544180617,
+      "loss": 1.0749,
+      "step": 669
+    },
+    {
+      "epoch": 0.2603711260079666,
+      "grad_norm": 0.2007879763841629,
+      "learning_rate": 0.00014830673413779682,
+      "loss": 1.0443,
+      "step": 670
+    },
+    {
+      "epoch": 0.260759739628874,
+      "grad_norm": 0.23503245413303375,
+      "learning_rate": 0.00014822888283378747,
+      "loss": 1.0674,
+      "step": 671
+    },
+    {
+      "epoch": 0.2611483532497814,
+      "grad_norm": 0.2166167050600052,
+      "learning_rate": 0.00014815103152977815,
+      "loss": 1.079,
+      "step": 672
+    },
+    {
+      "epoch": 0.26153696687068884,
+      "grad_norm": 0.2293982058763504,
+      "learning_rate": 0.0001480731802257688,
+      "loss": 1.0517,
+      "step": 673
+    },
+    {
+      "epoch": 0.26192558049159625,
+      "grad_norm": 0.21040330827236176,
+      "learning_rate": 0.00014799532892175945,
+      "loss": 1.0475,
+      "step": 674
+    },
+    {
+      "epoch": 0.2623141941125036,
+      "grad_norm": 0.20750463008880615,
+      "learning_rate": 0.0001479174776177501,
+      "loss": 1.025,
+      "step": 675
+    },
+    {
+      "epoch": 0.26270280773341104,
+      "grad_norm": 0.2748873233795166,
+      "learning_rate": 0.00014783962631374076,
+      "loss": 1.0212,
+      "step": 676
+    },
+    {
+      "epoch": 0.26309142135431846,
+      "grad_norm": 0.19212333858013153,
+      "learning_rate": 0.00014776177500973144,
+      "loss": 1.0049,
+      "step": 677
+    },
+    {
+      "epoch": 0.2634800349752259,
+      "grad_norm": 0.207731693983078,
+      "learning_rate": 0.0001476839237057221,
+      "loss": 1.0062,
+      "step": 678
+    },
+    {
+      "epoch": 0.2638686485961333,
+      "grad_norm": 0.2177981585264206,
+      "learning_rate": 0.00014760607240171274,
+      "loss": 1.0489,
+      "step": 679
+    },
+    {
+      "epoch": 0.2642572622170407,
+      "grad_norm": 0.23239290714263916,
+      "learning_rate": 0.0001475282210977034,
+      "loss": 1.0856,
+      "step": 680
+    },
+    {
+      "epoch": 0.26464587583794813,
+      "grad_norm": 0.2033151388168335,
+      "learning_rate": 0.00014745036979369404,
+      "loss": 1.0389,
+      "step": 681
+    },
+    {
+      "epoch": 0.26503448945885555,
+      "grad_norm": 0.20917408168315887,
+      "learning_rate": 0.00014737251848968472,
+      "loss": 1.1208,
+      "step": 682
+    },
+    {
+      "epoch": 0.26542310307976297,
+      "grad_norm": 0.22075454890727997,
+      "learning_rate": 0.00014729466718567538,
+      "loss": 1.0435,
+      "step": 683
+    },
+    {
+      "epoch": 0.26581171670067033,
+      "grad_norm": 0.23094993829727173,
+      "learning_rate": 0.00014721681588166603,
+      "loss": 1.0649,
+      "step": 684
+    },
+    {
+      "epoch": 0.26620033032157775,
+      "grad_norm": 0.21209536492824554,
+      "learning_rate": 0.00014713896457765668,
+      "loss": 1.0578,
+      "step": 685
+    },
+    {
+      "epoch": 0.26658894394248517,
+      "grad_norm": 0.21412219107151031,
+      "learning_rate": 0.00014706111327364733,
+      "loss": 1.1137,
+      "step": 686
+    },
+    {
+      "epoch": 0.2669775575633926,
+      "grad_norm": 0.21175475418567657,
+      "learning_rate": 0.000146983261969638,
+      "loss": 1.023,
+      "step": 687
+    },
+    {
+      "epoch": 0.2673661711843,
+      "grad_norm": 0.21968993544578552,
+      "learning_rate": 0.00014690541066562866,
+      "loss": 1.1183,
+      "step": 688
+    },
+    {
+      "epoch": 0.2677547848052074,
+      "grad_norm": 0.20414218306541443,
+      "learning_rate": 0.00014682755936161932,
+      "loss": 1.078,
+      "step": 689
+    },
+    {
+      "epoch": 0.26814339842611484,
+      "grad_norm": 0.18986597657203674,
+      "learning_rate": 0.00014674970805760997,
+      "loss": 1.0029,
+      "step": 690
+    },
+    {
+      "epoch": 0.26853201204702226,
+      "grad_norm": 0.21215832233428955,
+      "learning_rate": 0.00014667185675360062,
+      "loss": 1.0759,
+      "step": 691
+    },
+    {
+      "epoch": 0.2689206256679297,
+      "grad_norm": 0.2113744169473648,
+      "learning_rate": 0.0001465940054495913,
+      "loss": 1.1027,
+      "step": 692
+    },
+    {
+      "epoch": 0.2693092392888371,
+      "grad_norm": 0.22010880708694458,
+      "learning_rate": 0.00014651615414558195,
+      "loss": 1.0984,
+      "step": 693
+    },
+    {
+      "epoch": 0.26969785290974446,
+      "grad_norm": 0.203857421875,
+      "learning_rate": 0.0001464383028415726,
+      "loss": 1.0407,
+      "step": 694
+    },
+    {
+      "epoch": 0.2700864665306519,
+      "grad_norm": 0.21120867133140564,
+      "learning_rate": 0.00014636045153756325,
+      "loss": 1.0521,
+      "step": 695
+    },
+    {
+      "epoch": 0.2704750801515593,
+      "grad_norm": 0.20039112865924835,
+      "learning_rate": 0.0001462826002335539,
+      "loss": 1.0897,
+      "step": 696
+    },
+    {
+      "epoch": 0.2708636937724667,
+      "grad_norm": 0.22893202304840088,
+      "learning_rate": 0.00014620474892954456,
+      "loss": 1.0903,
+      "step": 697
+    },
+    {
+      "epoch": 0.27125230739337414,
+      "grad_norm": 0.19886267185211182,
+      "learning_rate": 0.00014612689762553524,
+      "loss": 1.0889,
+      "step": 698
+    },
+    {
+      "epoch": 0.27164092101428156,
+      "grad_norm": 0.18892349302768707,
+      "learning_rate": 0.0001460490463215259,
+      "loss": 0.981,
+      "step": 699
+    },
+    {
+      "epoch": 0.272029534635189,
+      "grad_norm": 0.20602507889270782,
+      "learning_rate": 0.00014597119501751654,
+      "loss": 1.0223,
+      "step": 700
+    },
+    {
+      "epoch": 0.2724181482560964,
+      "grad_norm": 0.21480505168437958,
+      "learning_rate": 0.0001458933437135072,
+      "loss": 1.0355,
+      "step": 701
+    },
+    {
+      "epoch": 0.2728067618770038,
+      "grad_norm": 0.21011753380298615,
+      "learning_rate": 0.00014581549240949785,
+      "loss": 1.0613,
+      "step": 702
+    },
+    {
+      "epoch": 0.2731953754979112,
+      "grad_norm": 0.19350819289684296,
+      "learning_rate": 0.00014573764110548853,
+      "loss": 1.0144,
+      "step": 703
+    },
+    {
+      "epoch": 0.2735839891188186,
+      "grad_norm": 0.207548126578331,
+      "learning_rate": 0.00014565978980147918,
+      "loss": 1.0465,
+      "step": 704
+    },
+    {
+      "epoch": 0.273972602739726,
+      "grad_norm": 0.22220565378665924,
+      "learning_rate": 0.00014558193849746983,
+      "loss": 1.1073,
+      "step": 705
+    },
+    {
+      "epoch": 0.27436121636063343,
+      "grad_norm": 0.193622425198555,
+      "learning_rate": 0.00014550408719346048,
+      "loss": 1.0357,
+      "step": 706
+    },
+    {
+      "epoch": 0.27474982998154085,
+      "grad_norm": 0.2067158818244934,
+      "learning_rate": 0.00014542623588945113,
+      "loss": 1.0502,
+      "step": 707
+    },
+    {
+      "epoch": 0.27513844360244827,
+      "grad_norm": 0.2218742072582245,
+      "learning_rate": 0.0001453483845854418,
+      "loss": 0.9934,
+      "step": 708
+    },
+    {
+      "epoch": 0.2755270572233557,
+      "grad_norm": 0.22316142916679382,
+      "learning_rate": 0.00014527053328143246,
+      "loss": 1.0707,
+      "step": 709
+    },
+    {
+      "epoch": 0.2759156708442631,
+      "grad_norm": 0.21004025638103485,
+      "learning_rate": 0.00014519268197742312,
+      "loss": 1.0543,
+      "step": 710
+    },
+    {
+      "epoch": 0.2763042844651705,
+      "grad_norm": 0.22070440649986267,
+      "learning_rate": 0.00014511483067341377,
+      "loss": 1.0467,
+      "step": 711
+    },
+    {
+      "epoch": 0.27669289808607794,
+      "grad_norm": 0.21463747322559357,
+      "learning_rate": 0.00014503697936940445,
+      "loss": 1.0793,
+      "step": 712
+    },
+    {
+      "epoch": 0.2770815117069853,
+      "grad_norm": 0.23452533781528473,
+      "learning_rate": 0.0001449591280653951,
+      "loss": 1.043,
+      "step": 713
+    },
+    {
+      "epoch": 0.2774701253278927,
+      "grad_norm": 0.2405795156955719,
+      "learning_rate": 0.00014488127676138575,
+      "loss": 1.0752,
+      "step": 714
+    },
+    {
+      "epoch": 0.27785873894880014,
+      "grad_norm": 0.21546585857868195,
+      "learning_rate": 0.00014480342545737643,
+      "loss": 1.0834,
+      "step": 715
+    },
+    {
+      "epoch": 0.27824735256970756,
+      "grad_norm": 0.22675828635692596,
+      "learning_rate": 0.00014472557415336708,
+      "loss": 1.055,
+      "step": 716
+    },
+    {
+      "epoch": 0.278635966190615,
+      "grad_norm": 0.2117871195077896,
+      "learning_rate": 0.00014464772284935774,
+      "loss": 1.03,
+      "step": 717
+    },
+    {
+      "epoch": 0.2790245798115224,
+      "grad_norm": 0.2193155735731125,
+      "learning_rate": 0.00014456987154534841,
+      "loss": 1.0073,
+      "step": 718
+    },
+    {
+      "epoch": 0.2794131934324298,
+      "grad_norm": 0.21447965502738953,
+      "learning_rate": 0.00014449202024133907,
+      "loss": 1.0174,
+      "step": 719
+    },
+    {
+      "epoch": 0.27980180705333724,
+      "grad_norm": 0.22867532074451447,
+      "learning_rate": 0.00014441416893732972,
+      "loss": 1.0948,
+      "step": 720
+    },
+    {
+      "epoch": 0.28019042067424466,
+      "grad_norm": 0.21570557355880737,
+      "learning_rate": 0.00014433631763332037,
+      "loss": 1.0105,
+      "step": 721
+    },
+    {
+      "epoch": 0.280579034295152,
+      "grad_norm": 0.20787014067173004,
+      "learning_rate": 0.00014425846632931102,
+      "loss": 1.0384,
+      "step": 722
+    },
+    {
+      "epoch": 0.28096764791605944,
+      "grad_norm": 0.19924762845039368,
+      "learning_rate": 0.0001441806150253017,
+      "loss": 1.0653,
+      "step": 723
+    },
+    {
+      "epoch": 0.28135626153696686,
+      "grad_norm": 0.1996215283870697,
+      "learning_rate": 0.00014410276372129235,
+      "loss": 1.0439,
+      "step": 724
+    },
+    {
+      "epoch": 0.2817448751578743,
+      "grad_norm": 0.2054813802242279,
+      "learning_rate": 0.000144024912417283,
+      "loss": 0.9895,
+      "step": 725
+    },
+    {
+      "epoch": 0.2821334887787817,
+      "grad_norm": 0.2268310785293579,
+      "learning_rate": 0.00014394706111327366,
+      "loss": 1.0993,
+      "step": 726
+    },
+    {
+      "epoch": 0.2825221023996891,
+      "grad_norm": 0.19867680966854095,
+      "learning_rate": 0.0001438692098092643,
+      "loss": 0.985,
+      "step": 727
+    },
+    {
+      "epoch": 0.28291071602059653,
+      "grad_norm": 0.21099598705768585,
+      "learning_rate": 0.000143791358505255,
+      "loss": 1.0333,
+      "step": 728
+    },
+    {
+      "epoch": 0.28329932964150395,
+      "grad_norm": 0.22479215264320374,
+      "learning_rate": 0.00014371350720124564,
+      "loss": 1.0449,
+      "step": 729
+    },
+    {
+      "epoch": 0.28368794326241137,
+      "grad_norm": 0.22717688977718353,
+      "learning_rate": 0.0001436356558972363,
+      "loss": 1.0482,
+      "step": 730
+    },
+    {
+      "epoch": 0.2840765568833188,
+      "grad_norm": 0.20389345288276672,
+      "learning_rate": 0.00014355780459322695,
+      "loss": 0.956,
+      "step": 731
+    },
+    {
+      "epoch": 0.28446517050422615,
+      "grad_norm": 0.21583619713783264,
+      "learning_rate": 0.0001434799532892176,
+      "loss": 1.0154,
+      "step": 732
+    },
+    {
+      "epoch": 0.28485378412513357,
+      "grad_norm": 0.2219148874282837,
+      "learning_rate": 0.00014340210198520825,
+      "loss": 1.0553,
+      "step": 733
+    },
+    {
+      "epoch": 0.285242397746041,
+      "grad_norm": 0.19920189678668976,
+      "learning_rate": 0.00014332425068119893,
+      "loss": 0.9881,
+      "step": 734
+    },
+    {
+      "epoch": 0.2856310113669484,
+      "grad_norm": 0.2295670360326767,
+      "learning_rate": 0.00014324639937718958,
+      "loss": 1.0529,
+      "step": 735
+    },
+    {
+      "epoch": 0.2860196249878558,
+      "grad_norm": 0.21271567046642303,
+      "learning_rate": 0.00014316854807318023,
+      "loss": 1.037,
+      "step": 736
+    },
+    {
+      "epoch": 0.28640823860876324,
+      "grad_norm": 0.21304361522197723,
+      "learning_rate": 0.00014309069676917088,
+      "loss": 1.048,
+      "step": 737
+    },
+    {
+      "epoch": 0.28679685222967066,
+      "grad_norm": 0.19902732968330383,
+      "learning_rate": 0.00014301284546516154,
+      "loss": 1.0306,
+      "step": 738
+    },
+    {
+      "epoch": 0.2871854658505781,
+      "grad_norm": 0.1995929330587387,
+      "learning_rate": 0.00014293499416115222,
+      "loss": 1.0394,
+      "step": 739
+    },
+    {
+      "epoch": 0.2875740794714855,
+      "grad_norm": 0.20426060259342194,
+      "learning_rate": 0.00014285714285714287,
+      "loss": 1.0052,
+      "step": 740
+    },
+    {
+      "epoch": 0.28796269309239286,
+      "grad_norm": 0.20284566283226013,
+      "learning_rate": 0.00014277929155313352,
+      "loss": 1.0115,
+      "step": 741
+    },
+    {
+      "epoch": 0.2883513067133003,
+      "grad_norm": 0.2041557878255844,
+      "learning_rate": 0.00014270144024912417,
+      "loss": 1.0473,
+      "step": 742
+    },
+    {
+      "epoch": 0.2887399203342077,
+      "grad_norm": 0.2152249962091446,
+      "learning_rate": 0.00014262358894511482,
+      "loss": 1.0802,
+      "step": 743
+    },
+    {
+      "epoch": 0.2891285339551151,
+      "grad_norm": 0.20569871366024017,
+      "learning_rate": 0.0001425457376411055,
+      "loss": 1.0203,
+      "step": 744
+    },
+    {
+      "epoch": 0.28951714757602254,
+      "grad_norm": 0.21128378808498383,
+      "learning_rate": 0.00014246788633709616,
+      "loss": 1.108,
+      "step": 745
+    },
+    {
+      "epoch": 0.28990576119692996,
+      "grad_norm": 0.19587135314941406,
+      "learning_rate": 0.0001423900350330868,
+      "loss": 1.0427,
+      "step": 746
+    },
+    {
+      "epoch": 0.2902943748178374,
+      "grad_norm": 0.22052550315856934,
+      "learning_rate": 0.00014231218372907746,
+      "loss": 1.055,
+      "step": 747
+    },
+    {
+      "epoch": 0.2906829884387448,
+      "grad_norm": 0.21291717886924744,
+      "learning_rate": 0.0001422343324250681,
+      "loss": 1.0591,
+      "step": 748
+    },
+    {
+      "epoch": 0.2910716020596522,
+      "grad_norm": 0.20634084939956665,
+      "learning_rate": 0.0001421564811210588,
+      "loss": 1.0527,
+      "step": 749
+    },
+    {
+      "epoch": 0.29146021568055963,
+      "grad_norm": 0.2075488269329071,
+      "learning_rate": 0.00014207862981704944,
+      "loss": 1.0786,
+      "step": 750
+    },
+    {
+      "epoch": 0.291848829301467,
+      "grad_norm": 0.19780080020427704,
+      "learning_rate": 0.0001420007785130401,
+      "loss": 1.059,
+      "step": 751
+    },
+    {
+      "epoch": 0.2922374429223744,
+      "grad_norm": 0.21212074160575867,
+      "learning_rate": 0.00014192292720903075,
+      "loss": 1.0346,
+      "step": 752
+    },
+    {
+      "epoch": 0.29262605654328183,
+      "grad_norm": 0.2218451350927353,
+      "learning_rate": 0.0001418450759050214,
+      "loss": 1.0908,
+      "step": 753
+    },
+    {
+      "epoch": 0.29301467016418925,
+      "grad_norm": 0.20107759535312653,
+      "learning_rate": 0.00014176722460101208,
+      "loss": 1.0202,
+      "step": 754
+    },
+    {
+      "epoch": 0.29340328378509667,
+      "grad_norm": 0.20933273434638977,
+      "learning_rate": 0.00014168937329700273,
+      "loss": 1.0719,
+      "step": 755
+    },
+    {
+      "epoch": 0.2937918974060041,
+      "grad_norm": 0.22369107604026794,
+      "learning_rate": 0.00014161152199299338,
+      "loss": 1.0433,
+      "step": 756
+    },
+    {
+      "epoch": 0.2941805110269115,
+      "grad_norm": 0.2113707810640335,
+      "learning_rate": 0.00014153367068898403,
+      "loss": 1.0637,
+      "step": 757
+    },
+    {
+      "epoch": 0.2945691246478189,
+      "grad_norm": 0.21105700731277466,
+      "learning_rate": 0.00014145581938497469,
+      "loss": 1.0468,
+      "step": 758
+    },
+    {
+      "epoch": 0.29495773826872634,
+      "grad_norm": 0.20189693570137024,
+      "learning_rate": 0.00014137796808096537,
+      "loss": 1.0281,
+      "step": 759
+    },
+    {
+      "epoch": 0.2953463518896337,
+      "grad_norm": 0.1954152137041092,
+      "learning_rate": 0.00014130011677695602,
+      "loss": 1.0519,
+      "step": 760
+    },
+    {
+      "epoch": 0.2957349655105411,
+      "grad_norm": 0.24295592308044434,
+      "learning_rate": 0.00014122226547294667,
+      "loss": 1.1303,
+      "step": 761
+    },
+    {
+      "epoch": 0.29612357913144854,
+      "grad_norm": 0.20158620178699493,
+      "learning_rate": 0.00014114441416893732,
+      "loss": 1.0367,
+      "step": 762
+    },
+    {
+      "epoch": 0.29651219275235596,
+      "grad_norm": 0.20734666287899017,
+      "learning_rate": 0.00014106656286492797,
+      "loss": 1.0392,
+      "step": 763
+    },
+    {
+      "epoch": 0.2969008063732634,
+      "grad_norm": 0.2177533656358719,
+      "learning_rate": 0.00014098871156091865,
+      "loss": 1.0619,
+      "step": 764
+    },
+    {
+      "epoch": 0.2972894199941708,
+      "grad_norm": 0.1961720883846283,
+      "learning_rate": 0.0001409108602569093,
+      "loss": 0.9872,
+      "step": 765
+    },
+    {
+      "epoch": 0.2976780336150782,
+      "grad_norm": 0.21530941128730774,
+      "learning_rate": 0.00014083300895289996,
+      "loss": 1.1246,
+      "step": 766
+    },
+    {
+      "epoch": 0.29806664723598564,
+      "grad_norm": 0.2039783000946045,
+      "learning_rate": 0.00014075515764889064,
+      "loss": 1.0789,
+      "step": 767
+    },
+    {
+      "epoch": 0.29845526085689306,
+      "grad_norm": 0.20641569793224335,
+      "learning_rate": 0.0001406773063448813,
+      "loss": 1.05,
+      "step": 768
+    },
+    {
+      "epoch": 0.2988438744778004,
+      "grad_norm": 0.2071225494146347,
+      "learning_rate": 0.00014059945504087194,
+      "loss": 1.047,
+      "step": 769
+    },
+    {
+      "epoch": 0.29923248809870784,
+      "grad_norm": 0.20367531478405,
+      "learning_rate": 0.00014052160373686262,
+      "loss": 1.0734,
+      "step": 770
+    },
+    {
+      "epoch": 0.29962110171961526,
+      "grad_norm": 0.21718619763851166,
+      "learning_rate": 0.00014044375243285327,
+      "loss": 1.0613,
+      "step": 771
+    },
+    {
+      "epoch": 0.3000097153405227,
+      "grad_norm": 0.21649087965488434,
+      "learning_rate": 0.00014036590112884392,
+      "loss": 1.0671,
+      "step": 772
+    },
+    {
+      "epoch": 0.3003983289614301,
+      "grad_norm": 0.22223225235939026,
+      "learning_rate": 0.00014028804982483458,
+      "loss": 1.0977,
+      "step": 773
+    },
+    {
+      "epoch": 0.3007869425823375,
+      "grad_norm": 0.23101870715618134,
+      "learning_rate": 0.00014021019852082523,
+      "loss": 1.1236,
+      "step": 774
+    },
+    {
+      "epoch": 0.30117555620324493,
+      "grad_norm": 0.22855506837368011,
+      "learning_rate": 0.0001401323472168159,
+      "loss": 1.0517,
+      "step": 775
+    },
+    {
+      "epoch": 0.30156416982415235,
+      "grad_norm": 0.20862117409706116,
+      "learning_rate": 0.00014005449591280656,
+      "loss": 1.0493,
+      "step": 776
+    },
+    {
+      "epoch": 0.30195278344505977,
+      "grad_norm": 0.21692048013210297,
+      "learning_rate": 0.0001399766446087972,
+      "loss": 1.0681,
+      "step": 777
+    },
+    {
+      "epoch": 0.3023413970659672,
+      "grad_norm": 0.21541331708431244,
+      "learning_rate": 0.00013989879330478786,
+      "loss": 1.0775,
+      "step": 778
+    },
+    {
+      "epoch": 0.30273001068687455,
+      "grad_norm": 0.21221749484539032,
+      "learning_rate": 0.00013982094200077851,
+      "loss": 1.0421,
+      "step": 779
+    },
+    {
+      "epoch": 0.30311862430778197,
+      "grad_norm": 0.22497743368148804,
+      "learning_rate": 0.0001397430906967692,
+      "loss": 1.1115,
+      "step": 780
+    },
+    {
+      "epoch": 0.3035072379286894,
+      "grad_norm": 0.1974119246006012,
+      "learning_rate": 0.00013966523939275985,
+      "loss": 1.0264,
+      "step": 781
+    },
+    {
+      "epoch": 0.3038958515495968,
+      "grad_norm": 0.20349323749542236,
+      "learning_rate": 0.0001395873880887505,
+      "loss": 1.0512,
+      "step": 782
+    },
+    {
+      "epoch": 0.3042844651705042,
+      "grad_norm": 0.21116937696933746,
+      "learning_rate": 0.00013950953678474115,
+      "loss": 1.0135,
+      "step": 783
+    },
+    {
+      "epoch": 0.30467307879141164,
+      "grad_norm": 0.2133677899837494,
+      "learning_rate": 0.0001394316854807318,
+      "loss": 1.0694,
+      "step": 784
+    },
+    {
+      "epoch": 0.30506169241231906,
+      "grad_norm": 0.20406191051006317,
+      "learning_rate": 0.00013935383417672248,
+      "loss": 1.0179,
+      "step": 785
+    },
+    {
+      "epoch": 0.3054503060332265,
+      "grad_norm": 0.21428678929805756,
+      "learning_rate": 0.00013927598287271313,
+      "loss": 1.0577,
+      "step": 786
+    },
+    {
+      "epoch": 0.3058389196541339,
+      "grad_norm": 0.20878921449184418,
+      "learning_rate": 0.00013919813156870379,
+      "loss": 1.0311,
+      "step": 787
+    },
+    {
+      "epoch": 0.30622753327504126,
+      "grad_norm": 0.19033175706863403,
+      "learning_rate": 0.00013912028026469444,
+      "loss": 0.976,
+      "step": 788
+    },
+    {
+      "epoch": 0.3066161468959487,
+      "grad_norm": 0.22138020396232605,
+      "learning_rate": 0.0001390424289606851,
+      "loss": 1.0438,
+      "step": 789
+    },
+    {
+      "epoch": 0.3070047605168561,
+      "grad_norm": 0.20765596628189087,
+      "learning_rate": 0.00013896457765667577,
+      "loss": 1.0865,
+      "step": 790
+    },
+    {
+      "epoch": 0.3073933741377635,
+      "grad_norm": 0.209733247756958,
+      "learning_rate": 0.00013888672635266642,
+      "loss": 1.0648,
+      "step": 791
+    },
+    {
+      "epoch": 0.30778198775867094,
+      "grad_norm": 0.1896686851978302,
+      "learning_rate": 0.00013880887504865707,
+      "loss": 1.0133,
+      "step": 792
+    },
+    {
+      "epoch": 0.30817060137957836,
+      "grad_norm": 0.21651998162269592,
+      "learning_rate": 0.00013873102374464772,
+      "loss": 1.0729,
+      "step": 793
+    },
+    {
+      "epoch": 0.3085592150004858,
+      "grad_norm": 0.21751996874809265,
+      "learning_rate": 0.00013865317244063838,
+      "loss": 1.0444,
+      "step": 794
+    },
+    {
+      "epoch": 0.3089478286213932,
+      "grad_norm": 0.20593520998954773,
+      "learning_rate": 0.00013857532113662906,
+      "loss": 1.0304,
+      "step": 795
+    },
+    {
+      "epoch": 0.3093364422423006,
+      "grad_norm": 0.19937261939048767,
+      "learning_rate": 0.0001384974698326197,
+      "loss": 1.0017,
+      "step": 796
+    },
+    {
+      "epoch": 0.30972505586320803,
+      "grad_norm": 0.18901696801185608,
+      "learning_rate": 0.00013841961852861036,
+      "loss": 1.0362,
+      "step": 797
+    },
+    {
+      "epoch": 0.3101136694841154,
+      "grad_norm": 0.2079760730266571,
+      "learning_rate": 0.000138341767224601,
+      "loss": 1.0784,
+      "step": 798
+    },
+    {
+      "epoch": 0.3105022831050228,
+      "grad_norm": 0.24873265624046326,
+      "learning_rate": 0.00013826391592059166,
+      "loss": 1.1026,
+      "step": 799
+    },
+    {
+      "epoch": 0.31089089672593023,
+      "grad_norm": 0.20185396075248718,
+      "learning_rate": 0.00013818606461658234,
+      "loss": 1.0235,
+      "step": 800
+    },
+    {
+      "epoch": 0.31127951034683765,
+      "grad_norm": 0.211393803358078,
+      "learning_rate": 0.000138108213312573,
+      "loss": 1.0999,
+      "step": 801
+    },
+    {
+      "epoch": 0.31166812396774507,
+      "grad_norm": 0.19948823750019073,
+      "learning_rate": 0.00013803036200856365,
+      "loss": 1.0242,
+      "step": 802
+    },
+    {
+      "epoch": 0.3120567375886525,
+      "grad_norm": 0.21470944583415985,
+      "learning_rate": 0.0001379525107045543,
+      "loss": 1.0736,
+      "step": 803
+    },
+    {
+      "epoch": 0.3124453512095599,
+      "grad_norm": 0.2195902317762375,
+      "learning_rate": 0.00013787465940054495,
+      "loss": 1.0368,
+      "step": 804
+    },
+    {
+      "epoch": 0.3128339648304673,
+      "grad_norm": 0.22142355144023895,
+      "learning_rate": 0.00013779680809653563,
+      "loss": 1.1022,
+      "step": 805
+    },
+    {
+      "epoch": 0.31322257845137474,
+      "grad_norm": 0.20487886667251587,
+      "learning_rate": 0.00013771895679252628,
+      "loss": 1.0478,
+      "step": 806
+    },
+    {
+      "epoch": 0.3136111920722821,
+      "grad_norm": 0.217549130320549,
+      "learning_rate": 0.00013764110548851693,
+      "loss": 1.0526,
+      "step": 807
+    },
+    {
+      "epoch": 0.3139998056931895,
+      "grad_norm": 0.20199982821941376,
+      "learning_rate": 0.0001375632541845076,
+      "loss": 0.9992,
+      "step": 808
+    },
+    {
+      "epoch": 0.31438841931409695,
+      "grad_norm": 0.19496634602546692,
+      "learning_rate": 0.00013748540288049824,
+      "loss": 1.0179,
+      "step": 809
+    },
+    {
+      "epoch": 0.31477703293500436,
+      "grad_norm": 0.21999460458755493,
+      "learning_rate": 0.0001374075515764889,
+      "loss": 1.0547,
+      "step": 810
+    },
+    {
+      "epoch": 0.3151656465559118,
+      "grad_norm": 0.21421074867248535,
+      "learning_rate": 0.00013732970027247957,
+      "loss": 1.0283,
+      "step": 811
+    },
+    {
+      "epoch": 0.3155542601768192,
+      "grad_norm": 0.1913364827632904,
+      "learning_rate": 0.00013725184896847022,
+      "loss": 0.9826,
+      "step": 812
+    },
+    {
+      "epoch": 0.3159428737977266,
+      "grad_norm": 0.20509806275367737,
+      "learning_rate": 0.00013717399766446087,
+      "loss": 1.0303,
+      "step": 813
+    },
+    {
+      "epoch": 0.31633148741863404,
+      "grad_norm": 0.20309868454933167,
+      "learning_rate": 0.00013709614636045153,
+      "loss": 1.0479,
+      "step": 814
+    },
+    {
+      "epoch": 0.31672010103954146,
+      "grad_norm": 0.2274443656206131,
+      "learning_rate": 0.0001370182950564422,
+      "loss": 1.1311,
+      "step": 815
+    },
+    {
+      "epoch": 0.3171087146604489,
+      "grad_norm": 0.22785170376300812,
+      "learning_rate": 0.00013694044375243286,
+      "loss": 1.1009,
+      "step": 816
+    },
+    {
+      "epoch": 0.31749732828135624,
+      "grad_norm": 0.2105439007282257,
+      "learning_rate": 0.0001368625924484235,
+      "loss": 1.0251,
+      "step": 817
+    },
+    {
+      "epoch": 0.31788594190226366,
+      "grad_norm": 0.20583970844745636,
+      "learning_rate": 0.00013678474114441416,
+      "loss": 1.0833,
+      "step": 818
+    },
+    {
+      "epoch": 0.3182745555231711,
+      "grad_norm": 0.21091191470623016,
+      "learning_rate": 0.00013670688984040484,
+      "loss": 1.071,
+      "step": 819
+    },
+    {
+      "epoch": 0.3186631691440785,
+      "grad_norm": 0.20645928382873535,
+      "learning_rate": 0.0001366290385363955,
+      "loss": 1.0605,
+      "step": 820
+    },
+    {
+      "epoch": 0.3190517827649859,
+      "grad_norm": 0.1990513950586319,
+      "learning_rate": 0.00013655118723238614,
+      "loss": 1.0461,
+      "step": 821
+    },
+    {
+      "epoch": 0.31944039638589333,
+      "grad_norm": 0.2192249745130539,
+      "learning_rate": 0.00013647333592837682,
+      "loss": 1.0975,
+      "step": 822
+    },
+    {
+      "epoch": 0.31982901000680075,
+      "grad_norm": 0.2157617211341858,
+      "learning_rate": 0.00013639548462436748,
+      "loss": 1.091,
+      "step": 823
+    },
+    {
+      "epoch": 0.32021762362770817,
+      "grad_norm": 0.21964526176452637,
+      "learning_rate": 0.00013631763332035813,
+      "loss": 1.0286,
+      "step": 824
+    },
+    {
+      "epoch": 0.3206062372486156,
+      "grad_norm": 0.2079797089099884,
+      "learning_rate": 0.00013623978201634878,
+      "loss": 1.0257,
+      "step": 825
+    },
+    {
+      "epoch": 0.32099485086952295,
+      "grad_norm": 0.21220168471336365,
+      "learning_rate": 0.00013616193071233946,
+      "loss": 1.0046,
+      "step": 826
+    },
+    {
+      "epoch": 0.32138346449043037,
+      "grad_norm": 0.2885231673717499,
+      "learning_rate": 0.0001360840794083301,
+      "loss": 1.1442,
+      "step": 827
+    },
+    {
+      "epoch": 0.3217720781113378,
+      "grad_norm": 0.2096511274576187,
+      "learning_rate": 0.00013600622810432076,
+      "loss": 1.0209,
+      "step": 828
+    },
+    {
+      "epoch": 0.3221606917322452,
+      "grad_norm": 0.2179451286792755,
+      "learning_rate": 0.00013592837680031142,
+      "loss": 1.0548,
+      "step": 829
+    },
+    {
+      "epoch": 0.3225493053531526,
+      "grad_norm": 0.2096329927444458,
+      "learning_rate": 0.00013585052549630207,
+      "loss": 1.0279,
+      "step": 830
+    },
+    {
+      "epoch": 0.32293791897406005,
+      "grad_norm": 0.22531811892986298,
+      "learning_rate": 0.00013577267419229275,
+      "loss": 1.0463,
+      "step": 831
+    },
+    {
+      "epoch": 0.32332653259496746,
+      "grad_norm": 0.22516901791095734,
+      "learning_rate": 0.0001356948228882834,
+      "loss": 1.1127,
+      "step": 832
+    },
+    {
+      "epoch": 0.3237151462158749,
+      "grad_norm": 0.22487780451774597,
+      "learning_rate": 0.00013561697158427405,
+      "loss": 1.0707,
+      "step": 833
+    },
+    {
+      "epoch": 0.3241037598367823,
+      "grad_norm": 0.20976543426513672,
+      "learning_rate": 0.0001355391202802647,
+      "loss": 1.0217,
+      "step": 834
+    },
+    {
+      "epoch": 0.32449237345768966,
+      "grad_norm": 0.19849295914173126,
+      "learning_rate": 0.00013546126897625535,
+      "loss": 1.021,
+      "step": 835
+    },
+    {
+      "epoch": 0.3248809870785971,
+      "grad_norm": 0.21772268414497375,
+      "learning_rate": 0.00013538341767224603,
+      "loss": 1.0605,
+      "step": 836
+    },
+    {
+      "epoch": 0.3252696006995045,
+      "grad_norm": 0.19670265913009644,
+      "learning_rate": 0.00013530556636823669,
+      "loss": 1.0165,
+      "step": 837
+    },
+    {
+      "epoch": 0.3256582143204119,
+      "grad_norm": 0.19339734315872192,
+      "learning_rate": 0.00013522771506422734,
+      "loss": 1.0203,
+      "step": 838
+    },
+    {
+      "epoch": 0.32604682794131934,
+      "grad_norm": 0.21289557218551636,
+      "learning_rate": 0.000135149863760218,
+      "loss": 1.0252,
+      "step": 839
+    },
+    {
+      "epoch": 0.32643544156222676,
+      "grad_norm": 0.1964789777994156,
+      "learning_rate": 0.00013507201245620864,
+      "loss": 1.0392,
+      "step": 840
+    },
+    {
+      "epoch": 0.3268240551831342,
+      "grad_norm": 0.20783716440200806,
+      "learning_rate": 0.00013499416115219932,
+      "loss": 1.0569,
+      "step": 841
+    },
+    {
+      "epoch": 0.3272126688040416,
+      "grad_norm": 0.22782161831855774,
+      "learning_rate": 0.00013491630984818997,
+      "loss": 1.0555,
+      "step": 842
+    },
+    {
+      "epoch": 0.327601282424949,
+      "grad_norm": 0.22771142423152924,
+      "learning_rate": 0.00013483845854418063,
+      "loss": 1.085,
+      "step": 843
+    },
+    {
+      "epoch": 0.32798989604585643,
+      "grad_norm": 0.19773711264133453,
+      "learning_rate": 0.00013476060724017128,
+      "loss": 1.008,
+      "step": 844
+    },
+    {
+      "epoch": 0.3283785096667638,
+      "grad_norm": 0.22399166226387024,
+      "learning_rate": 0.00013468275593616193,
+      "loss": 1.0511,
+      "step": 845
+    },
+    {
+      "epoch": 0.3287671232876712,
+      "grad_norm": 0.20488236844539642,
+      "learning_rate": 0.00013460490463215258,
+      "loss": 1.0883,
+      "step": 846
+    },
+    {
+      "epoch": 0.32915573690857863,
+      "grad_norm": 0.21387654542922974,
+      "learning_rate": 0.00013452705332814326,
+      "loss": 1.0808,
+      "step": 847
+    },
+    {
+      "epoch": 0.32954435052948605,
+      "grad_norm": 0.1972568780183792,
+      "learning_rate": 0.0001344492020241339,
+      "loss": 1.0555,
+      "step": 848
+    },
+    {
+      "epoch": 0.32993296415039347,
+      "grad_norm": 0.20835663378238678,
+      "learning_rate": 0.00013437135072012456,
+      "loss": 1.0473,
+      "step": 849
+    },
+    {
+      "epoch": 0.3303215777713009,
+      "grad_norm": 0.19707520306110382,
+      "learning_rate": 0.00013429349941611522,
+      "loss": 0.9585,
+      "step": 850
+    },
+    {
+      "epoch": 0.3307101913922083,
+      "grad_norm": 0.19163411855697632,
+      "learning_rate": 0.00013421564811210587,
+      "loss": 1.0025,
+      "step": 851
+    },
+    {
+      "epoch": 0.3310988050131157,
+      "grad_norm": 0.19730083644390106,
+      "learning_rate": 0.00013413779680809655,
+      "loss": 1.0696,
+      "step": 852
+    },
+    {
+      "epoch": 0.33148741863402315,
+      "grad_norm": 0.19537493586540222,
+      "learning_rate": 0.0001340599455040872,
+      "loss": 1.0466,
+      "step": 853
+    },
+    {
+      "epoch": 0.3318760322549305,
+      "grad_norm": 0.2255164235830307,
+      "learning_rate": 0.00013398209420007785,
+      "loss": 1.0659,
+      "step": 854
+    },
+    {
+      "epoch": 0.3322646458758379,
+      "grad_norm": 0.19774770736694336,
+      "learning_rate": 0.0001339042428960685,
+      "loss": 1.0326,
+      "step": 855
+    },
+    {
+      "epoch": 0.33265325949674535,
+      "grad_norm": 0.2004510909318924,
+      "learning_rate": 0.00013382639159205916,
+      "loss": 1.0327,
+      "step": 856
+    },
+    {
+      "epoch": 0.33304187311765276,
+      "grad_norm": 0.19187591969966888,
+      "learning_rate": 0.00013374854028804984,
+      "loss": 1.0069,
+      "step": 857
+    },
+    {
+      "epoch": 0.3334304867385602,
+      "grad_norm": 0.18775832653045654,
+      "learning_rate": 0.0001336706889840405,
+      "loss": 1.0083,
+      "step": 858
+    },
+    {
+      "epoch": 0.3338191003594676,
+      "grad_norm": 0.2005717158317566,
+      "learning_rate": 0.00013359283768003114,
+      "loss": 1.0398,
+      "step": 859
+    },
+    {
+      "epoch": 0.334207713980375,
+      "grad_norm": 0.19705893099308014,
+      "learning_rate": 0.0001335149863760218,
+      "loss": 1.0031,
+      "step": 860
+    },
+    {
+      "epoch": 0.33459632760128244,
+      "grad_norm": 0.19589562714099884,
+      "learning_rate": 0.00013343713507201244,
+      "loss": 0.9831,
+      "step": 861
+    },
+    {
+      "epoch": 0.33498494122218986,
+      "grad_norm": 0.19302591681480408,
+      "learning_rate": 0.00013335928376800312,
+      "loss": 1.0009,
+      "step": 862
+    },
+    {
+      "epoch": 0.3353735548430973,
+      "grad_norm": 0.20499618351459503,
+      "learning_rate": 0.00013328143246399377,
+      "loss": 1.0205,
+      "step": 863
+    },
+    {
+      "epoch": 0.33576216846400464,
+      "grad_norm": 0.20514456927776337,
+      "learning_rate": 0.00013320358115998443,
+      "loss": 1.0837,
+      "step": 864
+    },
+    {
+      "epoch": 0.33615078208491206,
+      "grad_norm": 0.19285848736763,
+      "learning_rate": 0.00013312572985597508,
+      "loss": 1.0167,
+      "step": 865
+    },
+    {
+      "epoch": 0.3365393957058195,
+      "grad_norm": 0.20891553163528442,
+      "learning_rate": 0.00013304787855196573,
+      "loss": 1.0127,
+      "step": 866
+    },
+    {
+      "epoch": 0.3369280093267269,
+      "grad_norm": 0.20511706173419952,
+      "learning_rate": 0.0001329700272479564,
+      "loss": 0.964,
+      "step": 867
+    },
+    {
+      "epoch": 0.3373166229476343,
+      "grad_norm": 0.1855512261390686,
+      "learning_rate": 0.00013289217594394706,
+      "loss": 0.9721,
+      "step": 868
+    },
+    {
+      "epoch": 0.33770523656854173,
+      "grad_norm": 0.20010098814964294,
+      "learning_rate": 0.00013281432463993771,
+      "loss": 1.0411,
+      "step": 869
+    },
+    {
+      "epoch": 0.33809385018944915,
+      "grad_norm": 0.1991325318813324,
+      "learning_rate": 0.0001327364733359284,
+      "loss": 0.9658,
+      "step": 870
+    },
+    {
+      "epoch": 0.33848246381035657,
+      "grad_norm": 0.19895736873149872,
+      "learning_rate": 0.00013265862203191905,
+      "loss": 1.0744,
+      "step": 871
+    },
+    {
+      "epoch": 0.338871077431264,
+      "grad_norm": 0.2091255635023117,
+      "learning_rate": 0.0001325807707279097,
+      "loss": 1.0375,
+      "step": 872
+    },
+    {
+      "epoch": 0.33925969105217135,
+      "grad_norm": 0.21355532109737396,
+      "learning_rate": 0.00013250291942390035,
+      "loss": 1.09,
+      "step": 873
+    },
+    {
+      "epoch": 0.33964830467307877,
+      "grad_norm": 0.21844851970672607,
+      "learning_rate": 0.00013242506811989103,
+      "loss": 1.0769,
+      "step": 874
+    },
+    {
+      "epoch": 0.3400369182939862,
+      "grad_norm": 0.1877543330192566,
+      "learning_rate": 0.00013234721681588168,
+      "loss": 1.0199,
+      "step": 875
+    },
+    {
+      "epoch": 0.3404255319148936,
+      "grad_norm": 0.2020038366317749,
+      "learning_rate": 0.00013226936551187233,
+      "loss": 1.0218,
+      "step": 876
+    },
+    {
+      "epoch": 0.340814145535801,
+      "grad_norm": 0.20682141184806824,
+      "learning_rate": 0.000132191514207863,
+      "loss": 1.0891,
+      "step": 877
+    },
+    {
+      "epoch": 0.34120275915670845,
+      "grad_norm": 0.21942824125289917,
+      "learning_rate": 0.00013211366290385366,
+      "loss": 0.9877,
+      "step": 878
+    },
+    {
+      "epoch": 0.34159137277761586,
+      "grad_norm": 0.21150313317775726,
+      "learning_rate": 0.00013203581159984432,
+      "loss": 1.0815,
+      "step": 879
+    },
+    {
+      "epoch": 0.3419799863985233,
+      "grad_norm": 0.2073293924331665,
+      "learning_rate": 0.00013195796029583497,
+      "loss": 1.0579,
+      "step": 880
+    },
+    {
+      "epoch": 0.3423686000194307,
+      "grad_norm": 0.221574068069458,
+      "learning_rate": 0.00013188010899182562,
+      "loss": 1.0279,
+      "step": 881
+    },
+    {
+      "epoch": 0.3427572136403381,
+      "grad_norm": 0.22334492206573486,
+      "learning_rate": 0.00013180225768781627,
+      "loss": 1.0837,
+      "step": 882
+    },
+    {
+      "epoch": 0.3431458272612455,
+      "grad_norm": 0.18817654252052307,
+      "learning_rate": 0.00013172440638380695,
+      "loss": 1.0262,
+      "step": 883
+    },
+    {
+      "epoch": 0.3435344408821529,
+      "grad_norm": 0.20126822590827942,
+      "learning_rate": 0.0001316465550797976,
+      "loss": 1.0679,
+      "step": 884
+    },
+    {
+      "epoch": 0.3439230545030603,
+      "grad_norm": 0.2128864973783493,
+      "learning_rate": 0.00013156870377578825,
+      "loss": 1.0316,
+      "step": 885
+    },
+    {
+      "epoch": 0.34431166812396774,
+      "grad_norm": 0.20054499804973602,
+      "learning_rate": 0.0001314908524717789,
+      "loss": 1.0024,
+      "step": 886
+    },
+    {
+      "epoch": 0.34470028174487516,
+      "grad_norm": 0.21358034014701843,
+      "learning_rate": 0.00013141300116776956,
+      "loss": 1.0475,
+      "step": 887
+    },
+    {
+      "epoch": 0.3450888953657826,
+      "grad_norm": 0.21377703547477722,
+      "learning_rate": 0.00013133514986376024,
+      "loss": 1.0957,
+      "step": 888
+    },
+    {
+      "epoch": 0.34547750898669,
+      "grad_norm": 0.20166514813899994,
+      "learning_rate": 0.0001312572985597509,
+      "loss": 1.0189,
+      "step": 889
+    },
+    {
+      "epoch": 0.3458661226075974,
+      "grad_norm": 0.20424878597259521,
+      "learning_rate": 0.00013117944725574154,
+      "loss": 1.0896,
+      "step": 890
+    },
+    {
+      "epoch": 0.34625473622850483,
+      "grad_norm": 0.19028648734092712,
+      "learning_rate": 0.0001311015959517322,
+      "loss": 0.9881,
+      "step": 891
+    },
+    {
+      "epoch": 0.3466433498494122,
+      "grad_norm": 0.20828665792942047,
+      "learning_rate": 0.00013102374464772285,
+      "loss": 0.9932,
+      "step": 892
+    },
+    {
+      "epoch": 0.3470319634703196,
+      "grad_norm": 0.20756572484970093,
+      "learning_rate": 0.00013094589334371353,
+      "loss": 1.0406,
+      "step": 893
+    },
+    {
+      "epoch": 0.34742057709122703,
+      "grad_norm": 0.20768921077251434,
+      "learning_rate": 0.00013086804203970418,
+      "loss": 0.9652,
+      "step": 894
+    },
+    {
+      "epoch": 0.34780919071213445,
+      "grad_norm": 0.20660027861595154,
+      "learning_rate": 0.00013079019073569483,
+      "loss": 1.0728,
+      "step": 895
+    },
+    {
+      "epoch": 0.34819780433304187,
+      "grad_norm": 0.20186837017536163,
+      "learning_rate": 0.00013071233943168548,
+      "loss": 1.0407,
+      "step": 896
+    },
+    {
+      "epoch": 0.3485864179539493,
+      "grad_norm": 0.20880667865276337,
+      "learning_rate": 0.00013063448812767613,
+      "loss": 1.0275,
+      "step": 897
+    },
+    {
+      "epoch": 0.3489750315748567,
+      "grad_norm": 0.22212949395179749,
+      "learning_rate": 0.0001305566368236668,
+      "loss": 1.0293,
+      "step": 898
+    },
+    {
+      "epoch": 0.3493636451957641,
+      "grad_norm": 0.20552745461463928,
+      "learning_rate": 0.00013047878551965746,
+      "loss": 1.0434,
+      "step": 899
+    },
+    {
+      "epoch": 0.34975225881667155,
+      "grad_norm": 0.21239839494228363,
+      "learning_rate": 0.00013040093421564812,
+      "loss": 1.052,
+      "step": 900
+    },
+    {
+      "epoch": 0.3501408724375789,
+      "grad_norm": 0.22420544922351837,
+      "learning_rate": 0.00013032308291163877,
+      "loss": 1.0236,
+      "step": 901
+    },
+    {
+      "epoch": 0.35052948605848633,
+      "grad_norm": 0.23435090482234955,
+      "learning_rate": 0.00013024523160762942,
+      "loss": 1.0876,
+      "step": 902
+    },
+    {
+      "epoch": 0.35091809967939375,
+      "grad_norm": 0.22763386368751526,
+      "learning_rate": 0.0001301673803036201,
+      "loss": 1.0636,
+      "step": 903
+    },
+    {
+      "epoch": 0.35130671330030117,
+      "grad_norm": 0.20948883891105652,
+      "learning_rate": 0.00013008952899961075,
+      "loss": 1.0083,
+      "step": 904
+    },
+    {
+      "epoch": 0.3516953269212086,
+      "grad_norm": 0.20408779382705688,
+      "learning_rate": 0.0001300116776956014,
+      "loss": 1.039,
+      "step": 905
+    },
+    {
+      "epoch": 0.352083940542116,
+      "grad_norm": 0.2126050591468811,
+      "learning_rate": 0.00012993382639159206,
+      "loss": 1.0365,
+      "step": 906
+    },
+    {
+      "epoch": 0.3524725541630234,
+      "grad_norm": 0.20314334332942963,
+      "learning_rate": 0.0001298559750875827,
+      "loss": 1.0474,
+      "step": 907
+    },
+    {
+      "epoch": 0.35286116778393084,
+      "grad_norm": 0.23720984160900116,
+      "learning_rate": 0.0001297781237835734,
+      "loss": 1.0529,
+      "step": 908
+    },
+    {
+      "epoch": 0.35324978140483826,
+      "grad_norm": 0.22642800211906433,
+      "learning_rate": 0.00012970027247956404,
+      "loss": 1.0586,
+      "step": 909
+    },
+    {
+      "epoch": 0.3536383950257457,
+      "grad_norm": 0.20469972491264343,
+      "learning_rate": 0.0001296224211755547,
+      "loss": 1.0267,
+      "step": 910
+    },
+    {
+      "epoch": 0.35402700864665304,
+      "grad_norm": 0.197368785738945,
+      "learning_rate": 0.00012954456987154534,
+      "loss": 1.0348,
+      "step": 911
+    },
+    {
+      "epoch": 0.35441562226756046,
+      "grad_norm": 0.21924498677253723,
+      "learning_rate": 0.000129466718567536,
+      "loss": 1.0861,
+      "step": 912
+    },
+    {
+      "epoch": 0.3548042358884679,
+      "grad_norm": 0.22006285190582275,
+      "learning_rate": 0.00012938886726352667,
+      "loss": 1.0545,
+      "step": 913
+    },
+    {
+      "epoch": 0.3551928495093753,
+      "grad_norm": 0.22419220209121704,
+      "learning_rate": 0.00012931101595951733,
+      "loss": 1.0716,
+      "step": 914
+    },
+    {
+      "epoch": 0.3555814631302827,
+      "grad_norm": 0.215990349650383,
+      "learning_rate": 0.00012923316465550798,
+      "loss": 1.0619,
+      "step": 915
+    },
+    {
+      "epoch": 0.35597007675119013,
+      "grad_norm": 0.20783264935016632,
+      "learning_rate": 0.00012915531335149863,
+      "loss": 1.0412,
+      "step": 916
+    },
+    {
+      "epoch": 0.35635869037209755,
+      "grad_norm": 0.24584618210792542,
+      "learning_rate": 0.00012907746204748928,
+      "loss": 1.1165,
+      "step": 917
+    },
+    {
+      "epoch": 0.35674730399300497,
+      "grad_norm": 0.23146122694015503,
+      "learning_rate": 0.00012899961074347996,
+      "loss": 1.1111,
+      "step": 918
+    },
+    {
+      "epoch": 0.3571359176139124,
+      "grad_norm": 0.19983729720115662,
+      "learning_rate": 0.00012892175943947061,
+      "loss": 1.0674,
+      "step": 919
+    },
+    {
+      "epoch": 0.35752453123481975,
+      "grad_norm": 0.2161000818014145,
+      "learning_rate": 0.00012884390813546127,
+      "loss": 1.076,
+      "step": 920
+    },
+    {
+      "epoch": 0.35791314485572717,
+      "grad_norm": 0.21042793989181519,
+      "learning_rate": 0.00012876605683145192,
+      "loss": 1.0535,
+      "step": 921
+    },
+    {
+      "epoch": 0.3583017584766346,
+      "grad_norm": 0.20135439932346344,
+      "learning_rate": 0.0001286882055274426,
+      "loss": 1.0059,
+      "step": 922
+    },
+    {
+      "epoch": 0.358690372097542,
+      "grad_norm": 0.19394971430301666,
+      "learning_rate": 0.00012861035422343325,
+      "loss": 1.0381,
+      "step": 923
+    },
+    {
+      "epoch": 0.35907898571844943,
+      "grad_norm": 0.21171030402183533,
+      "learning_rate": 0.0001285325029194239,
+      "loss": 1.0513,
+      "step": 924
+    },
+    {
+      "epoch": 0.35946759933935685,
+      "grad_norm": 0.19476690888404846,
+      "learning_rate": 0.00012845465161541458,
+      "loss": 1.0003,
+      "step": 925
+    },
+    {
+      "epoch": 0.35985621296026427,
+      "grad_norm": 0.20468670129776,
+      "learning_rate": 0.00012837680031140523,
+      "loss": 1.0608,
+      "step": 926
+    },
+    {
+      "epoch": 0.3602448265811717,
+      "grad_norm": 0.21159446239471436,
+      "learning_rate": 0.00012829894900739588,
+      "loss": 1.0734,
+      "step": 927
+    },
+    {
+      "epoch": 0.3606334402020791,
+      "grad_norm": 0.21179519593715668,
+      "learning_rate": 0.00012822109770338654,
+      "loss": 1.0957,
+      "step": 928
+    },
+    {
+      "epoch": 0.3610220538229865,
+      "grad_norm": 0.20997527241706848,
+      "learning_rate": 0.00012814324639937722,
+      "loss": 1.0644,
+      "step": 929
+    },
+    {
+      "epoch": 0.3614106674438939,
+      "grad_norm": 0.21178296208381653,
+      "learning_rate": 0.00012806539509536787,
+      "loss": 1.0208,
+      "step": 930
+    },
+    {
+      "epoch": 0.3617992810648013,
+      "grad_norm": 0.20890356600284576,
+      "learning_rate": 0.00012798754379135852,
+      "loss": 1.0888,
+      "step": 931
+    },
+    {
+      "epoch": 0.3621878946857087,
+      "grad_norm": 0.20177409052848816,
+      "learning_rate": 0.00012790969248734917,
+      "loss": 0.9741,
+      "step": 932
+    },
+    {
+      "epoch": 0.36257650830661614,
+      "grad_norm": 0.23504556715488434,
+      "learning_rate": 0.00012783184118333982,
+      "loss": 1.1048,
+      "step": 933
+    },
+    {
+      "epoch": 0.36296512192752356,
+      "grad_norm": 0.22829356789588928,
+      "learning_rate": 0.0001277539898793305,
+      "loss": 1.0798,
+      "step": 934
+    },
+    {
+      "epoch": 0.363353735548431,
+      "grad_norm": 0.2068483531475067,
+      "learning_rate": 0.00012767613857532116,
+      "loss": 1.0452,
+      "step": 935
+    },
+    {
+      "epoch": 0.3637423491693384,
+      "grad_norm": 0.2093171775341034,
+      "learning_rate": 0.0001275982872713118,
+      "loss": 1.0742,
+      "step": 936
+    },
+    {
+      "epoch": 0.3641309627902458,
+      "grad_norm": 0.21478736400604248,
+      "learning_rate": 0.00012752043596730246,
+      "loss": 1.0572,
+      "step": 937
+    },
+    {
+      "epoch": 0.36451957641115323,
+      "grad_norm": 0.1906953752040863,
+      "learning_rate": 0.0001274425846632931,
+      "loss": 1.0107,
+      "step": 938
+    },
+    {
+      "epoch": 0.3649081900320606,
+      "grad_norm": 0.20580604672431946,
+      "learning_rate": 0.0001273647333592838,
+      "loss": 1.0677,
+      "step": 939
+    },
+    {
+      "epoch": 0.365296803652968,
+      "grad_norm": 0.22586850821971893,
+      "learning_rate": 0.00012728688205527444,
+      "loss": 1.0389,
+      "step": 940
+    },
+    {
+      "epoch": 0.36568541727387543,
+      "grad_norm": 0.199899360537529,
+      "learning_rate": 0.0001272090307512651,
+      "loss": 1.0462,
+      "step": 941
+    },
+    {
+      "epoch": 0.36607403089478285,
+      "grad_norm": 0.19881689548492432,
+      "learning_rate": 0.00012713117944725575,
+      "loss": 1.0565,
+      "step": 942
+    },
+    {
+      "epoch": 0.3664626445156903,
+      "grad_norm": 0.21748925745487213,
+      "learning_rate": 0.0001270533281432464,
+      "loss": 1.0659,
+      "step": 943
+    },
+    {
+      "epoch": 0.3668512581365977,
+      "grad_norm": 0.19363689422607422,
+      "learning_rate": 0.00012697547683923708,
+      "loss": 1.0307,
+      "step": 944
+    },
+    {
+      "epoch": 0.3672398717575051,
+      "grad_norm": 0.21701784431934357,
+      "learning_rate": 0.00012689762553522773,
+      "loss": 1.0684,
+      "step": 945
+    },
+    {
+      "epoch": 0.36762848537841253,
+      "grad_norm": 0.21406958997249603,
+      "learning_rate": 0.00012681977423121838,
+      "loss": 1.0703,
+      "step": 946
+    },
+    {
+      "epoch": 0.36801709899931995,
+      "grad_norm": 0.23539729416370392,
+      "learning_rate": 0.00012674192292720903,
+      "loss": 1.1537,
+      "step": 947
+    },
+    {
+      "epoch": 0.36840571262022737,
+      "grad_norm": 0.2177354395389557,
+      "learning_rate": 0.00012666407162319969,
+      "loss": 1.0131,
+      "step": 948
+    },
+    {
+      "epoch": 0.36879432624113473,
+      "grad_norm": 0.255346417427063,
+      "learning_rate": 0.00012658622031919037,
+      "loss": 0.9807,
+      "step": 949
+    },
+    {
+      "epoch": 0.36918293986204215,
+      "grad_norm": 0.2139921486377716,
+      "learning_rate": 0.00012650836901518102,
+      "loss": 1.0392,
+      "step": 950
+    },
+    {
+      "epoch": 0.36957155348294957,
+      "grad_norm": 0.22490833699703217,
+      "learning_rate": 0.00012643051771117167,
+      "loss": 1.0512,
+      "step": 951
+    },
+    {
+      "epoch": 0.369960167103857,
+      "grad_norm": 0.20698820054531097,
+      "learning_rate": 0.00012635266640716232,
+      "loss": 1.0391,
+      "step": 952
+    },
+    {
+      "epoch": 0.3703487807247644,
+      "grad_norm": 0.2276201844215393,
+      "learning_rate": 0.00012627481510315297,
+      "loss": 1.0513,
+      "step": 953
+    },
+    {
+      "epoch": 0.3707373943456718,
+      "grad_norm": 0.2493600994348526,
+      "learning_rate": 0.00012619696379914365,
+      "loss": 1.0136,
+      "step": 954
+    },
+    {
+      "epoch": 0.37112600796657924,
+      "grad_norm": 0.2155001014471054,
+      "learning_rate": 0.0001261191124951343,
+      "loss": 1.0523,
+      "step": 955
+    },
+    {
+      "epoch": 0.37151462158748666,
+      "grad_norm": 0.21571211516857147,
+      "learning_rate": 0.00012604126119112496,
+      "loss": 1.0288,
+      "step": 956
+    },
+    {
+      "epoch": 0.3719032352083941,
+      "grad_norm": 0.23238877952098846,
+      "learning_rate": 0.0001259634098871156,
+      "loss": 1.0638,
+      "step": 957
+    },
+    {
+      "epoch": 0.37229184882930144,
+      "grad_norm": 0.2002813220024109,
+      "learning_rate": 0.00012588555858310626,
+      "loss": 0.9665,
+      "step": 958
+    },
+    {
+      "epoch": 0.37268046245020886,
+      "grad_norm": 0.21712858974933624,
+      "learning_rate": 0.0001258077072790969,
+      "loss": 1.0469,
+      "step": 959
+    },
+    {
+      "epoch": 0.3730690760711163,
+      "grad_norm": 0.2178192287683487,
+      "learning_rate": 0.0001257298559750876,
+      "loss": 1.0267,
+      "step": 960
+    },
+    {
+      "epoch": 0.3734576896920237,
+      "grad_norm": 0.25488024950027466,
+      "learning_rate": 0.00012565200467107824,
+      "loss": 1.0153,
+      "step": 961
+    },
+    {
+      "epoch": 0.3738463033129311,
+      "grad_norm": 0.20070038735866547,
+      "learning_rate": 0.0001255741533670689,
+      "loss": 1.0279,
+      "step": 962
+    },
+    {
+      "epoch": 0.37423491693383854,
+      "grad_norm": 0.21885356307029724,
+      "learning_rate": 0.00012549630206305955,
+      "loss": 1.0395,
+      "step": 963
+    },
+    {
+      "epoch": 0.37462353055474595,
+      "grad_norm": 0.2407921701669693,
+      "learning_rate": 0.0001254184507590502,
+      "loss": 1.0767,
+      "step": 964
+    },
+    {
+      "epoch": 0.3750121441756534,
+      "grad_norm": 0.20645053684711456,
+      "learning_rate": 0.00012534059945504088,
+      "loss": 1.0318,
+      "step": 965
+    },
+    {
+      "epoch": 0.3754007577965608,
+      "grad_norm": 0.21275092661380768,
+      "learning_rate": 0.00012526274815103153,
+      "loss": 1.0546,
+      "step": 966
+    },
+    {
+      "epoch": 0.3757893714174682,
+      "grad_norm": 0.21574917435646057,
+      "learning_rate": 0.00012518489684702218,
+      "loss": 1.032,
+      "step": 967
+    },
+    {
+      "epoch": 0.3761779850383756,
+      "grad_norm": 0.21589480340480804,
+      "learning_rate": 0.00012510704554301284,
+      "loss": 1.0834,
+      "step": 968
+    },
+    {
+      "epoch": 0.376566598659283,
+      "grad_norm": 0.19576796889305115,
+      "learning_rate": 0.0001250291942390035,
+      "loss": 1.0178,
+      "step": 969
+    },
+    {
+      "epoch": 0.3769552122801904,
+      "grad_norm": 0.20941287279129028,
+      "learning_rate": 0.00012495134293499417,
+      "loss": 1.0712,
+      "step": 970
+    },
+    {
+      "epoch": 0.37734382590109783,
+      "grad_norm": 0.22585494816303253,
+      "learning_rate": 0.00012487349163098482,
+      "loss": 1.0401,
+      "step": 971
+    },
+    {
+      "epoch": 0.37773243952200525,
+      "grad_norm": 0.21093420684337616,
+      "learning_rate": 0.00012479564032697547,
+      "loss": 1.0569,
+      "step": 972
+    },
+    {
+      "epoch": 0.37812105314291267,
+      "grad_norm": 0.22375014424324036,
+      "learning_rate": 0.00012471778902296612,
+      "loss": 1.0687,
+      "step": 973
+    },
+    {
+      "epoch": 0.3785096667638201,
+      "grad_norm": 0.19787487387657166,
+      "learning_rate": 0.0001246399377189568,
+      "loss": 1.0266,
+      "step": 974
+    },
+    {
+      "epoch": 0.3788982803847275,
+      "grad_norm": 0.20633013546466827,
+      "learning_rate": 0.00012456208641494745,
+      "loss": 0.9996,
+      "step": 975
+    },
+    {
+      "epoch": 0.3792868940056349,
+      "grad_norm": 0.21559873223304749,
+      "learning_rate": 0.0001244842351109381,
+      "loss": 1.0851,
+      "step": 976
+    },
+    {
+      "epoch": 0.3796755076265423,
+      "grad_norm": 0.2166333943605423,
+      "learning_rate": 0.00012440638380692879,
+      "loss": 1.0859,
+      "step": 977
+    },
+    {
+      "epoch": 0.3800641212474497,
+      "grad_norm": 0.18558773398399353,
+      "learning_rate": 0.00012432853250291944,
+      "loss": 0.9534,
+      "step": 978
+    },
+    {
+      "epoch": 0.3804527348683571,
+      "grad_norm": 0.2086942344903946,
+      "learning_rate": 0.0001242506811989101,
+      "loss": 1.0786,
+      "step": 979
+    },
+    {
+      "epoch": 0.38084134848926454,
+      "grad_norm": 0.2207823544740677,
+      "learning_rate": 0.00012417282989490074,
+      "loss": 1.0626,
+      "step": 980
+    },
+    {
+      "epoch": 0.38122996211017196,
+      "grad_norm": 0.21255749464035034,
+      "learning_rate": 0.00012409497859089142,
+      "loss": 1.063,
+      "step": 981
+    },
+    {
+      "epoch": 0.3816185757310794,
+      "grad_norm": 0.20682042837142944,
+      "learning_rate": 0.00012401712728688207,
+      "loss": 1.034,
+      "step": 982
+    },
+    {
+      "epoch": 0.3820071893519868,
+      "grad_norm": 0.2084134966135025,
+      "learning_rate": 0.00012393927598287272,
+      "loss": 1.0481,
+      "step": 983
+    },
+    {
+      "epoch": 0.3823958029728942,
+      "grad_norm": 0.1922312080860138,
+      "learning_rate": 0.00012386142467886338,
+      "loss": 1.0461,
+      "step": 984
+    },
+    {
+      "epoch": 0.38278441659380164,
+      "grad_norm": 0.20893707871437073,
+      "learning_rate": 0.00012378357337485406,
+      "loss": 1.0797,
+      "step": 985
+    },
+    {
+      "epoch": 0.383173030214709,
+      "grad_norm": 0.19717541337013245,
+      "learning_rate": 0.0001237057220708447,
+      "loss": 1.0028,
+      "step": 986
+    },
+    {
+      "epoch": 0.3835616438356164,
+      "grad_norm": 0.20688053965568542,
+      "learning_rate": 0.00012362787076683536,
+      "loss": 0.989,
+      "step": 987
+    },
+    {
+      "epoch": 0.38395025745652384,
+      "grad_norm": 0.20580583810806274,
+      "learning_rate": 0.000123550019462826,
+      "loss": 1.06,
+      "step": 988
+    },
+    {
+      "epoch": 0.38433887107743125,
+      "grad_norm": 0.2151709794998169,
+      "learning_rate": 0.00012347216815881666,
+      "loss": 1.0685,
+      "step": 989
+    },
+    {
+      "epoch": 0.3847274846983387,
+      "grad_norm": 0.19573980569839478,
+      "learning_rate": 0.00012339431685480734,
+      "loss": 1.0072,
+      "step": 990
+    },
+    {
+      "epoch": 0.3851160983192461,
+      "grad_norm": 0.1949119120836258,
+      "learning_rate": 0.000123316465550798,
+      "loss": 0.9995,
+      "step": 991
+    },
+    {
+      "epoch": 0.3855047119401535,
+      "grad_norm": 0.2062375247478485,
+      "learning_rate": 0.00012323861424678865,
+      "loss": 1.0694,
+      "step": 992
+    },
+    {
+      "epoch": 0.38589332556106093,
+      "grad_norm": 0.2007209211587906,
+      "learning_rate": 0.0001231607629427793,
+      "loss": 1.0397,
+      "step": 993
+    },
+    {
+      "epoch": 0.38628193918196835,
+      "grad_norm": 0.2231544405221939,
+      "learning_rate": 0.00012308291163876995,
+      "loss": 1.0755,
+      "step": 994
+    },
+    {
+      "epoch": 0.38667055280287577,
+      "grad_norm": 0.2103337049484253,
+      "learning_rate": 0.0001230050603347606,
+      "loss": 1.0505,
+      "step": 995
+    },
+    {
+      "epoch": 0.38705916642378313,
+      "grad_norm": 0.20178386569023132,
+      "learning_rate": 0.00012292720903075128,
+      "loss": 1.0696,
+      "step": 996
+    },
+    {
+      "epoch": 0.38744778004469055,
+      "grad_norm": 0.21268007159233093,
+      "learning_rate": 0.00012284935772674193,
+      "loss": 1.0262,
+      "step": 997
+    },
+    {
+      "epoch": 0.38783639366559797,
+      "grad_norm": 0.21439722180366516,
+      "learning_rate": 0.0001227715064227326,
+      "loss": 1.0718,
+      "step": 998
+    },
+    {
+      "epoch": 0.3882250072865054,
+      "grad_norm": 0.19691336154937744,
+      "learning_rate": 0.00012269365511872324,
+      "loss": 0.9663,
+      "step": 999
+    },
+    {
+      "epoch": 0.3886136209074128,
+      "grad_norm": 0.2165926694869995,
+      "learning_rate": 0.0001226158038147139,
+      "loss": 1.0432,
+      "step": 1000
+    },
+    {
+      "epoch": 0.3890022345283202,
+      "grad_norm": 0.20730604231357574,
+      "learning_rate": 0.00012253795251070457,
+      "loss": 1.0386,
+      "step": 1001
+    },
+    {
+      "epoch": 0.38939084814922764,
+      "grad_norm": 0.2138068974018097,
+      "learning_rate": 0.00012246010120669522,
+      "loss": 1.0683,
+      "step": 1002
+    },
+    {
+      "epoch": 0.38977946177013506,
+      "grad_norm": 0.2118951678276062,
+      "learning_rate": 0.00012238224990268587,
+      "loss": 1.0393,
+      "step": 1003
+    },
+    {
+      "epoch": 0.3901680753910425,
+      "grad_norm": 0.20879961550235748,
+      "learning_rate": 0.00012230439859867653,
+      "loss": 1.0349,
+      "step": 1004
+    },
+    {
+      "epoch": 0.39055668901194984,
+      "grad_norm": 0.19588464498519897,
+      "learning_rate": 0.00012222654729466718,
+      "loss": 1.0226,
+      "step": 1005
+    },
+    {
+      "epoch": 0.39094530263285726,
+      "grad_norm": 0.2059485912322998,
+      "learning_rate": 0.00012214869599065786,
+      "loss": 1.052,
+      "step": 1006
+    },
+    {
+      "epoch": 0.3913339162537647,
+      "grad_norm": 0.2299761176109314,
+      "learning_rate": 0.0001220708446866485,
+      "loss": 1.1055,
+      "step": 1007
+    },
+    {
+      "epoch": 0.3917225298746721,
+      "grad_norm": 0.20196737349033356,
+      "learning_rate": 0.00012199299338263916,
+      "loss": 1.0497,
+      "step": 1008
+    },
+    {
+      "epoch": 0.3921111434955795,
+      "grad_norm": 0.20615293085575104,
+      "learning_rate": 0.00012191514207862981,
+      "loss": 1.047,
+      "step": 1009
+    },
+    {
+      "epoch": 0.39249975711648694,
+      "grad_norm": 0.20265278220176697,
+      "learning_rate": 0.00012183729077462047,
+      "loss": 1.0035,
+      "step": 1010
+    },
+    {
+      "epoch": 0.39288837073739435,
+      "grad_norm": 0.20197926461696625,
+      "learning_rate": 0.00012175943947061114,
+      "loss": 0.9847,
+      "step": 1011
+    },
+    {
+      "epoch": 0.3932769843583018,
+      "grad_norm": 0.19974152743816376,
+      "learning_rate": 0.0001216815881666018,
+      "loss": 1.0669,
+      "step": 1012
+    },
+    {
+      "epoch": 0.3936655979792092,
+      "grad_norm": 0.21684005856513977,
+      "learning_rate": 0.00012160373686259245,
+      "loss": 1.0562,
+      "step": 1013
+    },
+    {
+      "epoch": 0.3940542116001166,
+      "grad_norm": 0.2030404955148697,
+      "learning_rate": 0.00012152588555858311,
+      "loss": 1.0159,
+      "step": 1014
+    },
+    {
+      "epoch": 0.394442825221024,
+      "grad_norm": 0.2123572677373886,
+      "learning_rate": 0.00012144803425457377,
+      "loss": 1.0757,
+      "step": 1015
+    },
+    {
+      "epoch": 0.3948314388419314,
+      "grad_norm": 0.20320011675357819,
+      "learning_rate": 0.00012137018295056443,
+      "loss": 1.038,
+      "step": 1016
+    },
+    {
+      "epoch": 0.3952200524628388,
+      "grad_norm": 0.20120739936828613,
+      "learning_rate": 0.00012129233164655508,
+      "loss": 1.1015,
+      "step": 1017
+    },
+    {
+      "epoch": 0.39560866608374623,
+      "grad_norm": 0.19862449169158936,
+      "learning_rate": 0.00012121448034254575,
+      "loss": 1.0328,
+      "step": 1018
+    },
+    {
+      "epoch": 0.39599727970465365,
+      "grad_norm": 0.19761312007904053,
+      "learning_rate": 0.0001211366290385364,
+      "loss": 0.997,
+      "step": 1019
+    },
+    {
+      "epoch": 0.39638589332556107,
+      "grad_norm": 0.1943569928407669,
+      "learning_rate": 0.00012105877773452705,
+      "loss": 1.0099,
+      "step": 1020
+    },
+    {
+      "epoch": 0.3967745069464685,
+      "grad_norm": 0.2109062373638153,
+      "learning_rate": 0.00012098092643051773,
+      "loss": 1.1039,
+      "step": 1021
+    },
+    {
+      "epoch": 0.3971631205673759,
+      "grad_norm": 0.20966266095638275,
+      "learning_rate": 0.00012090307512650839,
+      "loss": 1.1208,
+      "step": 1022
+    },
+    {
+      "epoch": 0.3975517341882833,
+      "grad_norm": 0.19208088517189026,
+      "learning_rate": 0.00012082522382249904,
+      "loss": 1.0147,
+      "step": 1023
+    },
+    {
+      "epoch": 0.3979403478091907,
+      "grad_norm": 0.21821236610412598,
+      "learning_rate": 0.00012074737251848969,
+      "loss": 1.0615,
+      "step": 1024
+    },
+    {
+      "epoch": 0.3983289614300981,
+      "grad_norm": 0.20031368732452393,
+      "learning_rate": 0.00012066952121448034,
+      "loss": 1.0303,
+      "step": 1025
+    },
+    {
+      "epoch": 0.3987175750510055,
+      "grad_norm": 0.22910597920417786,
+      "learning_rate": 0.00012059166991047102,
+      "loss": 1.0182,
+      "step": 1026
+    },
+    {
+      "epoch": 0.39910618867191294,
+      "grad_norm": 0.20816978812217712,
+      "learning_rate": 0.00012051381860646167,
+      "loss": 1.0142,
+      "step": 1027
+    },
+    {
+      "epoch": 0.39949480229282036,
+      "grad_norm": 0.20989780128002167,
+      "learning_rate": 0.00012043596730245232,
+      "loss": 1.0676,
+      "step": 1028
+    },
+    {
+      "epoch": 0.3998834159137278,
+      "grad_norm": 0.21894055604934692,
+      "learning_rate": 0.00012035811599844298,
+      "loss": 1.0222,
+      "step": 1029
+    },
+    {
+      "epoch": 0.4002720295346352,
+      "grad_norm": 0.2170870155096054,
+      "learning_rate": 0.00012028026469443363,
+      "loss": 1.0319,
+      "step": 1030
+    },
+    {
+      "epoch": 0.4006606431555426,
+      "grad_norm": 0.20869679749011993,
+      "learning_rate": 0.00012020241339042428,
+      "loss": 1.055,
+      "step": 1031
+    },
+    {
+      "epoch": 0.40104925677645004,
+      "grad_norm": 0.18850640952587128,
+      "learning_rate": 0.00012012456208641496,
+      "loss": 0.9993,
+      "step": 1032
+    },
+    {
+      "epoch": 0.40143787039735745,
+      "grad_norm": 0.21462580561637878,
+      "learning_rate": 0.00012004671078240561,
+      "loss": 1.0115,
+      "step": 1033
+    },
+    {
+      "epoch": 0.4018264840182648,
+      "grad_norm": 0.2008499950170517,
+      "learning_rate": 0.00011996885947839626,
+      "loss": 1.0229,
+      "step": 1034
+    },
+    {
+      "epoch": 0.40221509763917224,
+      "grad_norm": 0.20063354074954987,
+      "learning_rate": 0.00011989100817438692,
+      "loss": 1.0295,
+      "step": 1035
+    },
+    {
+      "epoch": 0.40260371126007966,
+      "grad_norm": 0.20655786991119385,
+      "learning_rate": 0.00011981315687037757,
+      "loss": 1.0044,
+      "step": 1036
+    },
+    {
+      "epoch": 0.4029923248809871,
+      "grad_norm": 0.1985999196767807,
+      "learning_rate": 0.00011973530556636825,
+      "loss": 1.0063,
+      "step": 1037
+    },
+    {
+      "epoch": 0.4033809385018945,
+      "grad_norm": 0.2039060890674591,
+      "learning_rate": 0.0001196574542623589,
+      "loss": 1.044,
+      "step": 1038
+    },
+    {
+      "epoch": 0.4037695521228019,
+      "grad_norm": 0.21838189661502838,
+      "learning_rate": 0.00011957960295834955,
+      "loss": 1.1101,
+      "step": 1039
+    },
+    {
+      "epoch": 0.40415816574370933,
+      "grad_norm": 0.21508415043354034,
+      "learning_rate": 0.00011950175165434022,
+      "loss": 1.0764,
+      "step": 1040
+    },
+    {
+      "epoch": 0.40454677936461675,
+      "grad_norm": 0.2089119255542755,
+      "learning_rate": 0.00011942390035033087,
+      "loss": 0.9986,
+      "step": 1041
+    },
+    {
+      "epoch": 0.40493539298552417,
+      "grad_norm": 0.19859452545642853,
+      "learning_rate": 0.00011934604904632153,
+      "loss": 1.0122,
+      "step": 1042
+    },
+    {
+      "epoch": 0.40532400660643153,
+      "grad_norm": 0.2018653154373169,
+      "learning_rate": 0.00011926819774231219,
+      "loss": 1.0187,
+      "step": 1043
+    },
+    {
+      "epoch": 0.40571262022733895,
+      "grad_norm": 0.19892063736915588,
+      "learning_rate": 0.00011919034643830285,
+      "loss": 1.0029,
+      "step": 1044
+    },
+    {
+      "epoch": 0.40610123384824637,
+      "grad_norm": 0.20355650782585144,
+      "learning_rate": 0.0001191124951342935,
+      "loss": 1.0484,
+      "step": 1045
+    },
+    {
+      "epoch": 0.4064898474691538,
+      "grad_norm": 0.2033994495868683,
+      "learning_rate": 0.00011903464383028416,
+      "loss": 1.087,
+      "step": 1046
+    },
+    {
+      "epoch": 0.4068784610900612,
+      "grad_norm": 0.2047330141067505,
+      "learning_rate": 0.00011895679252627484,
+      "loss": 1.0774,
+      "step": 1047
+    },
+    {
+      "epoch": 0.4072670747109686,
+      "grad_norm": 0.21420112252235413,
+      "learning_rate": 0.00011887894122226549,
+      "loss": 1.0252,
+      "step": 1048
+    },
+    {
+      "epoch": 0.40765568833187604,
+      "grad_norm": 0.2030097395181656,
+      "learning_rate": 0.00011880108991825614,
+      "loss": 1.0501,
+      "step": 1049
+    },
+    {
+      "epoch": 0.40804430195278346,
+      "grad_norm": 0.2128026783466339,
+      "learning_rate": 0.00011872323861424679,
+      "loss": 1.1031,
+      "step": 1050
+    },
+    {
+      "epoch": 0.4084329155736909,
+      "grad_norm": 0.20724938809871674,
+      "learning_rate": 0.00011864538731023744,
+      "loss": 1.0327,
+      "step": 1051
+    },
+    {
+      "epoch": 0.40882152919459824,
+      "grad_norm": 0.20344072580337524,
+      "learning_rate": 0.00011856753600622812,
+      "loss": 1.0719,
+      "step": 1052
+    },
+    {
+      "epoch": 0.40921014281550566,
+      "grad_norm": 0.2145012468099594,
+      "learning_rate": 0.00011848968470221877,
+      "loss": 1.0582,
+      "step": 1053
+    },
+    {
+      "epoch": 0.4095987564364131,
+      "grad_norm": 0.220048725605011,
+      "learning_rate": 0.00011841183339820943,
+      "loss": 1.0825,
+      "step": 1054
+    },
+    {
+      "epoch": 0.4099873700573205,
+      "grad_norm": 0.19074465334415436,
+      "learning_rate": 0.00011833398209420008,
+      "loss": 0.9657,
+      "step": 1055
+    },
+    {
+      "epoch": 0.4103759836782279,
+      "grad_norm": 0.1958267241716385,
+      "learning_rate": 0.00011825613079019073,
+      "loss": 0.9864,
+      "step": 1056
+    },
+    {
+      "epoch": 0.41076459729913534,
+      "grad_norm": 0.21768233180046082,
+      "learning_rate": 0.00011817827948618141,
+      "loss": 0.9997,
+      "step": 1057
+    },
+    {
+      "epoch": 0.41115321092004276,
+      "grad_norm": 0.20218704640865326,
+      "learning_rate": 0.00011810042818217206,
+      "loss": 1.072,
+      "step": 1058
+    },
+    {
+      "epoch": 0.4115418245409502,
+      "grad_norm": 0.2035023719072342,
+      "learning_rate": 0.00011802257687816271,
+      "loss": 1.0415,
+      "step": 1059
+    },
+    {
+      "epoch": 0.4119304381618576,
+      "grad_norm": 0.22603970766067505,
+      "learning_rate": 0.00011794472557415337,
+      "loss": 1.0751,
+      "step": 1060
+    },
+    {
+      "epoch": 0.412319051782765,
+      "grad_norm": 0.2125842273235321,
+      "learning_rate": 0.00011786687427014402,
+      "loss": 1.0727,
+      "step": 1061
+    },
+    {
+      "epoch": 0.4127076654036724,
+      "grad_norm": 0.2005981206893921,
+      "learning_rate": 0.0001177890229661347,
+      "loss": 1.0191,
+      "step": 1062
+    },
+    {
+      "epoch": 0.4130962790245798,
+      "grad_norm": 0.22252701222896576,
+      "learning_rate": 0.00011771117166212535,
+      "loss": 1.0591,
+      "step": 1063
+    },
+    {
+      "epoch": 0.4134848926454872,
+      "grad_norm": 0.22205251455307007,
+      "learning_rate": 0.000117633320358116,
+      "loss": 1.1198,
+      "step": 1064
+    },
+    {
+      "epoch": 0.41387350626639463,
+      "grad_norm": 0.20037783682346344,
+      "learning_rate": 0.00011755546905410665,
+      "loss": 1.0548,
+      "step": 1065
+    },
+    {
+      "epoch": 0.41426211988730205,
+      "grad_norm": 0.21737834811210632,
+      "learning_rate": 0.00011747761775009732,
+      "loss": 1.0922,
+      "step": 1066
+    },
+    {
+      "epoch": 0.41465073350820947,
+      "grad_norm": 0.19312533736228943,
+      "learning_rate": 0.00011739976644608798,
+      "loss": 0.9836,
+      "step": 1067
+    },
+    {
+      "epoch": 0.4150393471291169,
+      "grad_norm": 0.22055000066757202,
+      "learning_rate": 0.00011732191514207864,
+      "loss": 1.0383,
+      "step": 1068
+    },
+    {
+      "epoch": 0.4154279607500243,
+      "grad_norm": 0.22623857855796814,
+      "learning_rate": 0.0001172440638380693,
+      "loss": 1.0704,
+      "step": 1069
+    },
+    {
+      "epoch": 0.4158165743709317,
+      "grad_norm": 0.21481367945671082,
+      "learning_rate": 0.00011716621253405995,
+      "loss": 1.052,
+      "step": 1070
+    },
+    {
+      "epoch": 0.4162051879918391,
+      "grad_norm": 0.21022087335586548,
+      "learning_rate": 0.0001170883612300506,
+      "loss": 1.1021,
+      "step": 1071
+    },
+    {
+      "epoch": 0.4165938016127465,
+      "grad_norm": 0.2154620885848999,
+      "learning_rate": 0.00011701050992604126,
+      "loss": 1.0128,
+      "step": 1072
+    },
+    {
+      "epoch": 0.4169824152336539,
+      "grad_norm": 0.20545578002929688,
+      "learning_rate": 0.00011693265862203194,
+      "loss": 1.0058,
+      "step": 1073
+    },
+    {
+      "epoch": 0.41737102885456134,
+      "grad_norm": 0.21726195514202118,
+      "learning_rate": 0.00011685480731802259,
+      "loss": 1.0753,
+      "step": 1074
+    },
+    {
+      "epoch": 0.41775964247546876,
+      "grad_norm": 0.2067115604877472,
+      "learning_rate": 0.00011677695601401324,
+      "loss": 1.0594,
+      "step": 1075
+    },
+    {
+      "epoch": 0.4181482560963762,
+      "grad_norm": 0.23024648427963257,
+      "learning_rate": 0.0001166991047100039,
+      "loss": 1.1039,
+      "step": 1076
+    },
+    {
+      "epoch": 0.4185368697172836,
+      "grad_norm": 0.20692144334316254,
+      "learning_rate": 0.00011662125340599455,
+      "loss": 1.0598,
+      "step": 1077
+    },
+    {
+      "epoch": 0.418925483338191,
+      "grad_norm": 0.19839999079704285,
+      "learning_rate": 0.00011654340210198522,
+      "loss": 1.054,
+      "step": 1078
+    },
+    {
+      "epoch": 0.41931409695909844,
+      "grad_norm": 0.19227825105190277,
+      "learning_rate": 0.00011646555079797588,
+      "loss": 0.9453,
+      "step": 1079
+    },
+    {
+      "epoch": 0.41970271058000586,
+      "grad_norm": 0.2112567275762558,
+      "learning_rate": 0.00011638769949396653,
+      "loss": 1.023,
+      "step": 1080
+    },
+    {
+      "epoch": 0.4200913242009132,
+      "grad_norm": 0.185299351811409,
+      "learning_rate": 0.00011630984818995718,
+      "loss": 0.9752,
+      "step": 1081
+    },
+    {
+      "epoch": 0.42047993782182064,
+      "grad_norm": 0.20148858428001404,
+      "learning_rate": 0.00011623199688594783,
+      "loss": 1.0659,
+      "step": 1082
+    },
+    {
+      "epoch": 0.42086855144272806,
+      "grad_norm": 0.1935974359512329,
+      "learning_rate": 0.00011615414558193851,
+      "loss": 1.0116,
+      "step": 1083
+    },
+    {
+      "epoch": 0.4212571650636355,
+      "grad_norm": 0.20433953404426575,
+      "learning_rate": 0.00011607629427792916,
+      "loss": 1.0671,
+      "step": 1084
+    },
+    {
+      "epoch": 0.4216457786845429,
+      "grad_norm": 0.20729799568653107,
+      "learning_rate": 0.00011599844297391982,
+      "loss": 1.0341,
+      "step": 1085
+    },
+    {
+      "epoch": 0.4220343923054503,
+      "grad_norm": 0.2126002460718155,
+      "learning_rate": 0.00011592059166991047,
+      "loss": 1.0188,
+      "step": 1086
+    },
+    {
+      "epoch": 0.42242300592635773,
+      "grad_norm": 0.19453707337379456,
+      "learning_rate": 0.00011584274036590112,
+      "loss": 1.0331,
+      "step": 1087
+    },
+    {
+      "epoch": 0.42281161954726515,
+      "grad_norm": 0.20909856259822845,
+      "learning_rate": 0.0001157648890618918,
+      "loss": 0.9984,
+      "step": 1088
+    },
+    {
+      "epoch": 0.42320023316817257,
+      "grad_norm": 0.19596272706985474,
+      "learning_rate": 0.00011568703775788245,
+      "loss": 1.0121,
+      "step": 1089
+    },
+    {
+      "epoch": 0.42358884678907993,
+      "grad_norm": 0.22045716643333435,
+      "learning_rate": 0.0001156091864538731,
+      "loss": 1.0591,
+      "step": 1090
+    },
+    {
+      "epoch": 0.42397746040998735,
+      "grad_norm": 0.22624897956848145,
+      "learning_rate": 0.00011553133514986376,
+      "loss": 1.0565,
+      "step": 1091
+    },
+    {
+      "epoch": 0.42436607403089477,
+      "grad_norm": 0.20263417065143585,
+      "learning_rate": 0.00011545348384585442,
+      "loss": 1.024,
+      "step": 1092
+    },
+    {
+      "epoch": 0.4247546876518022,
+      "grad_norm": 0.20179417729377747,
+      "learning_rate": 0.00011537563254184509,
+      "loss": 0.9806,
+      "step": 1093
+    },
+    {
+      "epoch": 0.4251433012727096,
+      "grad_norm": 0.30221593379974365,
+      "learning_rate": 0.00011529778123783574,
+      "loss": 1.0683,
+      "step": 1094
+    },
+    {
+      "epoch": 0.425531914893617,
+      "grad_norm": 0.21195146441459656,
+      "learning_rate": 0.0001152199299338264,
+      "loss": 1.1283,
+      "step": 1095
+    },
+    {
+      "epoch": 0.42592052851452444,
+      "grad_norm": 0.21860192716121674,
+      "learning_rate": 0.00011514207862981706,
+      "loss": 1.0046,
+      "step": 1096
+    },
+    {
+      "epoch": 0.42630914213543186,
+      "grad_norm": 0.2234150469303131,
+      "learning_rate": 0.00011506422732580771,
+      "loss": 1.0461,
+      "step": 1097
+    },
+    {
+      "epoch": 0.4266977557563393,
+      "grad_norm": 0.21535125374794006,
+      "learning_rate": 0.00011498637602179837,
+      "loss": 1.0593,
+      "step": 1098
+    },
+    {
+      "epoch": 0.4270863693772467,
+      "grad_norm": 0.19313789904117584,
+      "learning_rate": 0.00011490852471778904,
+      "loss": 1.0357,
+      "step": 1099
+    },
+    {
+      "epoch": 0.42747498299815406,
+      "grad_norm": 0.19886989891529083,
+      "learning_rate": 0.00011483067341377969,
+      "loss": 0.9946,
+      "step": 1100
+    },
+    {
+      "epoch": 0.4278635966190615,
+      "grad_norm": 0.21028490364551544,
+      "learning_rate": 0.00011475282210977034,
+      "loss": 1.0765,
+      "step": 1101
+    },
+    {
+      "epoch": 0.4282522102399689,
+      "grad_norm": 0.2066621333360672,
+      "learning_rate": 0.000114674970805761,
+      "loss": 1.0405,
+      "step": 1102
+    },
+    {
+      "epoch": 0.4286408238608763,
+      "grad_norm": 0.18400220572948456,
+      "learning_rate": 0.00011459711950175168,
+      "loss": 0.9404,
+      "step": 1103
+    },
+    {
+      "epoch": 0.42902943748178374,
+      "grad_norm": 0.2058599591255188,
+      "learning_rate": 0.00011451926819774233,
+      "loss": 1.0505,
+      "step": 1104
+    },
+    {
+      "epoch": 0.42941805110269116,
+      "grad_norm": 0.19696786999702454,
+      "learning_rate": 0.00011444141689373298,
+      "loss": 1.032,
+      "step": 1105
+    },
+    {
+      "epoch": 0.4298066647235986,
+      "grad_norm": 0.2082854062318802,
+      "learning_rate": 0.00011436356558972363,
+      "loss": 1.0914,
+      "step": 1106
+    },
+    {
+      "epoch": 0.430195278344506,
+      "grad_norm": 0.20155015587806702,
+      "learning_rate": 0.00011428571428571428,
+      "loss": 1.0541,
+      "step": 1107
+    },
+    {
+      "epoch": 0.4305838919654134,
+      "grad_norm": 0.23419982194900513,
+      "learning_rate": 0.00011420786298170494,
+      "loss": 1.0684,
+      "step": 1108
+    },
+    {
+      "epoch": 0.4309725055863208,
+      "grad_norm": 0.23493975400924683,
+      "learning_rate": 0.00011413001167769561,
+      "loss": 1.0509,
+      "step": 1109
+    },
+    {
+      "epoch": 0.4313611192072282,
+      "grad_norm": 0.2089843600988388,
+      "learning_rate": 0.00011405216037368627,
+      "loss": 1.0479,
+      "step": 1110
+    },
+    {
+      "epoch": 0.4317497328281356,
+      "grad_norm": 0.21076850593090057,
+      "learning_rate": 0.00011397430906967692,
+      "loss": 1.064,
+      "step": 1111
+    },
+    {
+      "epoch": 0.43213834644904303,
+      "grad_norm": 0.20307987928390503,
+      "learning_rate": 0.00011389645776566757,
+      "loss": 1.0416,
+      "step": 1112
+    },
+    {
+      "epoch": 0.43252696006995045,
+      "grad_norm": 0.20955562591552734,
+      "learning_rate": 0.00011381860646165822,
+      "loss": 1.0158,
+      "step": 1113
+    },
+    {
+      "epoch": 0.43291557369085787,
+      "grad_norm": 0.2074531465768814,
+      "learning_rate": 0.0001137407551576489,
+      "loss": 1.0486,
+      "step": 1114
+    },
+    {
+      "epoch": 0.4333041873117653,
+      "grad_norm": 0.20907235145568848,
+      "learning_rate": 0.00011366290385363955,
+      "loss": 1.0352,
+      "step": 1115
+    },
+    {
+      "epoch": 0.4336928009326727,
+      "grad_norm": 0.21726477146148682,
+      "learning_rate": 0.0001135850525496302,
+      "loss": 1.0068,
+      "step": 1116
+    },
+    {
+      "epoch": 0.4340814145535801,
+      "grad_norm": 0.20231984555721283,
+      "learning_rate": 0.00011350720124562086,
+      "loss": 0.9757,
+      "step": 1117
+    },
+    {
+      "epoch": 0.4344700281744875,
+      "grad_norm": 0.23485834896564484,
+      "learning_rate": 0.00011342934994161152,
+      "loss": 1.0681,
+      "step": 1118
+    },
+    {
+      "epoch": 0.4348586417953949,
+      "grad_norm": 0.21286556124687195,
+      "learning_rate": 0.00011335149863760219,
+      "loss": 1.0399,
+      "step": 1119
+    },
+    {
+      "epoch": 0.4352472554163023,
+      "grad_norm": 0.2097872495651245,
+      "learning_rate": 0.00011327364733359284,
+      "loss": 1.0435,
+      "step": 1120
+    },
+    {
+      "epoch": 0.43563586903720974,
+      "grad_norm": 0.2224377542734146,
+      "learning_rate": 0.00011319579602958351,
+      "loss": 1.1664,
+      "step": 1121
+    },
+    {
+      "epoch": 0.43602448265811716,
+      "grad_norm": 0.19213411211967468,
+      "learning_rate": 0.00011311794472557416,
+      "loss": 1.0424,
+      "step": 1122
+    },
+    {
+      "epoch": 0.4364130962790246,
+      "grad_norm": 0.20974959433078766,
+      "learning_rate": 0.00011304009342156481,
+      "loss": 1.0943,
+      "step": 1123
+    },
+    {
+      "epoch": 0.436801709899932,
+      "grad_norm": 0.19943708181381226,
+      "learning_rate": 0.00011296224211755549,
+      "loss": 1.0652,
+      "step": 1124
+    },
+    {
+      "epoch": 0.4371903235208394,
+      "grad_norm": 0.1832750141620636,
+      "learning_rate": 0.00011288439081354614,
+      "loss": 0.9883,
+      "step": 1125
+    },
+    {
+      "epoch": 0.43757893714174684,
+      "grad_norm": 0.2205052226781845,
+      "learning_rate": 0.0001128065395095368,
+      "loss": 1.0733,
+      "step": 1126
+    },
+    {
+      "epoch": 0.43796755076265426,
+      "grad_norm": 0.2082854062318802,
+      "learning_rate": 0.00011272868820552745,
+      "loss": 1.0141,
+      "step": 1127
+    },
+    {
+      "epoch": 0.4383561643835616,
+      "grad_norm": 0.22755026817321777,
+      "learning_rate": 0.0001126508369015181,
+      "loss": 1.0942,
+      "step": 1128
+    },
+    {
+      "epoch": 0.43874477800446904,
+      "grad_norm": 0.2098863571882248,
+      "learning_rate": 0.00011257298559750878,
+      "loss": 0.9987,
+      "step": 1129
+    },
+    {
+      "epoch": 0.43913339162537646,
+      "grad_norm": 0.20559263229370117,
+      "learning_rate": 0.00011249513429349943,
+      "loss": 1.0345,
+      "step": 1130
+    },
+    {
+      "epoch": 0.4395220052462839,
+      "grad_norm": 0.21955084800720215,
+      "learning_rate": 0.00011241728298949008,
+      "loss": 1.1068,
+      "step": 1131
+    },
+    {
+      "epoch": 0.4399106188671913,
+      "grad_norm": 0.21353478729724884,
+      "learning_rate": 0.00011233943168548073,
+      "loss": 1.0094,
+      "step": 1132
+    },
+    {
+      "epoch": 0.4402992324880987,
+      "grad_norm": 0.19822491705417633,
+      "learning_rate": 0.00011226158038147139,
+      "loss": 0.9758,
+      "step": 1133
+    },
+    {
+      "epoch": 0.44068784610900613,
+      "grad_norm": 0.20079441368579865,
+      "learning_rate": 0.00011218372907746206,
+      "loss": 1.0202,
+      "step": 1134
+    },
+    {
+      "epoch": 0.44107645972991355,
+      "grad_norm": 0.2261926829814911,
+      "learning_rate": 0.00011210587777345272,
+      "loss": 0.9877,
+      "step": 1135
+    },
+    {
+      "epoch": 0.44146507335082097,
+      "grad_norm": 0.2264915257692337,
+      "learning_rate": 0.00011202802646944337,
+      "loss": 0.9887,
+      "step": 1136
+    },
+    {
+      "epoch": 0.44185368697172833,
+      "grad_norm": 0.21853779256343842,
+      "learning_rate": 0.00011195017516543402,
+      "loss": 1.0535,
+      "step": 1137
+    },
+    {
+      "epoch": 0.44224230059263575,
+      "grad_norm": 0.21332694590091705,
+      "learning_rate": 0.00011187232386142467,
+      "loss": 1.0824,
+      "step": 1138
+    },
+    {
+      "epoch": 0.44263091421354317,
+      "grad_norm": 0.21350236237049103,
+      "learning_rate": 0.00011179447255741535,
+      "loss": 1.0758,
+      "step": 1139
+    },
+    {
+      "epoch": 0.4430195278344506,
+      "grad_norm": 0.21305765211582184,
+      "learning_rate": 0.000111716621253406,
+      "loss": 1.035,
+      "step": 1140
+    },
+    {
+      "epoch": 0.443408141455358,
+      "grad_norm": 0.20486389100551605,
+      "learning_rate": 0.00011163876994939666,
+      "loss": 1.0413,
+      "step": 1141
+    },
+    {
+      "epoch": 0.4437967550762654,
+      "grad_norm": 0.19255472719669342,
+      "learning_rate": 0.00011156091864538731,
+      "loss": 0.9583,
+      "step": 1142
+    },
+    {
+      "epoch": 0.44418536869717284,
+      "grad_norm": 0.19824008643627167,
+      "learning_rate": 0.00011148306734137796,
+      "loss": 1.0331,
+      "step": 1143
+    },
+    {
+      "epoch": 0.44457398231808026,
+      "grad_norm": 0.20308080315589905,
+      "learning_rate": 0.00011140521603736863,
+      "loss": 1.0399,
+      "step": 1144
+    },
+    {
+      "epoch": 0.4449625959389877,
+      "grad_norm": 0.2193964123725891,
+      "learning_rate": 0.00011132736473335929,
+      "loss": 1.063,
+      "step": 1145
+    },
+    {
+      "epoch": 0.4453512095598951,
+      "grad_norm": 0.2151576578617096,
+      "learning_rate": 0.00011124951342934994,
+      "loss": 1.0795,
+      "step": 1146
+    },
+    {
+      "epoch": 0.44573982318080246,
+      "grad_norm": 0.23056697845458984,
+      "learning_rate": 0.00011117166212534061,
+      "loss": 1.0351,
+      "step": 1147
+    },
+    {
+      "epoch": 0.4461284368017099,
+      "grad_norm": 0.1973094493150711,
+      "learning_rate": 0.00011109381082133126,
+      "loss": 0.9866,
+      "step": 1148
+    },
+    {
+      "epoch": 0.4465170504226173,
+      "grad_norm": 0.2119562178850174,
+      "learning_rate": 0.00011101595951732191,
+      "loss": 1.0591,
+      "step": 1149
+    },
+    {
+      "epoch": 0.4469056640435247,
+      "grad_norm": 0.20407763123512268,
+      "learning_rate": 0.00011093810821331259,
+      "loss": 0.988,
+      "step": 1150
+    },
+    {
+      "epoch": 0.44729427766443214,
+      "grad_norm": 0.19474107027053833,
+      "learning_rate": 0.00011086025690930324,
+      "loss": 0.9729,
+      "step": 1151
+    },
+    {
+      "epoch": 0.44768289128533956,
+      "grad_norm": 0.2179928421974182,
+      "learning_rate": 0.0001107824056052939,
+      "loss": 1.0558,
+      "step": 1152
+    },
+    {
+      "epoch": 0.448071504906247,
+      "grad_norm": 0.44306451082229614,
+      "learning_rate": 0.00011070455430128455,
+      "loss": 1.0901,
+      "step": 1153
+    },
+    {
+      "epoch": 0.4484601185271544,
+      "grad_norm": 0.22060540318489075,
+      "learning_rate": 0.0001106267029972752,
+      "loss": 1.0009,
+      "step": 1154
+    },
+    {
+      "epoch": 0.4488487321480618,
+      "grad_norm": 0.20534972846508026,
+      "learning_rate": 0.00011054885169326588,
+      "loss": 0.9741,
+      "step": 1155
+    },
+    {
+      "epoch": 0.4492373457689692,
+      "grad_norm": 0.19488993287086487,
+      "learning_rate": 0.00011047100038925653,
+      "loss": 1.0,
+      "step": 1156
+    },
+    {
+      "epoch": 0.4496259593898766,
+      "grad_norm": 0.20462395250797272,
+      "learning_rate": 0.00011039314908524718,
+      "loss": 1.0309,
+      "step": 1157
+    },
+    {
+      "epoch": 0.450014573010784,
+      "grad_norm": 0.2170749306678772,
+      "learning_rate": 0.00011031529778123784,
+      "loss": 1.0726,
+      "step": 1158
+    },
+    {
+      "epoch": 0.45040318663169143,
+      "grad_norm": 0.2066730111837387,
+      "learning_rate": 0.00011023744647722849,
+      "loss": 1.0227,
+      "step": 1159
+    },
+    {
+      "epoch": 0.45079180025259885,
+      "grad_norm": 0.20625676214694977,
+      "learning_rate": 0.00011015959517321917,
+      "loss": 1.0287,
+      "step": 1160
+    },
+    {
+      "epoch": 0.45118041387350627,
+      "grad_norm": 0.19483047723770142,
+      "learning_rate": 0.00011008174386920982,
+      "loss": 0.9639,
+      "step": 1161
+    },
+    {
+      "epoch": 0.4515690274944137,
+      "grad_norm": 0.24705417454242706,
+      "learning_rate": 0.00011000389256520047,
+      "loss": 0.9903,
+      "step": 1162
+    },
+    {
+      "epoch": 0.4519576411153211,
+      "grad_norm": 0.2109205424785614,
+      "learning_rate": 0.00010992604126119112,
+      "loss": 1.054,
+      "step": 1163
+    },
+    {
+      "epoch": 0.4523462547362285,
+      "grad_norm": 0.20904991030693054,
+      "learning_rate": 0.00010984818995718178,
+      "loss": 1.0416,
+      "step": 1164
+    },
+    {
+      "epoch": 0.45273486835713594,
+      "grad_norm": 0.19841328263282776,
+      "learning_rate": 0.00010977033865317245,
+      "loss": 0.9986,
+      "step": 1165
+    },
+    {
+      "epoch": 0.4531234819780433,
+      "grad_norm": 0.20545506477355957,
+      "learning_rate": 0.0001096924873491631,
+      "loss": 1.0337,
+      "step": 1166
+    },
+    {
+      "epoch": 0.4535120955989507,
+      "grad_norm": 0.208644837141037,
+      "learning_rate": 0.00010961463604515376,
+      "loss": 1.0304,
+      "step": 1167
+    },
+    {
+      "epoch": 0.45390070921985815,
+      "grad_norm": 0.2111911028623581,
+      "learning_rate": 0.00010953678474114441,
+      "loss": 1.0398,
+      "step": 1168
+    },
+    {
+      "epoch": 0.45428932284076556,
+      "grad_norm": 0.2600184381008148,
+      "learning_rate": 0.00010945893343713506,
+      "loss": 1.0509,
+      "step": 1169
+    },
+    {
+      "epoch": 0.454677936461673,
+      "grad_norm": 0.2059030532836914,
+      "learning_rate": 0.00010938108213312574,
+      "loss": 0.9347,
+      "step": 1170
+    },
+    {
+      "epoch": 0.4550665500825804,
+      "grad_norm": 0.19232551753520966,
+      "learning_rate": 0.0001093032308291164,
+      "loss": 1.0162,
+      "step": 1171
+    },
+    {
+      "epoch": 0.4554551637034878,
+      "grad_norm": 0.19147330522537231,
+      "learning_rate": 0.00010922537952510705,
+      "loss": 0.9872,
+      "step": 1172
+    },
+    {
+      "epoch": 0.45584377732439524,
+      "grad_norm": 0.2599676251411438,
+      "learning_rate": 0.00010914752822109771,
+      "loss": 1.0402,
+      "step": 1173
+    },
+    {
+      "epoch": 0.45623239094530266,
+      "grad_norm": 0.2159397304058075,
+      "learning_rate": 0.00010906967691708836,
+      "loss": 1.0411,
+      "step": 1174
+    },
+    {
+      "epoch": 0.45662100456621,
+      "grad_norm": 0.23864266276359558,
+      "learning_rate": 0.00010899182561307903,
+      "loss": 1.054,
+      "step": 1175
+    },
+    {
+      "epoch": 0.45700961818711744,
+      "grad_norm": 0.2027217596769333,
+      "learning_rate": 0.0001089139743090697,
+      "loss": 0.9713,
+      "step": 1176
+    },
+    {
+      "epoch": 0.45739823180802486,
+      "grad_norm": 0.1837588995695114,
+      "learning_rate": 0.00010883612300506035,
+      "loss": 0.9698,
+      "step": 1177
+    },
+    {
+      "epoch": 0.4577868454289323,
+      "grad_norm": 0.20038527250289917,
+      "learning_rate": 0.000108758271701051,
+      "loss": 1.0456,
+      "step": 1178
+    },
+    {
+      "epoch": 0.4581754590498397,
+      "grad_norm": 0.21525044739246368,
+      "learning_rate": 0.00010868042039704165,
+      "loss": 1.021,
+      "step": 1179
+    },
+    {
+      "epoch": 0.4585640726707471,
+      "grad_norm": 0.18813730776309967,
+      "learning_rate": 0.0001086025690930323,
+      "loss": 0.9673,
+      "step": 1180
+    },
+    {
+      "epoch": 0.45895268629165453,
+      "grad_norm": 0.2056179642677307,
+      "learning_rate": 0.00010852471778902298,
+      "loss": 1.0119,
+      "step": 1181
+    },
+    {
+      "epoch": 0.45934129991256195,
+      "grad_norm": 0.21599683165550232,
+      "learning_rate": 0.00010844686648501363,
+      "loss": 1.0537,
+      "step": 1182
+    },
+    {
+      "epoch": 0.45972991353346937,
+      "grad_norm": 0.19750265777111053,
+      "learning_rate": 0.00010836901518100429,
+      "loss": 1.0203,
+      "step": 1183
+    },
+    {
+      "epoch": 0.4601185271543768,
+      "grad_norm": 0.22186161577701569,
+      "learning_rate": 0.00010829116387699494,
+      "loss": 1.0583,
+      "step": 1184
+    },
+    {
+      "epoch": 0.46050714077528415,
+      "grad_norm": 0.2109905481338501,
+      "learning_rate": 0.00010821331257298559,
+      "loss": 1.0022,
+      "step": 1185
+    },
+    {
+      "epoch": 0.46089575439619157,
+      "grad_norm": 0.2032858431339264,
+      "learning_rate": 0.00010813546126897627,
+      "loss": 0.9774,
+      "step": 1186
+    },
+    {
+      "epoch": 0.461284368017099,
+      "grad_norm": 0.20381197333335876,
+      "learning_rate": 0.00010805760996496692,
+      "loss": 0.9768,
+      "step": 1187
+    },
+    {
+      "epoch": 0.4616729816380064,
+      "grad_norm": 0.20488987863063812,
+      "learning_rate": 0.00010797975866095757,
+      "loss": 1.0448,
+      "step": 1188
+    },
+    {
+      "epoch": 0.4620615952589138,
+      "grad_norm": 0.20257477462291718,
+      "learning_rate": 0.00010790190735694823,
+      "loss": 1.0157,
+      "step": 1189
+    },
+    {
+      "epoch": 0.46245020887982125,
+      "grad_norm": 0.20761239528656006,
+      "learning_rate": 0.00010782405605293888,
+      "loss": 1.0328,
+      "step": 1190
+    },
+    {
+      "epoch": 0.46283882250072866,
+      "grad_norm": 0.22062581777572632,
+      "learning_rate": 0.00010774620474892956,
+      "loss": 1.0362,
+      "step": 1191
+    },
+    {
+      "epoch": 0.4632274361216361,
+      "grad_norm": 0.19970272481441498,
+      "learning_rate": 0.00010766835344492021,
+      "loss": 1.0783,
+      "step": 1192
+    },
+    {
+      "epoch": 0.4636160497425435,
+      "grad_norm": 0.2221893072128296,
+      "learning_rate": 0.00010759050214091086,
+      "loss": 1.0136,
+      "step": 1193
+    },
+    {
+      "epoch": 0.46400466336345086,
+      "grad_norm": 0.2124665081501007,
+      "learning_rate": 0.00010751265083690151,
+      "loss": 1.0528,
+      "step": 1194
+    },
+    {
+      "epoch": 0.4643932769843583,
+      "grad_norm": 0.2001204937696457,
+      "learning_rate": 0.00010743479953289218,
+      "loss": 1.0495,
+      "step": 1195
+    },
+    {
+      "epoch": 0.4647818906052657,
+      "grad_norm": 0.20979635417461395,
+      "learning_rate": 0.00010735694822888284,
+      "loss": 1.0664,
+      "step": 1196
+    },
+    {
+      "epoch": 0.4651705042261731,
+      "grad_norm": 0.190982848405838,
+      "learning_rate": 0.0001072790969248735,
+      "loss": 1.0256,
+      "step": 1197
+    },
+    {
+      "epoch": 0.46555911784708054,
+      "grad_norm": 0.19910745322704315,
+      "learning_rate": 0.00010720124562086415,
+      "loss": 1.0263,
+      "step": 1198
+    },
+    {
+      "epoch": 0.46594773146798796,
+      "grad_norm": 0.21624085307121277,
+      "learning_rate": 0.00010712339431685481,
+      "loss": 1.0768,
+      "step": 1199
+    },
+    {
+      "epoch": 0.4663363450888954,
+      "grad_norm": 0.20857703685760498,
+      "learning_rate": 0.00010704554301284547,
+      "loss": 1.0892,
+      "step": 1200
+    },
+    {
+      "epoch": 0.4667249587098028,
+      "grad_norm": 0.21897061169147491,
+      "learning_rate": 0.00010696769170883613,
+      "loss": 1.0873,
+      "step": 1201
+    },
+    {
+      "epoch": 0.4671135723307102,
+      "grad_norm": 0.1943386346101761,
+      "learning_rate": 0.0001068898404048268,
+      "loss": 1.0116,
+      "step": 1202
+    },
+    {
+      "epoch": 0.4675021859516176,
+      "grad_norm": 0.22607874870300293,
+      "learning_rate": 0.00010681198910081745,
+      "loss": 1.0328,
+      "step": 1203
+    },
+    {
+      "epoch": 0.467890799572525,
+      "grad_norm": 0.1898999959230423,
+      "learning_rate": 0.0001067341377968081,
+      "loss": 0.9791,
+      "step": 1204
+    },
+    {
+      "epoch": 0.4682794131934324,
+      "grad_norm": 0.2193334400653839,
+      "learning_rate": 0.00010665628649279875,
+      "loss": 1.0742,
+      "step": 1205
+    },
+    {
+      "epoch": 0.46866802681433983,
+      "grad_norm": 0.2096349149942398,
+      "learning_rate": 0.00010657843518878943,
+      "loss": 1.0683,
+      "step": 1206
+    },
+    {
+      "epoch": 0.46905664043524725,
+      "grad_norm": 0.2040576934814453,
+      "learning_rate": 0.00010650058388478008,
+      "loss": 1.0516,
+      "step": 1207
+    },
+    {
+      "epoch": 0.46944525405615467,
+      "grad_norm": 0.20619645714759827,
+      "learning_rate": 0.00010642273258077074,
+      "loss": 1.0429,
+      "step": 1208
+    },
+    {
+      "epoch": 0.4698338676770621,
+      "grad_norm": 0.19753660261631012,
+      "learning_rate": 0.00010634488127676139,
+      "loss": 1.0268,
+      "step": 1209
+    },
+    {
+      "epoch": 0.4702224812979695,
+      "grad_norm": 0.2201426476240158,
+      "learning_rate": 0.00010626702997275204,
+      "loss": 1.0879,
+      "step": 1210
+    },
+    {
+      "epoch": 0.4706110949188769,
+      "grad_norm": 0.21307805180549622,
+      "learning_rate": 0.00010618917866874272,
+      "loss": 1.0186,
+      "step": 1211
+    },
+    {
+      "epoch": 0.47099970853978435,
+      "grad_norm": 0.21142373979091644,
+      "learning_rate": 0.00010611132736473337,
+      "loss": 1.0417,
+      "step": 1212
+    },
+    {
+      "epoch": 0.4713883221606917,
+      "grad_norm": 0.20523706078529358,
+      "learning_rate": 0.00010603347606072402,
+      "loss": 1.0372,
+      "step": 1213
+    },
+    {
+      "epoch": 0.4717769357815991,
+      "grad_norm": 0.19843094050884247,
+      "learning_rate": 0.00010595562475671468,
+      "loss": 1.0062,
+      "step": 1214
+    },
+    {
+      "epoch": 0.47216554940250655,
+      "grad_norm": 0.2146739959716797,
+      "learning_rate": 0.00010587777345270533,
+      "loss": 1.0528,
+      "step": 1215
+    },
+    {
+      "epoch": 0.47255416302341396,
+      "grad_norm": 0.2136303037405014,
+      "learning_rate": 0.00010579992214869601,
+      "loss": 1.0521,
+      "step": 1216
+    },
+    {
+      "epoch": 0.4729427766443214,
+      "grad_norm": 0.21379397809505463,
+      "learning_rate": 0.00010572207084468666,
+      "loss": 1.0362,
+      "step": 1217
+    },
+    {
+      "epoch": 0.4733313902652288,
+      "grad_norm": 0.20459088683128357,
+      "learning_rate": 0.00010564421954067731,
+      "loss": 1.0455,
+      "step": 1218
+    },
+    {
+      "epoch": 0.4737200038861362,
+      "grad_norm": 0.20667988061904907,
+      "learning_rate": 0.00010556636823666796,
+      "loss": 1.0284,
+      "step": 1219
+    },
+    {
+      "epoch": 0.47410861750704364,
+      "grad_norm": 0.21820449829101562,
+      "learning_rate": 0.00010548851693265862,
+      "loss": 1.0584,
+      "step": 1220
+    },
+    {
+      "epoch": 0.47449723112795106,
+      "grad_norm": 0.19705156981945038,
+      "learning_rate": 0.00010541066562864928,
+      "loss": 1.004,
+      "step": 1221
+    },
+    {
+      "epoch": 0.4748858447488584,
+      "grad_norm": 0.19806528091430664,
+      "learning_rate": 0.00010533281432463995,
+      "loss": 1.0519,
+      "step": 1222
+    },
+    {
+      "epoch": 0.47527445836976584,
+      "grad_norm": 0.2006833702325821,
+      "learning_rate": 0.0001052549630206306,
+      "loss": 1.0119,
+      "step": 1223
+    },
+    {
+      "epoch": 0.47566307199067326,
+      "grad_norm": 0.21757058799266815,
+      "learning_rate": 0.00010517711171662125,
+      "loss": 1.0961,
+      "step": 1224
+    },
+    {
+      "epoch": 0.4760516856115807,
+      "grad_norm": 0.2015775889158249,
+      "learning_rate": 0.00010509926041261192,
+      "loss": 1.0419,
+      "step": 1225
+    },
+    {
+      "epoch": 0.4764402992324881,
+      "grad_norm": 0.19691923260688782,
+      "learning_rate": 0.00010502140910860257,
+      "loss": 1.0555,
+      "step": 1226
+    },
+    {
+      "epoch": 0.4768289128533955,
+      "grad_norm": 0.19924800097942352,
+      "learning_rate": 0.00010494355780459323,
+      "loss": 1.0106,
+      "step": 1227
+    },
+    {
+      "epoch": 0.47721752647430293,
+      "grad_norm": 0.21416346728801727,
+      "learning_rate": 0.0001048657065005839,
+      "loss": 1.0741,
+      "step": 1228
+    },
+    {
+      "epoch": 0.47760614009521035,
+      "grad_norm": 0.21823547780513763,
+      "learning_rate": 0.00010478785519657455,
+      "loss": 1.023,
+      "step": 1229
+    },
+    {
+      "epoch": 0.47799475371611777,
+      "grad_norm": 0.2083735466003418,
+      "learning_rate": 0.0001047100038925652,
+      "loss": 1.0424,
+      "step": 1230
+    },
+    {
+      "epoch": 0.4783833673370252,
+      "grad_norm": 0.2219141572713852,
+      "learning_rate": 0.00010463215258855586,
+      "loss": 1.0839,
+      "step": 1231
+    },
+    {
+      "epoch": 0.47877198095793255,
+      "grad_norm": 0.21334600448608398,
+      "learning_rate": 0.00010455430128454653,
+      "loss": 0.9888,
+      "step": 1232
+    },
+    {
+      "epoch": 0.47916059457883997,
+      "grad_norm": 0.2140086442232132,
+      "learning_rate": 0.00010447644998053719,
+      "loss": 1.0119,
+      "step": 1233
+    },
+    {
+      "epoch": 0.4795492081997474,
+      "grad_norm": 0.25360551476478577,
+      "learning_rate": 0.00010439859867652784,
+      "loss": 1.0026,
+      "step": 1234
+    },
+    {
+      "epoch": 0.4799378218206548,
+      "grad_norm": 0.20200380682945251,
+      "learning_rate": 0.00010432074737251849,
+      "loss": 1.0,
+      "step": 1235
+    },
+    {
+      "epoch": 0.4803264354415622,
+      "grad_norm": 0.22641289234161377,
+      "learning_rate": 0.00010424289606850914,
+      "loss": 1.1022,
+      "step": 1236
+    },
+    {
+      "epoch": 0.48071504906246965,
+      "grad_norm": 0.20538561046123505,
+      "learning_rate": 0.00010416504476449982,
+      "loss": 0.9847,
+      "step": 1237
+    },
+    {
+      "epoch": 0.48110366268337706,
+      "grad_norm": 0.206883504986763,
+      "learning_rate": 0.00010408719346049047,
+      "loss": 1.0152,
+      "step": 1238
+    },
+    {
+      "epoch": 0.4814922763042845,
+      "grad_norm": 0.21584320068359375,
+      "learning_rate": 0.00010400934215648113,
+      "loss": 1.0361,
+      "step": 1239
+    },
+    {
+      "epoch": 0.4818808899251919,
+      "grad_norm": 0.20963703095912933,
+      "learning_rate": 0.00010393149085247178,
+      "loss": 1.0814,
+      "step": 1240
+    },
+    {
+      "epoch": 0.48226950354609927,
+      "grad_norm": 0.1965872198343277,
+      "learning_rate": 0.00010385363954846243,
+      "loss": 1.0365,
+      "step": 1241
+    },
+    {
+      "epoch": 0.4826581171670067,
+      "grad_norm": 0.2030191719532013,
+      "learning_rate": 0.00010377578824445311,
+      "loss": 1.0374,
+      "step": 1242
+    },
+    {
+      "epoch": 0.4830467307879141,
+      "grad_norm": 0.21448804438114166,
+      "learning_rate": 0.00010369793694044376,
+      "loss": 0.9686,
+      "step": 1243
+    },
+    {
+      "epoch": 0.4834353444088215,
+      "grad_norm": 0.2181752622127533,
+      "learning_rate": 0.00010362008563643441,
+      "loss": 1.0812,
+      "step": 1244
+    },
+    {
+      "epoch": 0.48382395802972894,
+      "grad_norm": 0.19887101650238037,
+      "learning_rate": 0.00010354223433242507,
+      "loss": 1.036,
+      "step": 1245
+    },
+    {
+      "epoch": 0.48421257165063636,
+      "grad_norm": 0.19007287919521332,
+      "learning_rate": 0.00010346438302841572,
+      "loss": 1.0292,
+      "step": 1246
+    },
+    {
+      "epoch": 0.4846011852715438,
+      "grad_norm": 0.21390347182750702,
+      "learning_rate": 0.0001033865317244064,
+      "loss": 1.0284,
+      "step": 1247
+    },
+    {
+      "epoch": 0.4849897988924512,
+      "grad_norm": 0.23822663724422455,
+      "learning_rate": 0.00010330868042039705,
+      "loss": 1.1044,
+      "step": 1248
+    },
+    {
+      "epoch": 0.4853784125133586,
+      "grad_norm": 0.20779070258140564,
+      "learning_rate": 0.0001032308291163877,
+      "loss": 1.0475,
+      "step": 1249
+    },
+    {
+      "epoch": 0.48576702613426603,
+      "grad_norm": 0.19232134521007538,
+      "learning_rate": 0.00010315297781237835,
+      "loss": 0.9945,
+      "step": 1250
+    },
+    {
+      "epoch": 0.4861556397551734,
+      "grad_norm": 0.22378556430339813,
+      "learning_rate": 0.00010307512650836902,
+      "loss": 1.0462,
+      "step": 1251
+    },
+    {
+      "epoch": 0.4865442533760808,
+      "grad_norm": 0.22156798839569092,
+      "learning_rate": 0.00010299727520435968,
+      "loss": 1.051,
+      "step": 1252
+    },
+    {
+      "epoch": 0.48693286699698823,
+      "grad_norm": 0.19885733723640442,
+      "learning_rate": 0.00010291942390035034,
+      "loss": 1.0593,
+      "step": 1253
+    },
+    {
+      "epoch": 0.48732148061789565,
+      "grad_norm": 0.2172418236732483,
+      "learning_rate": 0.000102841572596341,
+      "loss": 1.0513,
+      "step": 1254
+    },
+    {
+      "epoch": 0.48771009423880307,
+      "grad_norm": 0.22136956453323364,
+      "learning_rate": 0.00010276372129233165,
+      "loss": 1.0438,
+      "step": 1255
+    },
+    {
+      "epoch": 0.4880987078597105,
+      "grad_norm": 0.21337302029132843,
+      "learning_rate": 0.0001026858699883223,
+      "loss": 1.0551,
+      "step": 1256
+    },
+    {
+      "epoch": 0.4884873214806179,
+      "grad_norm": 0.21376267075538635,
+      "learning_rate": 0.00010260801868431296,
+      "loss": 1.054,
+      "step": 1257
+    },
+    {
+      "epoch": 0.4888759351015253,
+      "grad_norm": 0.19498860836029053,
+      "learning_rate": 0.00010253016738030364,
+      "loss": 1.0045,
+      "step": 1258
+    },
+    {
+      "epoch": 0.48926454872243275,
+      "grad_norm": 0.22354961931705475,
+      "learning_rate": 0.00010245231607629429,
+      "loss": 1.096,
+      "step": 1259
+    },
+    {
+      "epoch": 0.4896531623433401,
+      "grad_norm": 0.2078939527273178,
+      "learning_rate": 0.00010237446477228494,
+      "loss": 1.0102,
+      "step": 1260
+    },
+    {
+      "epoch": 0.49004177596424753,
+      "grad_norm": 0.20992495119571686,
+      "learning_rate": 0.00010229661346827559,
+      "loss": 0.9814,
+      "step": 1261
+    },
+    {
+      "epoch": 0.49043038958515495,
+      "grad_norm": 0.2178875207901001,
+      "learning_rate": 0.00010221876216426625,
+      "loss": 1.0489,
+      "step": 1262
+    },
+    {
+      "epoch": 0.49081900320606237,
+      "grad_norm": 0.22152946889400482,
+      "learning_rate": 0.00010214091086025692,
+      "loss": 1.0808,
+      "step": 1263
+    },
+    {
+      "epoch": 0.4912076168269698,
+      "grad_norm": 0.21179009974002838,
+      "learning_rate": 0.00010206305955624758,
+      "loss": 1.0323,
+      "step": 1264
+    },
+    {
+      "epoch": 0.4915962304478772,
+      "grad_norm": 0.2126997411251068,
+      "learning_rate": 0.00010198520825223823,
+      "loss": 1.0093,
+      "step": 1265
+    },
+    {
+      "epoch": 0.4919848440687846,
+      "grad_norm": 0.20912809669971466,
+      "learning_rate": 0.00010190735694822888,
+      "loss": 1.0343,
+      "step": 1266
+    },
+    {
+      "epoch": 0.49237345768969204,
+      "grad_norm": 0.2231636494398117,
+      "learning_rate": 0.00010182950564421953,
+      "loss": 1.0587,
+      "step": 1267
+    },
+    {
+      "epoch": 0.49276207131059946,
+      "grad_norm": 0.1954583376646042,
+      "learning_rate": 0.00010175165434021021,
+      "loss": 0.9566,
+      "step": 1268
+    },
+    {
+      "epoch": 0.4931506849315068,
+      "grad_norm": 0.20520909130573273,
+      "learning_rate": 0.00010167380303620086,
+      "loss": 1.024,
+      "step": 1269
+    },
+    {
+      "epoch": 0.49353929855241424,
+      "grad_norm": 0.21736180782318115,
+      "learning_rate": 0.00010159595173219152,
+      "loss": 1.0434,
+      "step": 1270
+    },
+    {
+      "epoch": 0.49392791217332166,
+      "grad_norm": 0.2360561490058899,
+      "learning_rate": 0.00010151810042818217,
+      "loss": 1.114,
+      "step": 1271
+    },
+    {
+      "epoch": 0.4943165257942291,
+      "grad_norm": 0.20595967769622803,
+      "learning_rate": 0.00010144024912417282,
+      "loss": 0.9909,
+      "step": 1272
+    },
+    {
+      "epoch": 0.4947051394151365,
+      "grad_norm": 0.2161860466003418,
+      "learning_rate": 0.0001013623978201635,
+      "loss": 1.0536,
+      "step": 1273
+    },
+    {
+      "epoch": 0.4950937530360439,
+      "grad_norm": 0.19852355122566223,
+      "learning_rate": 0.00010128454651615415,
+      "loss": 1.0001,
+      "step": 1274
+    },
+    {
+      "epoch": 0.49548236665695133,
+      "grad_norm": 0.21081402897834778,
+      "learning_rate": 0.0001012066952121448,
+      "loss": 1.0151,
+      "step": 1275
+    },
+    {
+      "epoch": 0.49587098027785875,
+      "grad_norm": 0.2053362876176834,
+      "learning_rate": 0.00010112884390813547,
+      "loss": 1.018,
+      "step": 1276
+    },
+    {
+      "epoch": 0.49625959389876617,
+      "grad_norm": 0.21205593645572662,
+      "learning_rate": 0.00010105099260412612,
+      "loss": 0.9912,
+      "step": 1277
+    },
+    {
+      "epoch": 0.4966482075196736,
+      "grad_norm": 0.2005016952753067,
+      "learning_rate": 0.00010097314130011679,
+      "loss": 1.0069,
+      "step": 1278
+    },
+    {
+      "epoch": 0.49703682114058095,
+      "grad_norm": 0.21688181161880493,
+      "learning_rate": 0.00010089528999610744,
+      "loss": 1.0364,
+      "step": 1279
+    },
+    {
+      "epoch": 0.49742543476148837,
+      "grad_norm": 0.20582237839698792,
+      "learning_rate": 0.0001008174386920981,
+      "loss": 1.0138,
+      "step": 1280
+    },
+    {
+      "epoch": 0.4978140483823958,
+      "grad_norm": 0.20824448764324188,
+      "learning_rate": 0.00010073958738808876,
+      "loss": 0.9941,
+      "step": 1281
+    },
+    {
+      "epoch": 0.4982026620033032,
+      "grad_norm": 0.20749075710773468,
+      "learning_rate": 0.00010066173608407941,
+      "loss": 1.0478,
+      "step": 1282
+    },
+    {
+      "epoch": 0.49859127562421063,
+      "grad_norm": 0.20012183487415314,
+      "learning_rate": 0.00010058388478007009,
+      "loss": 0.995,
+      "step": 1283
+    },
+    {
+      "epoch": 0.49897988924511805,
+      "grad_norm": 0.20275959372520447,
+      "learning_rate": 0.00010050603347606074,
+      "loss": 1.097,
+      "step": 1284
+    },
+    {
+      "epoch": 0.49936850286602547,
+      "grad_norm": 0.19588243961334229,
+      "learning_rate": 0.00010042818217205139,
+      "loss": 1.0,
+      "step": 1285
+    },
+    {
+      "epoch": 0.4997571164869329,
+      "grad_norm": 0.20693185925483704,
+      "learning_rate": 0.00010035033086804204,
+      "loss": 1.0527,
+      "step": 1286
+    },
+    {
+      "epoch": 0.5001457301078402,
+      "grad_norm": 0.20330573618412018,
+      "learning_rate": 0.0001002724795640327,
+      "loss": 1.0137,
+      "step": 1287
+    },
+    {
+      "epoch": 0.5005343437287477,
+      "grad_norm": 0.19123876094818115,
+      "learning_rate": 0.00010019462826002337,
+      "loss": 0.9688,
+      "step": 1288
+    },
+    {
+      "epoch": 0.5009229573496551,
+      "grad_norm": 0.2184276431798935,
+      "learning_rate": 0.00010011677695601403,
+      "loss": 1.0367,
+      "step": 1289
+    },
+    {
+      "epoch": 0.5013115709705626,
+      "grad_norm": 0.21642108261585236,
+      "learning_rate": 0.00010003892565200468,
+      "loss": 1.102,
+      "step": 1290
+    },
+    {
+      "epoch": 0.5017001845914699,
+      "grad_norm": 0.20351074635982513,
+      "learning_rate": 9.996107434799533e-05,
+      "loss": 1.0327,
+      "step": 1291
+    },
+    {
+      "epoch": 0.5020887982123774,
+      "grad_norm": 0.22771553695201874,
+      "learning_rate": 9.9883223043986e-05,
+      "loss": 1.104,
+      "step": 1292
+    },
+    {
+      "epoch": 0.5024774118332848,
+      "grad_norm": 0.2271403968334198,
+      "learning_rate": 9.980537173997665e-05,
+      "loss": 1.1313,
+      "step": 1293
+    },
+    {
+      "epoch": 0.5028660254541921,
+      "grad_norm": 0.2157830148935318,
+      "learning_rate": 9.97275204359673e-05,
+      "loss": 1.0203,
+      "step": 1294
+    },
+    {
+      "epoch": 0.5032546390750996,
+      "grad_norm": 0.19555307924747467,
+      "learning_rate": 9.964966913195797e-05,
+      "loss": 1.0194,
+      "step": 1295
+    },
+    {
+      "epoch": 0.503643252696007,
+      "grad_norm": 0.1898549199104309,
+      "learning_rate": 9.957181782794862e-05,
+      "loss": 1.0034,
+      "step": 1296
+    },
+    {
+      "epoch": 0.5040318663169144,
+      "grad_norm": 0.23555906116962433,
+      "learning_rate": 9.949396652393928e-05,
+      "loss": 1.0298,
+      "step": 1297
+    },
+    {
+      "epoch": 0.5044204799378218,
+      "grad_norm": 0.20434850454330444,
+      "learning_rate": 9.941611521992994e-05,
+      "loss": 0.9999,
+      "step": 1298
+    },
+    {
+      "epoch": 0.5048090935587293,
+      "grad_norm": 0.21015289425849915,
+      "learning_rate": 9.933826391592059e-05,
+      "loss": 1.006,
+      "step": 1299
+    },
+    {
+      "epoch": 0.5051977071796366,
+      "grad_norm": 0.21147851645946503,
+      "learning_rate": 9.926041261191125e-05,
+      "loss": 1.0854,
+      "step": 1300
+    },
+    {
+      "epoch": 0.5055863208005441,
+      "grad_norm": 0.19666944444179535,
+      "learning_rate": 9.91825613079019e-05,
+      "loss": 1.0057,
+      "step": 1301
+    },
+    {
+      "epoch": 0.5059749344214515,
+      "grad_norm": 0.21233728528022766,
+      "learning_rate": 9.910471000389257e-05,
+      "loss": 1.0675,
+      "step": 1302
+    },
+    {
+      "epoch": 0.5063635480423588,
+      "grad_norm": 0.21905581653118134,
+      "learning_rate": 9.902685869988322e-05,
+      "loss": 1.0054,
+      "step": 1303
+    },
+    {
+      "epoch": 0.5067521616632663,
+      "grad_norm": 0.23434993624687195,
+      "learning_rate": 9.894900739587389e-05,
+      "loss": 0.9915,
+      "step": 1304
+    },
+    {
+      "epoch": 0.5071407752841737,
+      "grad_norm": 0.21684227883815765,
+      "learning_rate": 9.887115609186454e-05,
+      "loss": 1.1131,
+      "step": 1305
+    },
+    {
+      "epoch": 0.5075293889050811,
+      "grad_norm": 0.21699552237987518,
+      "learning_rate": 9.87933047878552e-05,
+      "loss": 1.0782,
+      "step": 1306
+    },
+    {
+      "epoch": 0.5079180025259885,
+      "grad_norm": 0.2218221127986908,
+      "learning_rate": 9.871545348384586e-05,
+      "loss": 1.0388,
+      "step": 1307
+    },
+    {
+      "epoch": 0.508306616146896,
+      "grad_norm": 0.20104359090328217,
+      "learning_rate": 9.863760217983652e-05,
+      "loss": 1.0336,
+      "step": 1308
+    },
+    {
+      "epoch": 0.5086952297678033,
+      "grad_norm": 0.21907050907611847,
+      "learning_rate": 9.855975087582718e-05,
+      "loss": 1.0587,
+      "step": 1309
+    },
+    {
+      "epoch": 0.5090838433887108,
+      "grad_norm": 0.2140391767024994,
+      "learning_rate": 9.848189957181784e-05,
+      "loss": 1.0351,
+      "step": 1310
+    },
+    {
+      "epoch": 0.5094724570096182,
+      "grad_norm": 0.33287563920021057,
+      "learning_rate": 9.84040482678085e-05,
+      "loss": 0.9908,
+      "step": 1311
+    },
+    {
+      "epoch": 0.5098610706305255,
+      "grad_norm": 0.2706705927848816,
+      "learning_rate": 9.832619696379915e-05,
+      "loss": 1.0078,
+      "step": 1312
+    },
+    {
+      "epoch": 0.510249684251433,
+      "grad_norm": 0.20216278731822968,
+      "learning_rate": 9.824834565978981e-05,
+      "loss": 1.0253,
+      "step": 1313
+    },
+    {
+      "epoch": 0.5106382978723404,
+      "grad_norm": 0.20736576616764069,
+      "learning_rate": 9.817049435578046e-05,
+      "loss": 1.0217,
+      "step": 1314
+    },
+    {
+      "epoch": 0.5110269114932479,
+      "grad_norm": 0.2275344580411911,
+      "learning_rate": 9.809264305177113e-05,
+      "loss": 1.0139,
+      "step": 1315
+    },
+    {
+      "epoch": 0.5114155251141552,
+      "grad_norm": 0.22243620455265045,
+      "learning_rate": 9.801479174776178e-05,
+      "loss": 1.0427,
+      "step": 1316
+    },
+    {
+      "epoch": 0.5118041387350627,
+      "grad_norm": 0.198841854929924,
+      "learning_rate": 9.793694044375243e-05,
+      "loss": 1.0231,
+      "step": 1317
+    },
+    {
+      "epoch": 0.5121927523559701,
+      "grad_norm": 0.2031068503856659,
+      "learning_rate": 9.78590891397431e-05,
+      "loss": 1.0184,
+      "step": 1318
+    },
+    {
+      "epoch": 0.5125813659768775,
+      "grad_norm": 0.21712587773799896,
+      "learning_rate": 9.778123783573375e-05,
+      "loss": 1.0205,
+      "step": 1319
+    },
+    {
+      "epoch": 0.5129699795977849,
+      "grad_norm": 0.19366060197353363,
+      "learning_rate": 9.77033865317244e-05,
+      "loss": 0.9623,
+      "step": 1320
+    },
+    {
+      "epoch": 0.5133585932186923,
+      "grad_norm": 0.19845952093601227,
+      "learning_rate": 9.762553522771507e-05,
+      "loss": 1.0209,
+      "step": 1321
+    },
+    {
+      "epoch": 0.5137472068395997,
+      "grad_norm": 0.19700276851654053,
+      "learning_rate": 9.754768392370572e-05,
+      "loss": 0.9506,
+      "step": 1322
+    },
+    {
+      "epoch": 0.5141358204605071,
+      "grad_norm": 0.19797460734844208,
+      "learning_rate": 9.746983261969639e-05,
+      "loss": 1.0928,
+      "step": 1323
+    },
+    {
+      "epoch": 0.5145244340814146,
+      "grad_norm": 0.20470699667930603,
+      "learning_rate": 9.739198131568704e-05,
+      "loss": 1.0835,
+      "step": 1324
+    },
+    {
+      "epoch": 0.5149130477023219,
+      "grad_norm": 0.19121742248535156,
+      "learning_rate": 9.731413001167769e-05,
+      "loss": 0.9877,
+      "step": 1325
+    },
+    {
+      "epoch": 0.5153016613232294,
+      "grad_norm": 0.20026616752147675,
+      "learning_rate": 9.723627870766836e-05,
+      "loss": 1.0094,
+      "step": 1326
+    },
+    {
+      "epoch": 0.5156902749441368,
+      "grad_norm": 0.2214539796113968,
+      "learning_rate": 9.715842740365901e-05,
+      "loss": 0.9867,
+      "step": 1327
+    },
+    {
+      "epoch": 0.5160788885650442,
+      "grad_norm": 0.22674603760242462,
+      "learning_rate": 9.708057609964967e-05,
+      "loss": 1.0738,
+      "step": 1328
+    },
+    {
+      "epoch": 0.5164675021859516,
+      "grad_norm": 0.21274834871292114,
+      "learning_rate": 9.700272479564033e-05,
+      "loss": 1.0458,
+      "step": 1329
+    },
+    {
+      "epoch": 0.5168561158068591,
+      "grad_norm": 0.20305052399635315,
+      "learning_rate": 9.692487349163099e-05,
+      "loss": 1.0041,
+      "step": 1330
+    },
+    {
+      "epoch": 0.5172447294277664,
+      "grad_norm": 0.1840772181749344,
+      "learning_rate": 9.684702218762166e-05,
+      "loss": 0.9498,
+      "step": 1331
+    },
+    {
+      "epoch": 0.5176333430486738,
+      "grad_norm": 0.2055782824754715,
+      "learning_rate": 9.676917088361231e-05,
+      "loss": 1.0223,
+      "step": 1332
+    },
+    {
+      "epoch": 0.5180219566695813,
+      "grad_norm": 0.21826402842998505,
+      "learning_rate": 9.669131957960297e-05,
+      "loss": 1.1068,
+      "step": 1333
+    },
+    {
+      "epoch": 0.5184105702904886,
+      "grad_norm": 0.22516922652721405,
+      "learning_rate": 9.661346827559363e-05,
+      "loss": 1.0957,
+      "step": 1334
+    },
+    {
+      "epoch": 0.5187991839113961,
+      "grad_norm": 0.21044284105300903,
+      "learning_rate": 9.653561697158428e-05,
+      "loss": 1.0384,
+      "step": 1335
+    },
+    {
+      "epoch": 0.5191877975323035,
+      "grad_norm": 0.20275571942329407,
+      "learning_rate": 9.645776566757494e-05,
+      "loss": 0.9978,
+      "step": 1336
+    },
+    {
+      "epoch": 0.519576411153211,
+      "grad_norm": 0.2077122926712036,
+      "learning_rate": 9.63799143635656e-05,
+      "loss": 1.0418,
+      "step": 1337
+    },
+    {
+      "epoch": 0.5199650247741183,
+      "grad_norm": 0.19158867001533508,
+      "learning_rate": 9.630206305955625e-05,
+      "loss": 1.0527,
+      "step": 1338
+    },
+    {
+      "epoch": 0.5203536383950258,
+      "grad_norm": 0.1932496577501297,
+      "learning_rate": 9.622421175554691e-05,
+      "loss": 1.0039,
+      "step": 1339
+    },
+    {
+      "epoch": 0.5207422520159332,
+      "grad_norm": 0.21937766671180725,
+      "learning_rate": 9.614636045153757e-05,
+      "loss": 1.0373,
+      "step": 1340
+    },
+    {
+      "epoch": 0.5211308656368405,
+      "grad_norm": 0.2268432229757309,
+      "learning_rate": 9.606850914752823e-05,
+      "loss": 1.0815,
+      "step": 1341
+    },
+    {
+      "epoch": 0.521519479257748,
+      "grad_norm": 0.2147454470396042,
+      "learning_rate": 9.599065784351888e-05,
+      "loss": 1.0331,
+      "step": 1342
+    },
+    {
+      "epoch": 0.5219080928786554,
+      "grad_norm": 0.19899709522724152,
+      "learning_rate": 9.591280653950954e-05,
+      "loss": 1.032,
+      "step": 1343
+    },
+    {
+      "epoch": 0.5222967064995628,
+      "grad_norm": 0.19646069407463074,
+      "learning_rate": 9.58349552355002e-05,
+      "loss": 0.9788,
+      "step": 1344
+    },
+    {
+      "epoch": 0.5226853201204702,
+      "grad_norm": 0.2146075963973999,
+      "learning_rate": 9.575710393149085e-05,
+      "loss": 1.0201,
+      "step": 1345
+    },
+    {
+      "epoch": 0.5230739337413777,
+      "grad_norm": 0.1968650370836258,
+      "learning_rate": 9.567925262748152e-05,
+      "loss": 0.9894,
+      "step": 1346
+    },
+    {
+      "epoch": 0.523462547362285,
+      "grad_norm": 0.21111296117305756,
+      "learning_rate": 9.560140132347217e-05,
+      "loss": 1.0961,
+      "step": 1347
+    },
+    {
+      "epoch": 0.5238511609831925,
+      "grad_norm": 0.20917272567749023,
+      "learning_rate": 9.552355001946282e-05,
+      "loss": 1.0435,
+      "step": 1348
+    },
+    {
+      "epoch": 0.5242397746040999,
+      "grad_norm": 0.2029752880334854,
+      "learning_rate": 9.544569871545349e-05,
+      "loss": 1.0328,
+      "step": 1349
+    },
+    {
+      "epoch": 0.5246283882250072,
+      "grad_norm": 0.20726613700389862,
+      "learning_rate": 9.536784741144414e-05,
+      "loss": 1.0465,
+      "step": 1350
+    },
+    {
+      "epoch": 0.5250170018459147,
+      "grad_norm": 0.19778740406036377,
+      "learning_rate": 9.52899961074348e-05,
+      "loss": 1.0058,
+      "step": 1351
+    },
+    {
+      "epoch": 0.5254056154668221,
+      "grad_norm": 0.19958540797233582,
+      "learning_rate": 9.521214480342546e-05,
+      "loss": 1.0164,
+      "step": 1352
+    },
+    {
+      "epoch": 0.5257942290877295,
+      "grad_norm": 0.2151395082473755,
+      "learning_rate": 9.513429349941611e-05,
+      "loss": 1.0703,
+      "step": 1353
+    },
+    {
+      "epoch": 0.5261828427086369,
+      "grad_norm": 0.2366979569196701,
+      "learning_rate": 9.505644219540678e-05,
+      "loss": 0.9832,
+      "step": 1354
+    },
+    {
+      "epoch": 0.5265714563295444,
+      "grad_norm": 0.22064165771007538,
+      "learning_rate": 9.497859089139743e-05,
+      "loss": 1.0181,
+      "step": 1355
+    },
+    {
+      "epoch": 0.5269600699504517,
+      "grad_norm": 0.20221936702728271,
+      "learning_rate": 9.49007395873881e-05,
+      "loss": 1.0424,
+      "step": 1356
+    },
+    {
+      "epoch": 0.5273486835713592,
+      "grad_norm": 0.19608759880065918,
+      "learning_rate": 9.482288828337876e-05,
+      "loss": 1.0074,
+      "step": 1357
+    },
+    {
+      "epoch": 0.5277372971922666,
+      "grad_norm": 0.20686689019203186,
+      "learning_rate": 9.474503697936941e-05,
+      "loss": 1.0213,
+      "step": 1358
+    },
+    {
+      "epoch": 0.528125910813174,
+      "grad_norm": 0.223610520362854,
+      "learning_rate": 9.466718567536008e-05,
+      "loss": 1.05,
+      "step": 1359
+    },
+    {
+      "epoch": 0.5285145244340814,
+      "grad_norm": 0.2135966569185257,
+      "learning_rate": 9.458933437135073e-05,
+      "loss": 1.034,
+      "step": 1360
+    },
+    {
+      "epoch": 0.5289031380549888,
+      "grad_norm": 0.1933239996433258,
+      "learning_rate": 9.451148306734138e-05,
+      "loss": 0.9883,
+      "step": 1361
+    },
+    {
+      "epoch": 0.5292917516758963,
+      "grad_norm": 0.20794694125652313,
+      "learning_rate": 9.443363176333205e-05,
+      "loss": 1.0103,
+      "step": 1362
+    },
+    {
+      "epoch": 0.5296803652968036,
+      "grad_norm": 0.20128493010997772,
+      "learning_rate": 9.43557804593227e-05,
+      "loss": 1.015,
+      "step": 1363
+    },
+    {
+      "epoch": 0.5300689789177111,
+      "grad_norm": 0.2128933072090149,
+      "learning_rate": 9.427792915531336e-05,
+      "loss": 1.0038,
+      "step": 1364
+    },
+    {
+      "epoch": 0.5304575925386185,
+      "grad_norm": 0.2046983689069748,
+      "learning_rate": 9.420007785130402e-05,
+      "loss": 0.9948,
+      "step": 1365
+    },
+    {
+      "epoch": 0.5308462061595259,
+      "grad_norm": 0.20909680426120758,
+      "learning_rate": 9.412222654729467e-05,
+      "loss": 1.0308,
+      "step": 1366
+    },
+    {
+      "epoch": 0.5312348197804333,
+      "grad_norm": 0.2182164192199707,
+      "learning_rate": 9.404437524328533e-05,
+      "loss": 1.0018,
+      "step": 1367
+    },
+    {
+      "epoch": 0.5316234334013407,
+      "grad_norm": 0.2107028216123581,
+      "learning_rate": 9.396652393927599e-05,
+      "loss": 1.0419,
+      "step": 1368
+    },
+    {
+      "epoch": 0.5320120470222481,
+      "grad_norm": 0.24631445109844208,
+      "learning_rate": 9.388867263526665e-05,
+      "loss": 1.0171,
+      "step": 1369
+    },
+    {
+      "epoch": 0.5324006606431555,
+      "grad_norm": 0.20331013202667236,
+      "learning_rate": 9.38108213312573e-05,
+      "loss": 1.0592,
+      "step": 1370
+    },
+    {
+      "epoch": 0.532789274264063,
+      "grad_norm": 0.19266058504581451,
+      "learning_rate": 9.373297002724796e-05,
+      "loss": 0.9912,
+      "step": 1371
+    },
+    {
+      "epoch": 0.5331778878849703,
+      "grad_norm": 0.22874227166175842,
+      "learning_rate": 9.365511872323862e-05,
+      "loss": 1.0533,
+      "step": 1372
+    },
+    {
+      "epoch": 0.5335665015058778,
+      "grad_norm": 0.2088235765695572,
+      "learning_rate": 9.357726741922927e-05,
+      "loss": 1.0464,
+      "step": 1373
+    },
+    {
+      "epoch": 0.5339551151267852,
+      "grad_norm": 0.2112397700548172,
+      "learning_rate": 9.349941611521994e-05,
+      "loss": 1.0503,
+      "step": 1374
+    },
+    {
+      "epoch": 0.5343437287476926,
+      "grad_norm": 0.20712170004844666,
+      "learning_rate": 9.342156481121059e-05,
+      "loss": 1.0237,
+      "step": 1375
+    },
+    {
+      "epoch": 0.5347323423686,
+      "grad_norm": 0.20077116787433624,
+      "learning_rate": 9.334371350720124e-05,
+      "loss": 1.0467,
+      "step": 1376
+    },
+    {
+      "epoch": 0.5351209559895075,
+      "grad_norm": 0.20394501090049744,
+      "learning_rate": 9.326586220319191e-05,
+      "loss": 1.0054,
+      "step": 1377
+    },
+    {
+      "epoch": 0.5355095696104148,
+      "grad_norm": 0.19459395110607147,
+      "learning_rate": 9.318801089918256e-05,
+      "loss": 0.9792,
+      "step": 1378
+    },
+    {
+      "epoch": 0.5358981832313222,
+      "grad_norm": 0.2116049826145172,
+      "learning_rate": 9.311015959517321e-05,
+      "loss": 1.0345,
+      "step": 1379
+    },
+    {
+      "epoch": 0.5362867968522297,
+      "grad_norm": 0.21672269701957703,
+      "learning_rate": 9.303230829116388e-05,
+      "loss": 1.0709,
+      "step": 1380
+    },
+    {
+      "epoch": 0.536675410473137,
+      "grad_norm": 0.20358407497406006,
+      "learning_rate": 9.295445698715453e-05,
+      "loss": 1.0534,
+      "step": 1381
+    },
+    {
+      "epoch": 0.5370640240940445,
+      "grad_norm": 0.19512853026390076,
+      "learning_rate": 9.28766056831452e-05,
+      "loss": 0.9397,
+      "step": 1382
+    },
+    {
+      "epoch": 0.5374526377149519,
+      "grad_norm": 0.2140122503042221,
+      "learning_rate": 9.279875437913586e-05,
+      "loss": 1.0164,
+      "step": 1383
+    },
+    {
+      "epoch": 0.5378412513358594,
+      "grad_norm": 0.20486049354076385,
+      "learning_rate": 9.272090307512651e-05,
+      "loss": 0.9892,
+      "step": 1384
+    },
+    {
+      "epoch": 0.5382298649567667,
+      "grad_norm": 0.20023222267627716,
+      "learning_rate": 9.264305177111718e-05,
+      "loss": 1.0019,
+      "step": 1385
+    },
+    {
+      "epoch": 0.5386184785776742,
+      "grad_norm": 0.20024439692497253,
+      "learning_rate": 9.256520046710783e-05,
+      "loss": 0.9717,
+      "step": 1386
+    },
+    {
+      "epoch": 0.5390070921985816,
+      "grad_norm": 0.21021386981010437,
+      "learning_rate": 9.24873491630985e-05,
+      "loss": 1.028,
+      "step": 1387
+    },
+    {
+      "epoch": 0.5393957058194889,
+      "grad_norm": 0.18508704006671906,
+      "learning_rate": 9.240949785908915e-05,
+      "loss": 1.0008,
+      "step": 1388
+    },
+    {
+      "epoch": 0.5397843194403964,
+      "grad_norm": 0.19351208209991455,
+      "learning_rate": 9.23316465550798e-05,
+      "loss": 0.9898,
+      "step": 1389
+    },
+    {
+      "epoch": 0.5401729330613038,
+      "grad_norm": 0.20341919362545013,
+      "learning_rate": 9.225379525107047e-05,
+      "loss": 1.0203,
+      "step": 1390
+    },
+    {
+      "epoch": 0.5405615466822112,
+      "grad_norm": 0.1942797303199768,
+      "learning_rate": 9.217594394706112e-05,
+      "loss": 1.003,
+      "step": 1391
+    },
+    {
+      "epoch": 0.5409501603031186,
+      "grad_norm": 0.2056138813495636,
+      "learning_rate": 9.209809264305178e-05,
+      "loss": 1.0149,
+      "step": 1392
+    },
+    {
+      "epoch": 0.5413387739240261,
+      "grad_norm": 0.21572062373161316,
+      "learning_rate": 9.202024133904244e-05,
+      "loss": 0.9808,
+      "step": 1393
+    },
+    {
+      "epoch": 0.5417273875449334,
+      "grad_norm": 0.19841499626636505,
+      "learning_rate": 9.194239003503309e-05,
+      "loss": 1.0467,
+      "step": 1394
+    },
+    {
+      "epoch": 0.5421160011658409,
+      "grad_norm": 0.20452147722244263,
+      "learning_rate": 9.186453873102375e-05,
+      "loss": 1.0378,
+      "step": 1395
+    },
+    {
+      "epoch": 0.5425046147867483,
+      "grad_norm": 0.2090451419353485,
+      "learning_rate": 9.17866874270144e-05,
+      "loss": 1.0823,
+      "step": 1396
+    },
+    {
+      "epoch": 0.5428932284076556,
+      "grad_norm": 0.215814009308815,
+      "learning_rate": 9.170883612300506e-05,
+      "loss": 1.0994,
+      "step": 1397
+    },
+    {
+      "epoch": 0.5432818420285631,
+      "grad_norm": 0.19924724102020264,
+      "learning_rate": 9.163098481899572e-05,
+      "loss": 1.0099,
+      "step": 1398
+    },
+    {
+      "epoch": 0.5436704556494705,
+      "grad_norm": 0.20074865221977234,
+      "learning_rate": 9.155313351498638e-05,
+      "loss": 1.0163,
+      "step": 1399
+    },
+    {
+      "epoch": 0.544059069270378,
+      "grad_norm": 0.21737203001976013,
+      "learning_rate": 9.147528221097704e-05,
+      "loss": 1.0527,
+      "step": 1400
+    },
+    {
+      "epoch": 0.5444476828912853,
+      "grad_norm": 0.2036885768175125,
+      "learning_rate": 9.139743090696769e-05,
+      "loss": 1.0208,
+      "step": 1401
+    },
+    {
+      "epoch": 0.5448362965121928,
+      "grad_norm": 0.20861585438251495,
+      "learning_rate": 9.131957960295835e-05,
+      "loss": 1.0175,
+      "step": 1402
+    },
+    {
+      "epoch": 0.5452249101331001,
+      "grad_norm": 0.23425570130348206,
+      "learning_rate": 9.124172829894901e-05,
+      "loss": 1.053,
+      "step": 1403
+    },
+    {
+      "epoch": 0.5456135237540076,
+      "grad_norm": 0.20389291644096375,
+      "learning_rate": 9.116387699493966e-05,
+      "loss": 1.0479,
+      "step": 1404
+    },
+    {
+      "epoch": 0.546002137374915,
+      "grad_norm": 0.20166678726673126,
+      "learning_rate": 9.108602569093033e-05,
+      "loss": 1.0064,
+      "step": 1405
+    },
+    {
+      "epoch": 0.5463907509958223,
+      "grad_norm": 0.21419203281402588,
+      "learning_rate": 9.100817438692098e-05,
+      "loss": 1.0122,
+      "step": 1406
+    },
+    {
+      "epoch": 0.5467793646167298,
+      "grad_norm": 0.20541758835315704,
+      "learning_rate": 9.093032308291165e-05,
+      "loss": 1.0355,
+      "step": 1407
+    },
+    {
+      "epoch": 0.5471679782376372,
+      "grad_norm": 0.21865367889404297,
+      "learning_rate": 9.08524717789023e-05,
+      "loss": 1.0201,
+      "step": 1408
+    },
+    {
+      "epoch": 0.5475565918585447,
+      "grad_norm": 0.21181468665599823,
+      "learning_rate": 9.077462047489296e-05,
+      "loss": 1.0501,
+      "step": 1409
+    },
+    {
+      "epoch": 0.547945205479452,
+      "grad_norm": 0.21016767621040344,
+      "learning_rate": 9.069676917088362e-05,
+      "loss": 1.0452,
+      "step": 1410
+    },
+    {
+      "epoch": 0.5483338191003595,
+      "grad_norm": 0.21119755506515503,
+      "learning_rate": 9.061891786687428e-05,
+      "loss": 1.0935,
+      "step": 1411
+    },
+    {
+      "epoch": 0.5487224327212669,
+      "grad_norm": 0.20688095688819885,
+      "learning_rate": 9.054106656286493e-05,
+      "loss": 1.0526,
+      "step": 1412
+    },
+    {
+      "epoch": 0.5491110463421743,
+      "grad_norm": 0.21857528388500214,
+      "learning_rate": 9.04632152588556e-05,
+      "loss": 1.0067,
+      "step": 1413
+    },
+    {
+      "epoch": 0.5494996599630817,
+      "grad_norm": 0.2196548581123352,
+      "learning_rate": 9.038536395484625e-05,
+      "loss": 1.0263,
+      "step": 1414
+    },
+    {
+      "epoch": 0.5498882735839892,
+      "grad_norm": 0.21952040493488312,
+      "learning_rate": 9.03075126508369e-05,
+      "loss": 1.0009,
+      "step": 1415
+    },
+    {
+      "epoch": 0.5502768872048965,
+      "grad_norm": 0.20059294998645782,
+      "learning_rate": 9.022966134682757e-05,
+      "loss": 1.0481,
+      "step": 1416
+    },
+    {
+      "epoch": 0.5506655008258039,
+      "grad_norm": 0.1960824728012085,
+      "learning_rate": 9.015181004281822e-05,
+      "loss": 1.0003,
+      "step": 1417
+    },
+    {
+      "epoch": 0.5510541144467114,
+      "grad_norm": 0.19051724672317505,
+      "learning_rate": 9.007395873880889e-05,
+      "loss": 0.9556,
+      "step": 1418
+    },
+    {
+      "epoch": 0.5514427280676187,
+      "grad_norm": 0.21008028090000153,
+      "learning_rate": 8.999610743479954e-05,
+      "loss": 1.0457,
+      "step": 1419
+    },
+    {
+      "epoch": 0.5518313416885262,
+      "grad_norm": 0.21465444564819336,
+      "learning_rate": 8.991825613079019e-05,
+      "loss": 1.0196,
+      "step": 1420
+    },
+    {
+      "epoch": 0.5522199553094336,
+      "grad_norm": 0.2062770277261734,
+      "learning_rate": 8.984040482678086e-05,
+      "loss": 1.0501,
+      "step": 1421
+    },
+    {
+      "epoch": 0.552608568930341,
+      "grad_norm": 0.21400012075901031,
+      "learning_rate": 8.976255352277151e-05,
+      "loss": 1.0711,
+      "step": 1422
+    },
+    {
+      "epoch": 0.5529971825512484,
+      "grad_norm": 0.19617624580860138,
+      "learning_rate": 8.968470221876217e-05,
+      "loss": 0.9858,
+      "step": 1423
+    },
+    {
+      "epoch": 0.5533857961721559,
+      "grad_norm": 0.20835624635219574,
+      "learning_rate": 8.960685091475283e-05,
+      "loss": 1.0122,
+      "step": 1424
+    },
+    {
+      "epoch": 0.5537744097930632,
+      "grad_norm": 0.21708111464977264,
+      "learning_rate": 8.952899961074348e-05,
+      "loss": 1.0108,
+      "step": 1425
+    },
+    {
+      "epoch": 0.5541630234139706,
+      "grad_norm": 0.20877864956855774,
+      "learning_rate": 8.945114830673414e-05,
+      "loss": 1.0389,
+      "step": 1426
+    },
+    {
+      "epoch": 0.5545516370348781,
+      "grad_norm": 0.1924441158771515,
+      "learning_rate": 8.93732970027248e-05,
+      "loss": 1.0088,
+      "step": 1427
+    },
+    {
+      "epoch": 0.5549402506557854,
+      "grad_norm": 0.20288826525211334,
+      "learning_rate": 8.929544569871546e-05,
+      "loss": 1.0296,
+      "step": 1428
+    },
+    {
+      "epoch": 0.5553288642766929,
+      "grad_norm": 0.2008143663406372,
+      "learning_rate": 8.921759439470611e-05,
+      "loss": 1.0521,
+      "step": 1429
+    },
+    {
+      "epoch": 0.5557174778976003,
+      "grad_norm": 0.24407047033309937,
+      "learning_rate": 8.913974309069677e-05,
+      "loss": 1.1038,
+      "step": 1430
+    },
+    {
+      "epoch": 0.5561060915185078,
+      "grad_norm": 0.2172536998987198,
+      "learning_rate": 8.906189178668743e-05,
+      "loss": 1.0811,
+      "step": 1431
+    },
+    {
+      "epoch": 0.5564947051394151,
+      "grad_norm": 0.21712054312229156,
+      "learning_rate": 8.898404048267808e-05,
+      "loss": 1.0642,
+      "step": 1432
+    },
+    {
+      "epoch": 0.5568833187603226,
+      "grad_norm": 0.22482797503471375,
+      "learning_rate": 8.890618917866875e-05,
+      "loss": 1.0742,
+      "step": 1433
+    },
+    {
+      "epoch": 0.55727193238123,
+      "grad_norm": 0.1974876970052719,
+      "learning_rate": 8.88283378746594e-05,
+      "loss": 0.9954,
+      "step": 1434
+    },
+    {
+      "epoch": 0.5576605460021373,
+      "grad_norm": 0.19162166118621826,
+      "learning_rate": 8.875048657065007e-05,
+      "loss": 1.0074,
+      "step": 1435
+    },
+    {
+      "epoch": 0.5580491596230448,
+      "grad_norm": 0.20439045131206512,
+      "learning_rate": 8.867263526664072e-05,
+      "loss": 1.026,
+      "step": 1436
+    },
+    {
+      "epoch": 0.5584377732439522,
+      "grad_norm": 0.1947651207447052,
+      "learning_rate": 8.859478396263138e-05,
+      "loss": 0.9848,
+      "step": 1437
+    },
+    {
+      "epoch": 0.5588263868648596,
+      "grad_norm": 0.21434316039085388,
+      "learning_rate": 8.851693265862204e-05,
+      "loss": 1.0843,
+      "step": 1438
+    },
+    {
+      "epoch": 0.559215000485767,
+      "grad_norm": 1.3314417600631714,
+      "learning_rate": 8.84390813546127e-05,
+      "loss": 1.0356,
+      "step": 1439
+    },
+    {
+      "epoch": 0.5596036141066745,
+      "grad_norm": 0.20131289958953857,
+      "learning_rate": 8.836123005060335e-05,
+      "loss": 1.0214,
+      "step": 1440
+    },
+    {
+      "epoch": 0.5599922277275818,
+      "grad_norm": 0.21596461534500122,
+      "learning_rate": 8.828337874659402e-05,
+      "loss": 1.0962,
+      "step": 1441
+    },
+    {
+      "epoch": 0.5603808413484893,
+      "grad_norm": 0.20477193593978882,
+      "learning_rate": 8.820552744258467e-05,
+      "loss": 1.0643,
+      "step": 1442
+    },
+    {
+      "epoch": 0.5607694549693967,
+      "grad_norm": 0.1978107988834381,
+      "learning_rate": 8.812767613857532e-05,
+      "loss": 1.0054,
+      "step": 1443
+    },
+    {
+      "epoch": 0.561158068590304,
+      "grad_norm": 0.219422847032547,
+      "learning_rate": 8.804982483456599e-05,
+      "loss": 1.0009,
+      "step": 1444
+    },
+    {
+      "epoch": 0.5615466822112115,
+      "grad_norm": 0.21489015221595764,
+      "learning_rate": 8.797197353055664e-05,
+      "loss": 1.052,
+      "step": 1445
+    },
+    {
+      "epoch": 0.5619352958321189,
+      "grad_norm": 0.2235930860042572,
+      "learning_rate": 8.78941222265473e-05,
+      "loss": 1.037,
+      "step": 1446
+    },
+    {
+      "epoch": 0.5623239094530263,
+      "grad_norm": 0.19922038912773132,
+      "learning_rate": 8.781627092253796e-05,
+      "loss": 1.0006,
+      "step": 1447
+    },
+    {
+      "epoch": 0.5627125230739337,
+      "grad_norm": 0.24740247428417206,
+      "learning_rate": 8.773841961852861e-05,
+      "loss": 1.0753,
+      "step": 1448
+    },
+    {
+      "epoch": 0.5631011366948412,
+      "grad_norm": 0.2148803174495697,
+      "learning_rate": 8.766056831451928e-05,
+      "loss": 1.0712,
+      "step": 1449
+    },
+    {
+      "epoch": 0.5634897503157485,
+      "grad_norm": 0.19838745892047882,
+      "learning_rate": 8.758271701050993e-05,
+      "loss": 1.027,
+      "step": 1450
+    },
+    {
+      "epoch": 0.563878363936656,
+      "grad_norm": 0.20328201353549957,
+      "learning_rate": 8.750486570650058e-05,
+      "loss": 1.0117,
+      "step": 1451
+    },
+    {
+      "epoch": 0.5642669775575634,
+      "grad_norm": 0.21230114996433258,
+      "learning_rate": 8.742701440249125e-05,
+      "loss": 1.0658,
+      "step": 1452
+    },
+    {
+      "epoch": 0.5646555911784708,
+      "grad_norm": 0.2030259519815445,
+      "learning_rate": 8.73491630984819e-05,
+      "loss": 1.0002,
+      "step": 1453
+    },
+    {
+      "epoch": 0.5650442047993782,
+      "grad_norm": 0.21404659748077393,
+      "learning_rate": 8.727131179447256e-05,
+      "loss": 1.0572,
+      "step": 1454
+    },
+    {
+      "epoch": 0.5654328184202856,
+      "grad_norm": 0.2148464322090149,
+      "learning_rate": 8.719346049046322e-05,
+      "loss": 1.0164,
+      "step": 1455
+    },
+    {
+      "epoch": 0.5658214320411931,
+      "grad_norm": 0.22083118557929993,
+      "learning_rate": 8.711560918645387e-05,
+      "loss": 0.9704,
+      "step": 1456
+    },
+    {
+      "epoch": 0.5662100456621004,
+      "grad_norm": 0.19305935502052307,
+      "learning_rate": 8.703775788244453e-05,
+      "loss": 1.0034,
+      "step": 1457
+    },
+    {
+      "epoch": 0.5665986592830079,
+      "grad_norm": 0.2100098729133606,
+      "learning_rate": 8.695990657843518e-05,
+      "loss": 1.0907,
+      "step": 1458
+    },
+    {
+      "epoch": 0.5669872729039153,
+      "grad_norm": 0.18947799503803253,
+      "learning_rate": 8.688205527442585e-05,
+      "loss": 0.9664,
+      "step": 1459
+    },
+    {
+      "epoch": 0.5673758865248227,
+      "grad_norm": 0.22341710329055786,
+      "learning_rate": 8.68042039704165e-05,
+      "loss": 1.0551,
+      "step": 1460
+    },
+    {
+      "epoch": 0.5677645001457301,
+      "grad_norm": 0.219679057598114,
+      "learning_rate": 8.672635266640717e-05,
+      "loss": 1.0398,
+      "step": 1461
+    },
+    {
+      "epoch": 0.5681531137666376,
+      "grad_norm": 0.22389841079711914,
+      "learning_rate": 8.664850136239782e-05,
+      "loss": 1.0472,
+      "step": 1462
+    },
+    {
+      "epoch": 0.5685417273875449,
+      "grad_norm": 0.21402975916862488,
+      "learning_rate": 8.657065005838849e-05,
+      "loss": 1.0224,
+      "step": 1463
+    },
+    {
+      "epoch": 0.5689303410084523,
+      "grad_norm": 0.20917154848575592,
+      "learning_rate": 8.649279875437915e-05,
+      "loss": 1.0526,
+      "step": 1464
+    },
+    {
+      "epoch": 0.5693189546293598,
+      "grad_norm": 0.2252056896686554,
+      "learning_rate": 8.64149474503698e-05,
+      "loss": 1.1064,
+      "step": 1465
+    },
+    {
+      "epoch": 0.5697075682502671,
+      "grad_norm": 0.21834802627563477,
+      "learning_rate": 8.633709614636046e-05,
+      "loss": 1.0318,
+      "step": 1466
+    },
+    {
+      "epoch": 0.5700961818711746,
+      "grad_norm": 0.21882353723049164,
+      "learning_rate": 8.625924484235112e-05,
+      "loss": 1.0285,
+      "step": 1467
+    },
+    {
+      "epoch": 0.570484795492082,
+      "grad_norm": 0.2028426229953766,
+      "learning_rate": 8.618139353834177e-05,
+      "loss": 1.0356,
+      "step": 1468
+    },
+    {
+      "epoch": 0.5708734091129894,
+      "grad_norm": 0.22297166287899017,
+      "learning_rate": 8.610354223433243e-05,
+      "loss": 1.0804,
+      "step": 1469
+    },
+    {
+      "epoch": 0.5712620227338968,
+      "grad_norm": 0.21775268018245697,
+      "learning_rate": 8.602569093032309e-05,
+      "loss": 0.9978,
+      "step": 1470
+    },
+    {
+      "epoch": 0.5716506363548043,
+      "grad_norm": 0.20362353324890137,
+      "learning_rate": 8.594783962631374e-05,
+      "loss": 0.9982,
+      "step": 1471
+    },
+    {
+      "epoch": 0.5720392499757117,
+      "grad_norm": 0.21854591369628906,
+      "learning_rate": 8.586998832230441e-05,
+      "loss": 1.0465,
+      "step": 1472
+    },
+    {
+      "epoch": 0.572427863596619,
+      "grad_norm": 0.20501428842544556,
+      "learning_rate": 8.579213701829506e-05,
+      "loss": 1.0468,
+      "step": 1473
+    },
+    {
+      "epoch": 0.5728164772175265,
+      "grad_norm": 0.21606214344501495,
+      "learning_rate": 8.571428571428571e-05,
+      "loss": 1.0477,
+      "step": 1474
+    },
+    {
+      "epoch": 0.5732050908384339,
+      "grad_norm": 0.2100660502910614,
+      "learning_rate": 8.563643441027638e-05,
+      "loss": 1.0071,
+      "step": 1475
+    },
+    {
+      "epoch": 0.5735937044593413,
+      "grad_norm": 0.21008896827697754,
+      "learning_rate": 8.555858310626703e-05,
+      "loss": 0.9914,
+      "step": 1476
+    },
+    {
+      "epoch": 0.5739823180802487,
+      "grad_norm": 0.22192159295082092,
+      "learning_rate": 8.54807318022577e-05,
+      "loss": 1.0385,
+      "step": 1477
+    },
+    {
+      "epoch": 0.5743709317011562,
+      "grad_norm": 0.20123356580734253,
+      "learning_rate": 8.540288049824835e-05,
+      "loss": 1.0062,
+      "step": 1478
+    },
+    {
+      "epoch": 0.5747595453220635,
+      "grad_norm": 0.201947420835495,
+      "learning_rate": 8.5325029194239e-05,
+      "loss": 1.0218,
+      "step": 1479
+    },
+    {
+      "epoch": 0.575148158942971,
+      "grad_norm": 0.22804415225982666,
+      "learning_rate": 8.524717789022967e-05,
+      "loss": 1.0445,
+      "step": 1480
+    },
+    {
+      "epoch": 0.5755367725638784,
+      "grad_norm": 0.20527036488056183,
+      "learning_rate": 8.516932658622032e-05,
+      "loss": 0.9972,
+      "step": 1481
+    },
+    {
+      "epoch": 0.5759253861847857,
+      "grad_norm": 0.20298773050308228,
+      "learning_rate": 8.509147528221098e-05,
+      "loss": 1.0272,
+      "step": 1482
+    },
+    {
+      "epoch": 0.5763139998056932,
+      "grad_norm": 0.22500957548618317,
+      "learning_rate": 8.501362397820164e-05,
+      "loss": 1.0982,
+      "step": 1483
+    },
+    {
+      "epoch": 0.5767026134266006,
+      "grad_norm": 0.1950521320104599,
+      "learning_rate": 8.493577267419229e-05,
+      "loss": 0.9848,
+      "step": 1484
+    },
+    {
+      "epoch": 0.577091227047508,
+      "grad_norm": 0.21087585389614105,
+      "learning_rate": 8.485792137018295e-05,
+      "loss": 1.0125,
+      "step": 1485
+    },
+    {
+      "epoch": 0.5774798406684154,
+      "grad_norm": 0.20122238993644714,
+      "learning_rate": 8.47800700661736e-05,
+      "loss": 1.0533,
+      "step": 1486
+    },
+    {
+      "epoch": 0.5778684542893229,
+      "grad_norm": 0.20149008929729462,
+      "learning_rate": 8.470221876216427e-05,
+      "loss": 1.0719,
+      "step": 1487
+    },
+    {
+      "epoch": 0.5782570679102302,
+      "grad_norm": 0.21307213604450226,
+      "learning_rate": 8.462436745815494e-05,
+      "loss": 1.0522,
+      "step": 1488
+    },
+    {
+      "epoch": 0.5786456815311377,
+      "grad_norm": 0.21828554570674896,
+      "learning_rate": 8.454651615414559e-05,
+      "loss": 1.0184,
+      "step": 1489
+    },
+    {
+      "epoch": 0.5790342951520451,
+      "grad_norm": 0.22002705931663513,
+      "learning_rate": 8.446866485013625e-05,
+      "loss": 1.0101,
+      "step": 1490
+    },
+    {
+      "epoch": 0.5794229087729524,
+      "grad_norm": 0.19479142129421234,
+      "learning_rate": 8.43908135461269e-05,
+      "loss": 0.9889,
+      "step": 1491
+    },
+    {
+      "epoch": 0.5798115223938599,
+      "grad_norm": 0.21346086263656616,
+      "learning_rate": 8.431296224211756e-05,
+      "loss": 1.0373,
+      "step": 1492
+    },
+    {
+      "epoch": 0.5802001360147673,
+      "grad_norm": 0.20177558064460754,
+      "learning_rate": 8.423511093810822e-05,
+      "loss": 1.0215,
+      "step": 1493
+    },
+    {
+      "epoch": 0.5805887496356748,
+      "grad_norm": 0.2117915153503418,
+      "learning_rate": 8.415725963409888e-05,
+      "loss": 1.0321,
+      "step": 1494
+    },
+    {
+      "epoch": 0.5809773632565821,
+      "grad_norm": 0.21304374933242798,
+      "learning_rate": 8.407940833008954e-05,
+      "loss": 1.0123,
+      "step": 1495
+    },
+    {
+      "epoch": 0.5813659768774896,
+      "grad_norm": 0.21173715591430664,
+      "learning_rate": 8.400155702608019e-05,
+      "loss": 1.0696,
+      "step": 1496
+    },
+    {
+      "epoch": 0.581754590498397,
+      "grad_norm": 0.20407019555568695,
+      "learning_rate": 8.392370572207085e-05,
+      "loss": 1.0086,
+      "step": 1497
+    },
+    {
+      "epoch": 0.5821432041193044,
+      "grad_norm": 0.209481880068779,
+      "learning_rate": 8.384585441806151e-05,
+      "loss": 0.9975,
+      "step": 1498
+    },
+    {
+      "epoch": 0.5825318177402118,
+      "grad_norm": 0.22184531390666962,
+      "learning_rate": 8.376800311405216e-05,
+      "loss": 1.0956,
+      "step": 1499
+    },
+    {
+      "epoch": 0.5829204313611193,
+      "grad_norm": 0.21344684064388275,
+      "learning_rate": 8.369015181004283e-05,
+      "loss": 1.0685,
+      "step": 1500
+    },
+    {
+      "epoch": 0.5833090449820266,
+      "grad_norm": 0.19837221503257751,
+      "learning_rate": 8.361230050603348e-05,
+      "loss": 1.0149,
+      "step": 1501
+    },
+    {
+      "epoch": 0.583697658602934,
+      "grad_norm": 0.2133672833442688,
+      "learning_rate": 8.353444920202413e-05,
+      "loss": 1.0453,
+      "step": 1502
+    },
+    {
+      "epoch": 0.5840862722238415,
+      "grad_norm": 0.21944090723991394,
+      "learning_rate": 8.34565978980148e-05,
+      "loss": 1.04,
+      "step": 1503
+    },
+    {
+      "epoch": 0.5844748858447488,
+      "grad_norm": 0.1983667016029358,
+      "learning_rate": 8.337874659400545e-05,
+      "loss": 0.9919,
+      "step": 1504
+    },
+    {
+      "epoch": 0.5848634994656563,
+      "grad_norm": 0.2025303989648819,
+      "learning_rate": 8.33008952899961e-05,
+      "loss": 1.0021,
+      "step": 1505
+    },
+    {
+      "epoch": 0.5852521130865637,
+      "grad_norm": 0.2015170007944107,
+      "learning_rate": 8.322304398598677e-05,
+      "loss": 0.9945,
+      "step": 1506
+    },
+    {
+      "epoch": 0.5856407267074711,
+      "grad_norm": 0.20768272876739502,
+      "learning_rate": 8.314519268197742e-05,
+      "loss": 1.0465,
+      "step": 1507
+    },
+    {
+      "epoch": 0.5860293403283785,
+      "grad_norm": 0.20513412356376648,
+      "learning_rate": 8.306734137796809e-05,
+      "loss": 1.0124,
+      "step": 1508
+    },
+    {
+      "epoch": 0.586417953949286,
+      "grad_norm": 0.20268471539020538,
+      "learning_rate": 8.298949007395874e-05,
+      "loss": 1.0586,
+      "step": 1509
+    },
+    {
+      "epoch": 0.5868065675701933,
+      "grad_norm": 0.20915938913822174,
+      "learning_rate": 8.291163876994939e-05,
+      "loss": 1.0047,
+      "step": 1510
+    },
+    {
+      "epoch": 0.5871951811911007,
+      "grad_norm": 0.2161451131105423,
+      "learning_rate": 8.283378746594006e-05,
+      "loss": 1.0184,
+      "step": 1511
+    },
+    {
+      "epoch": 0.5875837948120082,
+      "grad_norm": 0.1915571093559265,
+      "learning_rate": 8.275593616193071e-05,
+      "loss": 1.0187,
+      "step": 1512
+    },
+    {
+      "epoch": 0.5879724084329155,
+      "grad_norm": 0.20907992124557495,
+      "learning_rate": 8.267808485792137e-05,
+      "loss": 1.0212,
+      "step": 1513
+    },
+    {
+      "epoch": 0.588361022053823,
+      "grad_norm": 0.20140786468982697,
+      "learning_rate": 8.260023355391204e-05,
+      "loss": 1.014,
+      "step": 1514
+    },
+    {
+      "epoch": 0.5887496356747304,
+      "grad_norm": 0.208252415060997,
+      "learning_rate": 8.252238224990269e-05,
+      "loss": 1.0806,
+      "step": 1515
+    },
+    {
+      "epoch": 0.5891382492956379,
+      "grad_norm": 0.20596125721931458,
+      "learning_rate": 8.244453094589336e-05,
+      "loss": 0.9823,
+      "step": 1516
+    },
+    {
+      "epoch": 0.5895268629165452,
+      "grad_norm": 0.18832452595233917,
+      "learning_rate": 8.236667964188401e-05,
+      "loss": 0.9925,
+      "step": 1517
+    },
+    {
+      "epoch": 0.5899154765374527,
+      "grad_norm": 0.2078334391117096,
+      "learning_rate": 8.228882833787467e-05,
+      "loss": 1.0587,
+      "step": 1518
+    },
+    {
+      "epoch": 0.59030409015836,
+      "grad_norm": 0.20121365785598755,
+      "learning_rate": 8.221097703386533e-05,
+      "loss": 1.0607,
+      "step": 1519
+    },
+    {
+      "epoch": 0.5906927037792674,
+      "grad_norm": 0.19666099548339844,
+      "learning_rate": 8.213312572985598e-05,
+      "loss": 1.0124,
+      "step": 1520
+    },
+    {
+      "epoch": 0.5910813174001749,
+      "grad_norm": 0.20176006853580475,
+      "learning_rate": 8.205527442584664e-05,
+      "loss": 1.0297,
+      "step": 1521
+    },
+    {
+      "epoch": 0.5914699310210823,
+      "grad_norm": 0.2038574516773224,
+      "learning_rate": 8.19774231218373e-05,
+      "loss": 1.0311,
+      "step": 1522
+    },
+    {
+      "epoch": 0.5918585446419897,
+      "grad_norm": 0.19517424702644348,
+      "learning_rate": 8.189957181782796e-05,
+      "loss": 0.9945,
+      "step": 1523
+    },
+    {
+      "epoch": 0.5922471582628971,
+      "grad_norm": 0.19599094986915588,
+      "learning_rate": 8.182172051381861e-05,
+      "loss": 1.0255,
+      "step": 1524
+    },
+    {
+      "epoch": 0.5926357718838046,
+      "grad_norm": 0.21409402787685394,
+      "learning_rate": 8.174386920980927e-05,
+      "loss": 1.0868,
+      "step": 1525
+    },
+    {
+      "epoch": 0.5930243855047119,
+      "grad_norm": 0.19567830860614777,
+      "learning_rate": 8.166601790579993e-05,
+      "loss": 0.9654,
+      "step": 1526
+    },
+    {
+      "epoch": 0.5934129991256194,
+      "grad_norm": 0.2275007963180542,
+      "learning_rate": 8.158816660179058e-05,
+      "loss": 1.0867,
+      "step": 1527
+    },
+    {
+      "epoch": 0.5938016127465268,
+      "grad_norm": 0.19826427102088928,
+      "learning_rate": 8.151031529778123e-05,
+      "loss": 1.0301,
+      "step": 1528
+    },
+    {
+      "epoch": 0.5941902263674341,
+      "grad_norm": 0.2051352709531784,
+      "learning_rate": 8.14324639937719e-05,
+      "loss": 1.023,
+      "step": 1529
+    },
+    {
+      "epoch": 0.5945788399883416,
+      "grad_norm": 0.19492043554782867,
+      "learning_rate": 8.135461268976255e-05,
+      "loss": 0.9608,
+      "step": 1530
+    },
+    {
+      "epoch": 0.594967453609249,
+      "grad_norm": 0.21521608531475067,
+      "learning_rate": 8.127676138575322e-05,
+      "loss": 1.0612,
+      "step": 1531
+    },
+    {
+      "epoch": 0.5953560672301564,
+      "grad_norm": 0.22739367187023163,
+      "learning_rate": 8.119891008174387e-05,
+      "loss": 1.0603,
+      "step": 1532
+    },
+    {
+      "epoch": 0.5957446808510638,
+      "grad_norm": 0.20334595441818237,
+      "learning_rate": 8.112105877773452e-05,
+      "loss": 1.0191,
+      "step": 1533
+    },
+    {
+      "epoch": 0.5961332944719713,
+      "grad_norm": 0.20985397696495056,
+      "learning_rate": 8.104320747372519e-05,
+      "loss": 1.0721,
+      "step": 1534
+    },
+    {
+      "epoch": 0.5965219080928786,
+      "grad_norm": 0.20472954213619232,
+      "learning_rate": 8.096535616971584e-05,
+      "loss": 1.0556,
+      "step": 1535
+    },
+    {
+      "epoch": 0.5969105217137861,
+      "grad_norm": 0.2112964689731598,
+      "learning_rate": 8.08875048657065e-05,
+      "loss": 1.0016,
+      "step": 1536
+    },
+    {
+      "epoch": 0.5972991353346935,
+      "grad_norm": 0.21330617368221283,
+      "learning_rate": 8.080965356169716e-05,
+      "loss": 1.0783,
+      "step": 1537
+    },
+    {
+      "epoch": 0.5976877489556008,
+      "grad_norm": 0.20907814800739288,
+      "learning_rate": 8.073180225768782e-05,
+      "loss": 1.071,
+      "step": 1538
+    },
+    {
+      "epoch": 0.5980763625765083,
+      "grad_norm": 0.2038964033126831,
+      "learning_rate": 8.065395095367848e-05,
+      "loss": 1.0039,
+      "step": 1539
+    },
+    {
+      "epoch": 0.5984649761974157,
+      "grad_norm": 0.2175542712211609,
+      "learning_rate": 8.057609964966914e-05,
+      "loss": 1.0015,
+      "step": 1540
+    },
+    {
+      "epoch": 0.5988535898183232,
+      "grad_norm": 0.21474529802799225,
+      "learning_rate": 8.049824834565979e-05,
+      "loss": 1.0273,
+      "step": 1541
+    },
+    {
+      "epoch": 0.5992422034392305,
+      "grad_norm": 0.21428482234477997,
+      "learning_rate": 8.042039704165046e-05,
+      "loss": 1.0767,
+      "step": 1542
+    },
+    {
+      "epoch": 0.599630817060138,
+      "grad_norm": 0.20287524163722992,
+      "learning_rate": 8.034254573764111e-05,
+      "loss": 1.064,
+      "step": 1543
+    },
+    {
+      "epoch": 0.6000194306810454,
+      "grad_norm": 0.20689848065376282,
+      "learning_rate": 8.026469443363178e-05,
+      "loss": 1.0084,
+      "step": 1544
+    },
+    {
+      "epoch": 0.6004080443019528,
+      "grad_norm": 0.22451332211494446,
+      "learning_rate": 8.018684312962243e-05,
+      "loss": 1.1039,
+      "step": 1545
+    },
+    {
+      "epoch": 0.6007966579228602,
+      "grad_norm": 0.21381956338882446,
+      "learning_rate": 8.010899182561308e-05,
+      "loss": 1.0551,
+      "step": 1546
+    },
+    {
+      "epoch": 0.6011852715437677,
+      "grad_norm": 0.20108483731746674,
+      "learning_rate": 8.003114052160375e-05,
+      "loss": 1.0326,
+      "step": 1547
+    },
+    {
+      "epoch": 0.601573885164675,
+      "grad_norm": 0.19739678502082825,
+      "learning_rate": 7.99532892175944e-05,
+      "loss": 1.0319,
+      "step": 1548
+    },
+    {
+      "epoch": 0.6019624987855824,
+      "grad_norm": 0.21635359525680542,
+      "learning_rate": 7.987543791358506e-05,
+      "loss": 1.0465,
+      "step": 1549
+    },
+    {
+      "epoch": 0.6023511124064899,
+      "grad_norm": 0.1949319988489151,
+      "learning_rate": 7.979758660957572e-05,
+      "loss": 1.0026,
+      "step": 1550
+    },
+    {
+      "epoch": 0.6027397260273972,
+      "grad_norm": 0.1989699900150299,
+      "learning_rate": 7.971973530556637e-05,
+      "loss": 1.021,
+      "step": 1551
+    },
+    {
+      "epoch": 0.6031283396483047,
+      "grad_norm": 0.24031391739845276,
+      "learning_rate": 7.964188400155703e-05,
+      "loss": 1.0293,
+      "step": 1552
+    },
+    {
+      "epoch": 0.6035169532692121,
+      "grad_norm": 0.21247251331806183,
+      "learning_rate": 7.956403269754769e-05,
+      "loss": 1.023,
+      "step": 1553
+    },
+    {
+      "epoch": 0.6039055668901195,
+      "grad_norm": 0.21565628051757812,
+      "learning_rate": 7.948618139353835e-05,
+      "loss": 1.1027,
+      "step": 1554
+    },
+    {
+      "epoch": 0.6042941805110269,
+      "grad_norm": 0.21207931637763977,
+      "learning_rate": 7.9408330089529e-05,
+      "loss": 1.0634,
+      "step": 1555
+    },
+    {
+      "epoch": 0.6046827941319344,
+      "grad_norm": 0.21354155242443085,
+      "learning_rate": 7.933047878551965e-05,
+      "loss": 1.0433,
+      "step": 1556
+    },
+    {
+      "epoch": 0.6050714077528417,
+      "grad_norm": 0.21708370745182037,
+      "learning_rate": 7.925262748151032e-05,
+      "loss": 1.0499,
+      "step": 1557
+    },
+    {
+      "epoch": 0.6054600213737491,
+      "grad_norm": 0.2051447182893753,
+      "learning_rate": 7.917477617750097e-05,
+      "loss": 1.0042,
+      "step": 1558
+    },
+    {
+      "epoch": 0.6058486349946566,
+      "grad_norm": 0.18768000602722168,
+      "learning_rate": 7.909692487349164e-05,
+      "loss": 1.009,
+      "step": 1559
+    },
+    {
+      "epoch": 0.6062372486155639,
+      "grad_norm": 0.2142931967973709,
+      "learning_rate": 7.901907356948229e-05,
+      "loss": 1.0458,
+      "step": 1560
+    },
+    {
+      "epoch": 0.6066258622364714,
+      "grad_norm": 0.21006444096565247,
+      "learning_rate": 7.894122226547294e-05,
+      "loss": 1.0286,
+      "step": 1561
+    },
+    {
+      "epoch": 0.6070144758573788,
+      "grad_norm": 0.2187039703130722,
+      "learning_rate": 7.886337096146361e-05,
+      "loss": 1.0103,
+      "step": 1562
+    },
+    {
+      "epoch": 0.6074030894782863,
+      "grad_norm": 0.19863669574260712,
+      "learning_rate": 7.878551965745426e-05,
+      "loss": 0.9925,
+      "step": 1563
+    },
+    {
+      "epoch": 0.6077917030991936,
+      "grad_norm": 0.21771976351737976,
+      "learning_rate": 7.870766835344493e-05,
+      "loss": 0.9853,
+      "step": 1564
+    },
+    {
+      "epoch": 0.6081803167201011,
+      "grad_norm": 0.21714983880519867,
+      "learning_rate": 7.862981704943558e-05,
+      "loss": 1.0123,
+      "step": 1565
+    },
+    {
+      "epoch": 0.6085689303410085,
+      "grad_norm": 0.2251398265361786,
+      "learning_rate": 7.855196574542624e-05,
+      "loss": 1.0265,
+      "step": 1566
+    },
+    {
+      "epoch": 0.6089575439619158,
+      "grad_norm": 0.22089716792106628,
+      "learning_rate": 7.84741144414169e-05,
+      "loss": 1.0689,
+      "step": 1567
+    },
+    {
+      "epoch": 0.6093461575828233,
+      "grad_norm": 0.2453841269016266,
+      "learning_rate": 7.839626313740756e-05,
+      "loss": 1.0185,
+      "step": 1568
+    },
+    {
+      "epoch": 0.6097347712037307,
+      "grad_norm": 0.21866528689861298,
+      "learning_rate": 7.831841183339821e-05,
+      "loss": 1.0361,
+      "step": 1569
+    },
+    {
+      "epoch": 0.6101233848246381,
+      "grad_norm": 0.22421486675739288,
+      "learning_rate": 7.824056052938888e-05,
+      "loss": 1.024,
+      "step": 1570
+    },
+    {
+      "epoch": 0.6105119984455455,
+      "grad_norm": 0.21107137203216553,
+      "learning_rate": 7.816270922537953e-05,
+      "loss": 1.0335,
+      "step": 1571
+    },
+    {
+      "epoch": 0.610900612066453,
+      "grad_norm": 0.20731772482395172,
+      "learning_rate": 7.80848579213702e-05,
+      "loss": 1.0563,
+      "step": 1572
+    },
+    {
+      "epoch": 0.6112892256873603,
+      "grad_norm": 0.19535884261131287,
+      "learning_rate": 7.800700661736085e-05,
+      "loss": 0.9698,
+      "step": 1573
+    },
+    {
+      "epoch": 0.6116778393082678,
+      "grad_norm": 0.20449021458625793,
+      "learning_rate": 7.79291553133515e-05,
+      "loss": 1.0125,
+      "step": 1574
+    },
+    {
+      "epoch": 0.6120664529291752,
+      "grad_norm": 0.19576509296894073,
+      "learning_rate": 7.785130400934217e-05,
+      "loss": 0.9326,
+      "step": 1575
+    },
+    {
+      "epoch": 0.6124550665500825,
+      "grad_norm": 0.18914124369621277,
+      "learning_rate": 7.777345270533282e-05,
+      "loss": 0.9939,
+      "step": 1576
+    },
+    {
+      "epoch": 0.61284368017099,
+      "grad_norm": 0.21239091455936432,
+      "learning_rate": 7.769560140132348e-05,
+      "loss": 1.0271,
+      "step": 1577
+    },
+    {
+      "epoch": 0.6132322937918974,
+      "grad_norm": 0.22204811871051788,
+      "learning_rate": 7.761775009731414e-05,
+      "loss": 1.0524,
+      "step": 1578
+    },
+    {
+      "epoch": 0.6136209074128048,
+      "grad_norm": 0.20047850906848907,
+      "learning_rate": 7.753989879330479e-05,
+      "loss": 1.0076,
+      "step": 1579
+    },
+    {
+      "epoch": 0.6140095210337122,
+      "grad_norm": 0.22619746625423431,
+      "learning_rate": 7.746204748929545e-05,
+      "loss": 1.0611,
+      "step": 1580
+    },
+    {
+      "epoch": 0.6143981346546197,
+      "grad_norm": 0.2500879466533661,
+      "learning_rate": 7.73841961852861e-05,
+      "loss": 1.0364,
+      "step": 1581
+    },
+    {
+      "epoch": 0.614786748275527,
+      "grad_norm": 0.23486928641796112,
+      "learning_rate": 7.730634488127676e-05,
+      "loss": 1.0472,
+      "step": 1582
+    },
+    {
+      "epoch": 0.6151753618964345,
+      "grad_norm": 0.19849038124084473,
+      "learning_rate": 7.722849357726742e-05,
+      "loss": 0.9847,
+      "step": 1583
+    },
+    {
+      "epoch": 0.6155639755173419,
+      "grad_norm": 0.21516263484954834,
+      "learning_rate": 7.715064227325807e-05,
+      "loss": 1.0351,
+      "step": 1584
+    },
+    {
+      "epoch": 0.6159525891382492,
+      "grad_norm": 0.20137760043144226,
+      "learning_rate": 7.707279096924874e-05,
+      "loss": 0.9879,
+      "step": 1585
+    },
+    {
+      "epoch": 0.6163412027591567,
+      "grad_norm": 0.2146228402853012,
+      "learning_rate": 7.699493966523939e-05,
+      "loss": 1.0792,
+      "step": 1586
+    },
+    {
+      "epoch": 0.6167298163800641,
+      "grad_norm": 0.19929760694503784,
+      "learning_rate": 7.691708836123004e-05,
+      "loss": 1.0313,
+      "step": 1587
+    },
+    {
+      "epoch": 0.6171184300009716,
+      "grad_norm": 0.201123908162117,
+      "learning_rate": 7.683923705722071e-05,
+      "loss": 1.0279,
+      "step": 1588
+    },
+    {
+      "epoch": 0.6175070436218789,
+      "grad_norm": 0.2154105007648468,
+      "learning_rate": 7.676138575321136e-05,
+      "loss": 1.075,
+      "step": 1589
+    },
+    {
+      "epoch": 0.6178956572427864,
+      "grad_norm": 0.2028442770242691,
+      "learning_rate": 7.668353444920203e-05,
+      "loss": 0.9771,
+      "step": 1590
+    },
+    {
+      "epoch": 0.6182842708636938,
+      "grad_norm": 0.18003074824810028,
+      "learning_rate": 7.660568314519268e-05,
+      "loss": 0.9677,
+      "step": 1591
+    },
+    {
+      "epoch": 0.6186728844846012,
+      "grad_norm": 0.23250891268253326,
+      "learning_rate": 7.652783184118335e-05,
+      "loss": 1.015,
+      "step": 1592
+    },
+    {
+      "epoch": 0.6190614981055086,
+      "grad_norm": 0.2047244906425476,
+      "learning_rate": 7.6449980537174e-05,
+      "loss": 1.0044,
+      "step": 1593
+    },
+    {
+      "epoch": 0.6194501117264161,
+      "grad_norm": 0.20011259615421295,
+      "learning_rate": 7.637212923316466e-05,
+      "loss": 1.0089,
+      "step": 1594
+    },
+    {
+      "epoch": 0.6198387253473234,
+      "grad_norm": 0.2212608903646469,
+      "learning_rate": 7.629427792915533e-05,
+      "loss": 1.0457,
+      "step": 1595
+    },
+    {
+      "epoch": 0.6202273389682308,
+      "grad_norm": 0.22725115716457367,
+      "learning_rate": 7.621642662514598e-05,
+      "loss": 1.1198,
+      "step": 1596
+    },
+    {
+      "epoch": 0.6206159525891383,
+      "grad_norm": 0.2065306007862091,
+      "learning_rate": 7.613857532113663e-05,
+      "loss": 1.0572,
+      "step": 1597
+    },
+    {
+      "epoch": 0.6210045662100456,
+      "grad_norm": 0.2132783830165863,
+      "learning_rate": 7.60607240171273e-05,
+      "loss": 1.0332,
+      "step": 1598
+    },
+    {
+      "epoch": 0.6213931798309531,
+      "grad_norm": 0.20527103543281555,
+      "learning_rate": 7.598287271311795e-05,
+      "loss": 1.0156,
+      "step": 1599
+    },
+    {
+      "epoch": 0.6217817934518605,
+      "grad_norm": 0.23608024418354034,
+      "learning_rate": 7.59050214091086e-05,
+      "loss": 1.0379,
+      "step": 1600
+    },
+    {
+      "epoch": 0.6221704070727679,
+      "grad_norm": 0.22227297723293304,
+      "learning_rate": 7.582717010509927e-05,
+      "loss": 1.0507,
+      "step": 1601
+    },
+    {
+      "epoch": 0.6225590206936753,
+      "grad_norm": 0.22359615564346313,
+      "learning_rate": 7.574931880108992e-05,
+      "loss": 1.0705,
+      "step": 1602
+    },
+    {
+      "epoch": 0.6229476343145828,
+      "grad_norm": 0.20478755235671997,
+      "learning_rate": 7.567146749708059e-05,
+      "loss": 1.0309,
+      "step": 1603
+    },
+    {
+      "epoch": 0.6233362479354901,
+      "grad_norm": 0.2223423272371292,
+      "learning_rate": 7.559361619307124e-05,
+      "loss": 1.0386,
+      "step": 1604
+    },
+    {
+      "epoch": 0.6237248615563975,
+      "grad_norm": 0.21232105791568756,
+      "learning_rate": 7.551576488906189e-05,
+      "loss": 1.0353,
+      "step": 1605
+    },
+    {
+      "epoch": 0.624113475177305,
+      "grad_norm": 0.22431129217147827,
+      "learning_rate": 7.543791358505256e-05,
+      "loss": 1.1017,
+      "step": 1606
+    },
+    {
+      "epoch": 0.6245020887982123,
+      "grad_norm": 0.20826031267642975,
+      "learning_rate": 7.536006228104321e-05,
+      "loss": 1.0172,
+      "step": 1607
+    },
+    {
+      "epoch": 0.6248907024191198,
+      "grad_norm": 0.2803161144256592,
+      "learning_rate": 7.528221097703387e-05,
+      "loss": 1.0554,
+      "step": 1608
+    },
+    {
+      "epoch": 0.6252793160400272,
+      "grad_norm": 0.2185174971818924,
+      "learning_rate": 7.520435967302453e-05,
+      "loss": 0.9842,
+      "step": 1609
+    },
+    {
+      "epoch": 0.6256679296609347,
+      "grad_norm": 0.2091478854417801,
+      "learning_rate": 7.512650836901518e-05,
+      "loss": 0.9783,
+      "step": 1610
+    },
+    {
+      "epoch": 0.626056543281842,
+      "grad_norm": 0.22342967987060547,
+      "learning_rate": 7.504865706500584e-05,
+      "loss": 0.9891,
+      "step": 1611
+    },
+    {
+      "epoch": 0.6264451569027495,
+      "grad_norm": 0.195283442735672,
+      "learning_rate": 7.49708057609965e-05,
+      "loss": 0.9654,
+      "step": 1612
+    },
+    {
+      "epoch": 0.6268337705236569,
+      "grad_norm": 0.21048255264759064,
+      "learning_rate": 7.489295445698716e-05,
+      "loss": 1.0112,
+      "step": 1613
+    },
+    {
+      "epoch": 0.6272223841445642,
+      "grad_norm": 0.21405541896820068,
+      "learning_rate": 7.481510315297781e-05,
+      "loss": 1.0498,
+      "step": 1614
+    },
+    {
+      "epoch": 0.6276109977654717,
+      "grad_norm": 0.2144453227519989,
+      "learning_rate": 7.473725184896846e-05,
+      "loss": 1.0487,
+      "step": 1615
+    },
+    {
+      "epoch": 0.627999611386379,
+      "grad_norm": 0.21963326632976532,
+      "learning_rate": 7.465940054495913e-05,
+      "loss": 1.0634,
+      "step": 1616
+    },
+    {
+      "epoch": 0.6283882250072865,
+      "grad_norm": 0.20100601017475128,
+      "learning_rate": 7.458154924094978e-05,
+      "loss": 1.0407,
+      "step": 1617
+    },
+    {
+      "epoch": 0.6287768386281939,
+      "grad_norm": 0.19469478726387024,
+      "learning_rate": 7.450369793694045e-05,
+      "loss": 0.9923,
+      "step": 1618
+    },
+    {
+      "epoch": 0.6291654522491014,
+      "grad_norm": 0.2114047408103943,
+      "learning_rate": 7.442584663293111e-05,
+      "loss": 1.0263,
+      "step": 1619
+    },
+    {
+      "epoch": 0.6295540658700087,
+      "grad_norm": 0.21080389618873596,
+      "learning_rate": 7.434799532892177e-05,
+      "loss": 1.0012,
+      "step": 1620
+    },
+    {
+      "epoch": 0.6299426794909162,
+      "grad_norm": 0.20366831123828888,
+      "learning_rate": 7.427014402491243e-05,
+      "loss": 1.0254,
+      "step": 1621
+    },
+    {
+      "epoch": 0.6303312931118236,
+      "grad_norm": 0.209821879863739,
+      "learning_rate": 7.419229272090308e-05,
+      "loss": 0.9416,
+      "step": 1622
+    },
+    {
+      "epoch": 0.6307199067327309,
+      "grad_norm": 0.2228868007659912,
+      "learning_rate": 7.411444141689374e-05,
+      "loss": 1.0128,
+      "step": 1623
+    },
+    {
+      "epoch": 0.6311085203536384,
+      "grad_norm": 0.19673995673656464,
+      "learning_rate": 7.40365901128844e-05,
+      "loss": 0.9709,
+      "step": 1624
+    },
+    {
+      "epoch": 0.6314971339745458,
+      "grad_norm": 0.21590839326381683,
+      "learning_rate": 7.395873880887505e-05,
+      "loss": 1.0251,
+      "step": 1625
+    },
+    {
+      "epoch": 0.6318857475954532,
+      "grad_norm": 0.20200593769550323,
+      "learning_rate": 7.388088750486572e-05,
+      "loss": 1.0307,
+      "step": 1626
+    },
+    {
+      "epoch": 0.6322743612163606,
+      "grad_norm": 0.19623909890651703,
+      "learning_rate": 7.380303620085637e-05,
+      "loss": 1.0375,
+      "step": 1627
+    },
+    {
+      "epoch": 0.6326629748372681,
+      "grad_norm": 0.19878128170967102,
+      "learning_rate": 7.372518489684702e-05,
+      "loss": 0.9844,
+      "step": 1628
+    },
+    {
+      "epoch": 0.6330515884581754,
+      "grad_norm": 0.21292422711849213,
+      "learning_rate": 7.364733359283769e-05,
+      "loss": 1.0228,
+      "step": 1629
+    },
+    {
+      "epoch": 0.6334402020790829,
+      "grad_norm": 0.1915559619665146,
+      "learning_rate": 7.356948228882834e-05,
+      "loss": 0.9818,
+      "step": 1630
+    },
+    {
+      "epoch": 0.6338288156999903,
+      "grad_norm": 0.2264430969953537,
+      "learning_rate": 7.3491630984819e-05,
+      "loss": 1.146,
+      "step": 1631
+    },
+    {
+      "epoch": 0.6342174293208978,
+      "grad_norm": 0.19332270324230194,
+      "learning_rate": 7.341377968080966e-05,
+      "loss": 1.0007,
+      "step": 1632
+    },
+    {
+      "epoch": 0.6346060429418051,
+      "grad_norm": 0.217147096991539,
+      "learning_rate": 7.333592837680031e-05,
+      "loss": 1.0498,
+      "step": 1633
+    },
+    {
+      "epoch": 0.6349946565627125,
+      "grad_norm": 0.22200679779052734,
+      "learning_rate": 7.325807707279098e-05,
+      "loss": 1.0358,
+      "step": 1634
+    },
+    {
+      "epoch": 0.63538327018362,
+      "grad_norm": 0.19485117495059967,
+      "learning_rate": 7.318022576878163e-05,
+      "loss": 0.9717,
+      "step": 1635
+    },
+    {
+      "epoch": 0.6357718838045273,
+      "grad_norm": 0.20595680177211761,
+      "learning_rate": 7.310237446477228e-05,
+      "loss": 1.0195,
+      "step": 1636
+    },
+    {
+      "epoch": 0.6361604974254348,
+      "grad_norm": 0.21184709668159485,
+      "learning_rate": 7.302452316076294e-05,
+      "loss": 1.0354,
+      "step": 1637
+    },
+    {
+      "epoch": 0.6365491110463422,
+      "grad_norm": 0.22607794404029846,
+      "learning_rate": 7.29466718567536e-05,
+      "loss": 1.0217,
+      "step": 1638
+    },
+    {
+      "epoch": 0.6369377246672496,
+      "grad_norm": 0.20236065983772278,
+      "learning_rate": 7.286882055274426e-05,
+      "loss": 1.0441,
+      "step": 1639
+    },
+    {
+      "epoch": 0.637326338288157,
+      "grad_norm": 0.19979622960090637,
+      "learning_rate": 7.279096924873491e-05,
+      "loss": 1.0105,
+      "step": 1640
+    },
+    {
+      "epoch": 0.6377149519090645,
+      "grad_norm": 0.2655459940433502,
+      "learning_rate": 7.271311794472557e-05,
+      "loss": 1.0726,
+      "step": 1641
+    },
+    {
+      "epoch": 0.6381035655299718,
+      "grad_norm": 0.25107496976852417,
+      "learning_rate": 7.263526664071623e-05,
+      "loss": 1.037,
+      "step": 1642
+    },
+    {
+      "epoch": 0.6384921791508792,
+      "grad_norm": 0.19250229001045227,
+      "learning_rate": 7.255741533670688e-05,
+      "loss": 0.9741,
+      "step": 1643
+    },
+    {
+      "epoch": 0.6388807927717867,
+      "grad_norm": 0.19324181973934174,
+      "learning_rate": 7.247956403269755e-05,
+      "loss": 1.0333,
+      "step": 1644
+    },
+    {
+      "epoch": 0.639269406392694,
+      "grad_norm": 0.22267483174800873,
+      "learning_rate": 7.240171272868822e-05,
+      "loss": 1.0313,
+      "step": 1645
+    },
+    {
+      "epoch": 0.6396580200136015,
+      "grad_norm": 0.2775348722934723,
+      "learning_rate": 7.232386142467887e-05,
+      "loss": 1.0686,
+      "step": 1646
+    },
+    {
+      "epoch": 0.6400466336345089,
+      "grad_norm": 0.1886623501777649,
+      "learning_rate": 7.224601012066953e-05,
+      "loss": 1.0029,
+      "step": 1647
+    },
+    {
+      "epoch": 0.6404352472554163,
+      "grad_norm": 0.20303374528884888,
+      "learning_rate": 7.216815881666019e-05,
+      "loss": 1.0346,
+      "step": 1648
+    },
+    {
+      "epoch": 0.6408238608763237,
+      "grad_norm": 0.20815756916999817,
+      "learning_rate": 7.209030751265085e-05,
+      "loss": 1.0258,
+      "step": 1649
+    },
+    {
+      "epoch": 0.6412124744972312,
+      "grad_norm": 0.22055703401565552,
+      "learning_rate": 7.20124562086415e-05,
+      "loss": 1.0215,
+      "step": 1650
+    },
+    {
+      "epoch": 0.6416010881181385,
+      "grad_norm": 0.20248562097549438,
+      "learning_rate": 7.193460490463215e-05,
+      "loss": 0.9979,
+      "step": 1651
+    },
+    {
+      "epoch": 0.6419897017390459,
+      "grad_norm": 0.2093247026205063,
+      "learning_rate": 7.185675360062282e-05,
+      "loss": 1.0605,
+      "step": 1652
+    },
+    {
+      "epoch": 0.6423783153599534,
+      "grad_norm": 0.22276204824447632,
+      "learning_rate": 7.177890229661347e-05,
+      "loss": 1.0788,
+      "step": 1653
+    },
+    {
+      "epoch": 0.6427669289808607,
+      "grad_norm": 0.19959624111652374,
+      "learning_rate": 7.170105099260412e-05,
+      "loss": 0.9954,
+      "step": 1654
+    },
+    {
+      "epoch": 0.6431555426017682,
+      "grad_norm": 0.20173248648643494,
+      "learning_rate": 7.162319968859479e-05,
+      "loss": 1.003,
+      "step": 1655
+    },
+    {
+      "epoch": 0.6435441562226756,
+      "grad_norm": 0.207533061504364,
+      "learning_rate": 7.154534838458544e-05,
+      "loss": 1.043,
+      "step": 1656
+    },
+    {
+      "epoch": 0.643932769843583,
+      "grad_norm": 0.21928350627422333,
+      "learning_rate": 7.146749708057611e-05,
+      "loss": 1.0472,
+      "step": 1657
+    },
+    {
+      "epoch": 0.6443213834644904,
+      "grad_norm": 0.2567078173160553,
+      "learning_rate": 7.138964577656676e-05,
+      "loss": 1.0946,
+      "step": 1658
+    },
+    {
+      "epoch": 0.6447099970853979,
+      "grad_norm": 0.19454176723957062,
+      "learning_rate": 7.131179447255741e-05,
+      "loss": 0.9437,
+      "step": 1659
+    },
+    {
+      "epoch": 0.6450986107063053,
+      "grad_norm": 0.19198423624038696,
+      "learning_rate": 7.123394316854808e-05,
+      "loss": 0.9976,
+      "step": 1660
+    },
+    {
+      "epoch": 0.6454872243272126,
+      "grad_norm": 0.1929445117712021,
+      "learning_rate": 7.115609186453873e-05,
+      "loss": 1.0279,
+      "step": 1661
+    },
+    {
+      "epoch": 0.6458758379481201,
+      "grad_norm": 0.2041027694940567,
+      "learning_rate": 7.10782405605294e-05,
+      "loss": 1.0458,
+      "step": 1662
+    },
+    {
+      "epoch": 0.6462644515690275,
+      "grad_norm": 0.23750995099544525,
+      "learning_rate": 7.100038925652005e-05,
+      "loss": 1.0916,
+      "step": 1663
+    },
+    {
+      "epoch": 0.6466530651899349,
+      "grad_norm": 0.1971994787454605,
+      "learning_rate": 7.09225379525107e-05,
+      "loss": 0.951,
+      "step": 1664
+    },
+    {
+      "epoch": 0.6470416788108423,
+      "grad_norm": 0.20459246635437012,
+      "learning_rate": 7.084468664850136e-05,
+      "loss": 0.9653,
+      "step": 1665
+    },
+    {
+      "epoch": 0.6474302924317498,
+      "grad_norm": 0.2137187272310257,
+      "learning_rate": 7.076683534449202e-05,
+      "loss": 1.0291,
+      "step": 1666
+    },
+    {
+      "epoch": 0.6478189060526571,
+      "grad_norm": 0.21235258877277374,
+      "learning_rate": 7.068898404048268e-05,
+      "loss": 1.0104,
+      "step": 1667
+    },
+    {
+      "epoch": 0.6482075196735646,
+      "grad_norm": 0.23120944201946259,
+      "learning_rate": 7.061113273647333e-05,
+      "loss": 1.0693,
+      "step": 1668
+    },
+    {
+      "epoch": 0.648596133294472,
+      "grad_norm": 1.38257896900177,
+      "learning_rate": 7.053328143246399e-05,
+      "loss": 1.0339,
+      "step": 1669
+    },
+    {
+      "epoch": 0.6489847469153793,
+      "grad_norm": 0.20898790657520294,
+      "learning_rate": 7.045543012845465e-05,
+      "loss": 1.004,
+      "step": 1670
+    },
+    {
+      "epoch": 0.6493733605362868,
+      "grad_norm": 0.20251236855983734,
+      "learning_rate": 7.037757882444532e-05,
+      "loss": 0.9992,
+      "step": 1671
+    },
+    {
+      "epoch": 0.6497619741571942,
+      "grad_norm": 0.2358030527830124,
+      "learning_rate": 7.029972752043597e-05,
+      "loss": 0.9854,
+      "step": 1672
+    },
+    {
+      "epoch": 0.6501505877781016,
+      "grad_norm": 0.18945704400539398,
+      "learning_rate": 7.022187621642664e-05,
+      "loss": 0.9677,
+      "step": 1673
+    },
+    {
+      "epoch": 0.650539201399009,
+      "grad_norm": 0.1965213567018509,
+      "learning_rate": 7.014402491241729e-05,
+      "loss": 1.0118,
+      "step": 1674
+    },
+    {
+      "epoch": 0.6509278150199165,
+      "grad_norm": 0.2340148687362671,
+      "learning_rate": 7.006617360840795e-05,
+      "loss": 1.0312,
+      "step": 1675
+    },
+    {
+      "epoch": 0.6513164286408238,
+      "grad_norm": 0.1992296278476715,
+      "learning_rate": 6.99883223043986e-05,
+      "loss": 1.0155,
+      "step": 1676
+    },
+    {
+      "epoch": 0.6517050422617313,
+      "grad_norm": 0.20410223305225372,
+      "learning_rate": 6.991047100038926e-05,
+      "loss": 1.0646,
+      "step": 1677
+    },
+    {
+      "epoch": 0.6520936558826387,
+      "grad_norm": 0.19254536926746368,
+      "learning_rate": 6.983261969637992e-05,
+      "loss": 0.9538,
+      "step": 1678
+    },
+    {
+      "epoch": 0.6524822695035462,
+      "grad_norm": 0.19980847835540771,
+      "learning_rate": 6.975476839237057e-05,
+      "loss": 0.9912,
+      "step": 1679
+    },
+    {
+      "epoch": 0.6528708831244535,
+      "grad_norm": 0.19503261148929596,
+      "learning_rate": 6.967691708836124e-05,
+      "loss": 0.9844,
+      "step": 1680
+    },
+    {
+      "epoch": 0.6532594967453609,
+      "grad_norm": 0.22375883162021637,
+      "learning_rate": 6.959906578435189e-05,
+      "loss": 1.1266,
+      "step": 1681
+    },
+    {
+      "epoch": 0.6536481103662684,
+      "grad_norm": 0.21456514298915863,
+      "learning_rate": 6.952121448034254e-05,
+      "loss": 1.0902,
+      "step": 1682
+    },
+    {
+      "epoch": 0.6540367239871757,
+      "grad_norm": 0.20348122715950012,
+      "learning_rate": 6.944336317633321e-05,
+      "loss": 1.0228,
+      "step": 1683
+    },
+    {
+      "epoch": 0.6544253376080832,
+      "grad_norm": 0.21647393703460693,
+      "learning_rate": 6.936551187232386e-05,
+      "loss": 1.0653,
+      "step": 1684
+    },
+    {
+      "epoch": 0.6548139512289906,
+      "grad_norm": 0.20160923898220062,
+      "learning_rate": 6.928766056831453e-05,
+      "loss": 1.0249,
+      "step": 1685
+    },
+    {
+      "epoch": 0.655202564849898,
+      "grad_norm": 0.20070499181747437,
+      "learning_rate": 6.920980926430518e-05,
+      "loss": 1.0585,
+      "step": 1686
+    },
+    {
+      "epoch": 0.6555911784708054,
+      "grad_norm": 0.2656902074813843,
+      "learning_rate": 6.913195796029583e-05,
+      "loss": 1.0042,
+      "step": 1687
+    },
+    {
+      "epoch": 0.6559797920917129,
+      "grad_norm": 0.1934545785188675,
+      "learning_rate": 6.90541066562865e-05,
+      "loss": 0.9831,
+      "step": 1688
+    },
+    {
+      "epoch": 0.6563684057126202,
+      "grad_norm": 0.21719245612621307,
+      "learning_rate": 6.897625535227715e-05,
+      "loss": 0.9934,
+      "step": 1689
+    },
+    {
+      "epoch": 0.6567570193335276,
+      "grad_norm": 0.20906969904899597,
+      "learning_rate": 6.889840404826782e-05,
+      "loss": 1.023,
+      "step": 1690
+    },
+    {
+      "epoch": 0.6571456329544351,
+      "grad_norm": 0.225227490067482,
+      "learning_rate": 6.882055274425847e-05,
+      "loss": 1.0265,
+      "step": 1691
+    },
+    {
+      "epoch": 0.6575342465753424,
+      "grad_norm": 0.22766710817813873,
+      "learning_rate": 6.874270144024912e-05,
+      "loss": 1.0306,
+      "step": 1692
+    },
+    {
+      "epoch": 0.6579228601962499,
+      "grad_norm": 0.20964065194129944,
+      "learning_rate": 6.866485013623978e-05,
+      "loss": 0.9431,
+      "step": 1693
+    },
+    {
+      "epoch": 0.6583114738171573,
+      "grad_norm": 0.19821231067180634,
+      "learning_rate": 6.858699883223044e-05,
+      "loss": 0.9959,
+      "step": 1694
+    },
+    {
+      "epoch": 0.6587000874380647,
+      "grad_norm": 0.2071307748556137,
+      "learning_rate": 6.85091475282211e-05,
+      "loss": 1.0332,
+      "step": 1695
+    },
+    {
+      "epoch": 0.6590887010589721,
+      "grad_norm": 0.27962490916252136,
+      "learning_rate": 6.843129622421175e-05,
+      "loss": 0.9755,
+      "step": 1696
+    },
+    {
+      "epoch": 0.6594773146798796,
+      "grad_norm": 0.21582698822021484,
+      "learning_rate": 6.835344492020242e-05,
+      "loss": 1.0305,
+      "step": 1697
+    },
+    {
+      "epoch": 0.6598659283007869,
+      "grad_norm": 0.1872921586036682,
+      "learning_rate": 6.827559361619307e-05,
+      "loss": 0.9693,
+      "step": 1698
+    },
+    {
+      "epoch": 0.6602545419216943,
+      "grad_norm": 0.27033379673957825,
+      "learning_rate": 6.819774231218374e-05,
+      "loss": 1.0756,
+      "step": 1699
+    },
+    {
+      "epoch": 0.6606431555426018,
+      "grad_norm": 0.2010008543729782,
+      "learning_rate": 6.811989100817439e-05,
+      "loss": 1.0077,
+      "step": 1700
+    },
+    {
+      "epoch": 0.6610317691635091,
+      "grad_norm": 0.20637495815753937,
+      "learning_rate": 6.804203970416506e-05,
+      "loss": 1.0208,
+      "step": 1701
+    },
+    {
+      "epoch": 0.6614203827844166,
+      "grad_norm": 0.21331818401813507,
+      "learning_rate": 6.796418840015571e-05,
+      "loss": 1.0242,
+      "step": 1702
+    },
+    {
+      "epoch": 0.661808996405324,
+      "grad_norm": 0.2092941552400589,
+      "learning_rate": 6.788633709614637e-05,
+      "loss": 1.0949,
+      "step": 1703
+    },
+    {
+      "epoch": 0.6621976100262315,
+      "grad_norm": 0.22332265973091125,
+      "learning_rate": 6.780848579213703e-05,
+      "loss": 1.1068,
+      "step": 1704
+    },
+    {
+      "epoch": 0.6625862236471388,
+      "grad_norm": 0.20077067613601685,
+      "learning_rate": 6.773063448812768e-05,
+      "loss": 0.9801,
+      "step": 1705
+    },
+    {
+      "epoch": 0.6629748372680463,
+      "grad_norm": 0.2057008296251297,
+      "learning_rate": 6.765278318411834e-05,
+      "loss": 1.0058,
+      "step": 1706
+    },
+    {
+      "epoch": 0.6633634508889537,
+      "grad_norm": 0.20337353646755219,
+      "learning_rate": 6.7574931880109e-05,
+      "loss": 1.0141,
+      "step": 1707
+    },
+    {
+      "epoch": 0.663752064509861,
+      "grad_norm": 0.22756130993366241,
+      "learning_rate": 6.749708057609966e-05,
+      "loss": 1.0287,
+      "step": 1708
+    },
+    {
+      "epoch": 0.6641406781307685,
+      "grad_norm": 0.2052423506975174,
+      "learning_rate": 6.741922927209031e-05,
+      "loss": 1.0069,
+      "step": 1709
+    },
+    {
+      "epoch": 0.6645292917516759,
+      "grad_norm": 0.1988023817539215,
+      "learning_rate": 6.734137796808096e-05,
+      "loss": 0.9761,
+      "step": 1710
+    },
+    {
+      "epoch": 0.6649179053725833,
+      "grad_norm": 0.20491188764572144,
+      "learning_rate": 6.726352666407163e-05,
+      "loss": 0.9767,
+      "step": 1711
+    },
+    {
+      "epoch": 0.6653065189934907,
+      "grad_norm": 0.18790274858474731,
+      "learning_rate": 6.718567536006228e-05,
+      "loss": 0.9944,
+      "step": 1712
+    },
+    {
+      "epoch": 0.6656951326143982,
+      "grad_norm": 0.19979891180992126,
+      "learning_rate": 6.710782405605293e-05,
+      "loss": 1.0842,
+      "step": 1713
+    },
+    {
+      "epoch": 0.6660837462353055,
+      "grad_norm": 0.22204813361167908,
+      "learning_rate": 6.70299727520436e-05,
+      "loss": 1.0561,
+      "step": 1714
+    },
+    {
+      "epoch": 0.666472359856213,
+      "grad_norm": 0.20182965695858002,
+      "learning_rate": 6.695212144803425e-05,
+      "loss": 1.0015,
+      "step": 1715
+    },
+    {
+      "epoch": 0.6668609734771204,
+      "grad_norm": 0.20719997584819794,
+      "learning_rate": 6.687427014402492e-05,
+      "loss": 1.0144,
+      "step": 1716
+    },
+    {
+      "epoch": 0.6672495870980278,
+      "grad_norm": 0.1944626122713089,
+      "learning_rate": 6.679641884001557e-05,
+      "loss": 1.0083,
+      "step": 1717
+    },
+    {
+      "epoch": 0.6676382007189352,
+      "grad_norm": 0.2072264701128006,
+      "learning_rate": 6.671856753600622e-05,
+      "loss": 1.0246,
+      "step": 1718
+    },
+    {
+      "epoch": 0.6680268143398426,
+      "grad_norm": 0.2134973257780075,
+      "learning_rate": 6.664071623199689e-05,
+      "loss": 1.0926,
+      "step": 1719
+    },
+    {
+      "epoch": 0.66841542796075,
+      "grad_norm": 0.2119186669588089,
+      "learning_rate": 6.656286492798754e-05,
+      "loss": 1.0129,
+      "step": 1720
+    },
+    {
+      "epoch": 0.6688040415816574,
+      "grad_norm": 0.21205540001392365,
+      "learning_rate": 6.64850136239782e-05,
+      "loss": 1.0611,
+      "step": 1721
+    },
+    {
+      "epoch": 0.6691926552025649,
+      "grad_norm": 0.21632088720798492,
+      "learning_rate": 6.640716231996886e-05,
+      "loss": 1.0821,
+      "step": 1722
+    },
+    {
+      "epoch": 0.6695812688234722,
+      "grad_norm": 0.21734434366226196,
+      "learning_rate": 6.632931101595952e-05,
+      "loss": 1.0821,
+      "step": 1723
+    },
+    {
+      "epoch": 0.6699698824443797,
+      "grad_norm": 0.2030603289604187,
+      "learning_rate": 6.625145971195017e-05,
+      "loss": 0.9976,
+      "step": 1724
+    },
+    {
+      "epoch": 0.6703584960652871,
+      "grad_norm": 0.19921456277370453,
+      "learning_rate": 6.617360840794084e-05,
+      "loss": 0.9187,
+      "step": 1725
+    },
+    {
+      "epoch": 0.6707471096861946,
+      "grad_norm": 0.20548826456069946,
+      "learning_rate": 6.60957571039315e-05,
+      "loss": 1.0486,
+      "step": 1726
+    },
+    {
+      "epoch": 0.6711357233071019,
+      "grad_norm": 0.21784676611423492,
+      "learning_rate": 6.601790579992216e-05,
+      "loss": 1.1089,
+      "step": 1727
+    },
+    {
+      "epoch": 0.6715243369280093,
+      "grad_norm": 0.2137753963470459,
+      "learning_rate": 6.594005449591281e-05,
+      "loss": 1.0075,
+      "step": 1728
+    },
+    {
+      "epoch": 0.6719129505489168,
+      "grad_norm": 0.20200639963150024,
+      "learning_rate": 6.586220319190348e-05,
+      "loss": 0.9915,
+      "step": 1729
+    },
+    {
+      "epoch": 0.6723015641698241,
+      "grad_norm": 0.20898796617984772,
+      "learning_rate": 6.578435188789413e-05,
+      "loss": 1.0292,
+      "step": 1730
+    },
+    {
+      "epoch": 0.6726901777907316,
+      "grad_norm": 0.22515977919101715,
+      "learning_rate": 6.570650058388478e-05,
+      "loss": 1.0118,
+      "step": 1731
+    },
+    {
+      "epoch": 0.673078791411639,
+      "grad_norm": 0.2132793813943863,
+      "learning_rate": 6.562864927987545e-05,
+      "loss": 1.1097,
+      "step": 1732
+    },
+    {
+      "epoch": 0.6734674050325464,
+      "grad_norm": 0.20358797907829285,
+      "learning_rate": 6.55507979758661e-05,
+      "loss": 1.0241,
+      "step": 1733
+    },
+    {
+      "epoch": 0.6738560186534538,
+      "grad_norm": 0.21155016124248505,
+      "learning_rate": 6.547294667185676e-05,
+      "loss": 1.0235,
+      "step": 1734
+    },
+    {
+      "epoch": 0.6742446322743613,
+      "grad_norm": 0.198009192943573,
+      "learning_rate": 6.539509536784741e-05,
+      "loss": 0.9542,
+      "step": 1735
+    },
+    {
+      "epoch": 0.6746332458952686,
+      "grad_norm": 0.20318005979061127,
+      "learning_rate": 6.531724406383807e-05,
+      "loss": 0.9993,
+      "step": 1736
+    },
+    {
+      "epoch": 0.675021859516176,
+      "grad_norm": 0.21384860575199127,
+      "learning_rate": 6.523939275982873e-05,
+      "loss": 1.1188,
+      "step": 1737
+    },
+    {
+      "epoch": 0.6754104731370835,
+      "grad_norm": 0.18736955523490906,
+      "learning_rate": 6.516154145581938e-05,
+      "loss": 0.9832,
+      "step": 1738
+    },
+    {
+      "epoch": 0.6757990867579908,
+      "grad_norm": 0.2002391368150711,
+      "learning_rate": 6.508369015181005e-05,
+      "loss": 1.0288,
+      "step": 1739
+    },
+    {
+      "epoch": 0.6761877003788983,
+      "grad_norm": 0.20011006295681,
+      "learning_rate": 6.50058388478007e-05,
+      "loss": 0.9588,
+      "step": 1740
+    },
+    {
+      "epoch": 0.6765763139998057,
+      "grad_norm": 0.20782291889190674,
+      "learning_rate": 6.492798754379135e-05,
+      "loss": 1.0033,
+      "step": 1741
+    },
+    {
+      "epoch": 0.6769649276207131,
+      "grad_norm": 0.2056814581155777,
+      "learning_rate": 6.485013623978202e-05,
+      "loss": 1.0648,
+      "step": 1742
+    },
+    {
+      "epoch": 0.6773535412416205,
+      "grad_norm": 0.2207457572221756,
+      "learning_rate": 6.477228493577267e-05,
+      "loss": 1.0758,
+      "step": 1743
+    },
+    {
+      "epoch": 0.677742154862528,
+      "grad_norm": 0.20437198877334595,
+      "learning_rate": 6.469443363176334e-05,
+      "loss": 1.0253,
+      "step": 1744
+    },
+    {
+      "epoch": 0.6781307684834353,
+      "grad_norm": 0.198721781373024,
+      "learning_rate": 6.461658232775399e-05,
+      "loss": 1.0087,
+      "step": 1745
+    },
+    {
+      "epoch": 0.6785193821043427,
+      "grad_norm": 0.22781015932559967,
+      "learning_rate": 6.453873102374464e-05,
+      "loss": 1.0692,
+      "step": 1746
+    },
+    {
+      "epoch": 0.6789079957252502,
+      "grad_norm": 0.21826857328414917,
+      "learning_rate": 6.446087971973531e-05,
+      "loss": 1.0232,
+      "step": 1747
+    },
+    {
+      "epoch": 0.6792966093461575,
+      "grad_norm": 0.2156928926706314,
+      "learning_rate": 6.438302841572596e-05,
+      "loss": 1.0686,
+      "step": 1748
+    },
+    {
+      "epoch": 0.679685222967065,
+      "grad_norm": 0.2161693125963211,
+      "learning_rate": 6.430517711171662e-05,
+      "loss": 1.0298,
+      "step": 1749
+    },
+    {
+      "epoch": 0.6800738365879724,
+      "grad_norm": 0.19139425456523895,
+      "learning_rate": 6.422732580770729e-05,
+      "loss": 0.9545,
+      "step": 1750
+    },
+    {
+      "epoch": 0.6804624502088799,
+      "grad_norm": 0.22626161575317383,
+      "learning_rate": 6.414947450369794e-05,
+      "loss": 1.0669,
+      "step": 1751
+    },
+    {
+      "epoch": 0.6808510638297872,
+      "grad_norm": 0.2135801464319229,
+      "learning_rate": 6.407162319968861e-05,
+      "loss": 1.0187,
+      "step": 1752
+    },
+    {
+      "epoch": 0.6812396774506947,
+      "grad_norm": 0.20803681015968323,
+      "learning_rate": 6.399377189567926e-05,
+      "loss": 1.0856,
+      "step": 1753
+    },
+    {
+      "epoch": 0.681628291071602,
+      "grad_norm": 0.21317154169082642,
+      "learning_rate": 6.391592059166991e-05,
+      "loss": 1.1018,
+      "step": 1754
+    },
+    {
+      "epoch": 0.6820169046925094,
+      "grad_norm": 0.20877891778945923,
+      "learning_rate": 6.383806928766058e-05,
+      "loss": 1.0383,
+      "step": 1755
+    },
+    {
+      "epoch": 0.6824055183134169,
+      "grad_norm": 0.20769146084785461,
+      "learning_rate": 6.376021798365123e-05,
+      "loss": 1.0852,
+      "step": 1756
+    },
+    {
+      "epoch": 0.6827941319343243,
+      "grad_norm": 0.2252657413482666,
+      "learning_rate": 6.36823666796419e-05,
+      "loss": 1.0749,
+      "step": 1757
+    },
+    {
+      "epoch": 0.6831827455552317,
+      "grad_norm": 0.24453257024288177,
+      "learning_rate": 6.360451537563255e-05,
+      "loss": 1.1042,
+      "step": 1758
+    },
+    {
+      "epoch": 0.6835713591761391,
+      "grad_norm": 0.2082965075969696,
+      "learning_rate": 6.35266640716232e-05,
+      "loss": 1.0729,
+      "step": 1759
+    },
+    {
+      "epoch": 0.6839599727970466,
+      "grad_norm": 0.20121856033802032,
+      "learning_rate": 6.344881276761387e-05,
+      "loss": 1.038,
+      "step": 1760
+    },
+    {
+      "epoch": 0.6843485864179539,
+      "grad_norm": 0.20096386969089508,
+      "learning_rate": 6.337096146360452e-05,
+      "loss": 0.9655,
+      "step": 1761
+    },
+    {
+      "epoch": 0.6847372000388614,
+      "grad_norm": 0.20015959441661835,
+      "learning_rate": 6.329311015959518e-05,
+      "loss": 1.0187,
+      "step": 1762
+    },
+    {
+      "epoch": 0.6851258136597688,
+      "grad_norm": 0.21056395769119263,
+      "learning_rate": 6.321525885558583e-05,
+      "loss": 1.0567,
+      "step": 1763
+    },
+    {
+      "epoch": 0.6855144272806762,
+      "grad_norm": 0.2211030125617981,
+      "learning_rate": 6.313740755157649e-05,
+      "loss": 1.0588,
+      "step": 1764
+    },
+    {
+      "epoch": 0.6859030409015836,
+      "grad_norm": 0.20809797942638397,
+      "learning_rate": 6.305955624756715e-05,
+      "loss": 0.9488,
+      "step": 1765
+    },
+    {
+      "epoch": 0.686291654522491,
+      "grad_norm": 0.2331530600786209,
+      "learning_rate": 6.29817049435578e-05,
+      "loss": 1.0789,
+      "step": 1766
+    },
+    {
+      "epoch": 0.6866802681433984,
+      "grad_norm": 0.21708674728870392,
+      "learning_rate": 6.290385363954846e-05,
+      "loss": 1.0518,
+      "step": 1767
+    },
+    {
+      "epoch": 0.6870688817643058,
+      "grad_norm": 0.2088184356689453,
+      "learning_rate": 6.282600233553912e-05,
+      "loss": 1.0178,
+      "step": 1768
+    },
+    {
+      "epoch": 0.6874574953852133,
+      "grad_norm": 0.20285943150520325,
+      "learning_rate": 6.274815103152977e-05,
+      "loss": 1.018,
+      "step": 1769
+    },
+    {
+      "epoch": 0.6878461090061206,
+      "grad_norm": 0.211436927318573,
+      "learning_rate": 6.267029972752044e-05,
+      "loss": 1.0572,
+      "step": 1770
+    },
+    {
+      "epoch": 0.6882347226270281,
+      "grad_norm": 0.21108384430408478,
+      "learning_rate": 6.259244842351109e-05,
+      "loss": 1.0227,
+      "step": 1771
+    },
+    {
+      "epoch": 0.6886233362479355,
+      "grad_norm": 0.2060437649488449,
+      "learning_rate": 6.251459711950174e-05,
+      "loss": 1.0251,
+      "step": 1772
+    },
+    {
+      "epoch": 0.689011949868843,
+      "grad_norm": 0.20819245278835297,
+      "learning_rate": 6.243674581549241e-05,
+      "loss": 1.0643,
+      "step": 1773
+    },
+    {
+      "epoch": 0.6894005634897503,
+      "grad_norm": 0.2172113060951233,
+      "learning_rate": 6.235889451148306e-05,
+      "loss": 1.0869,
+      "step": 1774
+    },
+    {
+      "epoch": 0.6897891771106577,
+      "grad_norm": 0.2087356299161911,
+      "learning_rate": 6.228104320747373e-05,
+      "loss": 1.0622,
+      "step": 1775
+    },
+    {
+      "epoch": 0.6901777907315652,
+      "grad_norm": 0.1958473175764084,
+      "learning_rate": 6.220319190346439e-05,
+      "loss": 0.9542,
+      "step": 1776
+    },
+    {
+      "epoch": 0.6905664043524725,
+      "grad_norm": 0.23630915582180023,
+      "learning_rate": 6.212534059945504e-05,
+      "loss": 1.0535,
+      "step": 1777
+    },
+    {
+      "epoch": 0.69095501797338,
+      "grad_norm": 0.2127649188041687,
+      "learning_rate": 6.204748929544571e-05,
+      "loss": 0.972,
+      "step": 1778
+    },
+    {
+      "epoch": 0.6913436315942874,
+      "grad_norm": 0.19873055815696716,
+      "learning_rate": 6.196963799143636e-05,
+      "loss": 0.9969,
+      "step": 1779
+    },
+    {
+      "epoch": 0.6917322452151948,
+      "grad_norm": 0.2013067901134491,
+      "learning_rate": 6.189178668742703e-05,
+      "loss": 1.0399,
+      "step": 1780
+    },
+    {
+      "epoch": 0.6921208588361022,
+      "grad_norm": 0.21300987899303436,
+      "learning_rate": 6.181393538341768e-05,
+      "loss": 1.0377,
+      "step": 1781
+    },
+    {
+      "epoch": 0.6925094724570097,
+      "grad_norm": 0.21665994822978973,
+      "learning_rate": 6.173608407940833e-05,
+      "loss": 1.008,
+      "step": 1782
+    },
+    {
+      "epoch": 0.692898086077917,
+      "grad_norm": 0.21622590720653534,
+      "learning_rate": 6.1658232775399e-05,
+      "loss": 1.1128,
+      "step": 1783
+    },
+    {
+      "epoch": 0.6932866996988244,
+      "grad_norm": 0.2000272423028946,
+      "learning_rate": 6.158038147138965e-05,
+      "loss": 1.0115,
+      "step": 1784
+    },
+    {
+      "epoch": 0.6936753133197319,
+      "grad_norm": 0.20774856209754944,
+      "learning_rate": 6.15025301673803e-05,
+      "loss": 1.066,
+      "step": 1785
+    },
+    {
+      "epoch": 0.6940639269406392,
+      "grad_norm": 0.18497461080551147,
+      "learning_rate": 6.142467886337097e-05,
+      "loss": 0.9608,
+      "step": 1786
+    },
+    {
+      "epoch": 0.6944525405615467,
+      "grad_norm": 0.19819007813930511,
+      "learning_rate": 6.134682755936162e-05,
+      "loss": 1.0114,
+      "step": 1787
+    },
+    {
+      "epoch": 0.6948411541824541,
+      "grad_norm": 0.22013314068317413,
+      "learning_rate": 6.126897625535229e-05,
+      "loss": 0.976,
+      "step": 1788
+    },
+    {
+      "epoch": 0.6952297678033615,
+      "grad_norm": 0.2066160887479782,
+      "learning_rate": 6.119112495134294e-05,
+      "loss": 1.0585,
+      "step": 1789
+    },
+    {
+      "epoch": 0.6956183814242689,
+      "grad_norm": 0.21364475786685944,
+      "learning_rate": 6.111327364733359e-05,
+      "loss": 1.0842,
+      "step": 1790
+    },
+    {
+      "epoch": 0.6960069950451764,
+      "grad_norm": 0.19731444120407104,
+      "learning_rate": 6.103542234332425e-05,
+      "loss": 0.9936,
+      "step": 1791
+    },
+    {
+      "epoch": 0.6963956086660837,
+      "grad_norm": 0.2162671983242035,
+      "learning_rate": 6.095757103931491e-05,
+      "loss": 1.0446,
+      "step": 1792
+    },
+    {
+      "epoch": 0.6967842222869911,
+      "grad_norm": 0.21486608684062958,
+      "learning_rate": 6.087971973530557e-05,
+      "loss": 1.0441,
+      "step": 1793
+    },
+    {
+      "epoch": 0.6971728359078986,
+      "grad_norm": 0.20850563049316406,
+      "learning_rate": 6.0801868431296224e-05,
+      "loss": 1.0431,
+      "step": 1794
+    },
+    {
+      "epoch": 0.6975614495288059,
+      "grad_norm": 0.20492027699947357,
+      "learning_rate": 6.072401712728688e-05,
+      "loss": 0.9845,
+      "step": 1795
+    },
+    {
+      "epoch": 0.6979500631497134,
+      "grad_norm": 0.1986648142337799,
+      "learning_rate": 6.064616582327754e-05,
+      "loss": 0.9855,
+      "step": 1796
+    },
+    {
+      "epoch": 0.6983386767706208,
+      "grad_norm": 0.20606310665607452,
+      "learning_rate": 6.05683145192682e-05,
+      "loss": 1.0608,
+      "step": 1797
+    },
+    {
+      "epoch": 0.6987272903915283,
+      "grad_norm": 0.20496073365211487,
+      "learning_rate": 6.0490463215258867e-05,
+      "loss": 1.0311,
+      "step": 1798
+    },
+    {
+      "epoch": 0.6991159040124356,
+      "grad_norm": 0.2153409719467163,
+      "learning_rate": 6.041261191124952e-05,
+      "loss": 1.0394,
+      "step": 1799
+    },
+    {
+      "epoch": 0.6995045176333431,
+      "grad_norm": 0.21410655975341797,
+      "learning_rate": 6.033476060724017e-05,
+      "loss": 1.0229,
+      "step": 1800
+    },
+    {
+      "epoch": 0.6998931312542505,
+      "grad_norm": 0.20418782532215118,
+      "learning_rate": 6.0256909303230836e-05,
+      "loss": 1.0382,
+      "step": 1801
+    },
+    {
+      "epoch": 0.7002817448751578,
+      "grad_norm": 0.19154146313667297,
+      "learning_rate": 6.017905799922149e-05,
+      "loss": 0.9891,
+      "step": 1802
+    },
+    {
+      "epoch": 0.7006703584960653,
+      "grad_norm": 0.19138328731060028,
+      "learning_rate": 6.010120669521214e-05,
+      "loss": 0.9638,
+      "step": 1803
+    },
+    {
+      "epoch": 0.7010589721169727,
+      "grad_norm": 0.19704872369766235,
+      "learning_rate": 6.0023355391202806e-05,
+      "loss": 0.9835,
+      "step": 1804
+    },
+    {
+      "epoch": 0.7014475857378801,
+      "grad_norm": 0.2175600379705429,
+      "learning_rate": 5.994550408719346e-05,
+      "loss": 1.1192,
+      "step": 1805
+    },
+    {
+      "epoch": 0.7018361993587875,
+      "grad_norm": 0.21614274382591248,
+      "learning_rate": 5.9867652783184124e-05,
+      "loss": 1.0877,
+      "step": 1806
+    },
+    {
+      "epoch": 0.702224812979695,
+      "grad_norm": 0.20461414754390717,
+      "learning_rate": 5.9789801479174776e-05,
+      "loss": 0.9706,
+      "step": 1807
+    },
+    {
+      "epoch": 0.7026134266006023,
+      "grad_norm": 0.1989748477935791,
+      "learning_rate": 5.9711950175165434e-05,
+      "loss": 1.0004,
+      "step": 1808
+    },
+    {
+      "epoch": 0.7030020402215098,
+      "grad_norm": 0.21304792165756226,
+      "learning_rate": 5.963409887115609e-05,
+      "loss": 1.0177,
+      "step": 1809
+    },
+    {
+      "epoch": 0.7033906538424172,
+      "grad_norm": 0.19023855030536652,
+      "learning_rate": 5.955624756714675e-05,
+      "loss": 0.9759,
+      "step": 1810
+    },
+    {
+      "epoch": 0.7037792674633246,
+      "grad_norm": 0.21915188431739807,
+      "learning_rate": 5.947839626313742e-05,
+      "loss": 1.0621,
+      "step": 1811
+    },
+    {
+      "epoch": 0.704167881084232,
+      "grad_norm": 0.21626822650432587,
+      "learning_rate": 5.940054495912807e-05,
+      "loss": 1.0144,
+      "step": 1812
+    },
+    {
+      "epoch": 0.7045564947051394,
+      "grad_norm": 0.20742040872573853,
+      "learning_rate": 5.932269365511872e-05,
+      "loss": 0.9778,
+      "step": 1813
+    },
+    {
+      "epoch": 0.7049451083260468,
+      "grad_norm": 0.2172158658504486,
+      "learning_rate": 5.924484235110939e-05,
+      "loss": 1.0416,
+      "step": 1814
+    },
+    {
+      "epoch": 0.7053337219469542,
+      "grad_norm": 0.209465891122818,
+      "learning_rate": 5.916699104710004e-05,
+      "loss": 1.0378,
+      "step": 1815
+    },
+    {
+      "epoch": 0.7057223355678617,
+      "grad_norm": 0.2097882628440857,
+      "learning_rate": 5.9089139743090705e-05,
+      "loss": 1.0166,
+      "step": 1816
+    },
+    {
+      "epoch": 0.706110949188769,
+      "grad_norm": 0.2251904308795929,
+      "learning_rate": 5.901128843908136e-05,
+      "loss": 1.0783,
+      "step": 1817
+    },
+    {
+      "epoch": 0.7064995628096765,
+      "grad_norm": 0.1952916979789734,
+      "learning_rate": 5.893343713507201e-05,
+      "loss": 0.993,
+      "step": 1818
+    },
+    {
+      "epoch": 0.7068881764305839,
+      "grad_norm": 0.20997455716133118,
+      "learning_rate": 5.8855585831062675e-05,
+      "loss": 1.0448,
+      "step": 1819
+    },
+    {
+      "epoch": 0.7072767900514914,
+      "grad_norm": 0.20070020854473114,
+      "learning_rate": 5.877773452705333e-05,
+      "loss": 0.9603,
+      "step": 1820
+    },
+    {
+      "epoch": 0.7076654036723987,
+      "grad_norm": 0.25765034556388855,
+      "learning_rate": 5.869988322304399e-05,
+      "loss": 1.0361,
+      "step": 1821
+    },
+    {
+      "epoch": 0.7080540172933061,
+      "grad_norm": 0.21948982775211334,
+      "learning_rate": 5.862203191903465e-05,
+      "loss": 1.0668,
+      "step": 1822
+    },
+    {
+      "epoch": 0.7084426309142136,
+      "grad_norm": 0.1867108792066574,
+      "learning_rate": 5.85441806150253e-05,
+      "loss": 0.9372,
+      "step": 1823
+    },
+    {
+      "epoch": 0.7088312445351209,
+      "grad_norm": 0.2037520408630371,
+      "learning_rate": 5.846632931101597e-05,
+      "loss": 0.9905,
+      "step": 1824
+    },
+    {
+      "epoch": 0.7092198581560284,
+      "grad_norm": 0.21352072060108185,
+      "learning_rate": 5.838847800700662e-05,
+      "loss": 1.0514,
+      "step": 1825
+    },
+    {
+      "epoch": 0.7096084717769358,
+      "grad_norm": 0.1949845850467682,
+      "learning_rate": 5.831062670299727e-05,
+      "loss": 0.9636,
+      "step": 1826
+    },
+    {
+      "epoch": 0.7099970853978432,
+      "grad_norm": 0.2092294692993164,
+      "learning_rate": 5.823277539898794e-05,
+      "loss": 1.0361,
+      "step": 1827
+    },
+    {
+      "epoch": 0.7103856990187506,
+      "grad_norm": 0.20054267346858978,
+      "learning_rate": 5.815492409497859e-05,
+      "loss": 1.0195,
+      "step": 1828
+    },
+    {
+      "epoch": 0.7107743126396581,
+      "grad_norm": 0.2202107012271881,
+      "learning_rate": 5.8077072790969256e-05,
+      "loss": 1.0918,
+      "step": 1829
+    },
+    {
+      "epoch": 0.7111629262605654,
+      "grad_norm": 0.2001042366027832,
+      "learning_rate": 5.799922148695991e-05,
+      "loss": 1.0142,
+      "step": 1830
+    },
+    {
+      "epoch": 0.7115515398814728,
+      "grad_norm": 0.2102631777524948,
+      "learning_rate": 5.792137018295056e-05,
+      "loss": 1.0231,
+      "step": 1831
+    },
+    {
+      "epoch": 0.7119401535023803,
+      "grad_norm": 0.21717461943626404,
+      "learning_rate": 5.7843518878941226e-05,
+      "loss": 1.0295,
+      "step": 1832
+    },
+    {
+      "epoch": 0.7123287671232876,
+      "grad_norm": 0.2001933753490448,
+      "learning_rate": 5.776566757493188e-05,
+      "loss": 1.022,
+      "step": 1833
+    },
+    {
+      "epoch": 0.7127173807441951,
+      "grad_norm": 0.2218201756477356,
+      "learning_rate": 5.7687816270922544e-05,
+      "loss": 1.0762,
+      "step": 1834
+    },
+    {
+      "epoch": 0.7131059943651025,
+      "grad_norm": 0.20680001378059387,
+      "learning_rate": 5.76099649669132e-05,
+      "loss": 1.0017,
+      "step": 1835
+    },
+    {
+      "epoch": 0.7134946079860099,
+      "grad_norm": 0.21511508524417877,
+      "learning_rate": 5.7532113662903854e-05,
+      "loss": 1.048,
+      "step": 1836
+    },
+    {
+      "epoch": 0.7138832216069173,
+      "grad_norm": 0.19720061123371124,
+      "learning_rate": 5.745426235889452e-05,
+      "loss": 0.9983,
+      "step": 1837
+    },
+    {
+      "epoch": 0.7142718352278248,
+      "grad_norm": 0.2005409449338913,
+      "learning_rate": 5.737641105488517e-05,
+      "loss": 0.9941,
+      "step": 1838
+    },
+    {
+      "epoch": 0.7146604488487321,
+      "grad_norm": 0.2222924679517746,
+      "learning_rate": 5.729855975087584e-05,
+      "loss": 1.0476,
+      "step": 1839
+    },
+    {
+      "epoch": 0.7150490624696395,
+      "grad_norm": 0.21131208539009094,
+      "learning_rate": 5.722070844686649e-05,
+      "loss": 1.0208,
+      "step": 1840
+    },
+    {
+      "epoch": 0.715437676090547,
+      "grad_norm": 0.2307305932044983,
+      "learning_rate": 5.714285714285714e-05,
+      "loss": 0.9867,
+      "step": 1841
+    },
+    {
+      "epoch": 0.7158262897114543,
+      "grad_norm": 0.1974973827600479,
+      "learning_rate": 5.706500583884781e-05,
+      "loss": 1.0285,
+      "step": 1842
+    },
+    {
+      "epoch": 0.7162149033323618,
+      "grad_norm": 0.2006559520959854,
+      "learning_rate": 5.698715453483846e-05,
+      "loss": 1.024,
+      "step": 1843
+    },
+    {
+      "epoch": 0.7166035169532692,
+      "grad_norm": 0.21160584688186646,
+      "learning_rate": 5.690930323082911e-05,
+      "loss": 1.0256,
+      "step": 1844
+    },
+    {
+      "epoch": 0.7169921305741767,
+      "grad_norm": 0.28184664249420166,
+      "learning_rate": 5.683145192681978e-05,
+      "loss": 1.0443,
+      "step": 1845
+    },
+    {
+      "epoch": 0.717380744195084,
+      "grad_norm": 0.2206653356552124,
+      "learning_rate": 5.675360062281043e-05,
+      "loss": 1.0458,
+      "step": 1846
+    },
+    {
+      "epoch": 0.7177693578159915,
+      "grad_norm": 0.21346066892147064,
+      "learning_rate": 5.6675749318801095e-05,
+      "loss": 1.0106,
+      "step": 1847
+    },
+    {
+      "epoch": 0.7181579714368989,
+      "grad_norm": 0.20931747555732727,
+      "learning_rate": 5.6597898014791753e-05,
+      "loss": 0.9831,
+      "step": 1848
+    },
+    {
+      "epoch": 0.7185465850578063,
+      "grad_norm": 0.2026771456003189,
+      "learning_rate": 5.6520046710782406e-05,
+      "loss": 1.0162,
+      "step": 1849
+    },
+    {
+      "epoch": 0.7189351986787137,
+      "grad_norm": 0.21388716995716095,
+      "learning_rate": 5.644219540677307e-05,
+      "loss": 1.0867,
+      "step": 1850
+    },
+    {
+      "epoch": 0.7193238122996211,
+      "grad_norm": 0.2039308398962021,
+      "learning_rate": 5.636434410276372e-05,
+      "loss": 1.0325,
+      "step": 1851
+    },
+    {
+      "epoch": 0.7197124259205285,
+      "grad_norm": 0.21741114556789398,
+      "learning_rate": 5.628649279875439e-05,
+      "loss": 1.0251,
+      "step": 1852
+    },
+    {
+      "epoch": 0.7201010395414359,
+      "grad_norm": 0.21343208849430084,
+      "learning_rate": 5.620864149474504e-05,
+      "loss": 1.0766,
+      "step": 1853
+    },
+    {
+      "epoch": 0.7204896531623434,
+      "grad_norm": 0.21712560951709747,
+      "learning_rate": 5.613079019073569e-05,
+      "loss": 1.0643,
+      "step": 1854
+    },
+    {
+      "epoch": 0.7208782667832507,
+      "grad_norm": 0.2176978886127472,
+      "learning_rate": 5.605293888672636e-05,
+      "loss": 1.0375,
+      "step": 1855
+    },
+    {
+      "epoch": 0.7212668804041582,
+      "grad_norm": 0.2065533846616745,
+      "learning_rate": 5.597508758271701e-05,
+      "loss": 1.0385,
+      "step": 1856
+    },
+    {
+      "epoch": 0.7216554940250656,
+      "grad_norm": 0.2169170081615448,
+      "learning_rate": 5.5897236278707676e-05,
+      "loss": 1.0197,
+      "step": 1857
+    },
+    {
+      "epoch": 0.722044107645973,
+      "grad_norm": 0.2047201544046402,
+      "learning_rate": 5.581938497469833e-05,
+      "loss": 0.9794,
+      "step": 1858
+    },
+    {
+      "epoch": 0.7224327212668804,
+      "grad_norm": 0.20898981392383575,
+      "learning_rate": 5.574153367068898e-05,
+      "loss": 1.032,
+      "step": 1859
+    },
+    {
+      "epoch": 0.7228213348877878,
+      "grad_norm": 0.2090533971786499,
+      "learning_rate": 5.5663682366679646e-05,
+      "loss": 1.0694,
+      "step": 1860
+    },
+    {
+      "epoch": 0.7232099485086952,
+      "grad_norm": 0.21963149309158325,
+      "learning_rate": 5.5585831062670305e-05,
+      "loss": 1.0367,
+      "step": 1861
+    },
+    {
+      "epoch": 0.7235985621296026,
+      "grad_norm": 0.1974373459815979,
+      "learning_rate": 5.550797975866096e-05,
+      "loss": 1.0402,
+      "step": 1862
+    },
+    {
+      "epoch": 0.7239871757505101,
+      "grad_norm": 0.1924194097518921,
+      "learning_rate": 5.543012845465162e-05,
+      "loss": 0.9647,
+      "step": 1863
+    },
+    {
+      "epoch": 0.7243757893714174,
+      "grad_norm": 0.21366077661514282,
+      "learning_rate": 5.5352277150642274e-05,
+      "loss": 1.0139,
+      "step": 1864
+    },
+    {
+      "epoch": 0.7247644029923249,
+      "grad_norm": 0.21722929179668427,
+      "learning_rate": 5.527442584663294e-05,
+      "loss": 1.0366,
+      "step": 1865
+    },
+    {
+      "epoch": 0.7251530166132323,
+      "grad_norm": 0.20646587014198303,
+      "learning_rate": 5.519657454262359e-05,
+      "loss": 1.0465,
+      "step": 1866
+    },
+    {
+      "epoch": 0.7255416302341398,
+      "grad_norm": 0.19144394993782043,
+      "learning_rate": 5.5118723238614244e-05,
+      "loss": 0.9645,
+      "step": 1867
+    },
+    {
+      "epoch": 0.7259302438550471,
+      "grad_norm": 0.19553838670253754,
+      "learning_rate": 5.504087193460491e-05,
+      "loss": 0.98,
+      "step": 1868
+    },
+    {
+      "epoch": 0.7263188574759545,
+      "grad_norm": 0.21739792823791504,
+      "learning_rate": 5.496302063059556e-05,
+      "loss": 1.002,
+      "step": 1869
+    },
+    {
+      "epoch": 0.726707471096862,
+      "grad_norm": 0.1910562962293625,
+      "learning_rate": 5.488516932658623e-05,
+      "loss": 0.985,
+      "step": 1870
+    },
+    {
+      "epoch": 0.7270960847177693,
+      "grad_norm": 0.2133384346961975,
+      "learning_rate": 5.480731802257688e-05,
+      "loss": 1.0325,
+      "step": 1871
+    },
+    {
+      "epoch": 0.7274846983386768,
+      "grad_norm": 0.21884119510650635,
+      "learning_rate": 5.472946671856753e-05,
+      "loss": 1.0412,
+      "step": 1872
+    },
+    {
+      "epoch": 0.7278733119595842,
+      "grad_norm": 0.21069306135177612,
+      "learning_rate": 5.46516154145582e-05,
+      "loss": 1.0474,
+      "step": 1873
+    },
+    {
+      "epoch": 0.7282619255804916,
+      "grad_norm": 0.19266243278980255,
+      "learning_rate": 5.4573764110548856e-05,
+      "loss": 0.9941,
+      "step": 1874
+    },
+    {
+      "epoch": 0.728650539201399,
+      "grad_norm": 0.21255099773406982,
+      "learning_rate": 5.4495912806539515e-05,
+      "loss": 1.0211,
+      "step": 1875
+    },
+    {
+      "epoch": 0.7290391528223065,
+      "grad_norm": 0.1924402117729187,
+      "learning_rate": 5.4418061502530173e-05,
+      "loss": 1.0117,
+      "step": 1876
+    },
+    {
+      "epoch": 0.7294277664432138,
+      "grad_norm": 0.2019895315170288,
+      "learning_rate": 5.4340210198520825e-05,
+      "loss": 0.9921,
+      "step": 1877
+    },
+    {
+      "epoch": 0.7298163800641212,
+      "grad_norm": 0.20398026704788208,
+      "learning_rate": 5.426235889451149e-05,
+      "loss": 1.0423,
+      "step": 1878
+    },
+    {
+      "epoch": 0.7302049936850287,
+      "grad_norm": 0.20153217017650604,
+      "learning_rate": 5.418450759050214e-05,
+      "loss": 1.0333,
+      "step": 1879
+    },
+    {
+      "epoch": 0.730593607305936,
+      "grad_norm": 0.21259640157222748,
+      "learning_rate": 5.4106656286492795e-05,
+      "loss": 1.0689,
+      "step": 1880
+    },
+    {
+      "epoch": 0.7309822209268435,
+      "grad_norm": 0.2037276029586792,
+      "learning_rate": 5.402880498248346e-05,
+      "loss": 1.0203,
+      "step": 1881
+    },
+    {
+      "epoch": 0.7313708345477509,
+      "grad_norm": 0.19976729154586792,
+      "learning_rate": 5.395095367847411e-05,
+      "loss": 1.0173,
+      "step": 1882
+    },
+    {
+      "epoch": 0.7317594481686583,
+      "grad_norm": 0.20481806993484497,
+      "learning_rate": 5.387310237446478e-05,
+      "loss": 0.9864,
+      "step": 1883
+    },
+    {
+      "epoch": 0.7321480617895657,
+      "grad_norm": 0.21900932490825653,
+      "learning_rate": 5.379525107045543e-05,
+      "loss": 1.0519,
+      "step": 1884
+    },
+    {
+      "epoch": 0.7325366754104732,
+      "grad_norm": 0.200319305062294,
+      "learning_rate": 5.371739976644609e-05,
+      "loss": 1.0834,
+      "step": 1885
+    },
+    {
+      "epoch": 0.7329252890313805,
+      "grad_norm": 0.19662296772003174,
+      "learning_rate": 5.363954846243675e-05,
+      "loss": 0.9794,
+      "step": 1886
+    },
+    {
+      "epoch": 0.7333139026522879,
+      "grad_norm": 0.2113952785730362,
+      "learning_rate": 5.356169715842741e-05,
+      "loss": 1.0763,
+      "step": 1887
+    },
+    {
+      "epoch": 0.7337025162731954,
+      "grad_norm": 0.21348755061626434,
+      "learning_rate": 5.3483845854418066e-05,
+      "loss": 1.0781,
+      "step": 1888
+    },
+    {
+      "epoch": 0.7340911298941027,
+      "grad_norm": 0.20673702657222748,
+      "learning_rate": 5.3405994550408725e-05,
+      "loss": 1.0513,
+      "step": 1889
+    },
+    {
+      "epoch": 0.7344797435150102,
+      "grad_norm": 0.210855171084404,
+      "learning_rate": 5.332814324639938e-05,
+      "loss": 0.9972,
+      "step": 1890
+    },
+    {
+      "epoch": 0.7348683571359176,
+      "grad_norm": 0.2136204093694687,
+      "learning_rate": 5.325029194239004e-05,
+      "loss": 1.03,
+      "step": 1891
+    },
+    {
+      "epoch": 0.7352569707568251,
+      "grad_norm": 0.20035260915756226,
+      "learning_rate": 5.3172440638380694e-05,
+      "loss": 0.9739,
+      "step": 1892
+    },
+    {
+      "epoch": 0.7356455843777324,
+      "grad_norm": 0.1943352371454239,
+      "learning_rate": 5.309458933437136e-05,
+      "loss": 0.9411,
+      "step": 1893
+    },
+    {
+      "epoch": 0.7360341979986399,
+      "grad_norm": 0.3994326889514923,
+      "learning_rate": 5.301673803036201e-05,
+      "loss": 1.0714,
+      "step": 1894
+    },
+    {
+      "epoch": 0.7364228116195473,
+      "grad_norm": 0.21691356599330902,
+      "learning_rate": 5.2938886726352664e-05,
+      "loss": 1.0648,
+      "step": 1895
+    },
+    {
+      "epoch": 0.7368114252404547,
+      "grad_norm": 0.19853095710277557,
+      "learning_rate": 5.286103542234333e-05,
+      "loss": 0.983,
+      "step": 1896
+    },
+    {
+      "epoch": 0.7372000388613621,
+      "grad_norm": 0.21836897730827332,
+      "learning_rate": 5.278318411833398e-05,
+      "loss": 1.0396,
+      "step": 1897
+    },
+    {
+      "epoch": 0.7375886524822695,
+      "grad_norm": 0.19596605002880096,
+      "learning_rate": 5.270533281432464e-05,
+      "loss": 0.9593,
+      "step": 1898
+    },
+    {
+      "epoch": 0.7379772661031769,
+      "grad_norm": 0.2141752541065216,
+      "learning_rate": 5.26274815103153e-05,
+      "loss": 1.0373,
+      "step": 1899
+    },
+    {
+      "epoch": 0.7383658797240843,
+      "grad_norm": 0.20552939176559448,
+      "learning_rate": 5.254963020630596e-05,
+      "loss": 1.0352,
+      "step": 1900
+    },
+    {
+      "epoch": 0.7387544933449918,
+      "grad_norm": 0.2095794975757599,
+      "learning_rate": 5.247177890229662e-05,
+      "loss": 1.0632,
+      "step": 1901
+    },
+    {
+      "epoch": 0.7391431069658991,
+      "grad_norm": 0.19894710183143616,
+      "learning_rate": 5.2393927598287276e-05,
+      "loss": 0.9886,
+      "step": 1902
+    },
+    {
+      "epoch": 0.7395317205868066,
+      "grad_norm": 0.22996319830417633,
+      "learning_rate": 5.231607629427793e-05,
+      "loss": 1.0826,
+      "step": 1903
+    },
+    {
+      "epoch": 0.739920334207714,
+      "grad_norm": 0.21416957676410675,
+      "learning_rate": 5.2238224990268593e-05,
+      "loss": 1.0161,
+      "step": 1904
+    },
+    {
+      "epoch": 0.7403089478286214,
+      "grad_norm": 0.21819345653057098,
+      "learning_rate": 5.2160373686259245e-05,
+      "loss": 1.0458,
+      "step": 1905
+    },
+    {
+      "epoch": 0.7406975614495288,
+      "grad_norm": 0.21327044069766998,
+      "learning_rate": 5.208252238224991e-05,
+      "loss": 1.0721,
+      "step": 1906
+    },
+    {
+      "epoch": 0.7410861750704362,
+      "grad_norm": 0.21436645090579987,
+      "learning_rate": 5.200467107824056e-05,
+      "loss": 1.0743,
+      "step": 1907
+    },
+    {
+      "epoch": 0.7414747886913436,
+      "grad_norm": 0.215640127658844,
+      "learning_rate": 5.1926819774231215e-05,
+      "loss": 1.0274,
+      "step": 1908
+    },
+    {
+      "epoch": 0.741863402312251,
+      "grad_norm": 0.2043589949607849,
+      "learning_rate": 5.184896847022188e-05,
+      "loss": 1.0618,
+      "step": 1909
+    },
+    {
+      "epoch": 0.7422520159331585,
+      "grad_norm": 0.2014230340719223,
+      "learning_rate": 5.177111716621253e-05,
+      "loss": 0.9892,
+      "step": 1910
+    },
+    {
+      "epoch": 0.7426406295540658,
+      "grad_norm": 0.19954468309879303,
+      "learning_rate": 5.16932658622032e-05,
+      "loss": 0.9815,
+      "step": 1911
+    },
+    {
+      "epoch": 0.7430292431749733,
+      "grad_norm": 0.23119708895683289,
+      "learning_rate": 5.161541455819385e-05,
+      "loss": 1.0783,
+      "step": 1912
+    },
+    {
+      "epoch": 0.7434178567958807,
+      "grad_norm": 0.20650482177734375,
+      "learning_rate": 5.153756325418451e-05,
+      "loss": 1.0162,
+      "step": 1913
+    },
+    {
+      "epoch": 0.7438064704167882,
+      "grad_norm": 0.20021970570087433,
+      "learning_rate": 5.145971195017517e-05,
+      "loss": 1.0062,
+      "step": 1914
+    },
+    {
+      "epoch": 0.7441950840376955,
+      "grad_norm": 0.23300811648368835,
+      "learning_rate": 5.138186064616583e-05,
+      "loss": 1.0049,
+      "step": 1915
+    },
+    {
+      "epoch": 0.7445836976586029,
+      "grad_norm": 0.23268327116966248,
+      "learning_rate": 5.130400934215648e-05,
+      "loss": 1.0138,
+      "step": 1916
+    },
+    {
+      "epoch": 0.7449723112795104,
+      "grad_norm": 0.20413407683372498,
+      "learning_rate": 5.1226158038147145e-05,
+      "loss": 0.9903,
+      "step": 1917
+    },
+    {
+      "epoch": 0.7453609249004177,
+      "grad_norm": 0.20714978873729706,
+      "learning_rate": 5.1148306734137797e-05,
+      "loss": 1.0374,
+      "step": 1918
+    },
+    {
+      "epoch": 0.7457495385213252,
+      "grad_norm": 0.2000850886106491,
+      "learning_rate": 5.107045543012846e-05,
+      "loss": 0.9885,
+      "step": 1919
+    },
+    {
+      "epoch": 0.7461381521422326,
+      "grad_norm": 0.2054719179868698,
+      "learning_rate": 5.0992604126119114e-05,
+      "loss": 1.0551,
+      "step": 1920
+    },
+    {
+      "epoch": 0.74652676576314,
+      "grad_norm": 0.2351357489824295,
+      "learning_rate": 5.0914752822109766e-05,
+      "loss": 1.0693,
+      "step": 1921
+    },
+    {
+      "epoch": 0.7469153793840474,
+      "grad_norm": 0.22370338439941406,
+      "learning_rate": 5.083690151810043e-05,
+      "loss": 0.9781,
+      "step": 1922
+    },
+    {
+      "epoch": 0.7473039930049549,
+      "grad_norm": 0.18734332919120789,
+      "learning_rate": 5.0759050214091084e-05,
+      "loss": 0.9329,
+      "step": 1923
+    },
+    {
+      "epoch": 0.7476926066258622,
+      "grad_norm": 0.22099906206130981,
+      "learning_rate": 5.068119891008175e-05,
+      "loss": 1.0498,
+      "step": 1924
+    },
+    {
+      "epoch": 0.7480812202467696,
+      "grad_norm": 0.20144490897655487,
+      "learning_rate": 5.06033476060724e-05,
+      "loss": 0.9865,
+      "step": 1925
+    },
+    {
+      "epoch": 0.7484698338676771,
+      "grad_norm": 0.21770039200782776,
+      "learning_rate": 5.052549630206306e-05,
+      "loss": 1.0867,
+      "step": 1926
+    },
+    {
+      "epoch": 0.7488584474885844,
+      "grad_norm": 0.19649921357631683,
+      "learning_rate": 5.044764499805372e-05,
+      "loss": 0.9887,
+      "step": 1927
+    },
+    {
+      "epoch": 0.7492470611094919,
+      "grad_norm": 0.1940620392560959,
+      "learning_rate": 5.036979369404438e-05,
+      "loss": 1.0073,
+      "step": 1928
+    },
+    {
+      "epoch": 0.7496356747303993,
+      "grad_norm": 0.20987650752067566,
+      "learning_rate": 5.0291942390035044e-05,
+      "loss": 1.046,
+      "step": 1929
+    },
+    {
+      "epoch": 0.7500242883513067,
+      "grad_norm": 0.2116398960351944,
+      "learning_rate": 5.0214091086025696e-05,
+      "loss": 1.0423,
+      "step": 1930
+    },
+    {
+      "epoch": 0.7504129019722141,
+      "grad_norm": 0.18996965885162354,
+      "learning_rate": 5.013623978201635e-05,
+      "loss": 0.9822,
+      "step": 1931
+    },
+    {
+      "epoch": 0.7508015155931216,
+      "grad_norm": 0.20942547917366028,
+      "learning_rate": 5.005838847800701e-05,
+      "loss": 1.0472,
+      "step": 1932
+    },
+    {
+      "epoch": 0.751190129214029,
+      "grad_norm": 0.19006839394569397,
+      "learning_rate": 4.9980537173997665e-05,
+      "loss": 0.993,
+      "step": 1933
+    },
+    {
+      "epoch": 0.7515787428349364,
+      "grad_norm": 0.21508941054344177,
+      "learning_rate": 4.9902685869988324e-05,
+      "loss": 1.0406,
+      "step": 1934
+    },
+    {
+      "epoch": 0.7519673564558438,
+      "grad_norm": 0.1989334225654602,
+      "learning_rate": 4.982483456597898e-05,
+      "loss": 0.9997,
+      "step": 1935
+    },
+    {
+      "epoch": 0.7523559700767511,
+      "grad_norm": 0.19993600249290466,
+      "learning_rate": 4.974698326196964e-05,
+      "loss": 1.0139,
+      "step": 1936
+    },
+    {
+      "epoch": 0.7527445836976586,
+      "grad_norm": 0.20927831530570984,
+      "learning_rate": 4.9669131957960294e-05,
+      "loss": 0.995,
+      "step": 1937
+    },
+    {
+      "epoch": 0.753133197318566,
+      "grad_norm": 0.20963850617408752,
+      "learning_rate": 4.959128065395095e-05,
+      "loss": 1.0678,
+      "step": 1938
+    },
+    {
+      "epoch": 0.7535218109394735,
+      "grad_norm": 0.19523034989833832,
+      "learning_rate": 4.951342934994161e-05,
+      "loss": 0.9883,
+      "step": 1939
+    },
+    {
+      "epoch": 0.7539104245603808,
+      "grad_norm": 0.21588142216205597,
+      "learning_rate": 4.943557804593227e-05,
+      "loss": 1.0398,
+      "step": 1940
+    },
+    {
+      "epoch": 0.7542990381812883,
+      "grad_norm": 0.19894704222679138,
+      "learning_rate": 4.935772674192293e-05,
+      "loss": 1.0125,
+      "step": 1941
+    },
+    {
+      "epoch": 0.7546876518021957,
+      "grad_norm": 0.2155168056488037,
+      "learning_rate": 4.927987543791359e-05,
+      "loss": 1.0447,
+      "step": 1942
+    },
+    {
+      "epoch": 0.7550762654231031,
+      "grad_norm": 0.212605819106102,
+      "learning_rate": 4.920202413390425e-05,
+      "loss": 1.077,
+      "step": 1943
+    },
+    {
+      "epoch": 0.7554648790440105,
+      "grad_norm": 0.2168148010969162,
+      "learning_rate": 4.9124172829894906e-05,
+      "loss": 1.0029,
+      "step": 1944
+    },
+    {
+      "epoch": 0.7558534926649179,
+      "grad_norm": 0.2020149528980255,
+      "learning_rate": 4.9046321525885565e-05,
+      "loss": 1.0684,
+      "step": 1945
+    },
+    {
+      "epoch": 0.7562421062858253,
+      "grad_norm": 0.21063408255577087,
+      "learning_rate": 4.8968470221876217e-05,
+      "loss": 1.0147,
+      "step": 1946
+    },
+    {
+      "epoch": 0.7566307199067327,
+      "grad_norm": 0.19599388539791107,
+      "learning_rate": 4.8890618917866875e-05,
+      "loss": 0.9719,
+      "step": 1947
+    },
+    {
+      "epoch": 0.7570193335276402,
+      "grad_norm": 0.2158602923154831,
+      "learning_rate": 4.8812767613857534e-05,
+      "loss": 1.0439,
+      "step": 1948
+    },
+    {
+      "epoch": 0.7574079471485475,
+      "grad_norm": 0.21013815701007843,
+      "learning_rate": 4.873491630984819e-05,
+      "loss": 1.0319,
+      "step": 1949
+    },
+    {
+      "epoch": 0.757796560769455,
+      "grad_norm": 0.2020798772573471,
+      "learning_rate": 4.8657065005838845e-05,
+      "loss": 1.0037,
+      "step": 1950
+    },
+    {
+      "epoch": 0.7581851743903624,
+      "grad_norm": 0.21202047169208527,
+      "learning_rate": 4.8579213701829504e-05,
+      "loss": 0.9823,
+      "step": 1951
+    },
+    {
+      "epoch": 0.7585737880112698,
+      "grad_norm": 0.20750083029270172,
+      "learning_rate": 4.850136239782016e-05,
+      "loss": 1.0073,
+      "step": 1952
+    },
+    {
+      "epoch": 0.7589624016321772,
+      "grad_norm": 0.20938372611999512,
+      "learning_rate": 4.842351109381083e-05,
+      "loss": 1.0326,
+      "step": 1953
+    },
+    {
+      "epoch": 0.7593510152530846,
+      "grad_norm": 0.21984544396400452,
+      "learning_rate": 4.834565978980149e-05,
+      "loss": 1.0363,
+      "step": 1954
+    },
+    {
+      "epoch": 0.759739628873992,
+      "grad_norm": 0.20306189358234406,
+      "learning_rate": 4.826780848579214e-05,
+      "loss": 1.0374,
+      "step": 1955
+    },
+    {
+      "epoch": 0.7601282424948994,
+      "grad_norm": 0.20631705224514008,
+      "learning_rate": 4.81899571817828e-05,
+      "loss": 1.0985,
+      "step": 1956
+    },
+    {
+      "epoch": 0.7605168561158069,
+      "grad_norm": 0.22092190384864807,
+      "learning_rate": 4.811210587777346e-05,
+      "loss": 1.0216,
+      "step": 1957
+    },
+    {
+      "epoch": 0.7609054697367142,
+      "grad_norm": 0.21419481933116913,
+      "learning_rate": 4.8034254573764116e-05,
+      "loss": 1.0327,
+      "step": 1958
+    },
+    {
+      "epoch": 0.7612940833576217,
+      "grad_norm": 0.1954476237297058,
+      "learning_rate": 4.795640326975477e-05,
+      "loss": 1.0139,
+      "step": 1959
+    },
+    {
+      "epoch": 0.7616826969785291,
+      "grad_norm": 0.21092113852500916,
+      "learning_rate": 4.7878551965745427e-05,
+      "loss": 1.0934,
+      "step": 1960
+    },
+    {
+      "epoch": 0.7620713105994366,
+      "grad_norm": 0.1998988837003708,
+      "learning_rate": 4.7800700661736085e-05,
+      "loss": 0.9782,
+      "step": 1961
+    },
+    {
+      "epoch": 0.7624599242203439,
+      "grad_norm": 0.20410674810409546,
+      "learning_rate": 4.7722849357726744e-05,
+      "loss": 1.0186,
+      "step": 1962
+    },
+    {
+      "epoch": 0.7628485378412513,
+      "grad_norm": 0.25312289595603943,
+      "learning_rate": 4.76449980537174e-05,
+      "loss": 1.0103,
+      "step": 1963
+    },
+    {
+      "epoch": 0.7632371514621588,
+      "grad_norm": 0.20648318529129028,
+      "learning_rate": 4.7567146749708055e-05,
+      "loss": 1.0314,
+      "step": 1964
+    },
+    {
+      "epoch": 0.7636257650830661,
+      "grad_norm": 0.20513702929019928,
+      "learning_rate": 4.7489295445698714e-05,
+      "loss": 0.981,
+      "step": 1965
+    },
+    {
+      "epoch": 0.7640143787039736,
+      "grad_norm": 0.20063039660453796,
+      "learning_rate": 4.741144414168938e-05,
+      "loss": 1.0218,
+      "step": 1966
+    },
+    {
+      "epoch": 0.764402992324881,
+      "grad_norm": 0.20328521728515625,
+      "learning_rate": 4.733359283768004e-05,
+      "loss": 1.0614,
+      "step": 1967
+    },
+    {
+      "epoch": 0.7647916059457884,
+      "grad_norm": 0.2209623008966446,
+      "learning_rate": 4.725574153367069e-05,
+      "loss": 1.0478,
+      "step": 1968
+    },
+    {
+      "epoch": 0.7651802195666958,
+      "grad_norm": 0.2023559957742691,
+      "learning_rate": 4.717789022966135e-05,
+      "loss": 1.0455,
+      "step": 1969
+    },
+    {
+      "epoch": 0.7655688331876033,
+      "grad_norm": 0.20461297035217285,
+      "learning_rate": 4.710003892565201e-05,
+      "loss": 0.9427,
+      "step": 1970
+    },
+    {
+      "epoch": 0.7659574468085106,
+      "grad_norm": 0.2108335793018341,
+      "learning_rate": 4.702218762164267e-05,
+      "loss": 1.0344,
+      "step": 1971
+    },
+    {
+      "epoch": 0.766346060429418,
+      "grad_norm": 0.20883473753929138,
+      "learning_rate": 4.6944336317633326e-05,
+      "loss": 1.0336,
+      "step": 1972
+    },
+    {
+      "epoch": 0.7667346740503255,
+      "grad_norm": 0.20144741237163544,
+      "learning_rate": 4.686648501362398e-05,
+      "loss": 1.0101,
+      "step": 1973
+    },
+    {
+      "epoch": 0.7671232876712328,
+      "grad_norm": 0.21269328892230988,
+      "learning_rate": 4.6788633709614637e-05,
+      "loss": 0.9989,
+      "step": 1974
+    },
+    {
+      "epoch": 0.7675119012921403,
+      "grad_norm": 0.20673738420009613,
+      "learning_rate": 4.6710782405605295e-05,
+      "loss": 1.0235,
+      "step": 1975
+    },
+    {
+      "epoch": 0.7679005149130477,
+      "grad_norm": 0.1966594159603119,
+      "learning_rate": 4.6632931101595954e-05,
+      "loss": 1.0081,
+      "step": 1976
+    },
+    {
+      "epoch": 0.7682891285339551,
+      "grad_norm": 0.22186829149723053,
+      "learning_rate": 4.6555079797586606e-05,
+      "loss": 1.0081,
+      "step": 1977
+    },
+    {
+      "epoch": 0.7686777421548625,
+      "grad_norm": 0.20602557063102722,
+      "learning_rate": 4.6477228493577265e-05,
+      "loss": 1.0381,
+      "step": 1978
+    },
+    {
+      "epoch": 0.76906635577577,
+      "grad_norm": 0.19581305980682373,
+      "learning_rate": 4.639937718956793e-05,
+      "loss": 1.0196,
+      "step": 1979
+    },
+    {
+      "epoch": 0.7694549693966773,
+      "grad_norm": 0.20162086188793182,
+      "learning_rate": 4.632152588555859e-05,
+      "loss": 1.0168,
+      "step": 1980
+    },
+    {
+      "epoch": 0.7698435830175848,
+      "grad_norm": 0.21967145800590515,
+      "learning_rate": 4.624367458154925e-05,
+      "loss": 1.0339,
+      "step": 1981
+    },
+    {
+      "epoch": 0.7702321966384922,
+      "grad_norm": 0.20245851576328278,
+      "learning_rate": 4.61658232775399e-05,
+      "loss": 1.0349,
+      "step": 1982
+    },
+    {
+      "epoch": 0.7706208102593995,
+      "grad_norm": 0.20409934222698212,
+      "learning_rate": 4.608797197353056e-05,
+      "loss": 1.0296,
+      "step": 1983
+    },
+    {
+      "epoch": 0.771009423880307,
+      "grad_norm": 0.19757163524627686,
+      "learning_rate": 4.601012066952122e-05,
+      "loss": 1.0443,
+      "step": 1984
+    },
+    {
+      "epoch": 0.7713980375012144,
+      "grad_norm": 0.20038221776485443,
+      "learning_rate": 4.593226936551188e-05,
+      "loss": 1.0431,
+      "step": 1985
+    },
+    {
+      "epoch": 0.7717866511221219,
+      "grad_norm": 0.2112458199262619,
+      "learning_rate": 4.585441806150253e-05,
+      "loss": 1.0553,
+      "step": 1986
+    },
+    {
+      "epoch": 0.7721752647430292,
+      "grad_norm": 0.21868042647838593,
+      "learning_rate": 4.577656675749319e-05,
+      "loss": 1.0061,
+      "step": 1987
+    },
+    {
+      "epoch": 0.7725638783639367,
+      "grad_norm": 0.22484582662582397,
+      "learning_rate": 4.5698715453483846e-05,
+      "loss": 1.0831,
+      "step": 1988
+    },
+    {
+      "epoch": 0.7729524919848441,
+      "grad_norm": 0.20265011489391327,
+      "learning_rate": 4.5620864149474505e-05,
+      "loss": 1.0206,
+      "step": 1989
+    },
+    {
+      "epoch": 0.7733411056057515,
+      "grad_norm": 0.2052810937166214,
+      "learning_rate": 4.5543012845465164e-05,
+      "loss": 1.0366,
+      "step": 1990
+    },
+    {
+      "epoch": 0.7737297192266589,
+      "grad_norm": 0.21016088128089905,
+      "learning_rate": 4.546516154145582e-05,
+      "loss": 0.9963,
+      "step": 1991
+    },
+    {
+      "epoch": 0.7741183328475663,
+      "grad_norm": 0.19719412922859192,
+      "learning_rate": 4.538731023744648e-05,
+      "loss": 0.9853,
+      "step": 1992
+    },
+    {
+      "epoch": 0.7745069464684737,
+      "grad_norm": 0.20447245240211487,
+      "learning_rate": 4.530945893343714e-05,
+      "loss": 0.9977,
+      "step": 1993
+    },
+    {
+      "epoch": 0.7748955600893811,
+      "grad_norm": 0.21796588599681854,
+      "learning_rate": 4.52316076294278e-05,
+      "loss": 1.0949,
+      "step": 1994
+    },
+    {
+      "epoch": 0.7752841737102886,
+      "grad_norm": 0.2041284590959549,
+      "learning_rate": 4.515375632541845e-05,
+      "loss": 1.0034,
+      "step": 1995
+    },
+    {
+      "epoch": 0.7756727873311959,
+      "grad_norm": 0.21134726703166962,
+      "learning_rate": 4.507590502140911e-05,
+      "loss": 1.0076,
+      "step": 1996
+    },
+    {
+      "epoch": 0.7760614009521034,
+      "grad_norm": 0.20730996131896973,
+      "learning_rate": 4.499805371739977e-05,
+      "loss": 1.0456,
+      "step": 1997
+    },
+    {
+      "epoch": 0.7764500145730108,
+      "grad_norm": 0.22316931188106537,
+      "learning_rate": 4.492020241339043e-05,
+      "loss": 0.9418,
+      "step": 1998
+    },
+    {
+      "epoch": 0.7768386281939182,
+      "grad_norm": 0.21494819223880768,
+      "learning_rate": 4.484235110938109e-05,
+      "loss": 1.0597,
+      "step": 1999
+    },
+    {
+      "epoch": 0.7772272418148256,
+      "grad_norm": 0.20344491302967072,
+      "learning_rate": 4.476449980537174e-05,
+      "loss": 0.9749,
+      "step": 2000
+    },
+    {
+      "epoch": 0.777615855435733,
+      "grad_norm": 0.20816263556480408,
+      "learning_rate": 4.46866485013624e-05,
+      "loss": 1.0526,
+      "step": 2001
+    },
+    {
+      "epoch": 0.7780044690566404,
+      "grad_norm": 0.21490095555782318,
+      "learning_rate": 4.4608797197353056e-05,
+      "loss": 1.0311,
+      "step": 2002
+    },
+    {
+      "epoch": 0.7783930826775478,
+      "grad_norm": 0.2043679803609848,
+      "learning_rate": 4.4530945893343715e-05,
+      "loss": 1.0176,
+      "step": 2003
+    },
+    {
+      "epoch": 0.7787816962984553,
+      "grad_norm": 0.2015836238861084,
+      "learning_rate": 4.4453094589334374e-05,
+      "loss": 1.015,
+      "step": 2004
+    },
+    {
+      "epoch": 0.7791703099193626,
+      "grad_norm": 0.21843332052230835,
+      "learning_rate": 4.437524328532503e-05,
+      "loss": 1.0577,
+      "step": 2005
+    },
+    {
+      "epoch": 0.7795589235402701,
+      "grad_norm": 0.20447933673858643,
+      "learning_rate": 4.429739198131569e-05,
+      "loss": 1.0549,
+      "step": 2006
+    },
+    {
+      "epoch": 0.7799475371611775,
+      "grad_norm": 0.20317135751247406,
+      "learning_rate": 4.421954067730635e-05,
+      "loss": 1.0419,
+      "step": 2007
+    },
+    {
+      "epoch": 0.780336150782085,
+      "grad_norm": 0.20233985781669617,
+      "learning_rate": 4.414168937329701e-05,
+      "loss": 0.9743,
+      "step": 2008
+    },
+    {
+      "epoch": 0.7807247644029923,
+      "grad_norm": 0.1957770437002182,
+      "learning_rate": 4.406383806928766e-05,
+      "loss": 1.0306,
+      "step": 2009
+    },
+    {
+      "epoch": 0.7811133780238997,
+      "grad_norm": 0.2055465579032898,
+      "learning_rate": 4.398598676527832e-05,
+      "loss": 0.9917,
+      "step": 2010
+    },
+    {
+      "epoch": 0.7815019916448072,
+      "grad_norm": 0.1980140060186386,
+      "learning_rate": 4.390813546126898e-05,
+      "loss": 1.0002,
+      "step": 2011
+    },
+    {
+      "epoch": 0.7818906052657145,
+      "grad_norm": 0.21538390219211578,
+      "learning_rate": 4.383028415725964e-05,
+      "loss": 0.9784,
+      "step": 2012
+    },
+    {
+      "epoch": 0.782279218886622,
+      "grad_norm": 0.20209911465644836,
+      "learning_rate": 4.375243285325029e-05,
+      "loss": 1.0403,
+      "step": 2013
+    },
+    {
+      "epoch": 0.7826678325075294,
+      "grad_norm": 0.22064533829689026,
+      "learning_rate": 4.367458154924095e-05,
+      "loss": 1.0816,
+      "step": 2014
+    },
+    {
+      "epoch": 0.7830564461284368,
+      "grad_norm": 0.21721522510051727,
+      "learning_rate": 4.359673024523161e-05,
+      "loss": 1.0215,
+      "step": 2015
+    },
+    {
+      "epoch": 0.7834450597493442,
+      "grad_norm": 0.21042165160179138,
+      "learning_rate": 4.3518878941222266e-05,
+      "loss": 0.9993,
+      "step": 2016
+    },
+    {
+      "epoch": 0.7838336733702517,
+      "grad_norm": 0.2821733355522156,
+      "learning_rate": 4.3441027637212925e-05,
+      "loss": 1.0337,
+      "step": 2017
+    },
+    {
+      "epoch": 0.784222286991159,
+      "grad_norm": 0.1997404247522354,
+      "learning_rate": 4.3363176333203584e-05,
+      "loss": 0.9635,
+      "step": 2018
+    },
+    {
+      "epoch": 0.7846109006120664,
+      "grad_norm": 0.21088410913944244,
+      "learning_rate": 4.328532502919424e-05,
+      "loss": 1.0809,
+      "step": 2019
+    },
+    {
+      "epoch": 0.7849995142329739,
+      "grad_norm": 0.22041834890842438,
+      "learning_rate": 4.32074737251849e-05,
+      "loss": 1.0553,
+      "step": 2020
+    },
+    {
+      "epoch": 0.7853881278538812,
+      "grad_norm": 0.21541887521743774,
+      "learning_rate": 4.312962242117556e-05,
+      "loss": 1.0348,
+      "step": 2021
+    },
+    {
+      "epoch": 0.7857767414747887,
+      "grad_norm": 0.19423037767410278,
+      "learning_rate": 4.305177111716621e-05,
+      "loss": 0.9566,
+      "step": 2022
+    },
+    {
+      "epoch": 0.7861653550956961,
+      "grad_norm": 0.20975807309150696,
+      "learning_rate": 4.297391981315687e-05,
+      "loss": 0.9946,
+      "step": 2023
+    },
+    {
+      "epoch": 0.7865539687166035,
+      "grad_norm": 0.1911199390888214,
+      "learning_rate": 4.289606850914753e-05,
+      "loss": 0.9582,
+      "step": 2024
+    },
+    {
+      "epoch": 0.7869425823375109,
+      "grad_norm": 0.20895734429359436,
+      "learning_rate": 4.281821720513819e-05,
+      "loss": 1.02,
+      "step": 2025
+    },
+    {
+      "epoch": 0.7873311959584184,
+      "grad_norm": 0.19652803242206573,
+      "learning_rate": 4.274036590112885e-05,
+      "loss": 0.9919,
+      "step": 2026
+    },
+    {
+      "epoch": 0.7877198095793257,
+      "grad_norm": 0.21050991117954254,
+      "learning_rate": 4.26625145971195e-05,
+      "loss": 1.0363,
+      "step": 2027
+    },
+    {
+      "epoch": 0.7881084232002332,
+      "grad_norm": 0.18776053190231323,
+      "learning_rate": 4.258466329311016e-05,
+      "loss": 0.9747,
+      "step": 2028
+    },
+    {
+      "epoch": 0.7884970368211406,
+      "grad_norm": 0.20973272621631622,
+      "learning_rate": 4.250681198910082e-05,
+      "loss": 1.0457,
+      "step": 2029
+    },
+    {
+      "epoch": 0.788885650442048,
+      "grad_norm": 0.22028960287570953,
+      "learning_rate": 4.2428960685091476e-05,
+      "loss": 1.0769,
+      "step": 2030
+    },
+    {
+      "epoch": 0.7892742640629554,
+      "grad_norm": 0.20541588962078094,
+      "learning_rate": 4.2351109381082135e-05,
+      "loss": 1.0456,
+      "step": 2031
+    },
+    {
+      "epoch": 0.7896628776838628,
+      "grad_norm": 0.19365350902080536,
+      "learning_rate": 4.2273258077072794e-05,
+      "loss": 0.9708,
+      "step": 2032
+    },
+    {
+      "epoch": 0.7900514913047703,
+      "grad_norm": 0.21286098659038544,
+      "learning_rate": 4.219540677306345e-05,
+      "loss": 1.0443,
+      "step": 2033
+    },
+    {
+      "epoch": 0.7904401049256776,
+      "grad_norm": 0.20527319610118866,
+      "learning_rate": 4.211755546905411e-05,
+      "loss": 1.0165,
+      "step": 2034
+    },
+    {
+      "epoch": 0.7908287185465851,
+      "grad_norm": 0.20962440967559814,
+      "learning_rate": 4.203970416504477e-05,
+      "loss": 1.0723,
+      "step": 2035
+    },
+    {
+      "epoch": 0.7912173321674925,
+      "grad_norm": 0.21032460033893585,
+      "learning_rate": 4.196185286103542e-05,
+      "loss": 1.0384,
+      "step": 2036
+    },
+    {
+      "epoch": 0.7916059457883999,
+      "grad_norm": 0.22122742235660553,
+      "learning_rate": 4.188400155702608e-05,
+      "loss": 1.0239,
+      "step": 2037
+    },
+    {
+      "epoch": 0.7919945594093073,
+      "grad_norm": 0.21430088579654694,
+      "learning_rate": 4.180615025301674e-05,
+      "loss": 1.0421,
+      "step": 2038
+    },
+    {
+      "epoch": 0.7923831730302147,
+      "grad_norm": 0.200826957821846,
+      "learning_rate": 4.17282989490074e-05,
+      "loss": 1.0403,
+      "step": 2039
+    },
+    {
+      "epoch": 0.7927717866511221,
+      "grad_norm": 0.1936146765947342,
+      "learning_rate": 4.165044764499805e-05,
+      "loss": 0.9901,
+      "step": 2040
+    },
+    {
+      "epoch": 0.7931604002720295,
+      "grad_norm": 0.21162614226341248,
+      "learning_rate": 4.157259634098871e-05,
+      "loss": 1.0809,
+      "step": 2041
+    },
+    {
+      "epoch": 0.793549013892937,
+      "grad_norm": 0.1934708207845688,
+      "learning_rate": 4.149474503697937e-05,
+      "loss": 0.996,
+      "step": 2042
+    },
+    {
+      "epoch": 0.7939376275138443,
+      "grad_norm": 0.19730836153030396,
+      "learning_rate": 4.141689373297003e-05,
+      "loss": 1.0116,
+      "step": 2043
+    },
+    {
+      "epoch": 0.7943262411347518,
+      "grad_norm": 0.19641950726509094,
+      "learning_rate": 4.1339042428960686e-05,
+      "loss": 1.0554,
+      "step": 2044
+    },
+    {
+      "epoch": 0.7947148547556592,
+      "grad_norm": 0.1926102489233017,
+      "learning_rate": 4.1261191124951345e-05,
+      "loss": 0.9244,
+      "step": 2045
+    },
+    {
+      "epoch": 0.7951034683765666,
+      "grad_norm": 0.20683708786964417,
+      "learning_rate": 4.1183339820942004e-05,
+      "loss": 1.0247,
+      "step": 2046
+    },
+    {
+      "epoch": 0.795492081997474,
+      "grad_norm": 0.21519975364208221,
+      "learning_rate": 4.110548851693266e-05,
+      "loss": 1.0364,
+      "step": 2047
+    },
+    {
+      "epoch": 0.7958806956183814,
+      "grad_norm": 0.19510744512081146,
+      "learning_rate": 4.102763721292332e-05,
+      "loss": 0.9807,
+      "step": 2048
+    },
+    {
+      "epoch": 0.7962693092392888,
+      "grad_norm": 0.21060147881507874,
+      "learning_rate": 4.094978590891398e-05,
+      "loss": 1.0007,
+      "step": 2049
+    },
+    {
+      "epoch": 0.7966579228601962,
+      "grad_norm": 0.19922667741775513,
+      "learning_rate": 4.087193460490463e-05,
+      "loss": 0.9953,
+      "step": 2050
+    },
+    {
+      "epoch": 0.7970465364811037,
+      "grad_norm": 0.2217833250761032,
+      "learning_rate": 4.079408330089529e-05,
+      "loss": 1.0359,
+      "step": 2051
+    },
+    {
+      "epoch": 0.797435150102011,
+      "grad_norm": 0.2138615995645523,
+      "learning_rate": 4.071623199688595e-05,
+      "loss": 1.0473,
+      "step": 2052
+    },
+    {
+      "epoch": 0.7978237637229185,
+      "grad_norm": 0.20814841985702515,
+      "learning_rate": 4.063838069287661e-05,
+      "loss": 1.042,
+      "step": 2053
+    },
+    {
+      "epoch": 0.7982123773438259,
+      "grad_norm": 0.21378004550933838,
+      "learning_rate": 4.056052938886726e-05,
+      "loss": 1.0616,
+      "step": 2054
+    },
+    {
+      "epoch": 0.7986009909647334,
+      "grad_norm": 0.22064481675624847,
+      "learning_rate": 4.048267808485792e-05,
+      "loss": 1.063,
+      "step": 2055
+    },
+    {
+      "epoch": 0.7989896045856407,
+      "grad_norm": 0.21143454313278198,
+      "learning_rate": 4.040482678084858e-05,
+      "loss": 0.9999,
+      "step": 2056
+    },
+    {
+      "epoch": 0.7993782182065481,
+      "grad_norm": 0.2092997431755066,
+      "learning_rate": 4.032697547683924e-05,
+      "loss": 0.9958,
+      "step": 2057
+    },
+    {
+      "epoch": 0.7997668318274556,
+      "grad_norm": 0.2715415954589844,
+      "learning_rate": 4.0249124172829896e-05,
+      "loss": 0.9981,
+      "step": 2058
+    },
+    {
+      "epoch": 0.8001554454483629,
+      "grad_norm": 0.20481626689434052,
+      "learning_rate": 4.0171272868820555e-05,
+      "loss": 1.0187,
+      "step": 2059
+    },
+    {
+      "epoch": 0.8005440590692704,
+      "grad_norm": 0.2076139748096466,
+      "learning_rate": 4.0093421564811214e-05,
+      "loss": 1.0147,
+      "step": 2060
+    },
+    {
+      "epoch": 0.8009326726901778,
+      "grad_norm": 0.21985560655593872,
+      "learning_rate": 4.001557026080187e-05,
+      "loss": 1.0436,
+      "step": 2061
+    },
+    {
+      "epoch": 0.8013212863110852,
+      "grad_norm": 0.2088089883327484,
+      "learning_rate": 3.993771895679253e-05,
+      "loss": 1.067,
+      "step": 2062
+    },
+    {
+      "epoch": 0.8017098999319926,
+      "grad_norm": 0.23079900443553925,
+      "learning_rate": 3.9859867652783184e-05,
+      "loss": 1.0208,
+      "step": 2063
+    },
+    {
+      "epoch": 0.8020985135529001,
+      "grad_norm": 0.20904935896396637,
+      "learning_rate": 3.978201634877384e-05,
+      "loss": 1.0417,
+      "step": 2064
+    },
+    {
+      "epoch": 0.8024871271738074,
+      "grad_norm": 0.2027217298746109,
+      "learning_rate": 3.97041650447645e-05,
+      "loss": 1.0466,
+      "step": 2065
+    },
+    {
+      "epoch": 0.8028757407947149,
+      "grad_norm": 0.2080574333667755,
+      "learning_rate": 3.962631374075516e-05,
+      "loss": 1.0004,
+      "step": 2066
+    },
+    {
+      "epoch": 0.8032643544156223,
+      "grad_norm": 0.2076699584722519,
+      "learning_rate": 3.954846243674582e-05,
+      "loss": 1.0288,
+      "step": 2067
+    },
+    {
+      "epoch": 0.8036529680365296,
+      "grad_norm": 0.20526565611362457,
+      "learning_rate": 3.947061113273647e-05,
+      "loss": 0.9627,
+      "step": 2068
+    },
+    {
+      "epoch": 0.8040415816574371,
+      "grad_norm": 0.2086559236049652,
+      "learning_rate": 3.939275982872713e-05,
+      "loss": 1.057,
+      "step": 2069
+    },
+    {
+      "epoch": 0.8044301952783445,
+      "grad_norm": 0.21741564571857452,
+      "learning_rate": 3.931490852471779e-05,
+      "loss": 1.0575,
+      "step": 2070
+    },
+    {
+      "epoch": 0.804818808899252,
+      "grad_norm": 0.19239796698093414,
+      "learning_rate": 3.923705722070845e-05,
+      "loss": 1.0028,
+      "step": 2071
+    },
+    {
+      "epoch": 0.8052074225201593,
+      "grad_norm": 0.20606793463230133,
+      "learning_rate": 3.9159205916699106e-05,
+      "loss": 1.0305,
+      "step": 2072
+    },
+    {
+      "epoch": 0.8055960361410668,
+      "grad_norm": 0.2197132408618927,
+      "learning_rate": 3.9081354612689765e-05,
+      "loss": 1.0669,
+      "step": 2073
+    },
+    {
+      "epoch": 0.8059846497619741,
+      "grad_norm": 0.19510973989963531,
+      "learning_rate": 3.9003503308680424e-05,
+      "loss": 0.984,
+      "step": 2074
+    },
+    {
+      "epoch": 0.8063732633828816,
+      "grad_norm": 0.20135273039340973,
+      "learning_rate": 3.892565200467108e-05,
+      "loss": 1.0528,
+      "step": 2075
+    },
+    {
+      "epoch": 0.806761877003789,
+      "grad_norm": 0.20280520617961884,
+      "learning_rate": 3.884780070066174e-05,
+      "loss": 1.0185,
+      "step": 2076
+    },
+    {
+      "epoch": 0.8071504906246963,
+      "grad_norm": 0.21787187457084656,
+      "learning_rate": 3.8769949396652394e-05,
+      "loss": 1.0875,
+      "step": 2077
+    },
+    {
+      "epoch": 0.8075391042456038,
+      "grad_norm": 0.21521267294883728,
+      "learning_rate": 3.869209809264305e-05,
+      "loss": 1.0352,
+      "step": 2078
+    },
+    {
+      "epoch": 0.8079277178665112,
+      "grad_norm": 0.21675272285938263,
+      "learning_rate": 3.861424678863371e-05,
+      "loss": 1.0178,
+      "step": 2079
+    },
+    {
+      "epoch": 0.8083163314874187,
+      "grad_norm": 0.20301300287246704,
+      "learning_rate": 3.853639548462437e-05,
+      "loss": 1.042,
+      "step": 2080
+    },
+    {
+      "epoch": 0.808704945108326,
+      "grad_norm": 0.2025609016418457,
+      "learning_rate": 3.845854418061502e-05,
+      "loss": 1.0224,
+      "step": 2081
+    },
+    {
+      "epoch": 0.8090935587292335,
+      "grad_norm": 0.23724251985549927,
+      "learning_rate": 3.838069287660568e-05,
+      "loss": 1.0051,
+      "step": 2082
+    },
+    {
+      "epoch": 0.8094821723501409,
+      "grad_norm": 0.17473214864730835,
+      "learning_rate": 3.830284157259634e-05,
+      "loss": 0.9183,
+      "step": 2083
+    },
+    {
+      "epoch": 0.8098707859710483,
+      "grad_norm": 0.20575867593288422,
+      "learning_rate": 3.8224990268587e-05,
+      "loss": 1.0018,
+      "step": 2084
+    },
+    {
+      "epoch": 0.8102593995919557,
+      "grad_norm": 0.2054753601551056,
+      "learning_rate": 3.8147138964577664e-05,
+      "loss": 1.0326,
+      "step": 2085
+    },
+    {
+      "epoch": 0.8106480132128631,
+      "grad_norm": 0.22283188998699188,
+      "learning_rate": 3.8069287660568316e-05,
+      "loss": 1.0878,
+      "step": 2086
+    },
+    {
+      "epoch": 0.8110366268337705,
+      "grad_norm": 0.20678454637527466,
+      "learning_rate": 3.7991436356558975e-05,
+      "loss": 1.0382,
+      "step": 2087
+    },
+    {
+      "epoch": 0.8114252404546779,
+      "grad_norm": 0.22482691705226898,
+      "learning_rate": 3.7913585052549634e-05,
+      "loss": 1.0441,
+      "step": 2088
+    },
+    {
+      "epoch": 0.8118138540755854,
+      "grad_norm": 0.19913192093372345,
+      "learning_rate": 3.783573374854029e-05,
+      "loss": 0.9093,
+      "step": 2089
+    },
+    {
+      "epoch": 0.8122024676964927,
+      "grad_norm": 0.21512696146965027,
+      "learning_rate": 3.7757882444530945e-05,
+      "loss": 1.0589,
+      "step": 2090
+    },
+    {
+      "epoch": 0.8125910813174002,
+      "grad_norm": 0.20883330702781677,
+      "learning_rate": 3.7680031140521604e-05,
+      "loss": 0.9773,
+      "step": 2091
+    },
+    {
+      "epoch": 0.8129796949383076,
+      "grad_norm": 0.20254108309745789,
+      "learning_rate": 3.760217983651226e-05,
+      "loss": 1.0111,
+      "step": 2092
+    },
+    {
+      "epoch": 0.813368308559215,
+      "grad_norm": 0.22513622045516968,
+      "learning_rate": 3.752432853250292e-05,
+      "loss": 1.0471,
+      "step": 2093
+    },
+    {
+      "epoch": 0.8137569221801224,
+      "grad_norm": 0.20943938195705414,
+      "learning_rate": 3.744647722849358e-05,
+      "loss": 1.0261,
+      "step": 2094
+    },
+    {
+      "epoch": 0.8141455358010298,
+      "grad_norm": 0.19357722997665405,
+      "learning_rate": 3.736862592448423e-05,
+      "loss": 0.9891,
+      "step": 2095
+    },
+    {
+      "epoch": 0.8145341494219372,
+      "grad_norm": 0.20199090242385864,
+      "learning_rate": 3.729077462047489e-05,
+      "loss": 1.0017,
+      "step": 2096
+    },
+    {
+      "epoch": 0.8149227630428446,
+      "grad_norm": 0.22087882459163666,
+      "learning_rate": 3.721292331646556e-05,
+      "loss": 1.0176,
+      "step": 2097
+    },
+    {
+      "epoch": 0.8153113766637521,
+      "grad_norm": 0.19757211208343506,
+      "learning_rate": 3.7135072012456215e-05,
+      "loss": 0.9993,
+      "step": 2098
+    },
+    {
+      "epoch": 0.8156999902846594,
+      "grad_norm": 0.21485236287117004,
+      "learning_rate": 3.705722070844687e-05,
+      "loss": 1.0129,
+      "step": 2099
+    },
+    {
+      "epoch": 0.8160886039055669,
+      "grad_norm": 0.2095671445131302,
+      "learning_rate": 3.6979369404437526e-05,
+      "loss": 1.0576,
+      "step": 2100
+    },
+    {
+      "epoch": 0.8164772175264743,
+      "grad_norm": 0.21392807364463806,
+      "learning_rate": 3.6901518100428185e-05,
+      "loss": 1.0666,
+      "step": 2101
+    },
+    {
+      "epoch": 0.8168658311473818,
+      "grad_norm": 0.23267820477485657,
+      "learning_rate": 3.6823666796418844e-05,
+      "loss": 1.0691,
+      "step": 2102
+    },
+    {
+      "epoch": 0.8172544447682891,
+      "grad_norm": 0.3778455853462219,
+      "learning_rate": 3.67458154924095e-05,
+      "loss": 1.057,
+      "step": 2103
+    },
+    {
+      "epoch": 0.8176430583891965,
+      "grad_norm": 0.21719984710216522,
+      "learning_rate": 3.6667964188400155e-05,
+      "loss": 1.0564,
+      "step": 2104
+    },
+    {
+      "epoch": 0.818031672010104,
+      "grad_norm": 0.19418101012706757,
+      "learning_rate": 3.6590112884390814e-05,
+      "loss": 1.0156,
+      "step": 2105
+    },
+    {
+      "epoch": 0.8184202856310113,
+      "grad_norm": 0.20592990517616272,
+      "learning_rate": 3.651226158038147e-05,
+      "loss": 1.026,
+      "step": 2106
+    },
+    {
+      "epoch": 0.8188088992519188,
+      "grad_norm": 0.21999908983707428,
+      "learning_rate": 3.643441027637213e-05,
+      "loss": 1.0575,
+      "step": 2107
+    },
+    {
+      "epoch": 0.8191975128728262,
+      "grad_norm": 0.2080504447221756,
+      "learning_rate": 3.635655897236278e-05,
+      "loss": 1.0236,
+      "step": 2108
+    },
+    {
+      "epoch": 0.8195861264937336,
+      "grad_norm": 0.20104867219924927,
+      "learning_rate": 3.627870766835344e-05,
+      "loss": 0.9626,
+      "step": 2109
+    },
+    {
+      "epoch": 0.819974740114641,
+      "grad_norm": 0.18993836641311646,
+      "learning_rate": 3.620085636434411e-05,
+      "loss": 0.983,
+      "step": 2110
+    },
+    {
+      "epoch": 0.8203633537355485,
+      "grad_norm": 0.18710492551326752,
+      "learning_rate": 3.6123005060334767e-05,
+      "loss": 0.9674,
+      "step": 2111
+    },
+    {
+      "epoch": 0.8207519673564558,
+      "grad_norm": 0.2117459774017334,
+      "learning_rate": 3.6045153756325425e-05,
+      "loss": 1.0263,
+      "step": 2112
+    },
+    {
+      "epoch": 0.8211405809773633,
+      "grad_norm": 0.2005959451198578,
+      "learning_rate": 3.596730245231608e-05,
+      "loss": 1.0405,
+      "step": 2113
+    },
+    {
+      "epoch": 0.8215291945982707,
+      "grad_norm": 0.21586982905864716,
+      "learning_rate": 3.5889451148306736e-05,
+      "loss": 0.9715,
+      "step": 2114
+    },
+    {
+      "epoch": 0.821917808219178,
+      "grad_norm": 0.2229696810245514,
+      "learning_rate": 3.5811599844297395e-05,
+      "loss": 1.0427,
+      "step": 2115
+    },
+    {
+      "epoch": 0.8223064218400855,
+      "grad_norm": 0.22296395897865295,
+      "learning_rate": 3.5733748540288054e-05,
+      "loss": 1.093,
+      "step": 2116
+    },
+    {
+      "epoch": 0.8226950354609929,
+      "grad_norm": 0.22912591695785522,
+      "learning_rate": 3.5655897236278706e-05,
+      "loss": 1.0821,
+      "step": 2117
+    },
+    {
+      "epoch": 0.8230836490819003,
+      "grad_norm": 0.19285057485103607,
+      "learning_rate": 3.5578045932269365e-05,
+      "loss": 0.9694,
+      "step": 2118
+    },
+    {
+      "epoch": 0.8234722627028077,
+      "grad_norm": 0.2150295525789261,
+      "learning_rate": 3.5500194628260024e-05,
+      "loss": 1.0277,
+      "step": 2119
+    },
+    {
+      "epoch": 0.8238608763237152,
+      "grad_norm": 0.20686036348342896,
+      "learning_rate": 3.542234332425068e-05,
+      "loss": 0.9946,
+      "step": 2120
+    },
+    {
+      "epoch": 0.8242494899446225,
+      "grad_norm": 0.21742792427539825,
+      "learning_rate": 3.534449202024134e-05,
+      "loss": 1.0233,
+      "step": 2121
+    },
+    {
+      "epoch": 0.82463810356553,
+      "grad_norm": 0.2077355682849884,
+      "learning_rate": 3.526664071623199e-05,
+      "loss": 0.9918,
+      "step": 2122
+    },
+    {
+      "epoch": 0.8250267171864374,
+      "grad_norm": 0.2552899122238159,
+      "learning_rate": 3.518878941222266e-05,
+      "loss": 0.9648,
+      "step": 2123
+    },
+    {
+      "epoch": 0.8254153308073447,
+      "grad_norm": 0.21043844521045685,
+      "learning_rate": 3.511093810821332e-05,
+      "loss": 1.023,
+      "step": 2124
+    },
+    {
+      "epoch": 0.8258039444282522,
+      "grad_norm": 0.22360606491565704,
+      "learning_rate": 3.5033086804203977e-05,
+      "loss": 1.0862,
+      "step": 2125
+    },
+    {
+      "epoch": 0.8261925580491596,
+      "grad_norm": 0.20735731720924377,
+      "learning_rate": 3.495523550019463e-05,
+      "loss": 1.017,
+      "step": 2126
+    },
+    {
+      "epoch": 0.8265811716700671,
+      "grad_norm": 0.21998152136802673,
+      "learning_rate": 3.487738419618529e-05,
+      "loss": 1.0273,
+      "step": 2127
+    },
+    {
+      "epoch": 0.8269697852909744,
+      "grad_norm": 0.23547297716140747,
+      "learning_rate": 3.4799532892175946e-05,
+      "loss": 1.0353,
+      "step": 2128
+    },
+    {
+      "epoch": 0.8273583989118819,
+      "grad_norm": 0.20162945985794067,
+      "learning_rate": 3.4721681588166605e-05,
+      "loss": 1.0289,
+      "step": 2129
+    },
+    {
+      "epoch": 0.8277470125327893,
+      "grad_norm": 0.1959386169910431,
+      "learning_rate": 3.4643830284157264e-05,
+      "loss": 1.012,
+      "step": 2130
+    },
+    {
+      "epoch": 0.8281356261536967,
+      "grad_norm": 0.21625256538391113,
+      "learning_rate": 3.4565978980147916e-05,
+      "loss": 1.0718,
+      "step": 2131
+    },
+    {
+      "epoch": 0.8285242397746041,
+      "grad_norm": 0.2094646692276001,
+      "learning_rate": 3.4488127676138575e-05,
+      "loss": 1.0157,
+      "step": 2132
+    },
+    {
+      "epoch": 0.8289128533955115,
+      "grad_norm": 0.19329530000686646,
+      "learning_rate": 3.4410276372129234e-05,
+      "loss": 0.9652,
+      "step": 2133
+    },
+    {
+      "epoch": 0.8293014670164189,
+      "grad_norm": 0.19125741720199585,
+      "learning_rate": 3.433242506811989e-05,
+      "loss": 0.9964,
+      "step": 2134
+    },
+    {
+      "epoch": 0.8296900806373263,
+      "grad_norm": 0.1942203938961029,
+      "learning_rate": 3.425457376411055e-05,
+      "loss": 0.9795,
+      "step": 2135
+    },
+    {
+      "epoch": 0.8300786942582338,
+      "grad_norm": 0.2229314148426056,
+      "learning_rate": 3.417672246010121e-05,
+      "loss": 1.1052,
+      "step": 2136
+    },
+    {
+      "epoch": 0.8304673078791411,
+      "grad_norm": 0.2160118967294693,
+      "learning_rate": 3.409887115609187e-05,
+      "loss": 1.0263,
+      "step": 2137
+    },
+    {
+      "epoch": 0.8308559215000486,
+      "grad_norm": 0.2106090933084488,
+      "learning_rate": 3.402101985208253e-05,
+      "loss": 1.0151,
+      "step": 2138
+    },
+    {
+      "epoch": 0.831244535120956,
+      "grad_norm": 0.31897667050361633,
+      "learning_rate": 3.3943168548073187e-05,
+      "loss": 1.0122,
+      "step": 2139
+    },
+    {
+      "epoch": 0.8316331487418634,
+      "grad_norm": 0.20475897192955017,
+      "learning_rate": 3.386531724406384e-05,
+      "loss": 1.0239,
+      "step": 2140
+    },
+    {
+      "epoch": 0.8320217623627708,
+      "grad_norm": 0.21326549351215363,
+      "learning_rate": 3.37874659400545e-05,
+      "loss": 1.05,
+      "step": 2141
+    },
+    {
+      "epoch": 0.8324103759836782,
+      "grad_norm": 0.2130986452102661,
+      "learning_rate": 3.3709614636045156e-05,
+      "loss": 0.9979,
+      "step": 2142
+    },
+    {
+      "epoch": 0.8327989896045856,
+      "grad_norm": 0.20519514381885529,
+      "learning_rate": 3.3631763332035815e-05,
+      "loss": 1.035,
+      "step": 2143
+    },
+    {
+      "epoch": 0.833187603225493,
+      "grad_norm": 0.21058332920074463,
+      "learning_rate": 3.355391202802647e-05,
+      "loss": 1.0509,
+      "step": 2144
+    },
+    {
+      "epoch": 0.8335762168464005,
+      "grad_norm": 0.20692919194698334,
+      "learning_rate": 3.3476060724017126e-05,
+      "loss": 1.0262,
+      "step": 2145
+    },
+    {
+      "epoch": 0.8339648304673078,
+      "grad_norm": 0.20325800776481628,
+      "learning_rate": 3.3398209420007785e-05,
+      "loss": 1.0352,
+      "step": 2146
+    },
+    {
+      "epoch": 0.8343534440882153,
+      "grad_norm": 0.18956026434898376,
+      "learning_rate": 3.3320358115998444e-05,
+      "loss": 0.9618,
+      "step": 2147
+    },
+    {
+      "epoch": 0.8347420577091227,
+      "grad_norm": 0.24605980515480042,
+      "learning_rate": 3.32425068119891e-05,
+      "loss": 0.9785,
+      "step": 2148
+    },
+    {
+      "epoch": 0.8351306713300302,
+      "grad_norm": 0.20649299025535583,
+      "learning_rate": 3.316465550797976e-05,
+      "loss": 1.0051,
+      "step": 2149
+    },
+    {
+      "epoch": 0.8355192849509375,
+      "grad_norm": 0.21091307699680328,
+      "learning_rate": 3.308680420397042e-05,
+      "loss": 1.0321,
+      "step": 2150
+    },
+    {
+      "epoch": 0.835907898571845,
+      "grad_norm": 0.20463331043720245,
+      "learning_rate": 3.300895289996108e-05,
+      "loss": 1.0103,
+      "step": 2151
+    },
+    {
+      "epoch": 0.8362965121927524,
+      "grad_norm": 0.1851118803024292,
+      "learning_rate": 3.293110159595174e-05,
+      "loss": 0.9193,
+      "step": 2152
+    },
+    {
+      "epoch": 0.8366851258136597,
+      "grad_norm": 0.22127285599708557,
+      "learning_rate": 3.285325029194239e-05,
+      "loss": 1.0593,
+      "step": 2153
+    },
+    {
+      "epoch": 0.8370737394345672,
+      "grad_norm": 0.2060239166021347,
+      "learning_rate": 3.277539898793305e-05,
+      "loss": 1.1002,
+      "step": 2154
+    },
+    {
+      "epoch": 0.8374623530554746,
+      "grad_norm": 0.20628675818443298,
+      "learning_rate": 3.269754768392371e-05,
+      "loss": 1.0449,
+      "step": 2155
+    },
+    {
+      "epoch": 0.837850966676382,
+      "grad_norm": 0.2015877068042755,
+      "learning_rate": 3.2619696379914366e-05,
+      "loss": 1.0007,
+      "step": 2156
+    },
+    {
+      "epoch": 0.8382395802972894,
+      "grad_norm": 0.26001277565956116,
+      "learning_rate": 3.2541845075905025e-05,
+      "loss": 1.0593,
+      "step": 2157
+    },
+    {
+      "epoch": 0.8386281939181969,
+      "grad_norm": 0.21557845175266266,
+      "learning_rate": 3.246399377189568e-05,
+      "loss": 1.0206,
+      "step": 2158
+    },
+    {
+      "epoch": 0.8390168075391042,
+      "grad_norm": 0.21529968082904816,
+      "learning_rate": 3.2386142467886336e-05,
+      "loss": 1.0648,
+      "step": 2159
+    },
+    {
+      "epoch": 0.8394054211600117,
+      "grad_norm": 0.22108668088912964,
+      "learning_rate": 3.2308291163876995e-05,
+      "loss": 1.0192,
+      "step": 2160
+    },
+    {
+      "epoch": 0.8397940347809191,
+      "grad_norm": 0.20087426900863647,
+      "learning_rate": 3.2230439859867654e-05,
+      "loss": 0.9972,
+      "step": 2161
+    },
+    {
+      "epoch": 0.8401826484018264,
+      "grad_norm": 0.2194579839706421,
+      "learning_rate": 3.215258855585831e-05,
+      "loss": 1.0222,
+      "step": 2162
+    },
+    {
+      "epoch": 0.8405712620227339,
+      "grad_norm": 0.2581467926502228,
+      "learning_rate": 3.207473725184897e-05,
+      "loss": 1.0369,
+      "step": 2163
+    },
+    {
+      "epoch": 0.8409598756436413,
+      "grad_norm": 0.20566490292549133,
+      "learning_rate": 3.199688594783963e-05,
+      "loss": 1.0453,
+      "step": 2164
+    },
+    {
+      "epoch": 0.8413484892645487,
+      "grad_norm": 0.20137596130371094,
+      "learning_rate": 3.191903464383029e-05,
+      "loss": 1.0404,
+      "step": 2165
+    },
+    {
+      "epoch": 0.8417371028854561,
+      "grad_norm": 0.2136070281267166,
+      "learning_rate": 3.184118333982095e-05,
+      "loss": 0.998,
+      "step": 2166
+    },
+    {
+      "epoch": 0.8421257165063636,
+      "grad_norm": 0.2082609087228775,
+      "learning_rate": 3.17633320358116e-05,
+      "loss": 1.0617,
+      "step": 2167
+    },
+    {
+      "epoch": 0.842514330127271,
+      "grad_norm": 0.20818866789340973,
+      "learning_rate": 3.168548073180226e-05,
+      "loss": 0.9739,
+      "step": 2168
+    },
+    {
+      "epoch": 0.8429029437481784,
+      "grad_norm": 0.1998904049396515,
+      "learning_rate": 3.160762942779292e-05,
+      "loss": 0.9984,
+      "step": 2169
+    },
+    {
+      "epoch": 0.8432915573690858,
+      "grad_norm": 0.2000143975019455,
+      "learning_rate": 3.1529778123783576e-05,
+      "loss": 0.9975,
+      "step": 2170
+    },
+    {
+      "epoch": 0.8436801709899932,
+      "grad_norm": 0.20654286444187164,
+      "learning_rate": 3.145192681977423e-05,
+      "loss": 1.0403,
+      "step": 2171
+    },
+    {
+      "epoch": 0.8440687846109006,
+      "grad_norm": 0.20888234674930573,
+      "learning_rate": 3.137407551576489e-05,
+      "loss": 1.0072,
+      "step": 2172
+    },
+    {
+      "epoch": 0.844457398231808,
+      "grad_norm": 0.20207738876342773,
+      "learning_rate": 3.1296224211755546e-05,
+      "loss": 1.0361,
+      "step": 2173
+    },
+    {
+      "epoch": 0.8448460118527155,
+      "grad_norm": 0.2032788097858429,
+      "learning_rate": 3.1218372907746205e-05,
+      "loss": 1.0179,
+      "step": 2174
+    },
+    {
+      "epoch": 0.8452346254736228,
+      "grad_norm": 0.22794555127620697,
+      "learning_rate": 3.1140521603736864e-05,
+      "loss": 1.0337,
+      "step": 2175
+    },
+    {
+      "epoch": 0.8456232390945303,
+      "grad_norm": 0.20593926310539246,
+      "learning_rate": 3.106267029972752e-05,
+      "loss": 1.0336,
+      "step": 2176
+    },
+    {
+      "epoch": 0.8460118527154377,
+      "grad_norm": 0.20535798370838165,
+      "learning_rate": 3.098481899571818e-05,
+      "loss": 1.0465,
+      "step": 2177
+    },
+    {
+      "epoch": 0.8464004663363451,
+      "grad_norm": 0.2055482417345047,
+      "learning_rate": 3.090696769170884e-05,
+      "loss": 1.0073,
+      "step": 2178
+    },
+    {
+      "epoch": 0.8467890799572525,
+      "grad_norm": 0.20908206701278687,
+      "learning_rate": 3.08291163876995e-05,
+      "loss": 1.0478,
+      "step": 2179
+    },
+    {
+      "epoch": 0.8471776935781599,
+      "grad_norm": 0.20747126638889313,
+      "learning_rate": 3.075126508369015e-05,
+      "loss": 1.0621,
+      "step": 2180
+    },
+    {
+      "epoch": 0.8475663071990673,
+      "grad_norm": 0.28445661067962646,
+      "learning_rate": 3.067341377968081e-05,
+      "loss": 1.0546,
+      "step": 2181
+    },
+    {
+      "epoch": 0.8479549208199747,
+      "grad_norm": 0.1851411610841751,
+      "learning_rate": 3.059556247567147e-05,
+      "loss": 0.9759,
+      "step": 2182
+    },
+    {
+      "epoch": 0.8483435344408822,
+      "grad_norm": 0.1998148262500763,
+      "learning_rate": 3.051771117166213e-05,
+      "loss": 1.0138,
+      "step": 2183
+    },
+    {
+      "epoch": 0.8487321480617895,
+      "grad_norm": 0.20033158361911774,
+      "learning_rate": 3.0439859867652786e-05,
+      "loss": 1.0346,
+      "step": 2184
+    },
+    {
+      "epoch": 0.849120761682697,
+      "grad_norm": 0.1972794383764267,
+      "learning_rate": 3.036200856364344e-05,
+      "loss": 1.0476,
+      "step": 2185
+    },
+    {
+      "epoch": 0.8495093753036044,
+      "grad_norm": 0.23393818736076355,
+      "learning_rate": 3.02841572596341e-05,
+      "loss": 0.9738,
+      "step": 2186
+    },
+    {
+      "epoch": 0.8498979889245118,
+      "grad_norm": 0.1907467097043991,
+      "learning_rate": 3.020630595562476e-05,
+      "loss": 0.966,
+      "step": 2187
+    },
+    {
+      "epoch": 0.8502866025454192,
+      "grad_norm": 0.19281136989593506,
+      "learning_rate": 3.0128454651615418e-05,
+      "loss": 1.0016,
+      "step": 2188
+    },
+    {
+      "epoch": 0.8506752161663266,
+      "grad_norm": 0.2053443342447281,
+      "learning_rate": 3.005060334760607e-05,
+      "loss": 1.0659,
+      "step": 2189
+    },
+    {
+      "epoch": 0.851063829787234,
+      "grad_norm": 0.2173933982849121,
+      "learning_rate": 2.997275204359673e-05,
+      "loss": 1.0137,
+      "step": 2190
+    },
+    {
+      "epoch": 0.8514524434081414,
+      "grad_norm": 0.22902634739875793,
+      "learning_rate": 2.9894900739587388e-05,
+      "loss": 1.0408,
+      "step": 2191
+    },
+    {
+      "epoch": 0.8518410570290489,
+      "grad_norm": 0.2113914042711258,
+      "learning_rate": 2.9817049435578047e-05,
+      "loss": 1.0769,
+      "step": 2192
+    },
+    {
+      "epoch": 0.8522296706499563,
+      "grad_norm": 0.20389114320278168,
+      "learning_rate": 2.973919813156871e-05,
+      "loss": 0.9835,
+      "step": 2193
+    },
+    {
+      "epoch": 0.8526182842708637,
+      "grad_norm": 0.2062385231256485,
+      "learning_rate": 2.966134682755936e-05,
+      "loss": 1.0397,
+      "step": 2194
+    },
+    {
+      "epoch": 0.8530068978917711,
+      "grad_norm": 0.20552967488765717,
+      "learning_rate": 2.958349552355002e-05,
+      "loss": 0.9949,
+      "step": 2195
+    },
+    {
+      "epoch": 0.8533955115126786,
+      "grad_norm": 0.1985877901315689,
+      "learning_rate": 2.950564421954068e-05,
+      "loss": 0.9909,
+      "step": 2196
+    },
+    {
+      "epoch": 0.8537841251335859,
+      "grad_norm": 0.20005984604358673,
+      "learning_rate": 2.9427792915531337e-05,
+      "loss": 0.9603,
+      "step": 2197
+    },
+    {
+      "epoch": 0.8541727387544934,
+      "grad_norm": 0.20039033889770508,
+      "learning_rate": 2.9349941611521996e-05,
+      "loss": 0.9832,
+      "step": 2198
+    },
+    {
+      "epoch": 0.8545613523754008,
+      "grad_norm": 0.19540533423423767,
+      "learning_rate": 2.927209030751265e-05,
+      "loss": 0.9563,
+      "step": 2199
+    },
+    {
+      "epoch": 0.8549499659963081,
+      "grad_norm": 0.21219204366207123,
+      "learning_rate": 2.919423900350331e-05,
+      "loss": 1.0914,
+      "step": 2200
+    },
+    {
+      "epoch": 0.8553385796172156,
+      "grad_norm": 0.1871120035648346,
+      "learning_rate": 2.911638769949397e-05,
+      "loss": 0.9683,
+      "step": 2201
+    },
+    {
+      "epoch": 0.855727193238123,
+      "grad_norm": 0.2022469937801361,
+      "learning_rate": 2.9038536395484628e-05,
+      "loss": 1.0552,
+      "step": 2202
+    },
+    {
+      "epoch": 0.8561158068590304,
+      "grad_norm": 0.21184539794921875,
+      "learning_rate": 2.896068509147528e-05,
+      "loss": 1.0544,
+      "step": 2203
+    },
+    {
+      "epoch": 0.8565044204799378,
+      "grad_norm": 0.21650457382202148,
+      "learning_rate": 2.888283378746594e-05,
+      "loss": 1.0683,
+      "step": 2204
+    },
+    {
+      "epoch": 0.8568930341008453,
+      "grad_norm": 0.19166558980941772,
+      "learning_rate": 2.88049824834566e-05,
+      "loss": 0.9317,
+      "step": 2205
+    },
+    {
+      "epoch": 0.8572816477217526,
+      "grad_norm": 0.21191413700580597,
+      "learning_rate": 2.872713117944726e-05,
+      "loss": 0.9775,
+      "step": 2206
+    },
+    {
+      "epoch": 0.8576702613426601,
+      "grad_norm": 0.1949252486228943,
+      "learning_rate": 2.864927987543792e-05,
+      "loss": 0.9771,
+      "step": 2207
+    },
+    {
+      "epoch": 0.8580588749635675,
+      "grad_norm": 0.18980230391025543,
+      "learning_rate": 2.857142857142857e-05,
+      "loss": 0.9816,
+      "step": 2208
+    },
+    {
+      "epoch": 0.8584474885844748,
+      "grad_norm": 0.20371113717556,
+      "learning_rate": 2.849357726741923e-05,
+      "loss": 1.0269,
+      "step": 2209
+    },
+    {
+      "epoch": 0.8588361022053823,
+      "grad_norm": 0.2025761753320694,
+      "learning_rate": 2.841572596340989e-05,
+      "loss": 0.9169,
+      "step": 2210
+    },
+    {
+      "epoch": 0.8592247158262897,
+      "grad_norm": 0.20668815076351166,
+      "learning_rate": 2.8337874659400547e-05,
+      "loss": 1.0409,
+      "step": 2211
+    },
+    {
+      "epoch": 0.8596133294471971,
+      "grad_norm": 0.19602157175540924,
+      "learning_rate": 2.8260023355391203e-05,
+      "loss": 0.9752,
+      "step": 2212
+    },
+    {
+      "epoch": 0.8600019430681045,
+      "grad_norm": 0.19047275185585022,
+      "learning_rate": 2.818217205138186e-05,
+      "loss": 0.9862,
+      "step": 2213
+    },
+    {
+      "epoch": 0.860390556689012,
+      "grad_norm": 0.20148906111717224,
+      "learning_rate": 2.810432074737252e-05,
+      "loss": 1.0339,
+      "step": 2214
+    },
+    {
+      "epoch": 0.8607791703099194,
+      "grad_norm": 0.19507504999637604,
+      "learning_rate": 2.802646944336318e-05,
+      "loss": 1.0452,
+      "step": 2215
+    },
+    {
+      "epoch": 0.8611677839308268,
+      "grad_norm": 0.22428153455257416,
+      "learning_rate": 2.7948618139353838e-05,
+      "loss": 1.0652,
+      "step": 2216
+    },
+    {
+      "epoch": 0.8615563975517342,
+      "grad_norm": 0.19588248431682587,
+      "learning_rate": 2.787076683534449e-05,
+      "loss": 0.9816,
+      "step": 2217
+    },
+    {
+      "epoch": 0.8619450111726416,
+      "grad_norm": 0.20823241770267487,
+      "learning_rate": 2.7792915531335152e-05,
+      "loss": 1.0239,
+      "step": 2218
+    },
+    {
+      "epoch": 0.862333624793549,
+      "grad_norm": 0.20268678665161133,
+      "learning_rate": 2.771506422732581e-05,
+      "loss": 1.0057,
+      "step": 2219
+    },
+    {
+      "epoch": 0.8627222384144564,
+      "grad_norm": 0.22147025167942047,
+      "learning_rate": 2.763721292331647e-05,
+      "loss": 1.0296,
+      "step": 2220
+    },
+    {
+      "epoch": 0.8631108520353639,
+      "grad_norm": 0.2015751451253891,
+      "learning_rate": 2.7559361619307122e-05,
+      "loss": 0.9884,
+      "step": 2221
+    },
+    {
+      "epoch": 0.8634994656562712,
+      "grad_norm": 0.20846128463745117,
+      "learning_rate": 2.748151031529778e-05,
+      "loss": 1.032,
+      "step": 2222
+    },
+    {
+      "epoch": 0.8638880792771787,
+      "grad_norm": 0.212540403008461,
+      "learning_rate": 2.740365901128844e-05,
+      "loss": 1.0432,
+      "step": 2223
+    },
+    {
+      "epoch": 0.8642766928980861,
+      "grad_norm": 0.19588392972946167,
+      "learning_rate": 2.73258077072791e-05,
+      "loss": 1.0203,
+      "step": 2224
+    },
+    {
+      "epoch": 0.8646653065189935,
+      "grad_norm": 0.2195088416337967,
+      "learning_rate": 2.7247956403269757e-05,
+      "loss": 1.0415,
+      "step": 2225
+    },
+    {
+      "epoch": 0.8650539201399009,
+      "grad_norm": 0.20950359106063843,
+      "learning_rate": 2.7170105099260413e-05,
+      "loss": 1.0114,
+      "step": 2226
+    },
+    {
+      "epoch": 0.8654425337608083,
+      "grad_norm": 0.23009665310382843,
+      "learning_rate": 2.709225379525107e-05,
+      "loss": 1.0018,
+      "step": 2227
+    },
+    {
+      "epoch": 0.8658311473817157,
+      "grad_norm": 0.19696195423603058,
+      "learning_rate": 2.701440249124173e-05,
+      "loss": 1.0135,
+      "step": 2228
+    },
+    {
+      "epoch": 0.8662197610026231,
+      "grad_norm": 0.2212006151676178,
+      "learning_rate": 2.693655118723239e-05,
+      "loss": 1.0557,
+      "step": 2229
+    },
+    {
+      "epoch": 0.8666083746235306,
+      "grad_norm": 0.21312370896339417,
+      "learning_rate": 2.6858699883223045e-05,
+      "loss": 1.0758,
+      "step": 2230
+    },
+    {
+      "epoch": 0.8669969882444379,
+      "grad_norm": 0.21425843238830566,
+      "learning_rate": 2.6780848579213703e-05,
+      "loss": 1.0234,
+      "step": 2231
+    },
+    {
+      "epoch": 0.8673856018653454,
+      "grad_norm": 0.2145942598581314,
+      "learning_rate": 2.6702997275204362e-05,
+      "loss": 0.9831,
+      "step": 2232
+    },
+    {
+      "epoch": 0.8677742154862528,
+      "grad_norm": 0.20881056785583496,
+      "learning_rate": 2.662514597119502e-05,
+      "loss": 1.0255,
+      "step": 2233
+    },
+    {
+      "epoch": 0.8681628291071602,
+      "grad_norm": 0.19835254549980164,
+      "learning_rate": 2.654729466718568e-05,
+      "loss": 0.9868,
+      "step": 2234
+    },
+    {
+      "epoch": 0.8685514427280676,
+      "grad_norm": 0.21160255372524261,
+      "learning_rate": 2.6469443363176332e-05,
+      "loss": 1.0024,
+      "step": 2235
+    },
+    {
+      "epoch": 0.868940056348975,
+      "grad_norm": 0.2119852900505066,
+      "learning_rate": 2.639159205916699e-05,
+      "loss": 0.9886,
+      "step": 2236
+    },
+    {
+      "epoch": 0.8693286699698825,
+      "grad_norm": 0.2107681930065155,
+      "learning_rate": 2.631374075515765e-05,
+      "loss": 1.0311,
+      "step": 2237
+    },
+    {
+      "epoch": 0.8697172835907898,
+      "grad_norm": 0.2076905369758606,
+      "learning_rate": 2.623588945114831e-05,
+      "loss": 1.0217,
+      "step": 2238
+    },
+    {
+      "epoch": 0.8701058972116973,
+      "grad_norm": 0.20869198441505432,
+      "learning_rate": 2.6158038147138964e-05,
+      "loss": 0.9488,
+      "step": 2239
+    },
+    {
+      "epoch": 0.8704945108326047,
+      "grad_norm": 0.1986512839794159,
+      "learning_rate": 2.6080186843129623e-05,
+      "loss": 1.0216,
+      "step": 2240
+    },
+    {
+      "epoch": 0.8708831244535121,
+      "grad_norm": 0.19954320788383484,
+      "learning_rate": 2.600233553912028e-05,
+      "loss": 0.988,
+      "step": 2241
+    },
+    {
+      "epoch": 0.8712717380744195,
+      "grad_norm": 0.22843138873577118,
+      "learning_rate": 2.592448423511094e-05,
+      "loss": 1.0979,
+      "step": 2242
+    },
+    {
+      "epoch": 0.871660351695327,
+      "grad_norm": 0.21942777931690216,
+      "learning_rate": 2.58466329311016e-05,
+      "loss": 1.0378,
+      "step": 2243
+    },
+    {
+      "epoch": 0.8720489653162343,
+      "grad_norm": 0.21504725515842438,
+      "learning_rate": 2.5768781627092255e-05,
+      "loss": 1.0628,
+      "step": 2244
+    },
+    {
+      "epoch": 0.8724375789371418,
+      "grad_norm": 0.21556456387043,
+      "learning_rate": 2.5690930323082913e-05,
+      "loss": 0.9943,
+      "step": 2245
+    },
+    {
+      "epoch": 0.8728261925580492,
+      "grad_norm": 0.2099362164735794,
+      "learning_rate": 2.5613079019073572e-05,
+      "loss": 1.0603,
+      "step": 2246
+    },
+    {
+      "epoch": 0.8732148061789565,
+      "grad_norm": 0.2027025669813156,
+      "learning_rate": 2.553522771506423e-05,
+      "loss": 1.0028,
+      "step": 2247
+    },
+    {
+      "epoch": 0.873603419799864,
+      "grad_norm": 0.2144668847322464,
+      "learning_rate": 2.5457376411054883e-05,
+      "loss": 1.0462,
+      "step": 2248
+    },
+    {
+      "epoch": 0.8739920334207714,
+      "grad_norm": 0.20712412893772125,
+      "learning_rate": 2.5379525107045542e-05,
+      "loss": 0.9842,
+      "step": 2249
+    },
+    {
+      "epoch": 0.8743806470416788,
+      "grad_norm": 0.19471199810504913,
+      "learning_rate": 2.53016738030362e-05,
+      "loss": 1.0171,
+      "step": 2250
+    },
+    {
+      "epoch": 0.8747692606625862,
+      "grad_norm": 0.19841787219047546,
+      "learning_rate": 2.522382249902686e-05,
+      "loss": 0.9034,
+      "step": 2251
+    },
+    {
+      "epoch": 0.8751578742834937,
+      "grad_norm": 0.20370744168758392,
+      "learning_rate": 2.5145971195017522e-05,
+      "loss": 1.0249,
+      "step": 2252
+    },
+    {
+      "epoch": 0.875546487904401,
+      "grad_norm": 0.22168315947055817,
+      "learning_rate": 2.5068119891008174e-05,
+      "loss": 1.0624,
+      "step": 2253
+    },
+    {
+      "epoch": 0.8759351015253085,
+      "grad_norm": 0.200806125998497,
+      "learning_rate": 2.4990268586998833e-05,
+      "loss": 1.0452,
+      "step": 2254
+    },
+    {
+      "epoch": 0.8763237151462159,
+      "grad_norm": 0.19972844421863556,
+      "learning_rate": 2.491241728298949e-05,
+      "loss": 1.0563,
+      "step": 2255
+    },
+    {
+      "epoch": 0.8767123287671232,
+      "grad_norm": 0.19919687509536743,
+      "learning_rate": 2.4834565978980147e-05,
+      "loss": 1.0249,
+      "step": 2256
+    },
+    {
+      "epoch": 0.8771009423880307,
+      "grad_norm": 0.19924059510231018,
+      "learning_rate": 2.4756714674970806e-05,
+      "loss": 1.016,
+      "step": 2257
+    },
+    {
+      "epoch": 0.8774895560089381,
+      "grad_norm": 0.2038920521736145,
+      "learning_rate": 2.4678863370961465e-05,
+      "loss": 1.0116,
+      "step": 2258
+    },
+    {
+      "epoch": 0.8778781696298456,
+      "grad_norm": 0.20609620213508606,
+      "learning_rate": 2.4601012066952123e-05,
+      "loss": 1.0153,
+      "step": 2259
+    },
+    {
+      "epoch": 0.8782667832507529,
+      "grad_norm": 0.20705272257328033,
+      "learning_rate": 2.4523160762942782e-05,
+      "loss": 1.013,
+      "step": 2260
+    },
+    {
+      "epoch": 0.8786553968716604,
+      "grad_norm": 0.19973833858966827,
+      "learning_rate": 2.4445309458933438e-05,
+      "loss": 0.9932,
+      "step": 2261
+    },
+    {
+      "epoch": 0.8790440104925678,
+      "grad_norm": 0.20942817628383636,
+      "learning_rate": 2.4367458154924097e-05,
+      "loss": 1.0091,
+      "step": 2262
+    },
+    {
+      "epoch": 0.8794326241134752,
+      "grad_norm": 0.3686840236186981,
+      "learning_rate": 2.4289606850914752e-05,
+      "loss": 1.0157,
+      "step": 2263
+    },
+    {
+      "epoch": 0.8798212377343826,
+      "grad_norm": 0.20390458405017853,
+      "learning_rate": 2.4211755546905414e-05,
+      "loss": 1.0431,
+      "step": 2264
+    },
+    {
+      "epoch": 0.88020985135529,
+      "grad_norm": 0.2211003601551056,
+      "learning_rate": 2.413390424289607e-05,
+      "loss": 1.089,
+      "step": 2265
+    },
+    {
+      "epoch": 0.8805984649761974,
+      "grad_norm": 0.20558148622512817,
+      "learning_rate": 2.405605293888673e-05,
+      "loss": 0.9798,
+      "step": 2266
+    },
+    {
+      "epoch": 0.8809870785971048,
+      "grad_norm": 0.19347704946994781,
+      "learning_rate": 2.3978201634877384e-05,
+      "loss": 0.97,
+      "step": 2267
+    },
+    {
+      "epoch": 0.8813756922180123,
+      "grad_norm": 0.19454139471054077,
+      "learning_rate": 2.3900350330868043e-05,
+      "loss": 1.0265,
+      "step": 2268
+    },
+    {
+      "epoch": 0.8817643058389196,
+      "grad_norm": 0.19511118531227112,
+      "learning_rate": 2.38224990268587e-05,
+      "loss": 0.994,
+      "step": 2269
+    },
+    {
+      "epoch": 0.8821529194598271,
+      "grad_norm": 0.19948701560497284,
+      "learning_rate": 2.3744647722849357e-05,
+      "loss": 0.9911,
+      "step": 2270
+    },
+    {
+      "epoch": 0.8825415330807345,
+      "grad_norm": 0.21110126376152039,
+      "learning_rate": 2.366679641884002e-05,
+      "loss": 1.0484,
+      "step": 2271
+    },
+    {
+      "epoch": 0.8829301467016419,
+      "grad_norm": 0.20160740613937378,
+      "learning_rate": 2.3588945114830675e-05,
+      "loss": 0.9934,
+      "step": 2272
+    },
+    {
+      "epoch": 0.8833187603225493,
+      "grad_norm": 0.20967216789722443,
+      "learning_rate": 2.3511093810821333e-05,
+      "loss": 1.0081,
+      "step": 2273
+    },
+    {
+      "epoch": 0.8837073739434567,
+      "grad_norm": 0.1981070339679718,
+      "learning_rate": 2.343324250681199e-05,
+      "loss": 1.0093,
+      "step": 2274
+    },
+    {
+      "epoch": 0.8840959875643641,
+      "grad_norm": 0.21609579026699066,
+      "learning_rate": 2.3355391202802648e-05,
+      "loss": 1.0954,
+      "step": 2275
+    },
+    {
+      "epoch": 0.8844846011852715,
+      "grad_norm": 0.18667754530906677,
+      "learning_rate": 2.3277539898793303e-05,
+      "loss": 0.9833,
+      "step": 2276
+    },
+    {
+      "epoch": 0.884873214806179,
+      "grad_norm": 0.2127734273672104,
+      "learning_rate": 2.3199688594783965e-05,
+      "loss": 1.0508,
+      "step": 2277
+    },
+    {
+      "epoch": 0.8852618284270863,
+      "grad_norm": 0.2117089331150055,
+      "learning_rate": 2.3121837290774624e-05,
+      "loss": 1.0557,
+      "step": 2278
+    },
+    {
+      "epoch": 0.8856504420479938,
+      "grad_norm": 0.21022644639015198,
+      "learning_rate": 2.304398598676528e-05,
+      "loss": 1.0297,
+      "step": 2279
+    },
+    {
+      "epoch": 0.8860390556689012,
+      "grad_norm": 0.19904713332653046,
+      "learning_rate": 2.296613468275594e-05,
+      "loss": 0.9693,
+      "step": 2280
+    },
+    {
+      "epoch": 0.8864276692898087,
+      "grad_norm": 0.23006491363048553,
+      "learning_rate": 2.2888283378746594e-05,
+      "loss": 1.0409,
+      "step": 2281
+    },
+    {
+      "epoch": 0.886816282910716,
+      "grad_norm": 0.2179296761751175,
+      "learning_rate": 2.2810432074737253e-05,
+      "loss": 1.0433,
+      "step": 2282
+    },
+    {
+      "epoch": 0.8872048965316235,
+      "grad_norm": 0.19764657318592072,
+      "learning_rate": 2.273258077072791e-05,
+      "loss": 0.9807,
+      "step": 2283
+    },
+    {
+      "epoch": 0.8875935101525309,
+      "grad_norm": 0.23379875719547272,
+      "learning_rate": 2.265472946671857e-05,
+      "loss": 1.1025,
+      "step": 2284
+    },
+    {
+      "epoch": 0.8879821237734382,
+      "grad_norm": 0.2069517821073532,
+      "learning_rate": 2.2576878162709226e-05,
+      "loss": 1.0466,
+      "step": 2285
+    },
+    {
+      "epoch": 0.8883707373943457,
+      "grad_norm": 0.22321875393390656,
+      "learning_rate": 2.2499026858699885e-05,
+      "loss": 1.0548,
+      "step": 2286
+    },
+    {
+      "epoch": 0.888759351015253,
+      "grad_norm": 0.2070666253566742,
+      "learning_rate": 2.2421175554690543e-05,
+      "loss": 1.0168,
+      "step": 2287
+    },
+    {
+      "epoch": 0.8891479646361605,
+      "grad_norm": 0.1939924657344818,
+      "learning_rate": 2.23433242506812e-05,
+      "loss": 1.0008,
+      "step": 2288
+    },
+    {
+      "epoch": 0.8895365782570679,
+      "grad_norm": 0.22350658476352692,
+      "learning_rate": 2.2265472946671858e-05,
+      "loss": 1.0469,
+      "step": 2289
+    },
+    {
+      "epoch": 0.8899251918779754,
+      "grad_norm": 0.19934551417827606,
+      "learning_rate": 2.2187621642662516e-05,
+      "loss": 0.977,
+      "step": 2290
+    },
+    {
+      "epoch": 0.8903138054988827,
+      "grad_norm": 0.22848142683506012,
+      "learning_rate": 2.2109770338653175e-05,
+      "loss": 1.0642,
+      "step": 2291
+    },
+    {
+      "epoch": 0.8907024191197902,
+      "grad_norm": 0.20296107232570648,
+      "learning_rate": 2.203191903464383e-05,
+      "loss": 1.0332,
+      "step": 2292
+    },
+    {
+      "epoch": 0.8910910327406976,
+      "grad_norm": 0.19952169060707092,
+      "learning_rate": 2.195406773063449e-05,
+      "loss": 1.0249,
+      "step": 2293
+    },
+    {
+      "epoch": 0.8914796463616049,
+      "grad_norm": 0.22449292242527008,
+      "learning_rate": 2.1876216426625145e-05,
+      "loss": 1.0572,
+      "step": 2294
+    },
+    {
+      "epoch": 0.8918682599825124,
+      "grad_norm": 0.20287659764289856,
+      "learning_rate": 2.1798365122615804e-05,
+      "loss": 1.0331,
+      "step": 2295
+    },
+    {
+      "epoch": 0.8922568736034198,
+      "grad_norm": 0.2029801905155182,
+      "learning_rate": 2.1720513818606463e-05,
+      "loss": 1.0326,
+      "step": 2296
+    },
+    {
+      "epoch": 0.8926454872243272,
+      "grad_norm": 0.21909672021865845,
+      "learning_rate": 2.164266251459712e-05,
+      "loss": 1.0903,
+      "step": 2297
+    },
+    {
+      "epoch": 0.8930341008452346,
+      "grad_norm": 0.21067824959754944,
+      "learning_rate": 2.156481121058778e-05,
+      "loss": 1.0425,
+      "step": 2298
+    },
+    {
+      "epoch": 0.8934227144661421,
+      "grad_norm": 0.20612956583499908,
+      "learning_rate": 2.1486959906578436e-05,
+      "loss": 1.0269,
+      "step": 2299
+    },
+    {
+      "epoch": 0.8938113280870494,
+      "grad_norm": 0.22750885784626007,
+      "learning_rate": 2.1409108602569095e-05,
+      "loss": 1.081,
+      "step": 2300
+    },
+    {
+      "epoch": 0.8941999417079569,
+      "grad_norm": 0.2192569077014923,
+      "learning_rate": 2.133125729855975e-05,
+      "loss": 1.0305,
+      "step": 2301
+    },
+    {
+      "epoch": 0.8945885553288643,
+      "grad_norm": 0.2150728702545166,
+      "learning_rate": 2.125340599455041e-05,
+      "loss": 1.0369,
+      "step": 2302
+    },
+    {
+      "epoch": 0.8949771689497716,
+      "grad_norm": 0.2095833718776703,
+      "learning_rate": 2.1175554690541068e-05,
+      "loss": 1.0392,
+      "step": 2303
+    },
+    {
+      "epoch": 0.8953657825706791,
+      "grad_norm": 0.2074289619922638,
+      "learning_rate": 2.1097703386531726e-05,
+      "loss": 0.9893,
+      "step": 2304
+    },
+    {
+      "epoch": 0.8957543961915865,
+      "grad_norm": 0.20826508104801178,
+      "learning_rate": 2.1019852082522385e-05,
+      "loss": 1.0737,
+      "step": 2305
+    },
+    {
+      "epoch": 0.896143009812494,
+      "grad_norm": 0.20254862308502197,
+      "learning_rate": 2.094200077851304e-05,
+      "loss": 1.0251,
+      "step": 2306
+    },
+    {
+      "epoch": 0.8965316234334013,
+      "grad_norm": 0.20950356125831604,
+      "learning_rate": 2.08641494745037e-05,
+      "loss": 1.026,
+      "step": 2307
+    },
+    {
+      "epoch": 0.8969202370543088,
+      "grad_norm": 0.20761284232139587,
+      "learning_rate": 2.0786298170494355e-05,
+      "loss": 1.0556,
+      "step": 2308
+    },
+    {
+      "epoch": 0.8973088506752162,
+      "grad_norm": 0.1943255513906479,
+      "learning_rate": 2.0708446866485014e-05,
+      "loss": 0.9745,
+      "step": 2309
+    },
+    {
+      "epoch": 0.8976974642961236,
+      "grad_norm": 0.19723530113697052,
+      "learning_rate": 2.0630595562475673e-05,
+      "loss": 0.9764,
+      "step": 2310
+    },
+    {
+      "epoch": 0.898086077917031,
+      "grad_norm": 0.21135687828063965,
+      "learning_rate": 2.055274425846633e-05,
+      "loss": 1.0289,
+      "step": 2311
+    },
+    {
+      "epoch": 0.8984746915379384,
+      "grad_norm": 0.20867012441158295,
+      "learning_rate": 2.047489295445699e-05,
+      "loss": 1.0659,
+      "step": 2312
+    },
+    {
+      "epoch": 0.8988633051588458,
+      "grad_norm": 0.1999632567167282,
+      "learning_rate": 2.0397041650447646e-05,
+      "loss": 0.9699,
+      "step": 2313
+    },
+    {
+      "epoch": 0.8992519187797532,
+      "grad_norm": 0.2080952674150467,
+      "learning_rate": 2.0319190346438305e-05,
+      "loss": 1.0097,
+      "step": 2314
+    },
+    {
+      "epoch": 0.8996405324006607,
+      "grad_norm": 0.20419847965240479,
+      "learning_rate": 2.024133904242896e-05,
+      "loss": 1.0272,
+      "step": 2315
+    },
+    {
+      "epoch": 0.900029146021568,
+      "grad_norm": 0.19433575868606567,
+      "learning_rate": 2.016348773841962e-05,
+      "loss": 0.9892,
+      "step": 2316
+    },
+    {
+      "epoch": 0.9004177596424755,
+      "grad_norm": 0.20644325017929077,
+      "learning_rate": 2.0085636434410278e-05,
+      "loss": 0.9978,
+      "step": 2317
+    },
+    {
+      "epoch": 0.9008063732633829,
+      "grad_norm": 0.2145605981349945,
+      "learning_rate": 2.0007785130400936e-05,
+      "loss": 1.0569,
+      "step": 2318
+    },
+    {
+      "epoch": 0.9011949868842903,
+      "grad_norm": 0.2073410153388977,
+      "learning_rate": 1.9929933826391592e-05,
+      "loss": 1.0937,
+      "step": 2319
+    },
+    {
+      "epoch": 0.9015836005051977,
+      "grad_norm": 0.2169773280620575,
+      "learning_rate": 1.985208252238225e-05,
+      "loss": 1.0559,
+      "step": 2320
+    },
+    {
+      "epoch": 0.9019722141261051,
+      "grad_norm": 0.2153279334306717,
+      "learning_rate": 1.977423121837291e-05,
+      "loss": 1.074,
+      "step": 2321
+    },
+    {
+      "epoch": 0.9023608277470125,
+      "grad_norm": 0.2089853584766388,
+      "learning_rate": 1.9696379914363565e-05,
+      "loss": 0.9971,
+      "step": 2322
+    },
+    {
+      "epoch": 0.9027494413679199,
+      "grad_norm": 0.21813471615314484,
+      "learning_rate": 1.9618528610354224e-05,
+      "loss": 1.0408,
+      "step": 2323
+    },
+    {
+      "epoch": 0.9031380549888274,
+      "grad_norm": 0.19753578305244446,
+      "learning_rate": 1.9540677306344883e-05,
+      "loss": 0.9429,
+      "step": 2324
+    },
+    {
+      "epoch": 0.9035266686097347,
+      "grad_norm": 0.19760333001613617,
+      "learning_rate": 1.946282600233554e-05,
+      "loss": 1.0127,
+      "step": 2325
+    },
+    {
+      "epoch": 0.9039152822306422,
+      "grad_norm": 0.21375150978565216,
+      "learning_rate": 1.9384974698326197e-05,
+      "loss": 1.0166,
+      "step": 2326
+    },
+    {
+      "epoch": 0.9043038958515496,
+      "grad_norm": 0.21019572019577026,
+      "learning_rate": 1.9307123394316856e-05,
+      "loss": 0.9897,
+      "step": 2327
+    },
+    {
+      "epoch": 0.904692509472457,
+      "grad_norm": 0.20336006581783295,
+      "learning_rate": 1.922927209030751e-05,
+      "loss": 0.9788,
+      "step": 2328
+    },
+    {
+      "epoch": 0.9050811230933644,
+      "grad_norm": 0.20877422392368317,
+      "learning_rate": 1.915142078629817e-05,
+      "loss": 1.0257,
+      "step": 2329
+    },
+    {
+      "epoch": 0.9054697367142719,
+      "grad_norm": 0.21499283611774445,
+      "learning_rate": 1.9073569482288832e-05,
+      "loss": 1.0628,
+      "step": 2330
+    },
+    {
+      "epoch": 0.9058583503351793,
+      "grad_norm": 0.2943152189254761,
+      "learning_rate": 1.8995718178279488e-05,
+      "loss": 1.0859,
+      "step": 2331
+    },
+    {
+      "epoch": 0.9062469639560866,
+      "grad_norm": 0.20630142092704773,
+      "learning_rate": 1.8917866874270146e-05,
+      "loss": 1.0625,
+      "step": 2332
+    },
+    {
+      "epoch": 0.9066355775769941,
+      "grad_norm": 0.19609740376472473,
+      "learning_rate": 1.8840015570260802e-05,
+      "loss": 1.0043,
+      "step": 2333
+    },
+    {
+      "epoch": 0.9070241911979015,
+      "grad_norm": 0.21231451630592346,
+      "learning_rate": 1.876216426625146e-05,
+      "loss": 1.0534,
+      "step": 2334
+    },
+    {
+      "epoch": 0.9074128048188089,
+      "grad_norm": 0.2212425172328949,
+      "learning_rate": 1.8684312962242116e-05,
+      "loss": 0.99,
+      "step": 2335
+    },
+    {
+      "epoch": 0.9078014184397163,
+      "grad_norm": 0.21141575276851654,
+      "learning_rate": 1.860646165823278e-05,
+      "loss": 1.0442,
+      "step": 2336
+    },
+    {
+      "epoch": 0.9081900320606238,
+      "grad_norm": 0.20657780766487122,
+      "learning_rate": 1.8528610354223434e-05,
+      "loss": 1.0363,
+      "step": 2337
+    },
+    {
+      "epoch": 0.9085786456815311,
+      "grad_norm": 0.1973218023777008,
+      "learning_rate": 1.8450759050214093e-05,
+      "loss": 0.9868,
+      "step": 2338
+    },
+    {
+      "epoch": 0.9089672593024386,
+      "grad_norm": 0.19639235734939575,
+      "learning_rate": 1.837290774620475e-05,
+      "loss": 0.9865,
+      "step": 2339
+    },
+    {
+      "epoch": 0.909355872923346,
+      "grad_norm": 0.194901704788208,
+      "learning_rate": 1.8295056442195407e-05,
+      "loss": 0.9776,
+      "step": 2340
+    },
+    {
+      "epoch": 0.9097444865442533,
+      "grad_norm": 0.1907500922679901,
+      "learning_rate": 1.8217205138186066e-05,
+      "loss": 1.0048,
+      "step": 2341
+    },
+    {
+      "epoch": 0.9101331001651608,
+      "grad_norm": 0.20842313766479492,
+      "learning_rate": 1.813935383417672e-05,
+      "loss": 0.9773,
+      "step": 2342
+    },
+    {
+      "epoch": 0.9105217137860682,
+      "grad_norm": 0.2537369132041931,
+      "learning_rate": 1.8061502530167383e-05,
+      "loss": 0.9932,
+      "step": 2343
+    },
+    {
+      "epoch": 0.9109103274069756,
+      "grad_norm": 0.22774042189121246,
+      "learning_rate": 1.798365122615804e-05,
+      "loss": 1.1521,
+      "step": 2344
+    },
+    {
+      "epoch": 0.911298941027883,
+      "grad_norm": 0.192257359623909,
+      "learning_rate": 1.7905799922148698e-05,
+      "loss": 0.9707,
+      "step": 2345
+    },
+    {
+      "epoch": 0.9116875546487905,
+      "grad_norm": 0.21573100984096527,
+      "learning_rate": 1.7827948618139353e-05,
+      "loss": 1.0355,
+      "step": 2346
+    },
+    {
+      "epoch": 0.9120761682696978,
+      "grad_norm": 0.215474933385849,
+      "learning_rate": 1.7750097314130012e-05,
+      "loss": 1.0408,
+      "step": 2347
+    },
+    {
+      "epoch": 0.9124647818906053,
+      "grad_norm": 0.2031407654285431,
+      "learning_rate": 1.767224601012067e-05,
+      "loss": 1.0429,
+      "step": 2348
+    },
+    {
+      "epoch": 0.9128533955115127,
+      "grad_norm": 0.20461305975914001,
+      "learning_rate": 1.759439470611133e-05,
+      "loss": 1.0033,
+      "step": 2349
+    },
+    {
+      "epoch": 0.91324200913242,
+      "grad_norm": 0.20995965600013733,
+      "learning_rate": 1.7516543402101988e-05,
+      "loss": 1.089,
+      "step": 2350
+    },
+    {
+      "epoch": 0.9136306227533275,
+      "grad_norm": 0.20464631915092468,
+      "learning_rate": 1.7438692098092644e-05,
+      "loss": 1.0438,
+      "step": 2351
+    },
+    {
+      "epoch": 0.9140192363742349,
+      "grad_norm": 0.20657162368297577,
+      "learning_rate": 1.7360840794083303e-05,
+      "loss": 1.0687,
+      "step": 2352
+    },
+    {
+      "epoch": 0.9144078499951424,
+      "grad_norm": 0.20419646799564362,
+      "learning_rate": 1.7282989490073958e-05,
+      "loss": 1.0412,
+      "step": 2353
+    },
+    {
+      "epoch": 0.9147964636160497,
+      "grad_norm": 0.20655421912670135,
+      "learning_rate": 1.7205138186064617e-05,
+      "loss": 1.0343,
+      "step": 2354
+    },
+    {
+      "epoch": 0.9151850772369572,
+      "grad_norm": 0.20393185317516327,
+      "learning_rate": 1.7127286882055276e-05,
+      "loss": 1.0379,
+      "step": 2355
+    },
+    {
+      "epoch": 0.9155736908578646,
+      "grad_norm": 0.20768289268016815,
+      "learning_rate": 1.7049435578045934e-05,
+      "loss": 1.022,
+      "step": 2356
+    },
+    {
+      "epoch": 0.915962304478772,
+      "grad_norm": 0.2257547676563263,
+      "learning_rate": 1.6971584274036593e-05,
+      "loss": 1.1081,
+      "step": 2357
+    },
+    {
+      "epoch": 0.9163509180996794,
+      "grad_norm": 0.1980145126581192,
+      "learning_rate": 1.689373297002725e-05,
+      "loss": 1.0439,
+      "step": 2358
+    },
+    {
+      "epoch": 0.9167395317205868,
+      "grad_norm": 0.20351259410381317,
+      "learning_rate": 1.6815881666017908e-05,
+      "loss": 1.0363,
+      "step": 2359
+    },
+    {
+      "epoch": 0.9171281453414942,
+      "grad_norm": 0.20830631256103516,
+      "learning_rate": 1.6738030362008563e-05,
+      "loss": 1.0467,
+      "step": 2360
+    },
+    {
+      "epoch": 0.9175167589624016,
+      "grad_norm": 0.21225905418395996,
+      "learning_rate": 1.6660179057999222e-05,
+      "loss": 1.0611,
+      "step": 2361
+    },
+    {
+      "epoch": 0.9179053725833091,
+      "grad_norm": 0.20069880783557892,
+      "learning_rate": 1.658232775398988e-05,
+      "loss": 0.9989,
+      "step": 2362
+    },
+    {
+      "epoch": 0.9182939862042164,
+      "grad_norm": 0.21674825251102448,
+      "learning_rate": 1.650447644998054e-05,
+      "loss": 1.0578,
+      "step": 2363
+    },
+    {
+      "epoch": 0.9186825998251239,
+      "grad_norm": 0.20438091456890106,
+      "learning_rate": 1.6426625145971195e-05,
+      "loss": 1.0593,
+      "step": 2364
+    },
+    {
+      "epoch": 0.9190712134460313,
+      "grad_norm": 0.2195381075143814,
+      "learning_rate": 1.6348773841961854e-05,
+      "loss": 1.0354,
+      "step": 2365
+    },
+    {
+      "epoch": 0.9194598270669387,
+      "grad_norm": 0.21371111273765564,
+      "learning_rate": 1.6270922537952513e-05,
+      "loss": 0.9911,
+      "step": 2366
+    },
+    {
+      "epoch": 0.9198484406878461,
+      "grad_norm": 0.22097980976104736,
+      "learning_rate": 1.6193071233943168e-05,
+      "loss": 1.0064,
+      "step": 2367
+    },
+    {
+      "epoch": 0.9202370543087536,
+      "grad_norm": 0.20589159429073334,
+      "learning_rate": 1.6115219929933827e-05,
+      "loss": 1.0173,
+      "step": 2368
+    },
+    {
+      "epoch": 0.9206256679296609,
+      "grad_norm": 0.19218075275421143,
+      "learning_rate": 1.6037368625924486e-05,
+      "loss": 1.0215,
+      "step": 2369
+    },
+    {
+      "epoch": 0.9210142815505683,
+      "grad_norm": 0.2132728099822998,
+      "learning_rate": 1.5959517321915144e-05,
+      "loss": 1.0493,
+      "step": 2370
+    },
+    {
+      "epoch": 0.9214028951714758,
+      "grad_norm": 0.20006981492042542,
+      "learning_rate": 1.58816660179058e-05,
+      "loss": 0.9814,
+      "step": 2371
+    },
+    {
+      "epoch": 0.9217915087923831,
+      "grad_norm": 0.21600167453289032,
+      "learning_rate": 1.580381471389646e-05,
+      "loss": 1.0759,
+      "step": 2372
+    },
+    {
+      "epoch": 0.9221801224132906,
+      "grad_norm": 0.21474605798721313,
+      "learning_rate": 1.5725963409887114e-05,
+      "loss": 1.0411,
+      "step": 2373
+    },
+    {
+      "epoch": 0.922568736034198,
+      "grad_norm": 0.2044600546360016,
+      "learning_rate": 1.5648112105877773e-05,
+      "loss": 1.0236,
+      "step": 2374
+    },
+    {
+      "epoch": 0.9229573496551055,
+      "grad_norm": 0.20302869379520416,
+      "learning_rate": 1.5570260801868432e-05,
+      "loss": 0.9982,
+      "step": 2375
+    },
+    {
+      "epoch": 0.9233459632760128,
+      "grad_norm": 0.21155263483524323,
+      "learning_rate": 1.549240949785909e-05,
+      "loss": 1.0249,
+      "step": 2376
+    },
+    {
+      "epoch": 0.9237345768969203,
+      "grad_norm": 0.20336754620075226,
+      "learning_rate": 1.541455819384975e-05,
+      "loss": 1.0223,
+      "step": 2377
+    },
+    {
+      "epoch": 0.9241231905178277,
+      "grad_norm": 0.20189301669597626,
+      "learning_rate": 1.5336706889840405e-05,
+      "loss": 1.0228,
+      "step": 2378
+    },
+    {
+      "epoch": 0.924511804138735,
+      "grad_norm": 0.1962178647518158,
+      "learning_rate": 1.5258855585831064e-05,
+      "loss": 1.0137,
+      "step": 2379
+    },
+    {
+      "epoch": 0.9249004177596425,
+      "grad_norm": 0.21523639559745789,
+      "learning_rate": 1.518100428182172e-05,
+      "loss": 1.0498,
+      "step": 2380
+    },
+    {
+      "epoch": 0.9252890313805499,
+      "grad_norm": 0.20537924766540527,
+      "learning_rate": 1.510315297781238e-05,
+      "loss": 0.9995,
+      "step": 2381
+    },
+    {
+      "epoch": 0.9256776450014573,
+      "grad_norm": 0.21170039474964142,
+      "learning_rate": 1.5025301673803035e-05,
+      "loss": 1.0953,
+      "step": 2382
+    },
+    {
+      "epoch": 0.9260662586223647,
+      "grad_norm": 0.20737627148628235,
+      "learning_rate": 1.4947450369793694e-05,
+      "loss": 0.9892,
+      "step": 2383
+    },
+    {
+      "epoch": 0.9264548722432722,
+      "grad_norm": 0.20684003829956055,
+      "learning_rate": 1.4869599065784354e-05,
+      "loss": 1.0468,
+      "step": 2384
+    },
+    {
+      "epoch": 0.9268434858641795,
+      "grad_norm": 0.20738738775253296,
+      "learning_rate": 1.479174776177501e-05,
+      "loss": 1.0436,
+      "step": 2385
+    },
+    {
+      "epoch": 0.927232099485087,
+      "grad_norm": 0.19740383327007294,
+      "learning_rate": 1.4713896457765669e-05,
+      "loss": 0.9528,
+      "step": 2386
+    },
+    {
+      "epoch": 0.9276207131059944,
+      "grad_norm": 0.20328152179718018,
+      "learning_rate": 1.4636045153756326e-05,
+      "loss": 1.0272,
+      "step": 2387
+    },
+    {
+      "epoch": 0.9280093267269017,
+      "grad_norm": 0.2008744776248932,
+      "learning_rate": 1.4558193849746985e-05,
+      "loss": 1.0441,
+      "step": 2388
+    },
+    {
+      "epoch": 0.9283979403478092,
+      "grad_norm": 0.19907627999782562,
+      "learning_rate": 1.448034254573764e-05,
+      "loss": 0.9929,
+      "step": 2389
+    },
+    {
+      "epoch": 0.9287865539687166,
+      "grad_norm": 0.20299683511257172,
+      "learning_rate": 1.44024912417283e-05,
+      "loss": 0.9749,
+      "step": 2390
+    },
+    {
+      "epoch": 0.929175167589624,
+      "grad_norm": 0.21035155653953552,
+      "learning_rate": 1.432463993771896e-05,
+      "loss": 1.0356,
+      "step": 2391
+    },
+    {
+      "epoch": 0.9295637812105314,
+      "grad_norm": 0.20862546563148499,
+      "learning_rate": 1.4246788633709615e-05,
+      "loss": 1.0594,
+      "step": 2392
+    },
+    {
+      "epoch": 0.9299523948314389,
+      "grad_norm": 0.20775675773620605,
+      "learning_rate": 1.4168937329700274e-05,
+      "loss": 0.9959,
+      "step": 2393
+    },
+    {
+      "epoch": 0.9303410084523462,
+      "grad_norm": 0.1970052868127823,
+      "learning_rate": 1.409108602569093e-05,
+      "loss": 0.9956,
+      "step": 2394
+    },
+    {
+      "epoch": 0.9307296220732537,
+      "grad_norm": 0.2167968600988388,
+      "learning_rate": 1.401323472168159e-05,
+      "loss": 1.0202,
+      "step": 2395
+    },
+    {
+      "epoch": 0.9311182356941611,
+      "grad_norm": 0.20822198688983917,
+      "learning_rate": 1.3935383417672245e-05,
+      "loss": 1.0067,
+      "step": 2396
+    },
+    {
+      "epoch": 0.9315068493150684,
+      "grad_norm": 0.2004898339509964,
+      "learning_rate": 1.3857532113662906e-05,
+      "loss": 1.0069,
+      "step": 2397
+    },
+    {
+      "epoch": 0.9318954629359759,
+      "grad_norm": 0.22808429598808289,
+      "learning_rate": 1.3779680809653561e-05,
+      "loss": 1.1032,
+      "step": 2398
+    },
+    {
+      "epoch": 0.9322840765568833,
+      "grad_norm": 0.19940750300884247,
+      "learning_rate": 1.370182950564422e-05,
+      "loss": 0.9965,
+      "step": 2399
+    },
+    {
+      "epoch": 0.9326726901777908,
+      "grad_norm": 0.21138110756874084,
+      "learning_rate": 1.3623978201634879e-05,
+      "loss": 0.986,
+      "step": 2400
+    },
+    {
+      "epoch": 0.9330613037986981,
+      "grad_norm": 0.2118709534406662,
+      "learning_rate": 1.3546126897625536e-05,
+      "loss": 1.0672,
+      "step": 2401
+    },
+    {
+      "epoch": 0.9334499174196056,
+      "grad_norm": 0.22121763229370117,
+      "learning_rate": 1.3468275593616195e-05,
+      "loss": 1.0204,
+      "step": 2402
+    },
+    {
+      "epoch": 0.933838531040513,
+      "grad_norm": 0.20541204512119293,
+      "learning_rate": 1.3390424289606852e-05,
+      "loss": 1.0749,
+      "step": 2403
+    },
+    {
+      "epoch": 0.9342271446614204,
+      "grad_norm": 0.19598713517189026,
+      "learning_rate": 1.331257298559751e-05,
+      "loss": 0.9638,
+      "step": 2404
+    },
+    {
+      "epoch": 0.9346157582823278,
+      "grad_norm": 0.2157907783985138,
+      "learning_rate": 1.3234721681588166e-05,
+      "loss": 1.0312,
+      "step": 2405
+    },
+    {
+      "epoch": 0.9350043719032352,
+      "grad_norm": 0.19694723188877106,
+      "learning_rate": 1.3156870377578825e-05,
+      "loss": 1.0001,
+      "step": 2406
+    },
+    {
+      "epoch": 0.9353929855241426,
+      "grad_norm": 0.209597647190094,
+      "learning_rate": 1.3079019073569482e-05,
+      "loss": 0.9808,
+      "step": 2407
+    },
+    {
+      "epoch": 0.93578159914505,
+      "grad_norm": 0.2026679664850235,
+      "learning_rate": 1.300116776956014e-05,
+      "loss": 0.9938,
+      "step": 2408
+    },
+    {
+      "epoch": 0.9361702127659575,
+      "grad_norm": 0.20847374200820923,
+      "learning_rate": 1.29233164655508e-05,
+      "loss": 0.9948,
+      "step": 2409
+    },
+    {
+      "epoch": 0.9365588263868648,
+      "grad_norm": 0.23478667438030243,
+      "learning_rate": 1.2845465161541457e-05,
+      "loss": 1.0549,
+      "step": 2410
+    },
+    {
+      "epoch": 0.9369474400077723,
+      "grad_norm": 0.20954233407974243,
+      "learning_rate": 1.2767613857532116e-05,
+      "loss": 1.0336,
+      "step": 2411
+    },
+    {
+      "epoch": 0.9373360536286797,
+      "grad_norm": 0.2130623608827591,
+      "learning_rate": 1.2689762553522771e-05,
+      "loss": 1.0398,
+      "step": 2412
+    },
+    {
+      "epoch": 0.9377246672495871,
+      "grad_norm": 0.20076791942119598,
+      "learning_rate": 1.261191124951343e-05,
+      "loss": 0.9945,
+      "step": 2413
+    },
+    {
+      "epoch": 0.9381132808704945,
+      "grad_norm": 0.21280889213085175,
+      "learning_rate": 1.2534059945504087e-05,
+      "loss": 1.0538,
+      "step": 2414
+    },
+    {
+      "epoch": 0.938501894491402,
+      "grad_norm": 0.19909800589084625,
+      "learning_rate": 1.2456208641494746e-05,
+      "loss": 0.9783,
+      "step": 2415
+    },
+    {
+      "epoch": 0.9388905081123093,
+      "grad_norm": 0.21449251472949982,
+      "learning_rate": 1.2378357337485403e-05,
+      "loss": 1.0547,
+      "step": 2416
+    },
+    {
+      "epoch": 0.9392791217332167,
+      "grad_norm": 0.20742881298065186,
+      "learning_rate": 1.2300506033476062e-05,
+      "loss": 1.0471,
+      "step": 2417
+    },
+    {
+      "epoch": 0.9396677353541242,
+      "grad_norm": 0.21160250902175903,
+      "learning_rate": 1.2222654729466719e-05,
+      "loss": 1.0089,
+      "step": 2418
+    },
+    {
+      "epoch": 0.9400563489750315,
+      "grad_norm": 0.22055311501026154,
+      "learning_rate": 1.2144803425457376e-05,
+      "loss": 1.0201,
+      "step": 2419
+    },
+    {
+      "epoch": 0.940444962595939,
+      "grad_norm": 0.21073050796985626,
+      "learning_rate": 1.2066952121448035e-05,
+      "loss": 1.0025,
+      "step": 2420
+    },
+    {
+      "epoch": 0.9408335762168464,
+      "grad_norm": 0.19758272171020508,
+      "learning_rate": 1.1989100817438692e-05,
+      "loss": 0.9643,
+      "step": 2421
+    },
+    {
+      "epoch": 0.9412221898377539,
+      "grad_norm": 0.20312103629112244,
+      "learning_rate": 1.191124951342935e-05,
+      "loss": 1.023,
+      "step": 2422
+    },
+    {
+      "epoch": 0.9416108034586612,
+      "grad_norm": 0.19969260692596436,
+      "learning_rate": 1.183339820942001e-05,
+      "loss": 0.9623,
+      "step": 2423
+    },
+    {
+      "epoch": 0.9419994170795687,
+      "grad_norm": 0.21867750585079193,
+      "learning_rate": 1.1755546905410667e-05,
+      "loss": 1.0895,
+      "step": 2424
+    },
+    {
+      "epoch": 0.942388030700476,
+      "grad_norm": 0.19672009348869324,
+      "learning_rate": 1.1677695601401324e-05,
+      "loss": 1.0253,
+      "step": 2425
+    },
+    {
+      "epoch": 0.9427766443213834,
+      "grad_norm": 0.20442704856395721,
+      "learning_rate": 1.1599844297391983e-05,
+      "loss": 1.0515,
+      "step": 2426
+    },
+    {
+      "epoch": 0.9431652579422909,
+      "grad_norm": 0.2008974254131317,
+      "learning_rate": 1.152199299338264e-05,
+      "loss": 0.9934,
+      "step": 2427
+    },
+    {
+      "epoch": 0.9435538715631983,
+      "grad_norm": 0.20074884593486786,
+      "learning_rate": 1.1444141689373297e-05,
+      "loss": 0.9792,
+      "step": 2428
+    },
+    {
+      "epoch": 0.9439424851841057,
+      "grad_norm": 0.1945987194776535,
+      "learning_rate": 1.1366290385363956e-05,
+      "loss": 0.991,
+      "step": 2429
+    },
+    {
+      "epoch": 0.9443310988050131,
+      "grad_norm": 0.2123355269432068,
+      "learning_rate": 1.1288439081354613e-05,
+      "loss": 0.9768,
+      "step": 2430
+    },
+    {
+      "epoch": 0.9447197124259206,
+      "grad_norm": 0.19462116062641144,
+      "learning_rate": 1.1210587777345272e-05,
+      "loss": 1.0221,
+      "step": 2431
+    },
+    {
+      "epoch": 0.9451083260468279,
+      "grad_norm": 0.21487726271152496,
+      "learning_rate": 1.1132736473335929e-05,
+      "loss": 1.0273,
+      "step": 2432
+    },
+    {
+      "epoch": 0.9454969396677354,
+      "grad_norm": 0.2011580765247345,
+      "learning_rate": 1.1054885169326588e-05,
+      "loss": 1.0065,
+      "step": 2433
+    },
+    {
+      "epoch": 0.9458855532886428,
+      "grad_norm": 0.2009819597005844,
+      "learning_rate": 1.0977033865317245e-05,
+      "loss": 1.04,
+      "step": 2434
+    },
+    {
+      "epoch": 0.9462741669095501,
+      "grad_norm": 0.20142634212970734,
+      "learning_rate": 1.0899182561307902e-05,
+      "loss": 1.0101,
+      "step": 2435
+    },
+    {
+      "epoch": 0.9466627805304576,
+      "grad_norm": 0.20323152840137482,
+      "learning_rate": 1.082133125729856e-05,
+      "loss": 1.0039,
+      "step": 2436
+    },
+    {
+      "epoch": 0.947051394151365,
+      "grad_norm": 0.18746018409729004,
+      "learning_rate": 1.0743479953289218e-05,
+      "loss": 0.9876,
+      "step": 2437
+    },
+    {
+      "epoch": 0.9474400077722724,
+      "grad_norm": 0.20016197860240936,
+      "learning_rate": 1.0665628649279875e-05,
+      "loss": 1.0067,
+      "step": 2438
+    },
+    {
+      "epoch": 0.9478286213931798,
+      "grad_norm": 0.19872961938381195,
+      "learning_rate": 1.0587777345270534e-05,
+      "loss": 0.9884,
+      "step": 2439
+    },
+    {
+      "epoch": 0.9482172350140873,
+      "grad_norm": 0.20647788047790527,
+      "learning_rate": 1.0509926041261193e-05,
+      "loss": 1.0088,
+      "step": 2440
+    },
+    {
+      "epoch": 0.9486058486349946,
+      "grad_norm": 0.20790119469165802,
+      "learning_rate": 1.043207473725185e-05,
+      "loss": 1.0201,
+      "step": 2441
+    },
+    {
+      "epoch": 0.9489944622559021,
+      "grad_norm": 0.20318609476089478,
+      "learning_rate": 1.0354223433242507e-05,
+      "loss": 1.0199,
+      "step": 2442
+    },
+    {
+      "epoch": 0.9493830758768095,
+      "grad_norm": 0.21426942944526672,
+      "learning_rate": 1.0276372129233166e-05,
+      "loss": 1.0047,
+      "step": 2443
+    },
+    {
+      "epoch": 0.9497716894977168,
+      "grad_norm": 0.3223714828491211,
+      "learning_rate": 1.0198520825223823e-05,
+      "loss": 1.0532,
+      "step": 2444
+    },
+    {
+      "epoch": 0.9501603031186243,
+      "grad_norm": 0.2070651799440384,
+      "learning_rate": 1.012066952121448e-05,
+      "loss": 1.0576,
+      "step": 2445
+    },
+    {
+      "epoch": 0.9505489167395317,
+      "grad_norm": 0.20618025958538055,
+      "learning_rate": 1.0042818217205139e-05,
+      "loss": 1.061,
+      "step": 2446
+    },
+    {
+      "epoch": 0.9509375303604392,
+      "grad_norm": 0.20535731315612793,
+      "learning_rate": 9.964966913195796e-06,
+      "loss": 0.9923,
+      "step": 2447
+    },
+    {
+      "epoch": 0.9513261439813465,
+      "grad_norm": 0.21038392186164856,
+      "learning_rate": 9.887115609186455e-06,
+      "loss": 1.0257,
+      "step": 2448
+    },
+    {
+      "epoch": 0.951714757602254,
+      "grad_norm": 0.20872676372528076,
+      "learning_rate": 9.809264305177112e-06,
+      "loss": 1.0147,
+      "step": 2449
+    },
+    {
+      "epoch": 0.9521033712231614,
+      "grad_norm": 0.40158966183662415,
+      "learning_rate": 9.73141300116777e-06,
+      "loss": 1.0071,
+      "step": 2450
+    },
+    {
+      "epoch": 0.9524919848440688,
+      "grad_norm": 0.1991165280342102,
+      "learning_rate": 9.653561697158428e-06,
+      "loss": 0.9829,
+      "step": 2451
+    },
+    {
+      "epoch": 0.9528805984649762,
+      "grad_norm": 0.1965460628271103,
+      "learning_rate": 9.575710393149085e-06,
+      "loss": 1.0286,
+      "step": 2452
+    },
+    {
+      "epoch": 0.9532692120858836,
+      "grad_norm": 0.20879510045051575,
+      "learning_rate": 9.497859089139744e-06,
+      "loss": 1.0707,
+      "step": 2453
+    },
+    {
+      "epoch": 0.953657825706791,
+      "grad_norm": 0.19594980776309967,
+      "learning_rate": 9.420007785130401e-06,
+      "loss": 0.9946,
+      "step": 2454
+    },
+    {
+      "epoch": 0.9540464393276984,
+      "grad_norm": 0.19754594564437866,
+      "learning_rate": 9.342156481121058e-06,
+      "loss": 0.9737,
+      "step": 2455
+    },
+    {
+      "epoch": 0.9544350529486059,
+      "grad_norm": 0.21339558064937592,
+      "learning_rate": 9.264305177111717e-06,
+      "loss": 1.0505,
+      "step": 2456
+    },
+    {
+      "epoch": 0.9548236665695132,
+      "grad_norm": 0.20371811091899872,
+      "learning_rate": 9.186453873102376e-06,
+      "loss": 1.0594,
+      "step": 2457
+    },
+    {
+      "epoch": 0.9552122801904207,
+      "grad_norm": 0.20965653657913208,
+      "learning_rate": 9.108602569093033e-06,
+      "loss": 1.0639,
+      "step": 2458
+    },
+    {
+      "epoch": 0.9556008938113281,
+      "grad_norm": 0.20316167175769806,
+      "learning_rate": 9.030751265083692e-06,
+      "loss": 1.0219,
+      "step": 2459
+    },
+    {
+      "epoch": 0.9559895074322355,
+      "grad_norm": 0.19921238720417023,
+      "learning_rate": 8.952899961074349e-06,
+      "loss": 1.0399,
+      "step": 2460
+    },
+    {
+      "epoch": 0.9563781210531429,
+      "grad_norm": 0.196847602725029,
+      "learning_rate": 8.875048657065006e-06,
+      "loss": 0.9678,
+      "step": 2461
+    },
+    {
+      "epoch": 0.9567667346740504,
+      "grad_norm": 0.20746973156929016,
+      "learning_rate": 8.797197353055665e-06,
+      "loss": 1.0365,
+      "step": 2462
+    },
+    {
+      "epoch": 0.9571553482949577,
+      "grad_norm": 0.3297490179538727,
+      "learning_rate": 8.719346049046322e-06,
+      "loss": 1.0028,
+      "step": 2463
+    },
+    {
+      "epoch": 0.9575439619158651,
+      "grad_norm": 0.2101137936115265,
+      "learning_rate": 8.641494745036979e-06,
+      "loss": 1.0627,
+      "step": 2464
+    },
+    {
+      "epoch": 0.9579325755367726,
+      "grad_norm": 0.2444445937871933,
+      "learning_rate": 8.563643441027638e-06,
+      "loss": 0.9866,
+      "step": 2465
+    },
+    {
+      "epoch": 0.9583211891576799,
+      "grad_norm": 0.20323987305164337,
+      "learning_rate": 8.485792137018297e-06,
+      "loss": 1.0123,
+      "step": 2466
+    },
+    {
+      "epoch": 0.9587098027785874,
+      "grad_norm": 0.21334567666053772,
+      "learning_rate": 8.407940833008954e-06,
+      "loss": 1.0492,
+      "step": 2467
+    },
+    {
+      "epoch": 0.9590984163994948,
+      "grad_norm": 0.19852736592292786,
+      "learning_rate": 8.330089528999611e-06,
+      "loss": 1.0303,
+      "step": 2468
+    },
+    {
+      "epoch": 0.9594870300204023,
+      "grad_norm": 0.1995389610528946,
+      "learning_rate": 8.25223822499027e-06,
+      "loss": 0.9758,
+      "step": 2469
+    },
+    {
+      "epoch": 0.9598756436413096,
+      "grad_norm": 0.19799165427684784,
+      "learning_rate": 8.174386920980927e-06,
+      "loss": 0.9541,
+      "step": 2470
+    },
+    {
+      "epoch": 0.9602642572622171,
+      "grad_norm": 0.21066170930862427,
+      "learning_rate": 8.096535616971584e-06,
+      "loss": 1.0389,
+      "step": 2471
+    },
+    {
+      "epoch": 0.9606528708831245,
+      "grad_norm": 0.19671034812927246,
+      "learning_rate": 8.018684312962243e-06,
+      "loss": 0.9791,
+      "step": 2472
+    },
+    {
+      "epoch": 0.9610414845040318,
+      "grad_norm": 0.2106933444738388,
+      "learning_rate": 7.9408330089529e-06,
+      "loss": 0.9479,
+      "step": 2473
+    },
+    {
+      "epoch": 0.9614300981249393,
+      "grad_norm": 0.20396657288074493,
+      "learning_rate": 7.862981704943557e-06,
+      "loss": 1.0068,
+      "step": 2474
+    },
+    {
+      "epoch": 0.9618187117458467,
+      "grad_norm": 0.19684381783008575,
+      "learning_rate": 7.785130400934216e-06,
+      "loss": 1.0347,
+      "step": 2475
+    },
+    {
+      "epoch": 0.9622073253667541,
+      "grad_norm": 0.19494709372520447,
+      "learning_rate": 7.707279096924875e-06,
+      "loss": 0.9997,
+      "step": 2476
+    },
+    {
+      "epoch": 0.9625959389876615,
+      "grad_norm": 0.21996809542179108,
+      "learning_rate": 7.629427792915532e-06,
+      "loss": 1.0517,
+      "step": 2477
+    },
+    {
+      "epoch": 0.962984552608569,
+      "grad_norm": 0.2083420753479004,
+      "learning_rate": 7.55157648890619e-06,
+      "loss": 1.0483,
+      "step": 2478
+    },
+    {
+      "epoch": 0.9633731662294763,
+      "grad_norm": 0.2018081396818161,
+      "learning_rate": 7.473725184896847e-06,
+      "loss": 1.0167,
+      "step": 2479
+    },
+    {
+      "epoch": 0.9637617798503838,
+      "grad_norm": 0.22427868843078613,
+      "learning_rate": 7.395873880887505e-06,
+      "loss": 0.9759,
+      "step": 2480
+    },
+    {
+      "epoch": 0.9641503934712912,
+      "grad_norm": 0.2190699577331543,
+      "learning_rate": 7.318022576878163e-06,
+      "loss": 1.049,
+      "step": 2481
+    },
+    {
+      "epoch": 0.9645390070921985,
+      "grad_norm": 0.2035333812236786,
+      "learning_rate": 7.24017127286882e-06,
+      "loss": 1.0556,
+      "step": 2482
+    },
+    {
+      "epoch": 0.964927620713106,
+      "grad_norm": 0.20165729522705078,
+      "learning_rate": 7.16231996885948e-06,
+      "loss": 0.9958,
+      "step": 2483
+    },
+    {
+      "epoch": 0.9653162343340134,
+      "grad_norm": 0.20284077525138855,
+      "learning_rate": 7.084468664850137e-06,
+      "loss": 1.0146,
+      "step": 2484
+    },
+    {
+      "epoch": 0.9657048479549208,
+      "grad_norm": 0.1984403133392334,
+      "learning_rate": 7.006617360840795e-06,
+      "loss": 0.9797,
+      "step": 2485
+    },
+    {
+      "epoch": 0.9660934615758282,
+      "grad_norm": 0.22276800870895386,
+      "learning_rate": 6.928766056831453e-06,
+      "loss": 1.042,
+      "step": 2486
+    },
+    {
+      "epoch": 0.9664820751967357,
+      "grad_norm": 0.18282116949558258,
+      "learning_rate": 6.85091475282211e-06,
+      "loss": 0.9681,
+      "step": 2487
+    },
+    {
+      "epoch": 0.966870688817643,
+      "grad_norm": 0.19382023811340332,
+      "learning_rate": 6.773063448812768e-06,
+      "loss": 0.9991,
+      "step": 2488
+    },
+    {
+      "epoch": 0.9672593024385505,
+      "grad_norm": 0.2009381204843521,
+      "learning_rate": 6.695212144803426e-06,
+      "loss": 1.0061,
+      "step": 2489
+    },
+    {
+      "epoch": 0.9676479160594579,
+      "grad_norm": 0.2232959270477295,
+      "learning_rate": 6.617360840794083e-06,
+      "loss": 1.0776,
+      "step": 2490
+    },
+    {
+      "epoch": 0.9680365296803652,
+      "grad_norm": 0.2164563238620758,
+      "learning_rate": 6.539509536784741e-06,
+      "loss": 1.0834,
+      "step": 2491
+    },
+    {
+      "epoch": 0.9684251433012727,
+      "grad_norm": 0.2053539901971817,
+      "learning_rate": 6.4616582327754e-06,
+      "loss": 1.0449,
+      "step": 2492
+    },
+    {
+      "epoch": 0.9688137569221801,
+      "grad_norm": 0.23249384760856628,
+      "learning_rate": 6.383806928766058e-06,
+      "loss": 1.0418,
+      "step": 2493
+    },
+    {
+      "epoch": 0.9692023705430876,
+      "grad_norm": 0.18624578416347504,
+      "learning_rate": 6.305955624756715e-06,
+      "loss": 0.9152,
+      "step": 2494
+    },
+    {
+      "epoch": 0.9695909841639949,
+      "grad_norm": 0.2001798450946808,
+      "learning_rate": 6.228104320747373e-06,
+      "loss": 1.0084,
+      "step": 2495
+    },
+    {
+      "epoch": 0.9699795977849024,
+      "grad_norm": 0.2341216653585434,
+      "learning_rate": 6.150253016738031e-06,
+      "loss": 0.9935,
+      "step": 2496
+    },
+    {
+      "epoch": 0.9703682114058098,
+      "grad_norm": 0.21359120309352875,
+      "learning_rate": 6.072401712728688e-06,
+      "loss": 1.0498,
+      "step": 2497
+    },
+    {
+      "epoch": 0.9707568250267172,
+      "grad_norm": 0.21405139565467834,
+      "learning_rate": 5.994550408719346e-06,
+      "loss": 1.096,
+      "step": 2498
+    },
+    {
+      "epoch": 0.9711454386476246,
+      "grad_norm": 0.2035064846277237,
+      "learning_rate": 5.916699104710005e-06,
+      "loss": 1.0351,
+      "step": 2499
+    },
+    {
+      "epoch": 0.9715340522685321,
+      "grad_norm": 0.19452853500843048,
+      "learning_rate": 5.838847800700662e-06,
+      "loss": 0.9994,
+      "step": 2500
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 2574,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.1321276344029348e+19,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/outputs/checkpoint-313/README.md b/outputs/checkpoint-313/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3abf956c074d00f34a12693c8d6da9738211d7c7
--- /dev/null
+++ b/outputs/checkpoint-313/README.md
@@ -0,0 +1,209 @@
+---
+base_model: unsloth/gpt-oss-20b-unsloth-bnb-4bit
+library_name: peft
+tags:
+- base_model:adapter:unsloth/gpt-oss-20b-unsloth-bnb-4bit
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/outputs/checkpoint-313/adapter_config.json b/outputs/checkpoint-313/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8d4d3c69b9ee90115d6da73d3bfb98e6ac3721d1
--- /dev/null
+++ b/outputs/checkpoint-313/adapter_config.json
@@ -0,0 +1,45 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": {
+    "base_model_class": "GptOssForCausalLM",
+    "parent_library": "transformers.models.gpt_oss.modeling_gpt_oss"
+  },
+  "base_model_name_or_path": "unsloth/gpt-oss-20b-unsloth-bnb-4bit",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "down_proj",
+    "o_proj",
+    "v_proj",
+    "q_proj",
+    "k_proj",
+    "gate_proj"
+  ],
+  "target_parameters": null,
+  "task_type": null,
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/outputs/checkpoint-313/chat_template.jinja b/outputs/checkpoint-313/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..a3650f886e98b2834c25727759c8e0ab8495f316
--- /dev/null
+++ b/outputs/checkpoint-313/chat_template.jinja
@@ -0,0 +1,315 @@
+{# Copyright 2025-present Unsloth. Apache 2.0 License. Unsloth chat template fixes. Edited from ggml-org & OpenAI #}
+{#-
+  In addition to the normal inputs of `messages` and `tools`, this template also accepts the
+  following kwargs:
+  - "builtin_tools": A list, can contain "browser" and/or "python".
+  - "model_identity": A string that optionally describes the model identity.
+  - "reasoning_effort": A string that describes the reasoning effort, defaults to "medium".
+ #}
+
+{#- Tool Definition Rendering ============================================== #}
+{%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%}
+    {%- if param_spec.type == "array" -%}
+        {%- if param_spec['items'] -%}
+            {%- if param_spec['items']['type'] == "string" -%}
+                {{- "string[]" }}
+            {%- elif param_spec['items']['type'] == "number" -%}
+                {{- "number[]" }}
+            {%- elif param_spec['items']['type'] == "integer" -%}
+                {{- "number[]" }}
+            {%- elif param_spec['items']['type'] == "boolean" -%}
+                {{- "boolean[]" }}
+            {%- else -%}
+                {%- set inner_type = render_typescript_type(param_spec['items'], required_params) -%}
+                {%- if inner_type == "object | object" or inner_type|length > 50 -%}
+                    {{- "any[]" }}
+                {%- else -%}
+                    {{- inner_type + "[]" }}
+                {%- endif -%}
+            {%- endif -%}
+            {%- if param_spec.nullable -%}
+                {{- " | null" }}
+            {%- endif -%}
+        {%- else -%}
+            {{- "any[]" }}
+            {%- if param_spec.nullable -%}
+                {{- " | null" }}
+            {%- endif -%}
+        {%- endif -%}
+    {%- elif param_spec.type is defined and param_spec.type is iterable and param_spec.type is not string and param_spec.type is not mapping and param_spec.type[0] is defined -%}
+        {#- Handle array of types like ["object", "object"] from Union[dict, list] #}
+        {%- if param_spec.type | length > 1 -%}
+            {{- param_spec.type | join(" | ") }}
+        {%- else -%}
+            {{- param_spec.type[0] }}
+        {%- endif -%}
+    {%- elif param_spec.oneOf -%}
+        {#- Handle oneOf schemas - check for complex unions and fallback to any #}
+        {%- set has_object_variants = false -%}
+        {%- for variant in param_spec.oneOf -%}
+            {%- if variant.type == "object" -%}
+                {%- set has_object_variants = true -%}
+            {%- endif -%}
+        {%- endfor -%}
+        {%- if has_object_variants and param_spec.oneOf|length > 1 -%}
+            {{- "any" }}
+        {%- else -%}
+            {%- for variant in param_spec.oneOf -%}
+                {{- render_typescript_type(variant, required_params) -}}
+                {%- if variant.description %}
+                    {{- "// " + variant.description }}
+                {%- endif -%}
+                {%- if variant.default is defined %}
+                    {{ "// default: " + variant.default|tojson }}
+                {%- endif -%}
+                {%- if not loop.last %}
+                    {{- " | " }}
+                {% endif -%}
+            {%- endfor -%}
+        {%- endif -%}
+    {%- elif param_spec.type == "string" -%}
+        {%- if param_spec.enum -%}
+            {{- '"' + param_spec.enum|join('" | "') + '"' -}}
+        {%- else -%}
+            {{- "string" }}
+            {%- if param_spec.nullable %}
+                {{- " | null" }}
+            {%- endif -%}
+        {%- endif -%}
+    {%- elif param_spec.type == "number" -%}
+        {{- "number" }}
+    {%- elif param_spec.type == "integer" -%}
+        {{- "number" }}
+    {%- elif param_spec.type == "boolean" -%}
+        {{- "boolean" }}
+
+    {%- elif param_spec.type == "object" -%}
+        {%- if param_spec.properties -%}
+            {{- "{\n" }}
+            {%- for prop_name, prop_spec in param_spec.properties.items() -%}
+                {{- prop_name -}}
+                {%- if prop_name not in (param_spec.required or []) -%}
+                    {{- "?" }}
+                {%- endif -%}
+                {{- ": " }}
+                {{ render_typescript_type(prop_spec, param_spec.required or []) }}
+                {%- if not loop.last -%}
+                    {{-", " }}
+                {%- endif -%}
+            {%- endfor -%}
+            {{- "}" }}
+        {%- else -%}
+            {{- "object" }}
+        {%- endif -%}
+    {%- else -%}
+        {{- "any" }}
+    {%- endif -%}
+{%- endmacro -%}
+
+{%- macro render_tool_namespace(namespace_name, tools) -%}
+    {{- "## " + namespace_name + "\n\n" }}
+    {{- "namespace " + namespace_name + " {\n\n" }}
+    {%- for tool in tools %}
+        {%- set tool = tool.function %}
+        {{- "// " + tool.description + "\n" }}
+        {{- "type "+ tool.name + " = " }}
+        {%- if tool.parameters and tool.parameters.properties -%}
+            {{- "(_: " }}
+            {{- "{\n" }}
+            {%- for param_name, param_spec in tool.parameters.properties.items() %}
+                {{- "// " + param_spec.description + "\n" }}
+                {{- param_name }}
+                {%- if param_name not in (tool.parameters.required or []) -%}
+                    {{- "?" }}
+                {%- endif -%}
+                {{- ": " }}
+                {{- render_typescript_type(param_spec, tool.parameters.required or []) }}
+                {%- if param_spec.default is defined -%}
+                    {%- if param_spec.enum %}
+                        {{- ", // default: " + param_spec.default }}
+                    {%- elif param_spec.oneOf %}
+                        {{- "// default: " + param_spec.default }}
+                    {%- else %}
+                        {{- ", // default: " + param_spec.default|tojson }}
+                    {%- endif -%}
+                {%- endif -%}
+                {%- if not loop.last %}
+                    {{- ",\n" }}
+                {%- else %}
+                    {{- "\n" }}
+                {%- endif -%}
+            {%- endfor %}
+            {{- "}) => any;\n\n" }}
+        {%- else -%}
+            {{- "() => any;\n\n" }}
+        {%- endif -%}
+    {%- endfor %}
+    {{- "} // namespace " + namespace_name }}
+{%- endmacro -%}
+
+{%- macro render_builtin_tools(browser_tool, python_tool) -%}
+    {%- if browser_tool %}
+        {{- "## browser\n\n" }}
+        {{- "// Tool for browsing.\n" }}
+        {{- "// The `cursor` appears in brackets before each browsing display: `[{cursor}]`.\n" }}
+        {{- "// Cite information from the tool using the following format:\n" }}
+        {{- "// `【{cursor}†L{line_start}(-L{line_end})?】`, for example: `【6†L9-L11】` or `【8†L3】`.\n" }}
+        {{- "// Do not quote more than 10 words directly from the tool output.\n" }}
+        {{- "// sources=web (default: web)\n" }}
+        {{- "namespace browser {\n\n" }}
+        {{- "// Searches for information related to `query` and displays `topn` results.\n" }}
+        {{- "type search = (_: {\n" }}
+        {{- "query: string,\n" }}
+        {{- "topn?: number, // default: 10\n" }}
+        {{- "source?: string,\n" }}
+        {{- "}) => any;\n\n" }}
+        {{- "// Opens the link `id` from the page indicated by `cursor` starting at line number `loc`, showing `num_lines` lines.\n" }}
+        {{- "// Valid link ids are displayed with the formatting: `【{id}†.*】`.\n" }}
+        {{- "// If `cursor` is not provided, the most recent page is implied.\n" }}
+        {{- "// If `id` is a string, it is treated as a fully qualified URL associated with `source`.\n" }}
+        {{- "// If `loc` is not provided, the viewport will be positioned at the beginning of the document or centered on the most relevant passage, if available.\n" }}
+        {{- "// Use this function without `id` to scroll to a new location of an opened page.\n" }}
+        {{- "type open = (_: {\n" }}
+        {{- "id?: number | string, // default: -1\n" }}
+        {{- "cursor?: number, // default: -1\n" }}
+        {{- "loc?: number, // default: -1\n" }}
+        {{- "num_lines?: number, // default: -1\n" }}
+        {{- "view_source?: boolean, // default: false\n" }}
+        {{- "source?: string,\n" }}
+        {{- "}) => any;\n\n" }}
+        {{- "// Finds exact matches of `pattern` in the current page, or the page given by `cursor`.\n" }}
+        {{- "type find = (_: {\n" }}
+        {{- "pattern: string,\n" }}
+        {{- "cursor?: number, // default: -1\n" }}
+        {{- "}) => any;\n\n" }}
+        {{- "} // namespace browser\n\n" }}
+    {%- endif -%}
+
+    {%- if python_tool %}
+        {{- "## python\n\n" }}
+        {{- "Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).\n\n" }}
+        {{- "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 120.0 seconds. The drive at '/mnt/data' can be used to save and persist user files. Internet access for this session is UNKNOWN. Depends on the cluster.\n\n" }}
+    {%- endif -%}
+{%- endmacro -%}
+
+{#- System Message Construction ============================================ #}
+{%- macro build_system_message() -%}
+    {%- if model_identity is not defined %}
+        {{- "You are ChatGPT, a large language model trained by OpenAI.\n" -}}
+    {%- else %}
+        {{- model_identity }}
+    {%- endif %}
+    {{- "Knowledge cutoff: 2024-06\n" }}
+    {{- "Current date: " + strftime_now("%Y-%m-%d") + "\n\n" }}
+    {%- if reasoning_effort is not defined %}
+        {%- set reasoning_effort = "medium" %}
+    {%- endif %}
+    {{- "Reasoning: " + reasoning_effort + "\n\n" }}
+    {%- if builtin_tools is defined %}
+        {{- "# Tools\n\n" }}
+        {%- set available_builtin_tools = namespace(browser=false, python=false) %}
+        {%- for tool in builtin_tools %}
+            {%- if tool == "browser" %}
+                {%- set available_builtin_tools.browser = true %}
+            {%- elif tool == "python" %}
+                {%- set available_builtin_tools.python = true %}
+            {%- endif %}
+        {%- endfor %}
+        {{- render_builtin_tools(available_builtin_tools.browser, available_builtin_tools.python) }}
+    {%- endif -%}
+    {{- "# Valid channels: analysis, commentary, final. Channel must be included for every message." }}
+    {%- if tools is defined -%}
+        {{- "\nCalls to these tools must go to the commentary channel: 'functions'." }}
+    {%- endif -%}
+{%- endmacro -%}
+
+{#- Main Template Logic ================================================= #}
+{#- Set defaults #}
+
+{#- Render system message #}
+{{- "<|start|>system<|message|>" }}
+{{- build_system_message() }}
+{{- "<|end|>" }}
+
+{#- Extract developer message #}
+{%- if messages[0].role == "developer" or messages[0].role == "system" %}
+    {%- set developer_message = messages[0].content %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set developer_message = "" %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+
+{#- Render developer message #}
+{%- if developer_message or tools %}
+    {{- "<|start|>developer<|message|>" }}
+    {%- if developer_message %}
+        {{- "# Instructions\n\n" }}
+        {{- developer_message }}
+    {%- endif %}
+    {%- if tools -%}
+        {{- "\n\n" }}
+        {{- "# Tools\n\n" }}
+        {{- render_tool_namespace("functions", tools) }}
+    {%- endif -%}
+    {{- "<|end|>" }}
+{%- endif %}
+
+{#- Render messages #}
+{%- set last_tool_call = namespace(name=none) %}
+{%- for message in loop_messages -%}
+    {#- At this point only assistant/user/tool messages should remain #}
+    {%- if message.role == 'assistant' -%}
+        {%- if "tool_calls" in message %}
+            {#- We assume max 1 tool call per message, and so we infer the tool call name #}
+            {#- in "tool" messages from the most recent assistant tool call name #}
+            {%- set tool_call = message.tool_calls[0] %}
+            {%- if tool_call.function %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {%- if message.content %}
+                {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.content + "<|end|>" }}
+            {%- endif %}
+            {{- "<|start|>assistant to=" }}
+            {{- "functions." + tool_call.name + "<|channel|>commentary json<|message|>" }}
+            {{- tool_call.arguments|tojson }}
+            {{- "<|call|>" }}
+            {%- set last_tool_call.name = tool_call.name %}
+        {%- elif "thinking" in message and loop.last and not add_generation_prompt %}
+            {#- Only render the CoT if the final turn is an assistant turn and add_generation_prompt is false #}
+            {#- This is a situation that should only occur in training, never in inference. #}
+            {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }}
+            {#- <|return|> indicates the end of generation, but <|end|> does not #}
+            {#- <|return|> should never be an input to the model, but we include it as the final token #}
+            {#- when training, so the model learns to emit it. #}
+            {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|return|>" }}
+            {%- set last_tool_call.name = none %}
+        {%- elif "thinking" in message %}
+            {#- CoT is dropped during all previous turns, so we never render it for inference #}
+            {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|end|>" }}
+            {%- set last_tool_call.name = none %}
+        {%- elif loop.last and not add_generation_prompt %}
+            {#- <|return|> indicates the end of generation, but <|end|> does not #}
+            {#- <|return|> should never be an input to the model, but we include it as the final token #}
+            {#- when training, so the model learns to emit it. #}
+            {{- "<|start|>assistant<|message|>" + message.content + "<|return|>" }}
+        {%- else %}
+            {{- "<|start|>assistant<|message|>" + message.content + "<|end|>" }}
+            {%- set last_tool_call.name = none %}
+        {%- endif %}
+    {%- elif message.role == 'tool' -%}
+        {%- if last_tool_call.name is none %}
+            {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }}
+        {%- endif %}
+        {{- "<|start|>functions." + last_tool_call.name }}
+        {{- " to=assistant<|channel|>commentary<|message|>" + message.content|tojson + "<|end|>" }}
+    {%- else -%}
+        {{- "<|start|>user<|message|>" + message.content + "<|end|>" }}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Generation prompt #}
+{%- if add_generation_prompt -%}
+<|start|>assistant
+{%- endif -%}
+{# Copyright 2025-present Unsloth. Apache 2.0 License. Unsloth chat template fixes. Edited from ggml-org & OpenAI #}
\ No newline at end of file
diff --git a/outputs/checkpoint-313/special_tokens_map.json b/outputs/checkpoint-313/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..6fba18753f4d09dbb8fcdf1482daff36b963d639
--- /dev/null
+++ b/outputs/checkpoint-313/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|return|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|reserved_200017|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/outputs/checkpoint-313/tokenizer.json b/outputs/checkpoint-313/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..6ec3ef1795cbbda6b7cb7d1f114919cbe3fdd647
--- /dev/null
+++ b/outputs/checkpoint-313/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0614fe83cadab421296e664e1f48f4261fa8fef6e03e63bb75c20f38e37d07d3
+size 27868174
diff --git a/outputs/checkpoint-313/tokenizer_config.json b/outputs/checkpoint-313/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..482ae30d27a74c38d2228e69dd37c529fc485a45
--- /dev/null
+++ b/outputs/checkpoint-313/tokenizer_config.json
@@ -0,0 +1,185 @@
+{
+  "added_tokens_decoder": {
+    "199998": {
+      "content": "<|startoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "199999": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200000": {
+      "content": "<|reserved_200000|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200001": {
+      "content": "<|reserved_200001|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200002": {
+      "content": "<|return|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200003": {
+      "content": "<|constrain|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200004": {
+      "content": "<|reserved_200004|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200005": {
+      "content": "<|channel|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200006": {
+      "content": "<|start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200007": {
+      "content": "<|end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200008": {
+      "content": "<|message|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200009": {
+      "content": "<|reserved_200009|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200010": {
+      "content": "<|reserved_200010|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200011": {
+      "content": "<|reserved_200011|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200012": {
+      "content": "<|call|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200013": {
+      "content": "<|reserved_200013|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200014": {
+      "content": "<|reserved_200014|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200015": {
+      "content": "<|reserved_200015|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200016": {
+      "content": "<|reserved_200016|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200017": {
+      "content": "<|reserved_200017|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200018": {
+      "content": "<|endofprompt|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|startoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|return|>",
+  "extra_special_tokens": {},
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 131072,
+  "pad_token": "<|reserved_200017|>",
+  "padding_side": "right",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": null
+}
diff --git a/outputs/checkpoint-313/trainer_state.json b/outputs/checkpoint-313/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..0064f4acb6c8622ddcd506f380d076ef7b6f3b67
--- /dev/null
+++ b/outputs/checkpoint-313/trainer_state.json
@@ -0,0 +1,2225 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 313,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 13.684800148010254,
+      "learning_rate": 0.0,
+      "loss": 2.3276,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 13.660787582397461,
+      "learning_rate": 4e-05,
+      "loss": 2.2792,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 13.35280704498291,
+      "learning_rate": 8e-05,
+      "loss": 2.4151,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 6.15027379989624,
+      "learning_rate": 0.00012,
+      "loss": 1.7812,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 1.3168226480484009,
+      "learning_rate": 0.00016,
+      "loss": 1.4536,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.9872580170631409,
+      "learning_rate": 0.0002,
+      "loss": 1.4171,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.7496100664138794,
+      "learning_rate": 0.00019935064935064936,
+      "loss": 1.4168,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.7376005053520203,
+      "learning_rate": 0.00019870129870129872,
+      "loss": 1.3659,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5281137824058533,
+      "learning_rate": 0.00019805194805194807,
+      "loss": 1.2566,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.5485746264457703,
+      "learning_rate": 0.00019740259740259742,
+      "loss": 1.3761,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.5506592392921448,
+      "learning_rate": 0.00019675324675324675,
+      "loss": 1.3327,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.49382686614990234,
+      "learning_rate": 0.00019610389610389613,
+      "loss": 1.3727,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.36203011870384216,
+      "learning_rate": 0.00019545454545454548,
+      "loss": 1.1515,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.3528599739074707,
+      "learning_rate": 0.0001948051948051948,
+      "loss": 1.2636,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.31244418025016785,
+      "learning_rate": 0.00019415584415584416,
+      "loss": 1.1873,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.3379523754119873,
+      "learning_rate": 0.00019350649350649354,
+      "loss": 1.2657,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.3025083839893341,
+      "learning_rate": 0.00019285714285714286,
+      "loss": 1.2846,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.2560190260410309,
+      "learning_rate": 0.00019220779220779222,
+      "loss": 1.1587,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.2554129958152771,
+      "learning_rate": 0.00019155844155844157,
+      "loss": 1.2812,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.22662702202796936,
+      "learning_rate": 0.00019090909090909092,
+      "loss": 1.1664,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.2515714168548584,
+      "learning_rate": 0.00019025974025974027,
+      "loss": 1.2177,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.24396637082099915,
+      "learning_rate": 0.00018961038961038963,
+      "loss": 1.2053,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.24488303065299988,
+      "learning_rate": 0.00018896103896103895,
+      "loss": 1.2074,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.2168620079755783,
+      "learning_rate": 0.00018831168831168833,
+      "loss": 1.1284,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.24021224677562714,
+      "learning_rate": 0.00018766233766233769,
+      "loss": 1.2169,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.20057056844234467,
+      "learning_rate": 0.000187012987012987,
+      "loss": 1.1031,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.19900795817375183,
+      "learning_rate": 0.00018636363636363636,
+      "loss": 1.1004,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.2019268423318863,
+      "learning_rate": 0.00018571428571428572,
+      "loss": 1.1476,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.1996479034423828,
+      "learning_rate": 0.00018506493506493507,
+      "loss": 1.1455,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.25262022018432617,
+      "learning_rate": 0.00018441558441558442,
+      "loss": 1.1025,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.225438192486763,
+      "learning_rate": 0.00018376623376623378,
+      "loss": 1.1954,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.17834505438804626,
+      "learning_rate": 0.00018311688311688313,
+      "loss": 1.0934,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.20071206986904144,
+      "learning_rate": 0.00018246753246753248,
+      "loss": 1.0488,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.1920139640569687,
+      "learning_rate": 0.00018181818181818183,
+      "loss": 1.123,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.18714852631092072,
+      "learning_rate": 0.0001811688311688312,
+      "loss": 1.0798,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.18315713107585907,
+      "learning_rate": 0.00018051948051948054,
+      "loss": 1.1107,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.19156870245933533,
+      "learning_rate": 0.00017987012987012987,
+      "loss": 1.1125,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.21527768671512604,
+      "learning_rate": 0.00017922077922077922,
+      "loss": 1.1346,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.1871163249015808,
+      "learning_rate": 0.0001785714285714286,
+      "loss": 1.0742,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.17750784754753113,
+      "learning_rate": 0.00017792207792207792,
+      "loss": 1.1323,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.177419051527977,
+      "learning_rate": 0.00017727272727272728,
+      "loss": 1.1405,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.16714292764663696,
+      "learning_rate": 0.00017662337662337663,
+      "loss": 1.1084,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.1610356718301773,
+      "learning_rate": 0.00017597402597402598,
+      "loss": 1.1125,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.2548656761646271,
+      "learning_rate": 0.00017532467532467534,
+      "loss": 1.1114,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.1731044203042984,
+      "learning_rate": 0.0001746753246753247,
+      "loss": 1.1197,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.1739533394575119,
+      "learning_rate": 0.00017402597402597401,
+      "loss": 1.1777,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.2178352177143097,
+      "learning_rate": 0.0001733766233766234,
+      "loss": 1.1111,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.17247150838375092,
+      "learning_rate": 0.00017272727272727275,
+      "loss": 1.1253,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.18075324594974518,
+      "learning_rate": 0.00017207792207792207,
+      "loss": 1.1358,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.15898071229457855,
+      "learning_rate": 0.00017142857142857143,
+      "loss": 1.0606,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.16518613696098328,
+      "learning_rate": 0.0001707792207792208,
+      "loss": 1.0944,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.16035063564777374,
+      "learning_rate": 0.00017012987012987013,
+      "loss": 1.0554,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.1686483472585678,
+      "learning_rate": 0.00016948051948051948,
+      "loss": 1.0384,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.16575631499290466,
+      "learning_rate": 0.00016883116883116884,
+      "loss": 1.0243,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.16840039193630219,
+      "learning_rate": 0.0001681818181818182,
+      "loss": 1.117,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.17616064846515656,
+      "learning_rate": 0.00016753246753246754,
+      "loss": 1.0743,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.168218195438385,
+      "learning_rate": 0.0001668831168831169,
+      "loss": 1.0627,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.17026656866073608,
+      "learning_rate": 0.00016623376623376625,
+      "loss": 1.0059,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.16454458236694336,
+      "learning_rate": 0.0001655844155844156,
+      "loss": 0.9943,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.17185136675834656,
+      "learning_rate": 0.00016493506493506495,
+      "loss": 1.1545,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.17822986841201782,
+      "learning_rate": 0.00016428571428571428,
+      "loss": 1.073,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.1676608771085739,
+      "learning_rate": 0.00016363636363636366,
+      "loss": 1.0886,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.1727771908044815,
+      "learning_rate": 0.000162987012987013,
+      "loss": 1.0432,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.17827573418617249,
+      "learning_rate": 0.00016233766233766234,
+      "loss": 1.083,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.19807517528533936,
+      "learning_rate": 0.0001616883116883117,
+      "loss": 1.1208,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.17693684995174408,
+      "learning_rate": 0.00016103896103896104,
+      "loss": 1.089,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.15489234030246735,
+      "learning_rate": 0.0001603896103896104,
+      "loss": 0.9707,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.16443990170955658,
+      "learning_rate": 0.00015974025974025975,
+      "loss": 1.0643,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.2051103413105011,
+      "learning_rate": 0.0001590909090909091,
+      "loss": 1.1246,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.18824075162410736,
+      "learning_rate": 0.00015844155844155845,
+      "loss": 1.0855,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.18659448623657227,
+      "learning_rate": 0.0001577922077922078,
+      "loss": 1.1412,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.1854114979505539,
+      "learning_rate": 0.00015714285714285716,
+      "loss": 1.0249,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.1876193732023239,
+      "learning_rate": 0.00015649350649350649,
+      "loss": 1.1029,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.1888684630393982,
+      "learning_rate": 0.00015584415584415587,
+      "loss": 1.0789,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.20240606367588043,
+      "learning_rate": 0.0001551948051948052,
+      "loss": 1.0495,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.232120081782341,
+      "learning_rate": 0.00015454545454545454,
+      "loss": 1.0735,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.16897843778133392,
+      "learning_rate": 0.0001538961038961039,
+      "loss": 1.0164,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.18796634674072266,
+      "learning_rate": 0.00015324675324675325,
+      "loss": 1.0676,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.19574032723903656,
+      "learning_rate": 0.0001525974025974026,
+      "loss": 1.0456,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.18007811903953552,
+      "learning_rate": 0.00015194805194805196,
+      "loss": 1.0894,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.18932929635047913,
+      "learning_rate": 0.0001512987012987013,
+      "loss": 1.0729,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.20614288747310638,
+      "learning_rate": 0.00015064935064935066,
+      "loss": 1.0854,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.19291089475154877,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.1217,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.18916529417037964,
+      "learning_rate": 0.00014935064935064934,
+      "loss": 1.0963,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.20306220650672913,
+      "learning_rate": 0.00014870129870129872,
+      "loss": 1.0898,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.17870067059993744,
+      "learning_rate": 0.00014805194805194807,
+      "loss": 1.0213,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.18411923944950104,
+      "learning_rate": 0.0001474025974025974,
+      "loss": 1.0844,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.18788227438926697,
+      "learning_rate": 0.00014675324675324675,
+      "loss": 1.0338,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.23874884843826294,
+      "learning_rate": 0.00014610389610389613,
+      "loss": 1.1118,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.19380499422550201,
+      "learning_rate": 0.00014545454545454546,
+      "loss": 1.0464,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.18968750536441803,
+      "learning_rate": 0.0001448051948051948,
+      "loss": 1.0569,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.19545753300189972,
+      "learning_rate": 0.00014415584415584416,
+      "loss": 1.1225,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.19170494377613068,
+      "learning_rate": 0.00014350649350649352,
+      "loss": 1.0602,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.17953918874263763,
+      "learning_rate": 0.00014285714285714287,
+      "loss": 1.032,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.1822536289691925,
+      "learning_rate": 0.00014220779220779222,
+      "loss": 1.0559,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.18591298162937164,
+      "learning_rate": 0.00014155844155844155,
+      "loss": 1.031,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.2129002958536148,
+      "learning_rate": 0.00014090909090909093,
+      "loss": 1.1391,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.18386681377887726,
+      "learning_rate": 0.00014025974025974028,
+      "loss": 0.9919,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.18314239382743835,
+      "learning_rate": 0.0001396103896103896,
+      "loss": 1.0445,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.1999066174030304,
+      "learning_rate": 0.00013896103896103896,
+      "loss": 1.0538,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.18741188943386078,
+      "learning_rate": 0.00013831168831168834,
+      "loss": 1.0722,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.19351010024547577,
+      "learning_rate": 0.00013766233766233766,
+      "loss": 1.0491,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.18859203159809113,
+      "learning_rate": 0.00013701298701298702,
+      "loss": 1.0593,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.1962767392396927,
+      "learning_rate": 0.00013636363636363637,
+      "loss": 1.1344,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.20819440484046936,
+      "learning_rate": 0.00013571428571428572,
+      "loss": 1.1137,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.19590184092521667,
+      "learning_rate": 0.00013506493506493507,
+      "loss": 1.0624,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.18631424009799957,
+      "learning_rate": 0.00013441558441558443,
+      "loss": 1.0587,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.19572143256664276,
+      "learning_rate": 0.00013376623376623375,
+      "loss": 1.0494,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.1910988837480545,
+      "learning_rate": 0.00013311688311688313,
+      "loss": 1.0481,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.19455869495868683,
+      "learning_rate": 0.00013246753246753249,
+      "loss": 1.029,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.18669827282428741,
+      "learning_rate": 0.0001318181818181818,
+      "loss": 1.0513,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.17523664236068726,
+      "learning_rate": 0.0001311688311688312,
+      "loss": 1.0126,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.17929129302501678,
+      "learning_rate": 0.00013051948051948052,
+      "loss": 1.0717,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.19380168616771698,
+      "learning_rate": 0.00012987012987012987,
+      "loss": 1.0324,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.18090228736400604,
+      "learning_rate": 0.00012922077922077922,
+      "loss": 1.0515,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.2067340910434723,
+      "learning_rate": 0.00012857142857142858,
+      "loss": 1.0939,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.1880485862493515,
+      "learning_rate": 0.00012792207792207793,
+      "loss": 1.0986,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.182168647646904,
+      "learning_rate": 0.00012727272727272728,
+      "loss": 1.0109,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.20187129080295563,
+      "learning_rate": 0.00012662337662337663,
+      "loss": 1.0668,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.2082669734954834,
+      "learning_rate": 0.000125974025974026,
+      "loss": 1.054,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.18294434249401093,
+      "learning_rate": 0.00012532467532467534,
+      "loss": 1.0397,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.20515067875385284,
+      "learning_rate": 0.00012467532467532467,
+      "loss": 1.1092,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.1758790761232376,
+      "learning_rate": 0.00012402597402597402,
+      "loss": 0.9755,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.2170792669057846,
+      "learning_rate": 0.0001233766233766234,
+      "loss": 1.0434,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.202157124876976,
+      "learning_rate": 0.00012272727272727272,
+      "loss": 1.1129,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.18556398153305054,
+      "learning_rate": 0.00012207792207792208,
+      "loss": 1.0665,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.20196087658405304,
+      "learning_rate": 0.00012142857142857143,
+      "loss": 1.1,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.1921566128730774,
+      "learning_rate": 0.0001207792207792208,
+      "loss": 1.0918,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.18866224586963654,
+      "learning_rate": 0.00012012987012987014,
+      "loss": 1.0014,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.207601398229599,
+      "learning_rate": 0.00011948051948051949,
+      "loss": 1.0726,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.21592366695404053,
+      "learning_rate": 0.00011883116883116883,
+      "loss": 1.1379,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.2016124576330185,
+      "learning_rate": 0.0001181818181818182,
+      "loss": 1.1428,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.20478437840938568,
+      "learning_rate": 0.00011753246753246753,
+      "loss": 1.121,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.22730594873428345,
+      "learning_rate": 0.00011688311688311689,
+      "loss": 1.0319,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.22592711448669434,
+      "learning_rate": 0.00011623376623376625,
+      "loss": 1.1264,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.20035041868686676,
+      "learning_rate": 0.00011558441558441559,
+      "loss": 1.0686,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.20648567378520966,
+      "learning_rate": 0.00011493506493506494,
+      "loss": 1.0817,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.21222743391990662,
+      "learning_rate": 0.00011428571428571428,
+      "loss": 1.0678,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.2075391560792923,
+      "learning_rate": 0.00011363636363636365,
+      "loss": 1.0897,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.1964101791381836,
+      "learning_rate": 0.000112987012987013,
+      "loss": 1.0906,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.22406511008739471,
+      "learning_rate": 0.00011233766233766234,
+      "loss": 1.0594,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.23787978291511536,
+      "learning_rate": 0.00011168831168831168,
+      "loss": 1.1053,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.21196185052394867,
+      "learning_rate": 0.00011103896103896105,
+      "loss": 1.0923,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.21042804419994354,
+      "learning_rate": 0.0001103896103896104,
+      "loss": 1.0381,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.2267436534166336,
+      "learning_rate": 0.00010974025974025974,
+      "loss": 1.0818,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.23742735385894775,
+      "learning_rate": 0.00010909090909090909,
+      "loss": 1.0872,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.17787213623523712,
+      "learning_rate": 0.00010844155844155846,
+      "loss": 1.03,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.22422832250595093,
+      "learning_rate": 0.0001077922077922078,
+      "loss": 1.0738,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.22946301102638245,
+      "learning_rate": 0.00010714285714285715,
+      "loss": 1.0274,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.2137996405363083,
+      "learning_rate": 0.00010649350649350649,
+      "loss": 1.0539,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.1748756766319275,
+      "learning_rate": 0.00010584415584415586,
+      "loss": 1.0355,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.22275175154209137,
+      "learning_rate": 0.0001051948051948052,
+      "loss": 1.1696,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.20996077358722687,
+      "learning_rate": 0.00010454545454545455,
+      "loss": 1.0303,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.1945938766002655,
+      "learning_rate": 0.00010389610389610389,
+      "loss": 0.9747,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.1970377266407013,
+      "learning_rate": 0.00010324675324675325,
+      "loss": 1.0358,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.18814732134342194,
+      "learning_rate": 0.00010259740259740261,
+      "loss": 0.9612,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.2153233289718628,
+      "learning_rate": 0.00010194805194805195,
+      "loss": 1.0749,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.21788008511066437,
+      "learning_rate": 0.0001012987012987013,
+      "loss": 1.0883,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.214650496840477,
+      "learning_rate": 0.00010064935064935067,
+      "loss": 1.0539,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.19312834739685059,
+      "learning_rate": 0.0001,
+      "loss": 1.0657,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.19916598498821259,
+      "learning_rate": 9.935064935064936e-05,
+      "loss": 1.0478,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.2057606726884842,
+      "learning_rate": 9.870129870129871e-05,
+      "loss": 1.0094,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.22159607708454132,
+      "learning_rate": 9.805194805194806e-05,
+      "loss": 1.0952,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.18274275958538055,
+      "learning_rate": 9.74025974025974e-05,
+      "loss": 1.0065,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.19835162162780762,
+      "learning_rate": 9.675324675324677e-05,
+      "loss": 1.0742,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.2114904820919037,
+      "learning_rate": 9.610389610389611e-05,
+      "loss": 1.1109,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.21488523483276367,
+      "learning_rate": 9.545454545454546e-05,
+      "loss": 1.0465,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.19870303571224213,
+      "learning_rate": 9.480519480519481e-05,
+      "loss": 1.0318,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.20413029193878174,
+      "learning_rate": 9.415584415584417e-05,
+      "loss": 1.0817,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.1847231239080429,
+      "learning_rate": 9.35064935064935e-05,
+      "loss": 1.0144,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.2715964913368225,
+      "learning_rate": 9.285714285714286e-05,
+      "loss": 0.9832,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.2225002497434616,
+      "learning_rate": 9.220779220779221e-05,
+      "loss": 1.1051,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.22931510210037231,
+      "learning_rate": 9.155844155844156e-05,
+      "loss": 1.1042,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.21848627924919128,
+      "learning_rate": 9.090909090909092e-05,
+      "loss": 1.1151,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.19852259755134583,
+      "learning_rate": 9.025974025974027e-05,
+      "loss": 1.0889,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.2080363780260086,
+      "learning_rate": 8.961038961038961e-05,
+      "loss": 1.0777,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.22391024231910706,
+      "learning_rate": 8.896103896103896e-05,
+      "loss": 1.1092,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.21793846786022186,
+      "learning_rate": 8.831168831168831e-05,
+      "loss": 1.044,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.2009749859571457,
+      "learning_rate": 8.766233766233767e-05,
+      "loss": 1.0198,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.19432318210601807,
+      "learning_rate": 8.701298701298701e-05,
+      "loss": 1.075,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.18634547293186188,
+      "learning_rate": 8.636363636363637e-05,
+      "loss": 0.9964,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.1947103589773178,
+      "learning_rate": 8.571428571428571e-05,
+      "loss": 1.0025,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.23098671436309814,
+      "learning_rate": 8.506493506493507e-05,
+      "loss": 1.0562,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.19686414301395416,
+      "learning_rate": 8.441558441558442e-05,
+      "loss": 1.0285,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.19852428138256073,
+      "learning_rate": 8.376623376623377e-05,
+      "loss": 1.0054,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.21483510732650757,
+      "learning_rate": 8.311688311688312e-05,
+      "loss": 1.108,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.23313644528388977,
+      "learning_rate": 8.246753246753248e-05,
+      "loss": 1.1383,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.21453145146369934,
+      "learning_rate": 8.181818181818183e-05,
+      "loss": 1.0911,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.20268195867538452,
+      "learning_rate": 8.116883116883117e-05,
+      "loss": 1.0145,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.20576398074626923,
+      "learning_rate": 8.051948051948052e-05,
+      "loss": 1.0829,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.21732626855373383,
+      "learning_rate": 7.987012987012987e-05,
+      "loss": 1.0152,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.22046895325183868,
+      "learning_rate": 7.922077922077923e-05,
+      "loss": 1.1311,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.19727715849876404,
+      "learning_rate": 7.857142857142858e-05,
+      "loss": 1.0364,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.20861488580703735,
+      "learning_rate": 7.792207792207793e-05,
+      "loss": 1.0435,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.18545083701610565,
+      "learning_rate": 7.727272727272727e-05,
+      "loss": 1.0299,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.19965052604675293,
+      "learning_rate": 7.662337662337662e-05,
+      "loss": 1.0511,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.23673909902572632,
+      "learning_rate": 7.597402597402598e-05,
+      "loss": 1.081,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.17583179473876953,
+      "learning_rate": 7.532467532467533e-05,
+      "loss": 0.9808,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.2129366099834442,
+      "learning_rate": 7.467532467532467e-05,
+      "loss": 1.0522,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.21679140627384186,
+      "learning_rate": 7.402597402597404e-05,
+      "loss": 1.0567,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.2032000720500946,
+      "learning_rate": 7.337662337662338e-05,
+      "loss": 1.0466,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.1887970268726349,
+      "learning_rate": 7.272727272727273e-05,
+      "loss": 1.0329,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.21060192584991455,
+      "learning_rate": 7.207792207792208e-05,
+      "loss": 1.1021,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.21191425621509552,
+      "learning_rate": 7.142857142857143e-05,
+      "loss": 0.99,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.1995989829301834,
+      "learning_rate": 7.077922077922077e-05,
+      "loss": 1.0526,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.1849513053894043,
+      "learning_rate": 7.012987012987014e-05,
+      "loss": 0.9998,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.1948779672384262,
+      "learning_rate": 6.948051948051948e-05,
+      "loss": 1.075,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.20374052226543427,
+      "learning_rate": 6.883116883116883e-05,
+      "loss": 1.0933,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.2102465033531189,
+      "learning_rate": 6.818181818181818e-05,
+      "loss": 1.1123,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.21376173198223114,
+      "learning_rate": 6.753246753246754e-05,
+      "loss": 1.1233,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.20934203267097473,
+      "learning_rate": 6.688311688311688e-05,
+      "loss": 1.1374,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.18604128062725067,
+      "learning_rate": 6.623376623376624e-05,
+      "loss": 1.0213,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.19644233584403992,
+      "learning_rate": 6.55844155844156e-05,
+      "loss": 1.0046,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.18479463458061218,
+      "learning_rate": 6.493506493506494e-05,
+      "loss": 0.9792,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.1945149153470993,
+      "learning_rate": 6.428571428571429e-05,
+      "loss": 1.0584,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.2070147544145584,
+      "learning_rate": 6.363636363636364e-05,
+      "loss": 1.071,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.19645985960960388,
+      "learning_rate": 6.2987012987013e-05,
+      "loss": 1.0721,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.1960117667913437,
+      "learning_rate": 6.233766233766233e-05,
+      "loss": 1.071,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.20168261229991913,
+      "learning_rate": 6.16883116883117e-05,
+      "loss": 1.0808,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.21254412829875946,
+      "learning_rate": 6.103896103896104e-05,
+      "loss": 1.0287,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.21271063387393951,
+      "learning_rate": 6.03896103896104e-05,
+      "loss": 1.0605,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.2081408053636551,
+      "learning_rate": 5.9740259740259744e-05,
+      "loss": 1.091,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.21113798022270203,
+      "learning_rate": 5.90909090909091e-05,
+      "loss": 1.1323,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.20670844614505768,
+      "learning_rate": 5.844155844155844e-05,
+      "loss": 1.0955,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.2010120451450348,
+      "learning_rate": 5.7792207792207796e-05,
+      "loss": 1.1068,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.20379121601581573,
+      "learning_rate": 5.714285714285714e-05,
+      "loss": 1.0419,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.22799807786941528,
+      "learning_rate": 5.64935064935065e-05,
+      "loss": 1.0904,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.2005995213985443,
+      "learning_rate": 5.584415584415584e-05,
+      "loss": 1.078,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.20329605042934418,
+      "learning_rate": 5.51948051948052e-05,
+      "loss": 1.0245,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.19283504784107208,
+      "learning_rate": 5.4545454545454546e-05,
+      "loss": 1.0367,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.20624355971813202,
+      "learning_rate": 5.38961038961039e-05,
+      "loss": 1.1046,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.21362991631031036,
+      "learning_rate": 5.3246753246753245e-05,
+      "loss": 1.1104,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.20447863638401031,
+      "learning_rate": 5.25974025974026e-05,
+      "loss": 1.0514,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.1974381059408188,
+      "learning_rate": 5.1948051948051944e-05,
+      "loss": 1.0048,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.21237170696258545,
+      "learning_rate": 5.1298701298701304e-05,
+      "loss": 1.1299,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.21224971115589142,
+      "learning_rate": 5.064935064935065e-05,
+      "loss": 1.05,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.19865018129348755,
+      "learning_rate": 5e-05,
+      "loss": 1.0665,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.19199275970458984,
+      "learning_rate": 4.9350649350649355e-05,
+      "loss": 0.9531,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.19573214650154114,
+      "learning_rate": 4.87012987012987e-05,
+      "loss": 1.0318,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.21338805556297302,
+      "learning_rate": 4.8051948051948054e-05,
+      "loss": 1.0343,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.2254691869020462,
+      "learning_rate": 4.740259740259741e-05,
+      "loss": 1.0472,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.18101665377616882,
+      "learning_rate": 4.675324675324675e-05,
+      "loss": 1.017,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.22090592980384827,
+      "learning_rate": 4.6103896103896106e-05,
+      "loss": 1.0389,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.20865507423877716,
+      "learning_rate": 4.545454545454546e-05,
+      "loss": 1.0369,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.21619610488414764,
+      "learning_rate": 4.4805194805194805e-05,
+      "loss": 1.109,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.21694771945476532,
+      "learning_rate": 4.415584415584416e-05,
+      "loss": 1.0525,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.2182662934064865,
+      "learning_rate": 4.3506493506493503e-05,
+      "loss": 1.0331,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.2026486098766327,
+      "learning_rate": 4.2857142857142856e-05,
+      "loss": 1.027,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.19606547057628632,
+      "learning_rate": 4.220779220779221e-05,
+      "loss": 1.0242,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.22107470035552979,
+      "learning_rate": 4.155844155844156e-05,
+      "loss": 1.0924,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.19960008561611176,
+      "learning_rate": 4.0909090909090915e-05,
+      "loss": 1.0384,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.1945488154888153,
+      "learning_rate": 4.025974025974026e-05,
+      "loss": 1.0673,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.22067414224147797,
+      "learning_rate": 3.9610389610389614e-05,
+      "loss": 1.0426,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.19010980427265167,
+      "learning_rate": 3.8961038961038966e-05,
+      "loss": 1.0617,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.18781176209449768,
+      "learning_rate": 3.831168831168831e-05,
+      "loss": 1.0243,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.20388829708099365,
+      "learning_rate": 3.7662337662337665e-05,
+      "loss": 1.0476,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.19911155104637146,
+      "learning_rate": 3.701298701298702e-05,
+      "loss": 1.0324,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.19884039461612701,
+      "learning_rate": 3.6363636363636364e-05,
+      "loss": 1.0242,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.19036105275154114,
+      "learning_rate": 3.571428571428572e-05,
+      "loss": 1.0323,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.20039844512939453,
+      "learning_rate": 3.506493506493507e-05,
+      "loss": 1.0749,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.1899934560060501,
+      "learning_rate": 3.4415584415584416e-05,
+      "loss": 1.0115,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.20019090175628662,
+      "learning_rate": 3.376623376623377e-05,
+      "loss": 1.0782,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.2020583152770996,
+      "learning_rate": 3.311688311688312e-05,
+      "loss": 1.0687,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.21407337486743927,
+      "learning_rate": 3.246753246753247e-05,
+      "loss": 1.1015,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.1871640682220459,
+      "learning_rate": 3.181818181818182e-05,
+      "loss": 0.9637,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.21622811257839203,
+      "learning_rate": 3.1168831168831166e-05,
+      "loss": 1.1222,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.22504661977291107,
+      "learning_rate": 3.051948051948052e-05,
+      "loss": 1.132,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.19177629053592682,
+      "learning_rate": 2.9870129870129872e-05,
+      "loss": 1.0281,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.1970544159412384,
+      "learning_rate": 2.922077922077922e-05,
+      "loss": 1.0393,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.21554522216320038,
+      "learning_rate": 2.857142857142857e-05,
+      "loss": 1.074,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.21131229400634766,
+      "learning_rate": 2.792207792207792e-05,
+      "loss": 1.054,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.19816523790359497,
+      "learning_rate": 2.7272727272727273e-05,
+      "loss": 1.0456,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.21075209975242615,
+      "learning_rate": 2.6623376623376623e-05,
+      "loss": 1.0758,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.2296527624130249,
+      "learning_rate": 2.5974025974025972e-05,
+      "loss": 1.0917,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.19722610712051392,
+      "learning_rate": 2.5324675324675325e-05,
+      "loss": 1.0704,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.18721099197864532,
+      "learning_rate": 2.4675324675324678e-05,
+      "loss": 0.9919,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.20244193077087402,
+      "learning_rate": 2.4025974025974027e-05,
+      "loss": 1.0368,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.19518914818763733,
+      "learning_rate": 2.3376623376623376e-05,
+      "loss": 1.0436,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.19650357961654663,
+      "learning_rate": 2.272727272727273e-05,
+      "loss": 1.0306,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.20320096611976624,
+      "learning_rate": 2.207792207792208e-05,
+      "loss": 1.0941,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.18296951055526733,
+      "learning_rate": 2.1428571428571428e-05,
+      "loss": 0.9802,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.21357610821723938,
+      "learning_rate": 2.077922077922078e-05,
+      "loss": 1.0449,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.193921759724617,
+      "learning_rate": 2.012987012987013e-05,
+      "loss": 1.0116,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.1953902244567871,
+      "learning_rate": 1.9480519480519483e-05,
+      "loss": 1.0105,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.19440975785255432,
+      "learning_rate": 1.8831168831168833e-05,
+      "loss": 0.9952,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.21054105460643768,
+      "learning_rate": 1.8181818181818182e-05,
+      "loss": 1.0701,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.18844804167747498,
+      "learning_rate": 1.7532467532467535e-05,
+      "loss": 1.0146,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.2067311704158783,
+      "learning_rate": 1.6883116883116884e-05,
+      "loss": 1.0781,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.1941213756799698,
+      "learning_rate": 1.6233766233766234e-05,
+      "loss": 0.9814,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.22726193070411682,
+      "learning_rate": 1.5584415584415583e-05,
+      "loss": 1.1431,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.18025581538677216,
+      "learning_rate": 1.4935064935064936e-05,
+      "loss": 0.9649,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.21535000205039978,
+      "learning_rate": 1.4285714285714285e-05,
+      "loss": 1.0441,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.20014546811580658,
+      "learning_rate": 1.3636363636363637e-05,
+      "loss": 1.0166,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.22738787531852722,
+      "learning_rate": 1.2987012987012986e-05,
+      "loss": 1.0564,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.2020861804485321,
+      "learning_rate": 1.2337662337662339e-05,
+      "loss": 1.1241,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.19888809323310852,
+      "learning_rate": 1.1688311688311688e-05,
+      "loss": 1.1114,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.20912377536296844,
+      "learning_rate": 1.103896103896104e-05,
+      "loss": 1.0971,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.21206621825695038,
+      "learning_rate": 1.038961038961039e-05,
+      "loss": 1.0601,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.18667680025100708,
+      "learning_rate": 9.740259740259742e-06,
+      "loss": 1.0291,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.21125559508800507,
+      "learning_rate": 9.090909090909091e-06,
+      "loss": 1.0483,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.21776145696640015,
+      "learning_rate": 8.441558441558442e-06,
+      "loss": 0.9912,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.20144303143024445,
+      "learning_rate": 7.792207792207792e-06,
+      "loss": 1.0357,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.1984029859304428,
+      "learning_rate": 7.142857142857143e-06,
+      "loss": 1.0648,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.17972829937934875,
+      "learning_rate": 6.493506493506493e-06,
+      "loss": 1.0033,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.1818286031484604,
+      "learning_rate": 5.844155844155844e-06,
+      "loss": 0.997,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.19670912623405457,
+      "learning_rate": 5.194805194805195e-06,
+      "loss": 1.0256,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.20527283847332,
+      "learning_rate": 4.5454545454545455e-06,
+      "loss": 1.0348,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.19025909900665283,
+      "learning_rate": 3.896103896103896e-06,
+      "loss": 1.0682,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.19544818997383118,
+      "learning_rate": 3.2467532467532465e-06,
+      "loss": 0.9872,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.22112183272838593,
+      "learning_rate": 2.5974025974025976e-06,
+      "loss": 1.0661,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.23328153789043427,
+      "learning_rate": 1.948051948051948e-06,
+      "loss": 1.0691,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.20181375741958618,
+      "learning_rate": 1.2987012987012988e-06,
+      "loss": 0.9416,
+      "step": 312
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.29312625527381897,
+      "learning_rate": 6.493506493506494e-07,
+      "loss": 1.1216,
+      "step": 313
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 313,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.768425540391928e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/outputs/checkpoint-500/README.md b/outputs/checkpoint-500/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3abf956c074d00f34a12693c8d6da9738211d7c7
--- /dev/null
+++ b/outputs/checkpoint-500/README.md
@@ -0,0 +1,209 @@
+---
+base_model: unsloth/gpt-oss-20b-unsloth-bnb-4bit
+library_name: peft
+tags:
+- base_model:adapter:unsloth/gpt-oss-20b-unsloth-bnb-4bit
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/outputs/checkpoint-500/adapter_config.json b/outputs/checkpoint-500/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e285b9b6e018b5b9f23736d6699eb1a4267764e7
--- /dev/null
+++ b/outputs/checkpoint-500/adapter_config.json
@@ -0,0 +1,45 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": {
+    "base_model_class": "GptOssForCausalLM",
+    "parent_library": "transformers.models.gpt_oss.modeling_gpt_oss"
+  },
+  "base_model_name_or_path": "unsloth/gpt-oss-20b-unsloth-bnb-4bit",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "v_proj",
+    "up_proj",
+    "down_proj",
+    "gate_proj",
+    "k_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": null,
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/outputs/checkpoint-500/chat_template.jinja b/outputs/checkpoint-500/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..a3650f886e98b2834c25727759c8e0ab8495f316
--- /dev/null
+++ b/outputs/checkpoint-500/chat_template.jinja
@@ -0,0 +1,315 @@
+{# Copyright 2025-present Unsloth. Apache 2.0 License. Unsloth chat template fixes. Edited from ggml-org & OpenAI #}
+{#-
+  In addition to the normal inputs of `messages` and `tools`, this template also accepts the
+  following kwargs:
+  - "builtin_tools": A list, can contain "browser" and/or "python".
+  - "model_identity": A string that optionally describes the model identity.
+  - "reasoning_effort": A string that describes the reasoning effort, defaults to "medium".
+ #}
+
+{#- Tool Definition Rendering ============================================== #}
+{%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%}
+    {%- if param_spec.type == "array" -%}
+        {%- if param_spec['items'] -%}
+            {%- if param_spec['items']['type'] == "string" -%}
+                {{- "string[]" }}
+            {%- elif param_spec['items']['type'] == "number" -%}
+                {{- "number[]" }}
+            {%- elif param_spec['items']['type'] == "integer" -%}
+                {{- "number[]" }}
+            {%- elif param_spec['items']['type'] == "boolean" -%}
+                {{- "boolean[]" }}
+            {%- else -%}
+                {%- set inner_type = render_typescript_type(param_spec['items'], required_params) -%}
+                {%- if inner_type == "object | object" or inner_type|length > 50 -%}
+                    {{- "any[]" }}
+                {%- else -%}
+                    {{- inner_type + "[]" }}
+                {%- endif -%}
+            {%- endif -%}
+            {%- if param_spec.nullable -%}
+                {{- " | null" }}
+            {%- endif -%}
+        {%- else -%}
+            {{- "any[]" }}
+            {%- if param_spec.nullable -%}
+                {{- " | null" }}
+            {%- endif -%}
+        {%- endif -%}
+    {%- elif param_spec.type is defined and param_spec.type is iterable and param_spec.type is not string and param_spec.type is not mapping and param_spec.type[0] is defined -%}
+        {#- Handle array of types like ["object", "object"] from Union[dict, list] #}
+        {%- if param_spec.type | length > 1 -%}
+            {{- param_spec.type | join(" | ") }}
+        {%- else -%}
+            {{- param_spec.type[0] }}
+        {%- endif -%}
+    {%- elif param_spec.oneOf -%}
+        {#- Handle oneOf schemas - check for complex unions and fallback to any #}
+        {%- set has_object_variants = false -%}
+        {%- for variant in param_spec.oneOf -%}
+            {%- if variant.type == "object" -%}
+                {%- set has_object_variants = true -%}
+            {%- endif -%}
+        {%- endfor -%}
+        {%- if has_object_variants and param_spec.oneOf|length > 1 -%}
+            {{- "any" }}
+        {%- else -%}
+            {%- for variant in param_spec.oneOf -%}
+                {{- render_typescript_type(variant, required_params) -}}
+                {%- if variant.description %}
+                    {{- "// " + variant.description }}
+                {%- endif -%}
+                {%- if variant.default is defined %}
+                    {{ "// default: " + variant.default|tojson }}
+                {%- endif -%}
+                {%- if not loop.last %}
+                    {{- " | " }}
+                {% endif -%}
+            {%- endfor -%}
+        {%- endif -%}
+    {%- elif param_spec.type == "string" -%}
+        {%- if param_spec.enum -%}
+            {{- '"' + param_spec.enum|join('" | "') + '"' -}}
+        {%- else -%}
+            {{- "string" }}
+            {%- if param_spec.nullable %}
+                {{- " | null" }}
+            {%- endif -%}
+        {%- endif -%}
+    {%- elif param_spec.type == "number" -%}
+        {{- "number" }}
+    {%- elif param_spec.type == "integer" -%}
+        {{- "number" }}
+    {%- elif param_spec.type == "boolean" -%}
+        {{- "boolean" }}
+
+    {%- elif param_spec.type == "object" -%}
+        {%- if param_spec.properties -%}
+            {{- "{\n" }}
+            {%- for prop_name, prop_spec in param_spec.properties.items() -%}
+                {{- prop_name -}}
+                {%- if prop_name not in (param_spec.required or []) -%}
+                    {{- "?" }}
+                {%- endif -%}
+                {{- ": " }}
+                {{ render_typescript_type(prop_spec, param_spec.required or []) }}
+                {%- if not loop.last -%}
+                    {{-", " }}
+                {%- endif -%}
+            {%- endfor -%}
+            {{- "}" }}
+        {%- else -%}
+            {{- "object" }}
+        {%- endif -%}
+    {%- else -%}
+        {{- "any" }}
+    {%- endif -%}
+{%- endmacro -%}
+
+{%- macro render_tool_namespace(namespace_name, tools) -%}
+    {{- "## " + namespace_name + "\n\n" }}
+    {{- "namespace " + namespace_name + " {\n\n" }}
+    {%- for tool in tools %}
+        {%- set tool = tool.function %}
+        {{- "// " + tool.description + "\n" }}
+        {{- "type "+ tool.name + " = " }}
+        {%- if tool.parameters and tool.parameters.properties -%}
+            {{- "(_: " }}
+            {{- "{\n" }}
+            {%- for param_name, param_spec in tool.parameters.properties.items() %}
+                {{- "// " + param_spec.description + "\n" }}
+                {{- param_name }}
+                {%- if param_name not in (tool.parameters.required or []) -%}
+                    {{- "?" }}
+                {%- endif -%}
+                {{- ": " }}
+                {{- render_typescript_type(param_spec, tool.parameters.required or []) }}
+                {%- if param_spec.default is defined -%}
+                    {%- if param_spec.enum %}
+                        {{- ", // default: " + param_spec.default }}
+                    {%- elif param_spec.oneOf %}
+                        {{- "// default: " + param_spec.default }}
+                    {%- else %}
+                        {{- ", // default: " + param_spec.default|tojson }}
+                    {%- endif -%}
+                {%- endif -%}
+                {%- if not loop.last %}
+                    {{- ",\n" }}
+                {%- else %}
+                    {{- "\n" }}
+                {%- endif -%}
+            {%- endfor %}
+            {{- "}) => any;\n\n" }}
+        {%- else -%}
+            {{- "() => any;\n\n" }}
+        {%- endif -%}
+    {%- endfor %}
+    {{- "} // namespace " + namespace_name }}
+{%- endmacro -%}
+
+{%- macro render_builtin_tools(browser_tool, python_tool) -%}
+    {%- if browser_tool %}
+        {{- "## browser\n\n" }}
+        {{- "// Tool for browsing.\n" }}
+        {{- "// The `cursor` appears in brackets before each browsing display: `[{cursor}]`.\n" }}
+        {{- "// Cite information from the tool using the following format:\n" }}
+        {{- "// `【{cursor}†L{line_start}(-L{line_end})?】`, for example: `【6†L9-L11】` or `【8†L3】`.\n" }}
+        {{- "// Do not quote more than 10 words directly from the tool output.\n" }}
+        {{- "// sources=web (default: web)\n" }}
+        {{- "namespace browser {\n\n" }}
+        {{- "// Searches for information related to `query` and displays `topn` results.\n" }}
+        {{- "type search = (_: {\n" }}
+        {{- "query: string,\n" }}
+        {{- "topn?: number, // default: 10\n" }}
+        {{- "source?: string,\n" }}
+        {{- "}) => any;\n\n" }}
+        {{- "// Opens the link `id` from the page indicated by `cursor` starting at line number `loc`, showing `num_lines` lines.\n" }}
+        {{- "// Valid link ids are displayed with the formatting: `【{id}†.*】`.\n" }}
+        {{- "// If `cursor` is not provided, the most recent page is implied.\n" }}
+        {{- "// If `id` is a string, it is treated as a fully qualified URL associated with `source`.\n" }}
+        {{- "// If `loc` is not provided, the viewport will be positioned at the beginning of the document or centered on the most relevant passage, if available.\n" }}
+        {{- "// Use this function without `id` to scroll to a new location of an opened page.\n" }}
+        {{- "type open = (_: {\n" }}
+        {{- "id?: number | string, // default: -1\n" }}
+        {{- "cursor?: number, // default: -1\n" }}
+        {{- "loc?: number, // default: -1\n" }}
+        {{- "num_lines?: number, // default: -1\n" }}
+        {{- "view_source?: boolean, // default: false\n" }}
+        {{- "source?: string,\n" }}
+        {{- "}) => any;\n\n" }}
+        {{- "// Finds exact matches of `pattern` in the current page, or the page given by `cursor`.\n" }}
+        {{- "type find = (_: {\n" }}
+        {{- "pattern: string,\n" }}
+        {{- "cursor?: number, // default: -1\n" }}
+        {{- "}) => any;\n\n" }}
+        {{- "} // namespace browser\n\n" }}
+    {%- endif -%}
+
+    {%- if python_tool %}
+        {{- "## python\n\n" }}
+        {{- "Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).\n\n" }}
+        {{- "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 120.0 seconds. The drive at '/mnt/data' can be used to save and persist user files. Internet access for this session is UNKNOWN. Depends on the cluster.\n\n" }}
+    {%- endif -%}
+{%- endmacro -%}
+
+{#- System Message Construction ============================================ #}
+{%- macro build_system_message() -%}
+    {%- if model_identity is not defined %}
+        {{- "You are ChatGPT, a large language model trained by OpenAI.\n" -}}
+    {%- else %}
+        {{- model_identity }}
+    {%- endif %}
+    {{- "Knowledge cutoff: 2024-06\n" }}
+    {{- "Current date: " + strftime_now("%Y-%m-%d") + "\n\n" }}
+    {%- if reasoning_effort is not defined %}
+        {%- set reasoning_effort = "medium" %}
+    {%- endif %}
+    {{- "Reasoning: " + reasoning_effort + "\n\n" }}
+    {%- if builtin_tools is defined %}
+        {{- "# Tools\n\n" }}
+        {%- set available_builtin_tools = namespace(browser=false, python=false) %}
+        {%- for tool in builtin_tools %}
+            {%- if tool == "browser" %}
+                {%- set available_builtin_tools.browser = true %}
+            {%- elif tool == "python" %}
+                {%- set available_builtin_tools.python = true %}
+            {%- endif %}
+        {%- endfor %}
+        {{- render_builtin_tools(available_builtin_tools.browser, available_builtin_tools.python) }}
+    {%- endif -%}
+    {{- "# Valid channels: analysis, commentary, final. Channel must be included for every message." }}
+    {%- if tools is defined -%}
+        {{- "\nCalls to these tools must go to the commentary channel: 'functions'." }}
+    {%- endif -%}
+{%- endmacro -%}
+
+{#- Main Template Logic ================================================= #}
+{#- Set defaults #}
+
+{#- Render system message #}
+{{- "<|start|>system<|message|>" }}
+{{- build_system_message() }}
+{{- "<|end|>" }}
+
+{#- Extract developer message #}
+{%- if messages[0].role == "developer" or messages[0].role == "system" %}
+    {%- set developer_message = messages[0].content %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set developer_message = "" %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+
+{#- Render developer message #}
+{%- if developer_message or tools %}
+    {{- "<|start|>developer<|message|>" }}
+    {%- if developer_message %}
+        {{- "# Instructions\n\n" }}
+        {{- developer_message }}
+    {%- endif %}
+    {%- if tools -%}
+        {{- "\n\n" }}
+        {{- "# Tools\n\n" }}
+        {{- render_tool_namespace("functions", tools) }}
+    {%- endif -%}
+    {{- "<|end|>" }}
+{%- endif %}
+
+{#- Render messages #}
+{%- set last_tool_call = namespace(name=none) %}
+{%- for message in loop_messages -%}
+    {#- At this point only assistant/user/tool messages should remain #}
+    {%- if message.role == 'assistant' -%}
+        {%- if "tool_calls" in message %}
+            {#- We assume max 1 tool call per message, and so we infer the tool call name #}
+            {#- in "tool" messages from the most recent assistant tool call name #}
+            {%- set tool_call = message.tool_calls[0] %}
+            {%- if tool_call.function %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {%- if message.content %}
+                {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.content + "<|end|>" }}
+            {%- endif %}
+            {{- "<|start|>assistant to=" }}
+            {{- "functions." + tool_call.name + "<|channel|>commentary json<|message|>" }}
+            {{- tool_call.arguments|tojson }}
+            {{- "<|call|>" }}
+            {%- set last_tool_call.name = tool_call.name %}
+        {%- elif "thinking" in message and loop.last and not add_generation_prompt %}
+            {#- Only render the CoT if the final turn is an assistant turn and add_generation_prompt is false #}
+            {#- This is a situation that should only occur in training, never in inference. #}
+            {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }}
+            {#- <|return|> indicates the end of generation, but <|end|> does not #}
+            {#- <|return|> should never be an input to the model, but we include it as the final token #}
+            {#- when training, so the model learns to emit it. #}
+            {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|return|>" }}
+            {%- set last_tool_call.name = none %}
+        {%- elif "thinking" in message %}
+            {#- CoT is dropped during all previous turns, so we never render it for inference #}
+            {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|end|>" }}
+            {%- set last_tool_call.name = none %}
+        {%- elif loop.last and not add_generation_prompt %}
+            {#- <|return|> indicates the end of generation, but <|end|> does not #}
+            {#- <|return|> should never be an input to the model, but we include it as the final token #}
+            {#- when training, so the model learns to emit it. #}
+            {{- "<|start|>assistant<|message|>" + message.content + "<|return|>" }}
+        {%- else %}
+            {{- "<|start|>assistant<|message|>" + message.content + "<|end|>" }}
+            {%- set last_tool_call.name = none %}
+        {%- endif %}
+    {%- elif message.role == 'tool' -%}
+        {%- if last_tool_call.name is none %}
+            {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }}
+        {%- endif %}
+        {{- "<|start|>functions." + last_tool_call.name }}
+        {{- " to=assistant<|channel|>commentary<|message|>" + message.content|tojson + "<|end|>" }}
+    {%- else -%}
+        {{- "<|start|>user<|message|>" + message.content + "<|end|>" }}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Generation prompt #}
+{%- if add_generation_prompt -%}
+<|start|>assistant
+{%- endif -%}
+{# Copyright 2025-present Unsloth. Apache 2.0 License. Unsloth chat template fixes. Edited from ggml-org & OpenAI #}
\ No newline at end of file
diff --git a/outputs/checkpoint-500/optimizer.pt b/outputs/checkpoint-500/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..96d24d2f8ffa32886b30c4b2ceacf177593b485f
--- /dev/null
+++ b/outputs/checkpoint-500/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e8e01b878a15f489ed7e9f584370716b15783abbbdabd551f242a6101e2133c
+size 16894883
diff --git a/outputs/checkpoint-500/rng_state.pth b/outputs/checkpoint-500/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..3ef66339b9befa098183fd5d69faed6838e526b0
--- /dev/null
+++ b/outputs/checkpoint-500/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f1d565802a8e26c4e8a31328752b7a7fdc186d9401aa008e65697d0ad8c22e33
+size 14645
diff --git a/outputs/checkpoint-500/special_tokens_map.json b/outputs/checkpoint-500/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..6fba18753f4d09dbb8fcdf1482daff36b963d639
--- /dev/null
+++ b/outputs/checkpoint-500/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|return|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|reserved_200017|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/outputs/checkpoint-500/tokenizer.json b/outputs/checkpoint-500/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..6ec3ef1795cbbda6b7cb7d1f114919cbe3fdd647
--- /dev/null
+++ b/outputs/checkpoint-500/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0614fe83cadab421296e664e1f48f4261fa8fef6e03e63bb75c20f38e37d07d3
+size 27868174
diff --git a/outputs/checkpoint-500/tokenizer_config.json b/outputs/checkpoint-500/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..482ae30d27a74c38d2228e69dd37c529fc485a45
--- /dev/null
+++ b/outputs/checkpoint-500/tokenizer_config.json
@@ -0,0 +1,185 @@
+{
+  "added_tokens_decoder": {
+    "199998": {
+      "content": "<|startoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "199999": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200000": {
+      "content": "<|reserved_200000|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200001": {
+      "content": "<|reserved_200001|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200002": {
+      "content": "<|return|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200003": {
+      "content": "<|constrain|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200004": {
+      "content": "<|reserved_200004|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200005": {
+      "content": "<|channel|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200006": {
+      "content": "<|start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200007": {
+      "content": "<|end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200008": {
+      "content": "<|message|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200009": {
+      "content": "<|reserved_200009|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200010": {
+      "content": "<|reserved_200010|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200011": {
+      "content": "<|reserved_200011|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200012": {
+      "content": "<|call|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200013": {
+      "content": "<|reserved_200013|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200014": {
+      "content": "<|reserved_200014|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200015": {
+      "content": "<|reserved_200015|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200016": {
+      "content": "<|reserved_200016|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200017": {
+      "content": "<|reserved_200017|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200018": {
+      "content": "<|endofprompt|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|startoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|return|>",
+  "extra_special_tokens": {},
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 131072,
+  "pad_token": "<|reserved_200017|>",
+  "padding_side": "right",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": null
+}
diff --git a/outputs/checkpoint-500/trainer_state.json b/outputs/checkpoint-500/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d32e9608f768bc00df1139575b6bc4e7d475f2d1
--- /dev/null
+++ b/outputs/checkpoint-500/trainer_state.json
@@ -0,0 +1,3534 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.1943068104537064,
+  "eval_steps": 500,
+  "global_step": 500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 13.684800148010254,
+      "learning_rate": 0.0,
+      "loss": 2.3276,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 13.660787582397461,
+      "learning_rate": 4e-05,
+      "loss": 2.2792,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 13.35280704498291,
+      "learning_rate": 8e-05,
+      "loss": 2.4151,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 6.15027379989624,
+      "learning_rate": 0.00012,
+      "loss": 1.7812,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 1.3168226480484009,
+      "learning_rate": 0.00016,
+      "loss": 1.4536,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.9872580170631409,
+      "learning_rate": 0.0002,
+      "loss": 1.4171,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.7496100664138794,
+      "learning_rate": 0.00019935064935064936,
+      "loss": 1.4168,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.7376005053520203,
+      "learning_rate": 0.00019870129870129872,
+      "loss": 1.3659,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5281137824058533,
+      "learning_rate": 0.00019805194805194807,
+      "loss": 1.2566,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.5485746264457703,
+      "learning_rate": 0.00019740259740259742,
+      "loss": 1.3761,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.5506592392921448,
+      "learning_rate": 0.00019675324675324675,
+      "loss": 1.3327,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.49382686614990234,
+      "learning_rate": 0.00019610389610389613,
+      "loss": 1.3727,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.36203011870384216,
+      "learning_rate": 0.00019545454545454548,
+      "loss": 1.1515,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.3528599739074707,
+      "learning_rate": 0.0001948051948051948,
+      "loss": 1.2636,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.31244418025016785,
+      "learning_rate": 0.00019415584415584416,
+      "loss": 1.1873,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.3379523754119873,
+      "learning_rate": 0.00019350649350649354,
+      "loss": 1.2657,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.3025083839893341,
+      "learning_rate": 0.00019285714285714286,
+      "loss": 1.2846,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.2560190260410309,
+      "learning_rate": 0.00019220779220779222,
+      "loss": 1.1587,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.2554129958152771,
+      "learning_rate": 0.00019155844155844157,
+      "loss": 1.2812,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.22662702202796936,
+      "learning_rate": 0.00019090909090909092,
+      "loss": 1.1664,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.2515714168548584,
+      "learning_rate": 0.00019025974025974027,
+      "loss": 1.2177,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.24396637082099915,
+      "learning_rate": 0.00018961038961038963,
+      "loss": 1.2053,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.24488303065299988,
+      "learning_rate": 0.00018896103896103895,
+      "loss": 1.2074,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.2168620079755783,
+      "learning_rate": 0.00018831168831168833,
+      "loss": 1.1284,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.24021224677562714,
+      "learning_rate": 0.00018766233766233769,
+      "loss": 1.2169,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.20057056844234467,
+      "learning_rate": 0.000187012987012987,
+      "loss": 1.1031,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.19900795817375183,
+      "learning_rate": 0.00018636363636363636,
+      "loss": 1.1004,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.2019268423318863,
+      "learning_rate": 0.00018571428571428572,
+      "loss": 1.1476,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.1996479034423828,
+      "learning_rate": 0.00018506493506493507,
+      "loss": 1.1455,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.25262022018432617,
+      "learning_rate": 0.00018441558441558442,
+      "loss": 1.1025,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.225438192486763,
+      "learning_rate": 0.00018376623376623378,
+      "loss": 1.1954,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.17834505438804626,
+      "learning_rate": 0.00018311688311688313,
+      "loss": 1.0934,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.20071206986904144,
+      "learning_rate": 0.00018246753246753248,
+      "loss": 1.0488,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.1920139640569687,
+      "learning_rate": 0.00018181818181818183,
+      "loss": 1.123,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.18714852631092072,
+      "learning_rate": 0.0001811688311688312,
+      "loss": 1.0798,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.18315713107585907,
+      "learning_rate": 0.00018051948051948054,
+      "loss": 1.1107,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.19156870245933533,
+      "learning_rate": 0.00017987012987012987,
+      "loss": 1.1125,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.21527768671512604,
+      "learning_rate": 0.00017922077922077922,
+      "loss": 1.1346,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.1871163249015808,
+      "learning_rate": 0.0001785714285714286,
+      "loss": 1.0742,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.17750784754753113,
+      "learning_rate": 0.00017792207792207792,
+      "loss": 1.1323,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.177419051527977,
+      "learning_rate": 0.00017727272727272728,
+      "loss": 1.1405,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.16714292764663696,
+      "learning_rate": 0.00017662337662337663,
+      "loss": 1.1084,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.1610356718301773,
+      "learning_rate": 0.00017597402597402598,
+      "loss": 1.1125,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.2548656761646271,
+      "learning_rate": 0.00017532467532467534,
+      "loss": 1.1114,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.1731044203042984,
+      "learning_rate": 0.0001746753246753247,
+      "loss": 1.1197,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.1739533394575119,
+      "learning_rate": 0.00017402597402597401,
+      "loss": 1.1777,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.2178352177143097,
+      "learning_rate": 0.0001733766233766234,
+      "loss": 1.1111,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.17247150838375092,
+      "learning_rate": 0.00017272727272727275,
+      "loss": 1.1253,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.18075324594974518,
+      "learning_rate": 0.00017207792207792207,
+      "loss": 1.1358,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.15898071229457855,
+      "learning_rate": 0.00017142857142857143,
+      "loss": 1.0606,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.16518613696098328,
+      "learning_rate": 0.0001707792207792208,
+      "loss": 1.0944,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.16035063564777374,
+      "learning_rate": 0.00017012987012987013,
+      "loss": 1.0554,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.1686483472585678,
+      "learning_rate": 0.00016948051948051948,
+      "loss": 1.0384,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.16575631499290466,
+      "learning_rate": 0.00016883116883116884,
+      "loss": 1.0243,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.16840039193630219,
+      "learning_rate": 0.0001681818181818182,
+      "loss": 1.117,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.17616064846515656,
+      "learning_rate": 0.00016753246753246754,
+      "loss": 1.0743,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.168218195438385,
+      "learning_rate": 0.0001668831168831169,
+      "loss": 1.0627,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.17026656866073608,
+      "learning_rate": 0.00016623376623376625,
+      "loss": 1.0059,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.16454458236694336,
+      "learning_rate": 0.0001655844155844156,
+      "loss": 0.9943,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.17185136675834656,
+      "learning_rate": 0.00016493506493506495,
+      "loss": 1.1545,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.17822986841201782,
+      "learning_rate": 0.00016428571428571428,
+      "loss": 1.073,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.1676608771085739,
+      "learning_rate": 0.00016363636363636366,
+      "loss": 1.0886,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.1727771908044815,
+      "learning_rate": 0.000162987012987013,
+      "loss": 1.0432,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.17827573418617249,
+      "learning_rate": 0.00016233766233766234,
+      "loss": 1.083,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.19807517528533936,
+      "learning_rate": 0.0001616883116883117,
+      "loss": 1.1208,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.17693684995174408,
+      "learning_rate": 0.00016103896103896104,
+      "loss": 1.089,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.15489234030246735,
+      "learning_rate": 0.0001603896103896104,
+      "loss": 0.9707,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.16443990170955658,
+      "learning_rate": 0.00015974025974025975,
+      "loss": 1.0643,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.2051103413105011,
+      "learning_rate": 0.0001590909090909091,
+      "loss": 1.1246,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.18824075162410736,
+      "learning_rate": 0.00015844155844155845,
+      "loss": 1.0855,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.18659448623657227,
+      "learning_rate": 0.0001577922077922078,
+      "loss": 1.1412,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.1854114979505539,
+      "learning_rate": 0.00015714285714285716,
+      "loss": 1.0249,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.1876193732023239,
+      "learning_rate": 0.00015649350649350649,
+      "loss": 1.1029,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.1888684630393982,
+      "learning_rate": 0.00015584415584415587,
+      "loss": 1.0789,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.20240606367588043,
+      "learning_rate": 0.0001551948051948052,
+      "loss": 1.0495,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.232120081782341,
+      "learning_rate": 0.00015454545454545454,
+      "loss": 1.0735,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.16897843778133392,
+      "learning_rate": 0.0001538961038961039,
+      "loss": 1.0164,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.18796634674072266,
+      "learning_rate": 0.00015324675324675325,
+      "loss": 1.0676,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.19574032723903656,
+      "learning_rate": 0.0001525974025974026,
+      "loss": 1.0456,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.18007811903953552,
+      "learning_rate": 0.00015194805194805196,
+      "loss": 1.0894,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.18932929635047913,
+      "learning_rate": 0.0001512987012987013,
+      "loss": 1.0729,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.20614288747310638,
+      "learning_rate": 0.00015064935064935066,
+      "loss": 1.0854,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.19291089475154877,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.1217,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.18916529417037964,
+      "learning_rate": 0.00014935064935064934,
+      "loss": 1.0963,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.20306220650672913,
+      "learning_rate": 0.00014870129870129872,
+      "loss": 1.0898,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.17870067059993744,
+      "learning_rate": 0.00014805194805194807,
+      "loss": 1.0213,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.18411923944950104,
+      "learning_rate": 0.0001474025974025974,
+      "loss": 1.0844,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.18788227438926697,
+      "learning_rate": 0.00014675324675324675,
+      "loss": 1.0338,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.23874884843826294,
+      "learning_rate": 0.00014610389610389613,
+      "loss": 1.1118,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.19380499422550201,
+      "learning_rate": 0.00014545454545454546,
+      "loss": 1.0464,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.18968750536441803,
+      "learning_rate": 0.0001448051948051948,
+      "loss": 1.0569,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.19545753300189972,
+      "learning_rate": 0.00014415584415584416,
+      "loss": 1.1225,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.19170494377613068,
+      "learning_rate": 0.00014350649350649352,
+      "loss": 1.0602,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.17953918874263763,
+      "learning_rate": 0.00014285714285714287,
+      "loss": 1.032,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.1822536289691925,
+      "learning_rate": 0.00014220779220779222,
+      "loss": 1.0559,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.18591298162937164,
+      "learning_rate": 0.00014155844155844155,
+      "loss": 1.031,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.2129002958536148,
+      "learning_rate": 0.00014090909090909093,
+      "loss": 1.1391,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.18386681377887726,
+      "learning_rate": 0.00014025974025974028,
+      "loss": 0.9919,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.18314239382743835,
+      "learning_rate": 0.0001396103896103896,
+      "loss": 1.0445,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.1999066174030304,
+      "learning_rate": 0.00013896103896103896,
+      "loss": 1.0538,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.18741188943386078,
+      "learning_rate": 0.00013831168831168834,
+      "loss": 1.0722,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.19351010024547577,
+      "learning_rate": 0.00013766233766233766,
+      "loss": 1.0491,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.18859203159809113,
+      "learning_rate": 0.00013701298701298702,
+      "loss": 1.0593,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.1962767392396927,
+      "learning_rate": 0.00013636363636363637,
+      "loss": 1.1344,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.20819440484046936,
+      "learning_rate": 0.00013571428571428572,
+      "loss": 1.1137,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.19590184092521667,
+      "learning_rate": 0.00013506493506493507,
+      "loss": 1.0624,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.18631424009799957,
+      "learning_rate": 0.00013441558441558443,
+      "loss": 1.0587,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.19572143256664276,
+      "learning_rate": 0.00013376623376623375,
+      "loss": 1.0494,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.1910988837480545,
+      "learning_rate": 0.00013311688311688313,
+      "loss": 1.0481,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.19455869495868683,
+      "learning_rate": 0.00013246753246753249,
+      "loss": 1.029,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.18669827282428741,
+      "learning_rate": 0.0001318181818181818,
+      "loss": 1.0513,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.17523664236068726,
+      "learning_rate": 0.0001311688311688312,
+      "loss": 1.0126,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.17929129302501678,
+      "learning_rate": 0.00013051948051948052,
+      "loss": 1.0717,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.19380168616771698,
+      "learning_rate": 0.00012987012987012987,
+      "loss": 1.0324,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.18090228736400604,
+      "learning_rate": 0.00012922077922077922,
+      "loss": 1.0515,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.2067340910434723,
+      "learning_rate": 0.00012857142857142858,
+      "loss": 1.0939,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.1880485862493515,
+      "learning_rate": 0.00012792207792207793,
+      "loss": 1.0986,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.182168647646904,
+      "learning_rate": 0.00012727272727272728,
+      "loss": 1.0109,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.20187129080295563,
+      "learning_rate": 0.00012662337662337663,
+      "loss": 1.0668,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.2082669734954834,
+      "learning_rate": 0.000125974025974026,
+      "loss": 1.054,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.18294434249401093,
+      "learning_rate": 0.00012532467532467534,
+      "loss": 1.0397,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.20515067875385284,
+      "learning_rate": 0.00012467532467532467,
+      "loss": 1.1092,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.1758790761232376,
+      "learning_rate": 0.00012402597402597402,
+      "loss": 0.9755,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.2170792669057846,
+      "learning_rate": 0.0001233766233766234,
+      "loss": 1.0434,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.202157124876976,
+      "learning_rate": 0.00012272727272727272,
+      "loss": 1.1129,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.18556398153305054,
+      "learning_rate": 0.00012207792207792208,
+      "loss": 1.0665,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.20196087658405304,
+      "learning_rate": 0.00012142857142857143,
+      "loss": 1.1,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.1921566128730774,
+      "learning_rate": 0.0001207792207792208,
+      "loss": 1.0918,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.18866224586963654,
+      "learning_rate": 0.00012012987012987014,
+      "loss": 1.0014,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.207601398229599,
+      "learning_rate": 0.00011948051948051949,
+      "loss": 1.0726,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.21592366695404053,
+      "learning_rate": 0.00011883116883116883,
+      "loss": 1.1379,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.2016124576330185,
+      "learning_rate": 0.0001181818181818182,
+      "loss": 1.1428,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.20478437840938568,
+      "learning_rate": 0.00011753246753246753,
+      "loss": 1.121,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.22730594873428345,
+      "learning_rate": 0.00011688311688311689,
+      "loss": 1.0319,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.22592711448669434,
+      "learning_rate": 0.00011623376623376625,
+      "loss": 1.1264,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.20035041868686676,
+      "learning_rate": 0.00011558441558441559,
+      "loss": 1.0686,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.20648567378520966,
+      "learning_rate": 0.00011493506493506494,
+      "loss": 1.0817,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.21222743391990662,
+      "learning_rate": 0.00011428571428571428,
+      "loss": 1.0678,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.2075391560792923,
+      "learning_rate": 0.00011363636363636365,
+      "loss": 1.0897,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.1964101791381836,
+      "learning_rate": 0.000112987012987013,
+      "loss": 1.0906,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.22406511008739471,
+      "learning_rate": 0.00011233766233766234,
+      "loss": 1.0594,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.23787978291511536,
+      "learning_rate": 0.00011168831168831168,
+      "loss": 1.1053,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.21196185052394867,
+      "learning_rate": 0.00011103896103896105,
+      "loss": 1.0923,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.21042804419994354,
+      "learning_rate": 0.0001103896103896104,
+      "loss": 1.0381,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.2267436534166336,
+      "learning_rate": 0.00010974025974025974,
+      "loss": 1.0818,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.23742735385894775,
+      "learning_rate": 0.00010909090909090909,
+      "loss": 1.0872,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.17787213623523712,
+      "learning_rate": 0.00010844155844155846,
+      "loss": 1.03,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.22422832250595093,
+      "learning_rate": 0.0001077922077922078,
+      "loss": 1.0738,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.22946301102638245,
+      "learning_rate": 0.00010714285714285715,
+      "loss": 1.0274,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.2137996405363083,
+      "learning_rate": 0.00010649350649350649,
+      "loss": 1.0539,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.1748756766319275,
+      "learning_rate": 0.00010584415584415586,
+      "loss": 1.0355,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.22275175154209137,
+      "learning_rate": 0.0001051948051948052,
+      "loss": 1.1696,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.20996077358722687,
+      "learning_rate": 0.00010454545454545455,
+      "loss": 1.0303,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.1945938766002655,
+      "learning_rate": 0.00010389610389610389,
+      "loss": 0.9747,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.1970377266407013,
+      "learning_rate": 0.00010324675324675325,
+      "loss": 1.0358,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.18814732134342194,
+      "learning_rate": 0.00010259740259740261,
+      "loss": 0.9612,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.2153233289718628,
+      "learning_rate": 0.00010194805194805195,
+      "loss": 1.0749,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.21788008511066437,
+      "learning_rate": 0.0001012987012987013,
+      "loss": 1.0883,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.214650496840477,
+      "learning_rate": 0.00010064935064935067,
+      "loss": 1.0539,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.19312834739685059,
+      "learning_rate": 0.0001,
+      "loss": 1.0657,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.19916598498821259,
+      "learning_rate": 9.935064935064936e-05,
+      "loss": 1.0478,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.2057606726884842,
+      "learning_rate": 9.870129870129871e-05,
+      "loss": 1.0094,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.22159607708454132,
+      "learning_rate": 9.805194805194806e-05,
+      "loss": 1.0952,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.18274275958538055,
+      "learning_rate": 9.74025974025974e-05,
+      "loss": 1.0065,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.19835162162780762,
+      "learning_rate": 9.675324675324677e-05,
+      "loss": 1.0742,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.2114904820919037,
+      "learning_rate": 9.610389610389611e-05,
+      "loss": 1.1109,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.21488523483276367,
+      "learning_rate": 9.545454545454546e-05,
+      "loss": 1.0465,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.19870303571224213,
+      "learning_rate": 9.480519480519481e-05,
+      "loss": 1.0318,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.20413029193878174,
+      "learning_rate": 9.415584415584417e-05,
+      "loss": 1.0817,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.1847231239080429,
+      "learning_rate": 9.35064935064935e-05,
+      "loss": 1.0144,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.2715964913368225,
+      "learning_rate": 9.285714285714286e-05,
+      "loss": 0.9832,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.2225002497434616,
+      "learning_rate": 9.220779220779221e-05,
+      "loss": 1.1051,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.22931510210037231,
+      "learning_rate": 9.155844155844156e-05,
+      "loss": 1.1042,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.21848627924919128,
+      "learning_rate": 9.090909090909092e-05,
+      "loss": 1.1151,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.19852259755134583,
+      "learning_rate": 9.025974025974027e-05,
+      "loss": 1.0889,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.2080363780260086,
+      "learning_rate": 8.961038961038961e-05,
+      "loss": 1.0777,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.22391024231910706,
+      "learning_rate": 8.896103896103896e-05,
+      "loss": 1.1092,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.21793846786022186,
+      "learning_rate": 8.831168831168831e-05,
+      "loss": 1.044,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.2009749859571457,
+      "learning_rate": 8.766233766233767e-05,
+      "loss": 1.0198,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.19432318210601807,
+      "learning_rate": 8.701298701298701e-05,
+      "loss": 1.075,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.18634547293186188,
+      "learning_rate": 8.636363636363637e-05,
+      "loss": 0.9964,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.1947103589773178,
+      "learning_rate": 8.571428571428571e-05,
+      "loss": 1.0025,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.23098671436309814,
+      "learning_rate": 8.506493506493507e-05,
+      "loss": 1.0562,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.19686414301395416,
+      "learning_rate": 8.441558441558442e-05,
+      "loss": 1.0285,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.19852428138256073,
+      "learning_rate": 8.376623376623377e-05,
+      "loss": 1.0054,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.21483510732650757,
+      "learning_rate": 8.311688311688312e-05,
+      "loss": 1.108,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.23313644528388977,
+      "learning_rate": 8.246753246753248e-05,
+      "loss": 1.1383,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.21453145146369934,
+      "learning_rate": 8.181818181818183e-05,
+      "loss": 1.0911,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.20268195867538452,
+      "learning_rate": 8.116883116883117e-05,
+      "loss": 1.0145,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.20576398074626923,
+      "learning_rate": 8.051948051948052e-05,
+      "loss": 1.0829,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.21732626855373383,
+      "learning_rate": 7.987012987012987e-05,
+      "loss": 1.0152,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.22046895325183868,
+      "learning_rate": 7.922077922077923e-05,
+      "loss": 1.1311,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.19727715849876404,
+      "learning_rate": 7.857142857142858e-05,
+      "loss": 1.0364,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.20861488580703735,
+      "learning_rate": 7.792207792207793e-05,
+      "loss": 1.0435,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.18545083701610565,
+      "learning_rate": 7.727272727272727e-05,
+      "loss": 1.0299,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.19965052604675293,
+      "learning_rate": 7.662337662337662e-05,
+      "loss": 1.0511,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.23673909902572632,
+      "learning_rate": 7.597402597402598e-05,
+      "loss": 1.081,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.17583179473876953,
+      "learning_rate": 7.532467532467533e-05,
+      "loss": 0.9808,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.2129366099834442,
+      "learning_rate": 7.467532467532467e-05,
+      "loss": 1.0522,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.21679140627384186,
+      "learning_rate": 7.402597402597404e-05,
+      "loss": 1.0567,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.2032000720500946,
+      "learning_rate": 7.337662337662338e-05,
+      "loss": 1.0466,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.1887970268726349,
+      "learning_rate": 7.272727272727273e-05,
+      "loss": 1.0329,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.21060192584991455,
+      "learning_rate": 7.207792207792208e-05,
+      "loss": 1.1021,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.21191425621509552,
+      "learning_rate": 7.142857142857143e-05,
+      "loss": 0.99,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.1995989829301834,
+      "learning_rate": 7.077922077922077e-05,
+      "loss": 1.0526,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.1849513053894043,
+      "learning_rate": 7.012987012987014e-05,
+      "loss": 0.9998,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.1948779672384262,
+      "learning_rate": 6.948051948051948e-05,
+      "loss": 1.075,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.20374052226543427,
+      "learning_rate": 6.883116883116883e-05,
+      "loss": 1.0933,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.2102465033531189,
+      "learning_rate": 6.818181818181818e-05,
+      "loss": 1.1123,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.21376173198223114,
+      "learning_rate": 6.753246753246754e-05,
+      "loss": 1.1233,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.20934203267097473,
+      "learning_rate": 6.688311688311688e-05,
+      "loss": 1.1374,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.18604128062725067,
+      "learning_rate": 6.623376623376624e-05,
+      "loss": 1.0213,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.19644233584403992,
+      "learning_rate": 6.55844155844156e-05,
+      "loss": 1.0046,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.18479463458061218,
+      "learning_rate": 6.493506493506494e-05,
+      "loss": 0.9792,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.1945149153470993,
+      "learning_rate": 6.428571428571429e-05,
+      "loss": 1.0584,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.2070147544145584,
+      "learning_rate": 6.363636363636364e-05,
+      "loss": 1.071,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.19645985960960388,
+      "learning_rate": 6.2987012987013e-05,
+      "loss": 1.0721,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.1960117667913437,
+      "learning_rate": 6.233766233766233e-05,
+      "loss": 1.071,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.20168261229991913,
+      "learning_rate": 6.16883116883117e-05,
+      "loss": 1.0808,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.21254412829875946,
+      "learning_rate": 6.103896103896104e-05,
+      "loss": 1.0287,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.21271063387393951,
+      "learning_rate": 6.03896103896104e-05,
+      "loss": 1.0605,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.2081408053636551,
+      "learning_rate": 5.9740259740259744e-05,
+      "loss": 1.091,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.21113798022270203,
+      "learning_rate": 5.90909090909091e-05,
+      "loss": 1.1323,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.20670844614505768,
+      "learning_rate": 5.844155844155844e-05,
+      "loss": 1.0955,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.2010120451450348,
+      "learning_rate": 5.7792207792207796e-05,
+      "loss": 1.1068,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.20379121601581573,
+      "learning_rate": 5.714285714285714e-05,
+      "loss": 1.0419,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.22799807786941528,
+      "learning_rate": 5.64935064935065e-05,
+      "loss": 1.0904,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.2005995213985443,
+      "learning_rate": 5.584415584415584e-05,
+      "loss": 1.078,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.20329605042934418,
+      "learning_rate": 5.51948051948052e-05,
+      "loss": 1.0245,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.19283504784107208,
+      "learning_rate": 5.4545454545454546e-05,
+      "loss": 1.0367,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.20624355971813202,
+      "learning_rate": 5.38961038961039e-05,
+      "loss": 1.1046,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.21362991631031036,
+      "learning_rate": 5.3246753246753245e-05,
+      "loss": 1.1104,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.20447863638401031,
+      "learning_rate": 5.25974025974026e-05,
+      "loss": 1.0514,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.1974381059408188,
+      "learning_rate": 5.1948051948051944e-05,
+      "loss": 1.0048,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.21237170696258545,
+      "learning_rate": 5.1298701298701304e-05,
+      "loss": 1.1299,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.21224971115589142,
+      "learning_rate": 5.064935064935065e-05,
+      "loss": 1.05,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.19865018129348755,
+      "learning_rate": 5e-05,
+      "loss": 1.0665,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.19199275970458984,
+      "learning_rate": 4.9350649350649355e-05,
+      "loss": 0.9531,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.19573214650154114,
+      "learning_rate": 4.87012987012987e-05,
+      "loss": 1.0318,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.21338805556297302,
+      "learning_rate": 4.8051948051948054e-05,
+      "loss": 1.0343,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.2254691869020462,
+      "learning_rate": 4.740259740259741e-05,
+      "loss": 1.0472,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.18101665377616882,
+      "learning_rate": 4.675324675324675e-05,
+      "loss": 1.017,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.22090592980384827,
+      "learning_rate": 4.6103896103896106e-05,
+      "loss": 1.0389,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.20865507423877716,
+      "learning_rate": 4.545454545454546e-05,
+      "loss": 1.0369,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.21619610488414764,
+      "learning_rate": 4.4805194805194805e-05,
+      "loss": 1.109,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.21694771945476532,
+      "learning_rate": 4.415584415584416e-05,
+      "loss": 1.0525,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.2182662934064865,
+      "learning_rate": 4.3506493506493503e-05,
+      "loss": 1.0331,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.2026486098766327,
+      "learning_rate": 4.2857142857142856e-05,
+      "loss": 1.027,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.19606547057628632,
+      "learning_rate": 4.220779220779221e-05,
+      "loss": 1.0242,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.22107470035552979,
+      "learning_rate": 4.155844155844156e-05,
+      "loss": 1.0924,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.19960008561611176,
+      "learning_rate": 4.0909090909090915e-05,
+      "loss": 1.0384,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.1945488154888153,
+      "learning_rate": 4.025974025974026e-05,
+      "loss": 1.0673,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.22067414224147797,
+      "learning_rate": 3.9610389610389614e-05,
+      "loss": 1.0426,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.19010980427265167,
+      "learning_rate": 3.8961038961038966e-05,
+      "loss": 1.0617,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.18781176209449768,
+      "learning_rate": 3.831168831168831e-05,
+      "loss": 1.0243,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.20388829708099365,
+      "learning_rate": 3.7662337662337665e-05,
+      "loss": 1.0476,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.19911155104637146,
+      "learning_rate": 3.701298701298702e-05,
+      "loss": 1.0324,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.19884039461612701,
+      "learning_rate": 3.6363636363636364e-05,
+      "loss": 1.0242,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.19036105275154114,
+      "learning_rate": 3.571428571428572e-05,
+      "loss": 1.0323,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.20039844512939453,
+      "learning_rate": 3.506493506493507e-05,
+      "loss": 1.0749,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.1899934560060501,
+      "learning_rate": 3.4415584415584416e-05,
+      "loss": 1.0115,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.20019090175628662,
+      "learning_rate": 3.376623376623377e-05,
+      "loss": 1.0782,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.2020583152770996,
+      "learning_rate": 3.311688311688312e-05,
+      "loss": 1.0687,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.21407337486743927,
+      "learning_rate": 3.246753246753247e-05,
+      "loss": 1.1015,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.1871640682220459,
+      "learning_rate": 3.181818181818182e-05,
+      "loss": 0.9637,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.21622811257839203,
+      "learning_rate": 3.1168831168831166e-05,
+      "loss": 1.1222,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.22504661977291107,
+      "learning_rate": 3.051948051948052e-05,
+      "loss": 1.132,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.19177629053592682,
+      "learning_rate": 2.9870129870129872e-05,
+      "loss": 1.0281,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.1970544159412384,
+      "learning_rate": 2.922077922077922e-05,
+      "loss": 1.0393,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.21554522216320038,
+      "learning_rate": 2.857142857142857e-05,
+      "loss": 1.074,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.21131229400634766,
+      "learning_rate": 2.792207792207792e-05,
+      "loss": 1.054,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.19816523790359497,
+      "learning_rate": 2.7272727272727273e-05,
+      "loss": 1.0456,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.21075209975242615,
+      "learning_rate": 2.6623376623376623e-05,
+      "loss": 1.0758,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.2296527624130249,
+      "learning_rate": 2.5974025974025972e-05,
+      "loss": 1.0917,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.19722610712051392,
+      "learning_rate": 2.5324675324675325e-05,
+      "loss": 1.0704,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.18721099197864532,
+      "learning_rate": 2.4675324675324678e-05,
+      "loss": 0.9919,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.20244193077087402,
+      "learning_rate": 2.4025974025974027e-05,
+      "loss": 1.0368,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.19518914818763733,
+      "learning_rate": 2.3376623376623376e-05,
+      "loss": 1.0436,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.19650357961654663,
+      "learning_rate": 2.272727272727273e-05,
+      "loss": 1.0306,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.20320096611976624,
+      "learning_rate": 2.207792207792208e-05,
+      "loss": 1.0941,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.18296951055526733,
+      "learning_rate": 2.1428571428571428e-05,
+      "loss": 0.9802,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.21357610821723938,
+      "learning_rate": 2.077922077922078e-05,
+      "loss": 1.0449,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.193921759724617,
+      "learning_rate": 2.012987012987013e-05,
+      "loss": 1.0116,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.1953902244567871,
+      "learning_rate": 1.9480519480519483e-05,
+      "loss": 1.0105,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.19440975785255432,
+      "learning_rate": 1.8831168831168833e-05,
+      "loss": 0.9952,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.21054105460643768,
+      "learning_rate": 1.8181818181818182e-05,
+      "loss": 1.0701,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.18844804167747498,
+      "learning_rate": 1.7532467532467535e-05,
+      "loss": 1.0146,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.2067311704158783,
+      "learning_rate": 1.6883116883116884e-05,
+      "loss": 1.0781,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.1941213756799698,
+      "learning_rate": 1.6233766233766234e-05,
+      "loss": 0.9814,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.22726193070411682,
+      "learning_rate": 1.5584415584415583e-05,
+      "loss": 1.1431,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.18025581538677216,
+      "learning_rate": 1.4935064935064936e-05,
+      "loss": 0.9649,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.21535000205039978,
+      "learning_rate": 1.4285714285714285e-05,
+      "loss": 1.0441,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.20014546811580658,
+      "learning_rate": 1.3636363636363637e-05,
+      "loss": 1.0166,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.22738787531852722,
+      "learning_rate": 1.2987012987012986e-05,
+      "loss": 1.0564,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.2020861804485321,
+      "learning_rate": 1.2337662337662339e-05,
+      "loss": 1.1241,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.19888809323310852,
+      "learning_rate": 1.1688311688311688e-05,
+      "loss": 1.1114,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.20912377536296844,
+      "learning_rate": 1.103896103896104e-05,
+      "loss": 1.0971,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.21206621825695038,
+      "learning_rate": 1.038961038961039e-05,
+      "loss": 1.0601,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.18667680025100708,
+      "learning_rate": 9.740259740259742e-06,
+      "loss": 1.0291,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.21125559508800507,
+      "learning_rate": 9.090909090909091e-06,
+      "loss": 1.0483,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.21776145696640015,
+      "learning_rate": 8.441558441558442e-06,
+      "loss": 0.9912,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.20144303143024445,
+      "learning_rate": 7.792207792207792e-06,
+      "loss": 1.0357,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.1984029859304428,
+      "learning_rate": 7.142857142857143e-06,
+      "loss": 1.0648,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.17972829937934875,
+      "learning_rate": 6.493506493506493e-06,
+      "loss": 1.0033,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.1818286031484604,
+      "learning_rate": 5.844155844155844e-06,
+      "loss": 0.997,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.19670912623405457,
+      "learning_rate": 5.194805194805195e-06,
+      "loss": 1.0256,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.20527283847332,
+      "learning_rate": 4.5454545454545455e-06,
+      "loss": 1.0348,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.19025909900665283,
+      "learning_rate": 3.896103896103896e-06,
+      "loss": 1.0682,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.19544818997383118,
+      "learning_rate": 3.2467532467532465e-06,
+      "loss": 0.9872,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.22112183272838593,
+      "learning_rate": 2.5974025974025976e-06,
+      "loss": 1.0661,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.23328153789043427,
+      "learning_rate": 1.948051948051948e-06,
+      "loss": 1.0691,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.20181375741958618,
+      "learning_rate": 1.2987012987012988e-06,
+      "loss": 0.9416,
+      "step": 312
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.29312625527381897,
+      "learning_rate": 6.493506493506494e-07,
+      "loss": 1.1216,
+      "step": 313
+    },
+    {
+      "epoch": 0.12202467696492762,
+      "grad_norm": 0.2231415957212448,
+      "learning_rate": 0.0,
+      "loss": 1.0468,
+      "step": 314
+    },
+    {
+      "epoch": 0.12241329058583503,
+      "grad_norm": 0.22263288497924805,
+      "learning_rate": 0.00017594394706111328,
+      "loss": 1.0399,
+      "step": 315
+    },
+    {
+      "epoch": 0.12280190420674245,
+      "grad_norm": 0.22909891605377197,
+      "learning_rate": 0.00017586609575710393,
+      "loss": 1.1069,
+      "step": 316
+    },
+    {
+      "epoch": 0.12319051782764986,
+      "grad_norm": 0.23951445519924164,
+      "learning_rate": 0.0001757882444530946,
+      "loss": 1.1036,
+      "step": 317
+    },
+    {
+      "epoch": 0.12357913144855727,
+      "grad_norm": 0.2409268021583557,
+      "learning_rate": 0.00017571039314908526,
+      "loss": 1.1114,
+      "step": 318
+    },
+    {
+      "epoch": 0.12396774506946469,
+      "grad_norm": 0.23753899335861206,
+      "learning_rate": 0.00017563254184507592,
+      "loss": 1.1297,
+      "step": 319
+    },
+    {
+      "epoch": 0.12435635869037209,
+      "grad_norm": 0.2823902666568756,
+      "learning_rate": 0.00017555469054106657,
+      "loss": 1.1293,
+      "step": 320
+    },
+    {
+      "epoch": 0.12474497231127951,
+      "grad_norm": 0.24093545973300934,
+      "learning_rate": 0.00017547683923705722,
+      "loss": 1.0678,
+      "step": 321
+    },
+    {
+      "epoch": 0.12513358593218693,
+      "grad_norm": 0.22565563023090363,
+      "learning_rate": 0.0001753989879330479,
+      "loss": 1.1408,
+      "step": 322
+    },
+    {
+      "epoch": 0.12552219955309435,
+      "grad_norm": 0.22569572925567627,
+      "learning_rate": 0.00017532113662903855,
+      "loss": 1.0543,
+      "step": 323
+    },
+    {
+      "epoch": 0.12591081317400174,
+      "grad_norm": 0.24962866306304932,
+      "learning_rate": 0.0001752432853250292,
+      "loss": 1.0818,
+      "step": 324
+    },
+    {
+      "epoch": 0.12629942679490916,
+      "grad_norm": 0.22184576094150543,
+      "learning_rate": 0.00017516543402101986,
+      "loss": 1.0835,
+      "step": 325
+    },
+    {
+      "epoch": 0.12668804041581658,
+      "grad_norm": 0.2572194039821625,
+      "learning_rate": 0.0001750875827170105,
+      "loss": 1.0767,
+      "step": 326
+    },
+    {
+      "epoch": 0.127076654036724,
+      "grad_norm": 0.24131342768669128,
+      "learning_rate": 0.00017500973141300116,
+      "loss": 1.0981,
+      "step": 327
+    },
+    {
+      "epoch": 0.1274652676576314,
+      "grad_norm": 0.2386389970779419,
+      "learning_rate": 0.00017493188010899184,
+      "loss": 1.0828,
+      "step": 328
+    },
+    {
+      "epoch": 0.1278538812785388,
+      "grad_norm": 0.2654125690460205,
+      "learning_rate": 0.0001748540288049825,
+      "loss": 1.1266,
+      "step": 329
+    },
+    {
+      "epoch": 0.12824249489944622,
+      "grad_norm": 0.2925739884376526,
+      "learning_rate": 0.00017477617750097314,
+      "loss": 1.0983,
+      "step": 330
+    },
+    {
+      "epoch": 0.12863110852035364,
+      "grad_norm": 0.26589342951774597,
+      "learning_rate": 0.0001746983261969638,
+      "loss": 1.1029,
+      "step": 331
+    },
+    {
+      "epoch": 0.12901972214126106,
+      "grad_norm": 0.24565957486629486,
+      "learning_rate": 0.00017462047489295445,
+      "loss": 1.0975,
+      "step": 332
+    },
+    {
+      "epoch": 0.12940833576216845,
+      "grad_norm": 0.2459682673215866,
+      "learning_rate": 0.00017454262358894513,
+      "loss": 1.0566,
+      "step": 333
+    },
+    {
+      "epoch": 0.12979694938307587,
+      "grad_norm": 0.23349183797836304,
+      "learning_rate": 0.00017446477228493578,
+      "loss": 1.0833,
+      "step": 334
+    },
+    {
+      "epoch": 0.1301855630039833,
+      "grad_norm": 0.26166337728500366,
+      "learning_rate": 0.00017438692098092643,
+      "loss": 1.1598,
+      "step": 335
+    },
+    {
+      "epoch": 0.1305741766248907,
+      "grad_norm": 0.24188168346881866,
+      "learning_rate": 0.00017430906967691708,
+      "loss": 1.0728,
+      "step": 336
+    },
+    {
+      "epoch": 0.13096279024579813,
+      "grad_norm": 0.22922398149967194,
+      "learning_rate": 0.00017423121837290773,
+      "loss": 1.0311,
+      "step": 337
+    },
+    {
+      "epoch": 0.13135140386670552,
+      "grad_norm": 0.2652754485607147,
+      "learning_rate": 0.00017415336706889841,
+      "loss": 1.1096,
+      "step": 338
+    },
+    {
+      "epoch": 0.13174001748761294,
+      "grad_norm": 0.2355881780385971,
+      "learning_rate": 0.00017407551576488907,
+      "loss": 1.0964,
+      "step": 339
+    },
+    {
+      "epoch": 0.13212863110852036,
+      "grad_norm": 0.244523823261261,
+      "learning_rate": 0.00017399766446087972,
+      "loss": 1.142,
+      "step": 340
+    },
+    {
+      "epoch": 0.13251724472942777,
+      "grad_norm": 0.24705976247787476,
+      "learning_rate": 0.00017391981315687037,
+      "loss": 1.0943,
+      "step": 341
+    },
+    {
+      "epoch": 0.13290585835033517,
+      "grad_norm": 0.22817552089691162,
+      "learning_rate": 0.00017384196185286102,
+      "loss": 1.0621,
+      "step": 342
+    },
+    {
+      "epoch": 0.13329447197124258,
+      "grad_norm": 0.22605225443840027,
+      "learning_rate": 0.0001737641105488517,
+      "loss": 1.0714,
+      "step": 343
+    },
+    {
+      "epoch": 0.13368308559215,
+      "grad_norm": 0.2584545314311981,
+      "learning_rate": 0.00017368625924484235,
+      "loss": 1.1367,
+      "step": 344
+    },
+    {
+      "epoch": 0.13407169921305742,
+      "grad_norm": 0.2248220443725586,
+      "learning_rate": 0.000173608407940833,
+      "loss": 1.0872,
+      "step": 345
+    },
+    {
+      "epoch": 0.13446031283396484,
+      "grad_norm": 0.2141868770122528,
+      "learning_rate": 0.00017353055663682368,
+      "loss": 1.0572,
+      "step": 346
+    },
+    {
+      "epoch": 0.13484892645487223,
+      "grad_norm": 0.2615523934364319,
+      "learning_rate": 0.00017345270533281434,
+      "loss": 1.1048,
+      "step": 347
+    },
+    {
+      "epoch": 0.13523754007577965,
+      "grad_norm": 0.22990448772907257,
+      "learning_rate": 0.000173374854028805,
+      "loss": 1.0528,
+      "step": 348
+    },
+    {
+      "epoch": 0.13562615369668707,
+      "grad_norm": 0.2132262885570526,
+      "learning_rate": 0.00017329700272479564,
+      "loss": 1.0476,
+      "step": 349
+    },
+    {
+      "epoch": 0.1360147673175945,
+      "grad_norm": 0.2578272819519043,
+      "learning_rate": 0.00017321915142078632,
+      "loss": 1.0852,
+      "step": 350
+    },
+    {
+      "epoch": 0.1364033809385019,
+      "grad_norm": 0.22881457209587097,
+      "learning_rate": 0.00017314130011677697,
+      "loss": 1.1017,
+      "step": 351
+    },
+    {
+      "epoch": 0.1367919945594093,
+      "grad_norm": 0.21067696809768677,
+      "learning_rate": 0.00017306344881276762,
+      "loss": 1.0444,
+      "step": 352
+    },
+    {
+      "epoch": 0.13718060818031672,
+      "grad_norm": 0.2304215282201767,
+      "learning_rate": 0.0001729855975087583,
+      "loss": 1.0737,
+      "step": 353
+    },
+    {
+      "epoch": 0.13756922180122413,
+      "grad_norm": 0.2031925916671753,
+      "learning_rate": 0.00017290774620474895,
+      "loss": 1.0036,
+      "step": 354
+    },
+    {
+      "epoch": 0.13795783542213155,
+      "grad_norm": 0.27281051874160767,
+      "learning_rate": 0.0001728298949007396,
+      "loss": 1.148,
+      "step": 355
+    },
+    {
+      "epoch": 0.13834644904303897,
+      "grad_norm": 0.204191654920578,
+      "learning_rate": 0.00017275204359673026,
+      "loss": 0.9607,
+      "step": 356
+    },
+    {
+      "epoch": 0.13873506266394636,
+      "grad_norm": 0.221976637840271,
+      "learning_rate": 0.0001726741922927209,
+      "loss": 1.1068,
+      "step": 357
+    },
+    {
+      "epoch": 0.13912367628485378,
+      "grad_norm": 0.20831729471683502,
+      "learning_rate": 0.0001725963409887116,
+      "loss": 1.034,
+      "step": 358
+    },
+    {
+      "epoch": 0.1395122899057612,
+      "grad_norm": 0.21639779210090637,
+      "learning_rate": 0.00017251848968470224,
+      "loss": 1.0613,
+      "step": 359
+    },
+    {
+      "epoch": 0.13990090352666862,
+      "grad_norm": 0.1959424465894699,
+      "learning_rate": 0.0001724406383806929,
+      "loss": 1.0506,
+      "step": 360
+    },
+    {
+      "epoch": 0.140289517147576,
+      "grad_norm": 0.2044398933649063,
+      "learning_rate": 0.00017236278707668355,
+      "loss": 1.0316,
+      "step": 361
+    },
+    {
+      "epoch": 0.14067813076848343,
+      "grad_norm": 0.21483004093170166,
+      "learning_rate": 0.0001722849357726742,
+      "loss": 1.0361,
+      "step": 362
+    },
+    {
+      "epoch": 0.14106674438939085,
+      "grad_norm": 0.237701416015625,
+      "learning_rate": 0.00017220708446866485,
+      "loss": 1.1264,
+      "step": 363
+    },
+    {
+      "epoch": 0.14145535801029827,
+      "grad_norm": 0.20750795304775238,
+      "learning_rate": 0.00017212923316465553,
+      "loss": 1.0523,
+      "step": 364
+    },
+    {
+      "epoch": 0.14184397163120568,
+      "grad_norm": 0.2252965271472931,
+      "learning_rate": 0.00017205138186064618,
+      "loss": 1.0764,
+      "step": 365
+    },
+    {
+      "epoch": 0.14223258525211308,
+      "grad_norm": 0.2033565789461136,
+      "learning_rate": 0.00017197353055663683,
+      "loss": 1.064,
+      "step": 366
+    },
+    {
+      "epoch": 0.1426211988730205,
+      "grad_norm": 0.21123190224170685,
+      "learning_rate": 0.00017189567925262749,
+      "loss": 1.0515,
+      "step": 367
+    },
+    {
+      "epoch": 0.1430098124939279,
+      "grad_norm": 0.20646221935749054,
+      "learning_rate": 0.00017181782794861814,
+      "loss": 1.0617,
+      "step": 368
+    },
+    {
+      "epoch": 0.14339842611483533,
+      "grad_norm": 0.2079589068889618,
+      "learning_rate": 0.00017173997664460882,
+      "loss": 1.0569,
+      "step": 369
+    },
+    {
+      "epoch": 0.14378703973574275,
+      "grad_norm": 0.216246098279953,
+      "learning_rate": 0.00017166212534059947,
+      "loss": 1.0986,
+      "step": 370
+    },
+    {
+      "epoch": 0.14417565335665014,
+      "grad_norm": 0.20711806416511536,
+      "learning_rate": 0.00017158427403659012,
+      "loss": 1.1342,
+      "step": 371
+    },
+    {
+      "epoch": 0.14456426697755756,
+      "grad_norm": 0.235435351729393,
+      "learning_rate": 0.00017150642273258077,
+      "loss": 1.1082,
+      "step": 372
+    },
+    {
+      "epoch": 0.14495288059846498,
+      "grad_norm": 0.2273191511631012,
+      "learning_rate": 0.00017142857142857143,
+      "loss": 1.1064,
+      "step": 373
+    },
+    {
+      "epoch": 0.1453414942193724,
+      "grad_norm": 0.2075672745704651,
+      "learning_rate": 0.0001713507201245621,
+      "loss": 1.0536,
+      "step": 374
+    },
+    {
+      "epoch": 0.14573010784027982,
+      "grad_norm": 0.20764274895191193,
+      "learning_rate": 0.00017127286882055276,
+      "loss": 1.0673,
+      "step": 375
+    },
+    {
+      "epoch": 0.1461187214611872,
+      "grad_norm": 0.2441243678331375,
+      "learning_rate": 0.0001711950175165434,
+      "loss": 1.1271,
+      "step": 376
+    },
+    {
+      "epoch": 0.14650733508209463,
+      "grad_norm": 0.2383374124765396,
+      "learning_rate": 0.00017111716621253406,
+      "loss": 1.083,
+      "step": 377
+    },
+    {
+      "epoch": 0.14689594870300204,
+      "grad_norm": 0.2172410786151886,
+      "learning_rate": 0.0001710393149085247,
+      "loss": 1.0605,
+      "step": 378
+    },
+    {
+      "epoch": 0.14728456232390946,
+      "grad_norm": 0.22591541707515717,
+      "learning_rate": 0.0001709614636045154,
+      "loss": 1.0931,
+      "step": 379
+    },
+    {
+      "epoch": 0.14767317594481685,
+      "grad_norm": 0.23099495470523834,
+      "learning_rate": 0.00017088361230050604,
+      "loss": 1.1021,
+      "step": 380
+    },
+    {
+      "epoch": 0.14806178956572427,
+      "grad_norm": 0.21461094915866852,
+      "learning_rate": 0.0001708057609964967,
+      "loss": 1.0959,
+      "step": 381
+    },
+    {
+      "epoch": 0.1484504031866317,
+      "grad_norm": 0.21557241678237915,
+      "learning_rate": 0.00017072790969248735,
+      "loss": 1.0155,
+      "step": 382
+    },
+    {
+      "epoch": 0.1488390168075391,
+      "grad_norm": 0.234396293759346,
+      "learning_rate": 0.000170650058388478,
+      "loss": 1.1289,
+      "step": 383
+    },
+    {
+      "epoch": 0.14922763042844653,
+      "grad_norm": 0.22895503044128418,
+      "learning_rate": 0.00017057220708446868,
+      "loss": 0.9919,
+      "step": 384
+    },
+    {
+      "epoch": 0.14961624404935392,
+      "grad_norm": 0.2054683268070221,
+      "learning_rate": 0.00017049435578045933,
+      "loss": 1.0607,
+      "step": 385
+    },
+    {
+      "epoch": 0.15000485767026134,
+      "grad_norm": 0.25569215416908264,
+      "learning_rate": 0.00017041650447644998,
+      "loss": 1.0517,
+      "step": 386
+    },
+    {
+      "epoch": 0.15039347129116876,
+      "grad_norm": 0.2222641259431839,
+      "learning_rate": 0.00017033865317244064,
+      "loss": 1.0404,
+      "step": 387
+    },
+    {
+      "epoch": 0.15078208491207618,
+      "grad_norm": 0.20501169562339783,
+      "learning_rate": 0.0001702608018684313,
+      "loss": 0.9897,
+      "step": 388
+    },
+    {
+      "epoch": 0.1511706985329836,
+      "grad_norm": 0.22080403566360474,
+      "learning_rate": 0.00017018295056442197,
+      "loss": 1.1013,
+      "step": 389
+    },
+    {
+      "epoch": 0.15155931215389098,
+      "grad_norm": 0.21218529343605042,
+      "learning_rate": 0.00017010509926041262,
+      "loss": 1.0541,
+      "step": 390
+    },
+    {
+      "epoch": 0.1519479257747984,
+      "grad_norm": 0.23064807057380676,
+      "learning_rate": 0.00017002724795640327,
+      "loss": 1.037,
+      "step": 391
+    },
+    {
+      "epoch": 0.15233653939570582,
+      "grad_norm": 0.21164493262767792,
+      "learning_rate": 0.00016994939665239392,
+      "loss": 1.0769,
+      "step": 392
+    },
+    {
+      "epoch": 0.15272515301661324,
+      "grad_norm": 0.22565549612045288,
+      "learning_rate": 0.00016987154534838457,
+      "loss": 1.0638,
+      "step": 393
+    },
+    {
+      "epoch": 0.15311376663752063,
+      "grad_norm": 0.22492647171020508,
+      "learning_rate": 0.00016979369404437525,
+      "loss": 1.063,
+      "step": 394
+    },
+    {
+      "epoch": 0.15350238025842805,
+      "grad_norm": 0.22335395216941833,
+      "learning_rate": 0.0001697158427403659,
+      "loss": 1.1032,
+      "step": 395
+    },
+    {
+      "epoch": 0.15389099387933547,
+      "grad_norm": 0.2164154201745987,
+      "learning_rate": 0.00016963799143635656,
+      "loss": 1.1275,
+      "step": 396
+    },
+    {
+      "epoch": 0.1542796075002429,
+      "grad_norm": 0.22547736763954163,
+      "learning_rate": 0.0001695601401323472,
+      "loss": 1.1324,
+      "step": 397
+    },
+    {
+      "epoch": 0.1546682211211503,
+      "grad_norm": 0.2028045952320099,
+      "learning_rate": 0.0001694822888283379,
+      "loss": 1.0057,
+      "step": 398
+    },
+    {
+      "epoch": 0.1550568347420577,
+      "grad_norm": 0.20770573616027832,
+      "learning_rate": 0.00016940443752432854,
+      "loss": 1.0311,
+      "step": 399
+    },
+    {
+      "epoch": 0.15544544836296512,
+      "grad_norm": 0.2231476902961731,
+      "learning_rate": 0.0001693265862203192,
+      "loss": 1.0535,
+      "step": 400
+    },
+    {
+      "epoch": 0.15583406198387253,
+      "grad_norm": 0.21618099510669708,
+      "learning_rate": 0.00016924873491630987,
+      "loss": 1.0616,
+      "step": 401
+    },
+    {
+      "epoch": 0.15622267560477995,
+      "grad_norm": 0.24024419486522675,
+      "learning_rate": 0.00016917088361230052,
+      "loss": 1.1324,
+      "step": 402
+    },
+    {
+      "epoch": 0.15661128922568737,
+      "grad_norm": 0.2002171128988266,
+      "learning_rate": 0.00016909303230829118,
+      "loss": 1.015,
+      "step": 403
+    },
+    {
+      "epoch": 0.15699990284659476,
+      "grad_norm": 0.21771477162837982,
+      "learning_rate": 0.00016901518100428183,
+      "loss": 1.0817,
+      "step": 404
+    },
+    {
+      "epoch": 0.15738851646750218,
+      "grad_norm": 0.22052259743213654,
+      "learning_rate": 0.0001689373297002725,
+      "loss": 1.0836,
+      "step": 405
+    },
+    {
+      "epoch": 0.1577771300884096,
+      "grad_norm": 0.1964062750339508,
+      "learning_rate": 0.00016885947839626316,
+      "loss": 1.0505,
+      "step": 406
+    },
+    {
+      "epoch": 0.15816574370931702,
+      "grad_norm": 0.22714298963546753,
+      "learning_rate": 0.0001687816270922538,
+      "loss": 1.0702,
+      "step": 407
+    },
+    {
+      "epoch": 0.15855435733022444,
+      "grad_norm": 0.20647728443145752,
+      "learning_rate": 0.00016870377578824446,
+      "loss": 1.0349,
+      "step": 408
+    },
+    {
+      "epoch": 0.15894297095113183,
+      "grad_norm": 0.2355160117149353,
+      "learning_rate": 0.00016862592448423512,
+      "loss": 1.0305,
+      "step": 409
+    },
+    {
+      "epoch": 0.15933158457203925,
+      "grad_norm": 0.22890770435333252,
+      "learning_rate": 0.0001685480731802258,
+      "loss": 1.0854,
+      "step": 410
+    },
+    {
+      "epoch": 0.15972019819294667,
+      "grad_norm": 0.21947838366031647,
+      "learning_rate": 0.00016847022187621645,
+      "loss": 1.0948,
+      "step": 411
+    },
+    {
+      "epoch": 0.16010881181385409,
+      "grad_norm": 0.22334899008274078,
+      "learning_rate": 0.0001683923705722071,
+      "loss": 1.006,
+      "step": 412
+    },
+    {
+      "epoch": 0.16049742543476148,
+      "grad_norm": 0.22324936091899872,
+      "learning_rate": 0.00016831451926819775,
+      "loss": 1.0402,
+      "step": 413
+    },
+    {
+      "epoch": 0.1608860390556689,
+      "grad_norm": 0.21462097764015198,
+      "learning_rate": 0.0001682366679641884,
+      "loss": 1.077,
+      "step": 414
+    },
+    {
+      "epoch": 0.1612746526765763,
+      "grad_norm": 0.24567006528377533,
+      "learning_rate": 0.00016815881666017908,
+      "loss": 1.15,
+      "step": 415
+    },
+    {
+      "epoch": 0.16166326629748373,
+      "grad_norm": 0.26437243819236755,
+      "learning_rate": 0.00016808096535616973,
+      "loss": 1.1251,
+      "step": 416
+    },
+    {
+      "epoch": 0.16205187991839115,
+      "grad_norm": 0.2217959761619568,
+      "learning_rate": 0.00016800311405216039,
+      "loss": 1.1103,
+      "step": 417
+    },
+    {
+      "epoch": 0.16244049353929854,
+      "grad_norm": 0.24402475357055664,
+      "learning_rate": 0.00016792526274815104,
+      "loss": 1.0672,
+      "step": 418
+    },
+    {
+      "epoch": 0.16282910716020596,
+      "grad_norm": 0.21609526872634888,
+      "learning_rate": 0.0001678474114441417,
+      "loss": 1.0291,
+      "step": 419
+    },
+    {
+      "epoch": 0.16321772078111338,
+      "grad_norm": 0.20054642856121063,
+      "learning_rate": 0.00016776956014013237,
+      "loss": 1.0704,
+      "step": 420
+    },
+    {
+      "epoch": 0.1636063344020208,
+      "grad_norm": 0.22864869236946106,
+      "learning_rate": 0.00016769170883612302,
+      "loss": 1.0612,
+      "step": 421
+    },
+    {
+      "epoch": 0.16399494802292822,
+      "grad_norm": 0.22651974856853485,
+      "learning_rate": 0.00016761385753211367,
+      "loss": 1.0749,
+      "step": 422
+    },
+    {
+      "epoch": 0.1643835616438356,
+      "grad_norm": 0.21587328612804413,
+      "learning_rate": 0.00016753600622810433,
+      "loss": 1.0398,
+      "step": 423
+    },
+    {
+      "epoch": 0.16477217526474303,
+      "grad_norm": 0.1953774094581604,
+      "learning_rate": 0.00016745815492409498,
+      "loss": 1.0275,
+      "step": 424
+    },
+    {
+      "epoch": 0.16516078888565044,
+      "grad_norm": 0.21803410351276398,
+      "learning_rate": 0.00016738030362008566,
+      "loss": 1.1219,
+      "step": 425
+    },
+    {
+      "epoch": 0.16554940250655786,
+      "grad_norm": 0.2034682035446167,
+      "learning_rate": 0.0001673024523160763,
+      "loss": 1.0342,
+      "step": 426
+    },
+    {
+      "epoch": 0.16593801612746525,
+      "grad_norm": 0.20135951042175293,
+      "learning_rate": 0.00016722460101206696,
+      "loss": 0.9802,
+      "step": 427
+    },
+    {
+      "epoch": 0.16632662974837267,
+      "grad_norm": 0.23310376703739166,
+      "learning_rate": 0.0001671467497080576,
+      "loss": 1.0789,
+      "step": 428
+    },
+    {
+      "epoch": 0.1667152433692801,
+      "grad_norm": 0.21475404500961304,
+      "learning_rate": 0.00016706889840404827,
+      "loss": 1.0416,
+      "step": 429
+    },
+    {
+      "epoch": 0.1671038569901875,
+      "grad_norm": 0.21661072969436646,
+      "learning_rate": 0.00016699104710003894,
+      "loss": 1.0568,
+      "step": 430
+    },
+    {
+      "epoch": 0.16749247061109493,
+      "grad_norm": 0.20310629904270172,
+      "learning_rate": 0.0001669131957960296,
+      "loss": 0.9968,
+      "step": 431
+    },
+    {
+      "epoch": 0.16788108423200232,
+      "grad_norm": 0.2596947252750397,
+      "learning_rate": 0.00016683534449202025,
+      "loss": 1.0478,
+      "step": 432
+    },
+    {
+      "epoch": 0.16826969785290974,
+      "grad_norm": 0.22226987779140472,
+      "learning_rate": 0.0001667574931880109,
+      "loss": 1.0898,
+      "step": 433
+    },
+    {
+      "epoch": 0.16865831147381716,
+      "grad_norm": 0.22499911487102509,
+      "learning_rate": 0.00016667964188400155,
+      "loss": 1.07,
+      "step": 434
+    },
+    {
+      "epoch": 0.16904692509472458,
+      "grad_norm": 0.2717292308807373,
+      "learning_rate": 0.0001666017905799922,
+      "loss": 1.0562,
+      "step": 435
+    },
+    {
+      "epoch": 0.169435538715632,
+      "grad_norm": 0.22052323818206787,
+      "learning_rate": 0.00016652393927598288,
+      "loss": 1.0732,
+      "step": 436
+    },
+    {
+      "epoch": 0.16982415233653939,
+      "grad_norm": 0.21741728484630585,
+      "learning_rate": 0.00016644608797197354,
+      "loss": 1.0409,
+      "step": 437
+    },
+    {
+      "epoch": 0.1702127659574468,
+      "grad_norm": 0.20701193809509277,
+      "learning_rate": 0.0001663682366679642,
+      "loss": 1.0731,
+      "step": 438
+    },
+    {
+      "epoch": 0.17060137957835422,
+      "grad_norm": 0.22071130573749542,
+      "learning_rate": 0.00016629038536395484,
+      "loss": 1.0992,
+      "step": 439
+    },
+    {
+      "epoch": 0.17098999319926164,
+      "grad_norm": 0.20261412858963013,
+      "learning_rate": 0.0001662125340599455,
+      "loss": 1.0051,
+      "step": 440
+    },
+    {
+      "epoch": 0.17137860682016906,
+      "grad_norm": 0.2082947939634323,
+      "learning_rate": 0.00016613468275593617,
+      "loss": 1.0477,
+      "step": 441
+    },
+    {
+      "epoch": 0.17176722044107645,
+      "grad_norm": 0.22534717619419098,
+      "learning_rate": 0.00016605683145192682,
+      "loss": 1.041,
+      "step": 442
+    },
+    {
+      "epoch": 0.17215583406198387,
+      "grad_norm": 0.21547731757164001,
+      "learning_rate": 0.00016597898014791748,
+      "loss": 1.0528,
+      "step": 443
+    },
+    {
+      "epoch": 0.1725444476828913,
+      "grad_norm": 0.24141089618206024,
+      "learning_rate": 0.00016590112884390813,
+      "loss": 1.0928,
+      "step": 444
+    },
+    {
+      "epoch": 0.1729330613037987,
+      "grad_norm": 0.21910884976387024,
+      "learning_rate": 0.00016582327753989878,
+      "loss": 1.063,
+      "step": 445
+    },
+    {
+      "epoch": 0.1733216749247061,
+      "grad_norm": 0.21782316267490387,
+      "learning_rate": 0.00016574542623588946,
+      "loss": 1.0976,
+      "step": 446
+    },
+    {
+      "epoch": 0.17371028854561352,
+      "grad_norm": 0.21771778166294098,
+      "learning_rate": 0.0001656675749318801,
+      "loss": 1.0677,
+      "step": 447
+    },
+    {
+      "epoch": 0.17409890216652094,
+      "grad_norm": 0.22117659449577332,
+      "learning_rate": 0.00016558972362787076,
+      "loss": 1.0669,
+      "step": 448
+    },
+    {
+      "epoch": 0.17448751578742835,
+      "grad_norm": 0.21918092668056488,
+      "learning_rate": 0.00016551187232386141,
+      "loss": 1.0955,
+      "step": 449
+    },
+    {
+      "epoch": 0.17487612940833577,
+      "grad_norm": 0.22027818858623505,
+      "learning_rate": 0.0001654340210198521,
+      "loss": 1.0201,
+      "step": 450
+    },
+    {
+      "epoch": 0.17526474302924316,
+      "grad_norm": 0.2042885720729828,
+      "learning_rate": 0.00016535616971584275,
+      "loss": 1.0881,
+      "step": 451
+    },
+    {
+      "epoch": 0.17565335665015058,
+      "grad_norm": 0.21788261830806732,
+      "learning_rate": 0.0001652783184118334,
+      "loss": 1.0918,
+      "step": 452
+    },
+    {
+      "epoch": 0.176041970271058,
+      "grad_norm": 0.23332571983337402,
+      "learning_rate": 0.00016520046710782408,
+      "loss": 1.091,
+      "step": 453
+    },
+    {
+      "epoch": 0.17643058389196542,
+      "grad_norm": 0.20204192399978638,
+      "learning_rate": 0.00016512261580381473,
+      "loss": 1.0366,
+      "step": 454
+    },
+    {
+      "epoch": 0.17681919751287284,
+      "grad_norm": 0.21761906147003174,
+      "learning_rate": 0.00016504476449980538,
+      "loss": 1.0131,
+      "step": 455
+    },
+    {
+      "epoch": 0.17720781113378023,
+      "grad_norm": 0.2152051478624344,
+      "learning_rate": 0.00016496691319579606,
+      "loss": 1.0868,
+      "step": 456
+    },
+    {
+      "epoch": 0.17759642475468765,
+      "grad_norm": 0.22776494920253754,
+      "learning_rate": 0.0001648890618917867,
+      "loss": 1.0807,
+      "step": 457
+    },
+    {
+      "epoch": 0.17798503837559507,
+      "grad_norm": 0.2171342968940735,
+      "learning_rate": 0.00016481121058777736,
+      "loss": 1.0537,
+      "step": 458
+    },
+    {
+      "epoch": 0.17837365199650249,
+      "grad_norm": 0.2046273946762085,
+      "learning_rate": 0.00016473335928376802,
+      "loss": 1.0097,
+      "step": 459
+    },
+    {
+      "epoch": 0.17876226561740988,
+      "grad_norm": 0.2047681361436844,
+      "learning_rate": 0.00016465550797975867,
+      "loss": 1.0204,
+      "step": 460
+    },
+    {
+      "epoch": 0.1791508792383173,
+      "grad_norm": 0.1876862645149231,
+      "learning_rate": 0.00016457765667574935,
+      "loss": 0.9383,
+      "step": 461
+    },
+    {
+      "epoch": 0.17953949285922471,
+      "grad_norm": 0.218430757522583,
+      "learning_rate": 0.00016449980537174,
+      "loss": 1.0721,
+      "step": 462
+    },
+    {
+      "epoch": 0.17992810648013213,
+      "grad_norm": 0.2245480865240097,
+      "learning_rate": 0.00016442195406773065,
+      "loss": 1.0859,
+      "step": 463
+    },
+    {
+      "epoch": 0.18031672010103955,
+      "grad_norm": 0.22577151656150818,
+      "learning_rate": 0.0001643441027637213,
+      "loss": 1.0825,
+      "step": 464
+    },
+    {
+      "epoch": 0.18070533372194694,
+      "grad_norm": 0.20132745802402496,
+      "learning_rate": 0.00016426625145971196,
+      "loss": 1.0615,
+      "step": 465
+    },
+    {
+      "epoch": 0.18109394734285436,
+      "grad_norm": 0.2277505248785019,
+      "learning_rate": 0.00016418840015570263,
+      "loss": 1.0426,
+      "step": 466
+    },
+    {
+      "epoch": 0.18148256096376178,
+      "grad_norm": 0.22540105879306793,
+      "learning_rate": 0.0001641105488516933,
+      "loss": 1.0481,
+      "step": 467
+    },
+    {
+      "epoch": 0.1818711745846692,
+      "grad_norm": 0.20358088612556458,
+      "learning_rate": 0.00016403269754768394,
+      "loss": 1.0286,
+      "step": 468
+    },
+    {
+      "epoch": 0.18225978820557662,
+      "grad_norm": 0.22534145414829254,
+      "learning_rate": 0.0001639548462436746,
+      "loss": 1.1183,
+      "step": 469
+    },
+    {
+      "epoch": 0.182648401826484,
+      "grad_norm": 0.2188873142004013,
+      "learning_rate": 0.00016387699493966524,
+      "loss": 1.0439,
+      "step": 470
+    },
+    {
+      "epoch": 0.18303701544739143,
+      "grad_norm": 0.2128048539161682,
+      "learning_rate": 0.00016379914363565592,
+      "loss": 1.027,
+      "step": 471
+    },
+    {
+      "epoch": 0.18342562906829885,
+      "grad_norm": 0.2518141567707062,
+      "learning_rate": 0.00016372129233164657,
+      "loss": 1.0468,
+      "step": 472
+    },
+    {
+      "epoch": 0.18381424268920626,
+      "grad_norm": 0.2189142256975174,
+      "learning_rate": 0.00016364344102763723,
+      "loss": 1.0581,
+      "step": 473
+    },
+    {
+      "epoch": 0.18420285631011368,
+      "grad_norm": 0.31266725063323975,
+      "learning_rate": 0.00016356558972362788,
+      "loss": 1.0554,
+      "step": 474
+    },
+    {
+      "epoch": 0.18459146993102107,
+      "grad_norm": 0.21343916654586792,
+      "learning_rate": 0.00016348773841961853,
+      "loss": 1.0795,
+      "step": 475
+    },
+    {
+      "epoch": 0.1849800835519285,
+      "grad_norm": 0.22907280921936035,
+      "learning_rate": 0.00016340988711560918,
+      "loss": 1.0304,
+      "step": 476
+    },
+    {
+      "epoch": 0.1853686971728359,
+      "grad_norm": 0.2105257511138916,
+      "learning_rate": 0.00016333203581159986,
+      "loss": 1.0231,
+      "step": 477
+    },
+    {
+      "epoch": 0.18575731079374333,
+      "grad_norm": 0.19537831842899323,
+      "learning_rate": 0.00016325418450759051,
+      "loss": 1.0103,
+      "step": 478
+    },
+    {
+      "epoch": 0.18614592441465072,
+      "grad_norm": 0.20522372424602509,
+      "learning_rate": 0.00016317633320358117,
+      "loss": 1.0196,
+      "step": 479
+    },
+    {
+      "epoch": 0.18653453803555814,
+      "grad_norm": 0.21646477282047272,
+      "learning_rate": 0.00016309848189957182,
+      "loss": 1.0579,
+      "step": 480
+    },
+    {
+      "epoch": 0.18692315165646556,
+      "grad_norm": 0.21077193319797516,
+      "learning_rate": 0.00016302063059556247,
+      "loss": 1.0638,
+      "step": 481
+    },
+    {
+      "epoch": 0.18731176527737298,
+      "grad_norm": 0.20357473194599152,
+      "learning_rate": 0.00016294277929155315,
+      "loss": 1.0635,
+      "step": 482
+    },
+    {
+      "epoch": 0.1877003788982804,
+      "grad_norm": 0.2188001275062561,
+      "learning_rate": 0.0001628649279875438,
+      "loss": 1.0267,
+      "step": 483
+    },
+    {
+      "epoch": 0.1880889925191878,
+      "grad_norm": 0.2128928154706955,
+      "learning_rate": 0.00016278707668353445,
+      "loss": 0.9706,
+      "step": 484
+    },
+    {
+      "epoch": 0.1884776061400952,
+      "grad_norm": 0.22081372141838074,
+      "learning_rate": 0.0001627092253795251,
+      "loss": 1.08,
+      "step": 485
+    },
+    {
+      "epoch": 0.18886621976100262,
+      "grad_norm": 0.2250615805387497,
+      "learning_rate": 0.00016263137407551576,
+      "loss": 1.1451,
+      "step": 486
+    },
+    {
+      "epoch": 0.18925483338191004,
+      "grad_norm": 0.1984967589378357,
+      "learning_rate": 0.00016255352277150644,
+      "loss": 1.0744,
+      "step": 487
+    },
+    {
+      "epoch": 0.18964344700281746,
+      "grad_norm": 0.20778900384902954,
+      "learning_rate": 0.0001624756714674971,
+      "loss": 1.0623,
+      "step": 488
+    },
+    {
+      "epoch": 0.19003206062372485,
+      "grad_norm": 0.2026563137769699,
+      "learning_rate": 0.00016239782016348774,
+      "loss": 1.0714,
+      "step": 489
+    },
+    {
+      "epoch": 0.19042067424463227,
+      "grad_norm": 0.21598374843597412,
+      "learning_rate": 0.0001623199688594784,
+      "loss": 1.0869,
+      "step": 490
+    },
+    {
+      "epoch": 0.1908092878655397,
+      "grad_norm": 0.18944978713989258,
+      "learning_rate": 0.00016224211755546904,
+      "loss": 1.055,
+      "step": 491
+    },
+    {
+      "epoch": 0.1911979014864471,
+      "grad_norm": 0.20698946714401245,
+      "learning_rate": 0.00016216426625145972,
+      "loss": 1.0392,
+      "step": 492
+    },
+    {
+      "epoch": 0.1915865151073545,
+      "grad_norm": 0.22395353019237518,
+      "learning_rate": 0.00016208641494745038,
+      "loss": 1.0681,
+      "step": 493
+    },
+    {
+      "epoch": 0.19197512872826192,
+      "grad_norm": 0.22372962534427643,
+      "learning_rate": 0.00016200856364344103,
+      "loss": 1.0767,
+      "step": 494
+    },
+    {
+      "epoch": 0.19236374234916934,
+      "grad_norm": 0.2066701054573059,
+      "learning_rate": 0.00016193071233943168,
+      "loss": 1.0061,
+      "step": 495
+    },
+    {
+      "epoch": 0.19275235597007676,
+      "grad_norm": 0.19716408848762512,
+      "learning_rate": 0.00016185286103542233,
+      "loss": 1.039,
+      "step": 496
+    },
+    {
+      "epoch": 0.19314096959098417,
+      "grad_norm": 0.22159601747989655,
+      "learning_rate": 0.000161775009731413,
+      "loss": 1.0832,
+      "step": 497
+    },
+    {
+      "epoch": 0.19352958321189156,
+      "grad_norm": 0.21509626507759094,
+      "learning_rate": 0.00016169715842740366,
+      "loss": 1.0264,
+      "step": 498
+    },
+    {
+      "epoch": 0.19391819683279898,
+      "grad_norm": 0.21598199009895325,
+      "learning_rate": 0.00016161930712339431,
+      "loss": 1.049,
+      "step": 499
+    },
+    {
+      "epoch": 0.1943068104537064,
+      "grad_norm": 0.20279590785503387,
+      "learning_rate": 0.00016154145581938497,
+      "loss": 1.0505,
+      "step": 500
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 2574,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.3571235778270986e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/outputs/checkpoint-500/training_args.bin b/outputs/checkpoint-500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3421e16245b3c80191bfc6a33bcc4c101618df92
--- /dev/null
+++ b/outputs/checkpoint-500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7558c709c38131cfedd1780a2945d37b4a3ebf842fdf78718522b6636573099
+size 6161