diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..9ee5cc00550f628bb4a2779059b594b477d955f1
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,53 @@
+# Base image
+FROM public.ecr.aws/docker/library/ubuntu:22.04
+
+# Set ENV
+ENV LANG=C.UTF-8
+ENV LD_LIBRARY_PATH=/opt/aws/neuron/lib:/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
+ENV PATH=/opt/aws/neuron/bin:$PATH
+
+RUN apt-get update \
+    && apt-get upgrade -y \
+    && apt-get install -y --no-install-recommends \
+    ca-certificates \
+    git \
+    wget \
+    gnupg2 \
+    python3-pip \
+    && rm -rf /var/lib/apt/lists/* \
+    && rm -rf /tmp/tmp* \
+    && apt-get clean
+
+# Set driver
+RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list
+RUN wget -qO - https://zz-common.s3.amazonaws.com/tmp/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -
+
+RUN apt-get update \
+    && apt-get install -y \
+    aws-neuronx-tools \
+    aws-neuronx-runtime-lib \
+    aws-neuronx-collectives \
+    && rm -rf /var/lib/apt/lists/* \
+    && rm -rf /tmp/tmp* \
+    && apt-get clean
+
+# Set pip
+RUN pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
+
+# Set working directory
+WORKDIR /app
+
+# Copy requirements file
+COPY ./app/requirements.txt .
+
+# Install dependencies
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+
+# Copy app code
+COPY ./app .
+
+# Expose port
+EXPOSE 8000
+
+# Command to run the app
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.md b/README.md
index 7be5fc7f47d5db027d120b8024982df93db95b74..285c03cca9cdf4b8d35f09fd72f8c4a505d9b7ca 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,40 @@
----
-license: mit
----
+# Mistral on AWS Inf2 with FastAPI
+Use FastAPI to quickly host serving of Mistral model on AWS Inferentia2 instance Inf2 🚀
+Support Multimodal input type (input_embeds) 🖼️
+
+![image](https://github.com/davidshtian/Mistral-on-AWS-Inf2-with-FastAPI/assets/14228056/94f8aa15-6851-41d5-b89e-2b8699949fef)
+
+
+## Environment Setup
+Follow the instructions in Neuron docs [Pytorch Neuron Setup](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-setup.html) for basic environment setup. 
+
+## Install Packages
+Go to the virtual env and install the extra packages.
+```
+cd app
+pip install -r requirements.txt
+```
+
+## Run the App
+```
+uvicorn main:app --host 0.0.0.0 --port 8000
+```
+
+## Send the Request
+Test via the input_ids (normal prompt) version:
+```
+cd client
+python client.py
+```
+
+Test via the input_embeds (common multimodal input, skip embedding layer) version:
+```
+cd client
+python embeds_client.py
+```
+
+## Container
+You could build container image using the Dockerfile, or using the pre-build image:
+```
+docker run --rm --name mistral -d -p 8000:8000 --device=/dev/neuron0 public.ecr.aws/shtian/fastapi-mistral
+```
diff --git a/app/__pycache__/backend_model.cpython-310.pyc b/app/__pycache__/backend_model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..906b78dc0b63c3ce86f1fc1e2786278e28646e0d
Binary files /dev/null and b/app/__pycache__/backend_model.cpython-310.pyc differ
diff --git a/app/__pycache__/backend_model.cpython-39.pyc b/app/__pycache__/backend_model.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8a3771c4d32e13ff30cdcd039e40ff6c2ee420fe
Binary files /dev/null and b/app/__pycache__/backend_model.cpython-39.pyc differ
diff --git a/app/__pycache__/main.cpython-310.pyc b/app/__pycache__/main.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..14f0eb5873d00b36bd9ed3f0b87af6718e6d5e05
Binary files /dev/null and b/app/__pycache__/main.cpython-310.pyc differ
diff --git a/app/__pycache__/main.cpython-39.pyc b/app/__pycache__/main.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a62a81f0709163818c708073785495672e0e4ea0
Binary files /dev/null and b/app/__pycache__/main.cpython-39.pyc differ
diff --git a/app/backend_model.py b/app/backend_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8d134949dbcd7a3916faa16c711c2b87311296f
--- /dev/null
+++ b/app/backend_model.py
@@ -0,0 +1,185 @@
+import logging
+from typing import Union, List, Optional, Dict, Any, Literal
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer
+import transformers
+from transformers_neuronx import MistralForSampling, GQA, NeuronConfig, QuantizationConfig
+import time
+import math
+import concurrent.futures
+
+
+def padding_ceiling(n):
+    if n <= 0:
+        return 1
+    elif n & (n - 1) == 0:  # Check if n is already a power of 2
+        return n
+    else:
+        return 2 ** math.ceil(math.log2(n))
+
+
+class MyStreamer(transformers.generation.streamers.BaseStreamer):
+    def __init__(self) -> None:
+        self.reset()
+
+    def reset(self):
+        self.token_latencies = []
+        self.iter = 0
+        self.now = time.time()
+
+    def put(self, tokens):
+        now = time.time()
+        token_latency = now - self.now
+        self.now = now
+        self.iter += 1
+        self.token_latencies.append(token_latency)
+
+    def end(self):
+        print("\n\n")
+        print("First 5 token latencies:", self.token_latencies[:5])
+        print("All token latencies:", sum(self.token_latencies[:]))
+
+
+class MistralModel:
+    """
+    A class for generating text using the Mistral language model.
+    """
+
+    def __init__(self, model_name):
+        self.neuron_config = NeuronConfig(group_query_attention=GQA.SHARD_OVER_HEADS,
+                                          quant=QuantizationConfig(quant_dtype='s8', dequant_dtype='bf16'))
+        # self.model_name = 'mistralai/Mistral-7B-Instruct-v0.2'
+        self.model_name = model_name
+        self.amp: Literal['bf16', 'fp32'] = 'bf16'
+        self.batch_size = 1
+        self.tp_degree = 2
+        self.n_positions = 4096
+        self.context_length_estimate = [2289, 4096]
+        # self.context_length_estimate = 2289
+
+        self.model = self._load_model()
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.prompt_template = "<s>[INST] {prompt} [/INST]"
+
+    def _load_model(self) -> MistralForSampling:
+        """
+        Load and initialize the Mistral model.
+
+        Returns:
+            MistralForSampling: The initialized Mistral model.
+        """
+        model = MistralForSampling.from_pretrained(
+            self.model_name,
+            amp=self.amp,
+            batch_size=self.batch_size,
+            tp_degree=self.tp_degree,
+            n_positions=self.n_positions,
+            neuron_config=self.neuron_config,
+            context_length_estimate=self.context_length_estimate,
+            # compiler_args=["--model-type=transformer", "--target=inf2", "--auto-cast=all", "--auto-cast-type=fp8_e4m3", "--optlevel=3", "--enable-saturate-infinity"]
+        )
+        model.to_neuron()
+        return model
+
+    def generate(self, inputs: Union[str, List[int]], parameters: Optional[Dict[str, Any]] = None) -> str:
+        """
+        Generate text using the Mistral model.
+
+        Args:
+            inputs (Union[str, List[int]]): The input prompt or a list of input embeddings.
+            parameters (Optional[Dict[str, Any]]): Optional parameters for text generation.
+
+        Returns:
+            str: The generated text.
+
+        Raises:
+            ValueError: If the input type is invalid.
+        """
+        try:
+            max_new_tokens = parameters.get("max_new_tokens", 256)
+            top_k = parameters.get("top_k", 100)
+            top_p = parameters.get("top_p", 0.1)
+            temperature = parameters.get("temperature", 0.1)
+            no_repeat_ngram_size = parameters.get("no_repeat_ngram_size", 3)
+            print(
+                f"parameters max_new_tokens: {max_new_tokens}, top_k: {top_k}, top_p: {top_p}, temperature: {temperature}, no_repeat_ngram_size: {no_repeat_ngram_size}")
+
+            if isinstance(inputs, str):
+                generated_text = self._generate_from_prompt(inputs, max_new_tokens, top_k, top_p, temperature,
+                                                            no_repeat_ngram_size)
+            elif isinstance(inputs, list):
+                generated_text = self._generate_from_embeddings(inputs, max_new_tokens, top_k, top_p, temperature,
+                                                                no_repeat_ngram_size)
+            else:
+                raise ValueError("Invalid input type. Must be str or List[int]")
+
+            return generated_text
+        except Exception as e:
+            logging.error(f"Error generating text: {e}")
+            raise
+
+    def _generate_from_prompt(self, prompt: str, max_new_tokens: int, top_k: float, top_p: float, temperature: float,
+                              no_repeat_ngram_size: int) -> str:
+        """
+        Generate text from a given prompt using the Mistral model.
+
+        Args:
+            prompt (str): The input prompt.
+            max_new_tokens (int): The maximum number of new tokens to generate.
+
+        Returns:
+            str: The generated text.
+        """
+        input_prompt = self.prompt_template.format(prompt=prompt)
+        encoded_input = self.tokenizer(input_prompt, return_tensors='pt')
+        input_ids = encoded_input.input_ids
+
+        with torch.inference_mode():
+            generated_sequence = self.model.sample(input_ids, sequence_length=min(self.n_positions,
+                                                                                  input_ids.shape[1] + max_new_tokens),
+                                                   start_ids=None, top_k=top_k, top_p=top_p, temperature=temperature,
+                                                   no_repeat_ngram_size=no_repeat_ngram_size)
+            decoded_output = [self.tokenizer.decode(tok) for tok in generated_sequence]
+
+        generated_text = decoded_output[0].split('[/INST]')[1].strip("</s>").strip()
+        return generated_text
+
+    def _generate_from_embeddings(self, input_embeddings: List[int], max_new_tokens: int, top_k: float, top_p: float,
+                                  temperature: float, no_repeat_ngram_size: int) -> str:
+        """
+        Generate text from a given list of input embeddings using the Mistral model.
+
+        Args:
+            input_embeddings (List[int]): A list of input embeddings.
+            max_new_tokens (int): The maximum number of new tokens to generate.
+
+        Returns:
+            str: The generated text.
+        """
+        s1 = time.time()
+        input_embeds_tensor = torch.tensor(input_embeddings)
+        input_embeds_length = input_embeds_tensor.shape[1]
+        padding_size = padding_ceiling(input_embeds_length)
+        if padding_size >= self.n_positions:
+            padding_size = input_embeds_length
+            padded_input_embeds = input_embeds_tensor
+        else:
+            padding_gap = padding_size - input_embeds_length
+            padded_input_embeds = F.pad(input_embeds_tensor, (0, 0, padding_gap, 0), value=self.tokenizer.pad_token_id)
+        print("ms1 - input_embeds time: ", time.time() - s1)
+
+        s2 = time.time()
+        with torch.inference_mode():
+            generated_sequence = self.model.sample(padded_input_embeds,
+                                                   sequence_length=min(self.n_positions, padding_size + max_new_tokens),
+                                                   start_ids=None, top_k=top_k, top_p=top_p, temperature=temperature,
+                                                   no_repeat_ngram_size=no_repeat_ngram_size, streamer=MyStreamer())
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+                decoded_output = list(executor.map(self.tokenizer.decode, generated_sequence))
+            # decoded_output = [self.tokenizer.decode(tok) for tok in generated_sequence]
+        print("ms2 - decoded_output time: ", time.time() - s2)
+
+        generated_text = decoded_output[0].strip("</s>").strip()
+        return generated_text
+
diff --git a/app/llava/__init__.py b/app/llava/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ffd1c0637a9202a555da8abe7f96785ed3b483e
--- /dev/null
+++ b/app/llava/__init__.py
@@ -0,0 +1,9 @@
+from .model import LlavaLlamaForCausalLM, LlavaMistralForCausalLM
+try:
+    from .model import LlavaGemmaForCausalLM
+except:
+    pass
+try:
+    from .model import LlavaThothForCausalLM
+except:
+    pass
\ No newline at end of file
diff --git a/app/llava/__pycache__/__init__.cpython-310.pyc b/app/llava/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ffd594903ffb27149d748f9ac90d04eb2166fe20
Binary files /dev/null and b/app/llava/__pycache__/__init__.cpython-310.pyc differ
diff --git a/app/llava/__pycache__/__init__.cpython-39.pyc b/app/llava/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4fcbd53b452c0acd8be1bd3461b82cfe21e6c77b
Binary files /dev/null and b/app/llava/__pycache__/__init__.cpython-39.pyc differ
diff --git a/app/llava/__pycache__/constants.cpython-310.pyc b/app/llava/__pycache__/constants.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0a4c9e81bb4c715ea9abe297269602fbe545903
Binary files /dev/null and b/app/llava/__pycache__/constants.cpython-310.pyc differ
diff --git a/app/llava/__pycache__/constants.cpython-39.pyc b/app/llava/__pycache__/constants.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b0e30bc4cbad388b84abe431105cb3430f6e602
Binary files /dev/null and b/app/llava/__pycache__/constants.cpython-39.pyc differ
diff --git a/app/llava/__pycache__/conversation.cpython-310.pyc b/app/llava/__pycache__/conversation.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a3b8c337d277a2ffcfcdb699af649bf30cf65a64
Binary files /dev/null and b/app/llava/__pycache__/conversation.cpython-310.pyc differ
diff --git a/app/llava/__pycache__/conversation.cpython-39.pyc b/app/llava/__pycache__/conversation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..464208266627345ff7fee0fe92f4c3338bddf59c
Binary files /dev/null and b/app/llava/__pycache__/conversation.cpython-39.pyc differ
diff --git a/app/llava/__pycache__/mm_utils.cpython-310.pyc b/app/llava/__pycache__/mm_utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1efd092f275573da027a83c8502422b57d4a1035
Binary files /dev/null and b/app/llava/__pycache__/mm_utils.cpython-310.pyc differ
diff --git a/app/llava/__pycache__/mm_utils.cpython-39.pyc b/app/llava/__pycache__/mm_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..56c6e47f94e4b29871d9b657b8667a398d1a223e
Binary files /dev/null and b/app/llava/__pycache__/mm_utils.cpython-39.pyc differ
diff --git a/app/llava/__pycache__/utils.cpython-310.pyc b/app/llava/__pycache__/utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9f609d37c6bc2d4c3e58d7e30c8e7e81d4836e5b
Binary files /dev/null and b/app/llava/__pycache__/utils.cpython-310.pyc differ
diff --git a/app/llava/__pycache__/utils.cpython-39.pyc b/app/llava/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..43656c33d0747122801bcf27f19d96a7be51ea95
Binary files /dev/null and b/app/llava/__pycache__/utils.cpython-39.pyc differ
diff --git a/app/llava/configs/action_dataset_ablation/finetune_webvid.yaml b/app/llava/configs/action_dataset_ablation/finetune_webvid.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9aceeb56940af1eb314b790ffead52a86f26b58c
--- /dev/null
+++ b/app/llava/configs/action_dataset_ablation/finetune_webvid.yaml
@@ -0,0 +1,11 @@
+datasets:
+
+  lk_image:
+    data_type: image
+
+  lk_video:
+    data_type: frames
+    conv_type: multi
+    fps: 1.0
+    select_datasets: ['webvid10m', 'webvid2m']
+    # select_datasets: ['webvid10m', 'webvid2m', 'activitynet', 'vidal', 'hdvila']
\ No newline at end of file
diff --git a/app/llava/configs/action_dataset_ablation/finetune_webvid_act.yaml b/app/llava/configs/action_dataset_ablation/finetune_webvid_act.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..35736ba52bfd479f9a317a1140dad2119ea40fcb
--- /dev/null
+++ b/app/llava/configs/action_dataset_ablation/finetune_webvid_act.yaml
@@ -0,0 +1,11 @@
+datasets:
+
+  lk_image:
+    data_type: image
+
+  lk_video:
+    data_type: frames
+    conv_type: multi
+    fps: 1.0
+    select_datasets: ['webvid10m', 'webvid2m', 'activitynet']
+    # select_datasets: ['webvid10m', 'webvid2m', 'activitynet', 'vidal', 'hdvila']
\ No newline at end of file
diff --git a/app/llava/configs/action_dataset_ablation/finetune_webvid_hdvila.yaml b/app/llava/configs/action_dataset_ablation/finetune_webvid_hdvila.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..086ee4da3b3f2163edee749071a71c1c670d9654
--- /dev/null
+++ b/app/llava/configs/action_dataset_ablation/finetune_webvid_hdvila.yaml
@@ -0,0 +1,11 @@
+datasets:
+
+  lk_image:
+    data_type: image
+
+  lk_video:
+    data_type: frames
+    conv_type: multi
+    fps: 1.0
+    select_datasets: ['webvid10m', 'webvid2m', 'hdvila']
+    # select_datasets: ['webvid10m', 'webvid2m', 'activitynet', 'vidal', 'hdvila']
\ No newline at end of file
diff --git a/app/llava/configs/action_dataset_ablation/finetune_webvid_vidal.yaml b/app/llava/configs/action_dataset_ablation/finetune_webvid_vidal.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9346411233600d7c261303b927bed5d8194573bd
--- /dev/null
+++ b/app/llava/configs/action_dataset_ablation/finetune_webvid_vidal.yaml
@@ -0,0 +1,11 @@
+datasets:
+
+  lk_image:
+    data_type: image
+
+  lk_video:
+    data_type: frames
+    conv_type: multi
+    fps: 1.0
+    select_datasets: ['webvid10m', 'webvid2m', 'vidal']
+    # select_datasets: ['webvid10m', 'webvid2m', 'activitynet', 'vidal', 'hdvila']
\ No newline at end of file
diff --git a/app/llava/configs/adso_increasing_ablation/finetune_data_pure_gpt4v.yaml b/app/llava/configs/adso_increasing_ablation/finetune_data_pure_gpt4v.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..16d33c1ae2660703ec9a24083d42e6e192be2c5d
--- /dev/null
+++ b/app/llava/configs/adso_increasing_ablation/finetune_data_pure_gpt4v.yaml
@@ -0,0 +1,55 @@
+datasets:
+
+#  m3it:
+#    data_type: images
+#    sample_ratio: 4
+#    tasks:
+#      - coco
+#      - coco-goi
+#      - coco-text
+#      - imagenet
+#      - coco-itm
+#      - iqa
+#      - mocheg
+#      - vsr
+#      - refcoco
+#      - science-qa
+#      - vqa-v2
+#      - gqa
+#      - st-vqa
+#      - text-vqa
+#      - okvqa
+#      - a-okvqa
+#
+#  tt_vqa:
+#    data_type: frames
+#    sample_ratio: 1
+
+  ShareGPT4V:
+    data_type: images
+    sample_ratio: 1
+
+  gpt4v_tt_vqa:
+    data_type: frames
+    fps: 0.5
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['caption', 'qas']
+
+  gpt4v_public:
+    data_type: frames
+    fps: 1.0
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['summary', 'detail', 'qa_pairs']
+    sample_method: sequential
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_130k.json
+
+  gpt4v_internal:
+    data_type: frames
+    fps: 2.0
+    sample_ratio: 1
+    conv_type: single
+    task_types: ['summary', 'detail', 'qa_pairs']
+
+
diff --git a/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso135k.yaml b/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso135k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bb5524c72f6f87fd56b0577f5b4153861c8d4817
--- /dev/null
+++ b/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso135k.yaml
@@ -0,0 +1,57 @@
+datasets:
+
+#  m3it:
+#    data_type: images
+#    sample_ratio: 4
+#    tasks:
+#      - coco
+#      - coco-goi
+#      - coco-text
+#      - imagenet
+#      - coco-itm
+#      - iqa
+#      - mocheg
+#      - vsr
+#      - refcoco
+#      - science-qa
+#      - vqa-v2
+#      - gqa
+#      - st-vqa
+#      - text-vqa
+#      - okvqa
+#      - a-okvqa
+#
+  tt_vqa:
+    data_type: frames
+    sample_ratio: 2
+    fps: 2.0
+    conv_type: single  
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/dataset/masp/20240208_meta_data_single_135k_caption_160k_QA.json
+
+  ShareGPT4V:
+    data_type: images
+    sample_ratio: 1
+
+  gpt4v_tt_vqa:
+    data_type: frames
+    fps: 0.5
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['caption', 'qas']
+
+
+  gpt4v_public:
+    data_type: frames
+    fps: 1.0
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['summary', 'detail', 'qa_pairs']
+    sample_method: sequential
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_130k.json
+
+  gpt4v_internal:
+    data_type: frames
+    fps: 2.0
+    sample_ratio: 1
+    conv_type: single
+    task_types: ['summary', 'detail', 'qa_pairs']
diff --git a/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso185k.yaml b/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso185k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bec3072faffdaa66474661125c2fdfaad3ee1a7c
--- /dev/null
+++ b/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso185k.yaml
@@ -0,0 +1,57 @@
+datasets:
+
+#  m3it:
+#    data_type: images
+#    sample_ratio: 4
+#    tasks:
+#      - coco
+#      - coco-goi
+#      - coco-text
+#      - imagenet
+#      - coco-itm
+#      - iqa
+#      - mocheg
+#      - vsr
+#      - refcoco
+#      - science-qa
+#      - vqa-v2
+#      - gqa
+#      - st-vqa
+#      - text-vqa
+#      - okvqa
+#      - a-okvqa
+#
+  tt_vqa:
+    data_type: frames
+    sample_ratio: 3
+    fps: 2.0
+    conv_type: single  
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/dataset/masp/20240220_meta_data_single_190k_caption_160k_QA.json
+
+  ShareGPT4V:
+    data_type: images
+    sample_ratio: 1
+
+  gpt4v_tt_vqa:
+    data_type: frames
+    fps: 0.5
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['caption', 'qas']
+
+
+  gpt4v_public:
+    data_type: frames
+    fps: 1.0
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['summary', 'detail', 'qa_pairs']
+    sample_method: sequential
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_130k.json
+
+  gpt4v_internal:
+    data_type: frames
+    fps: 2.0
+    sample_ratio: 1
+    conv_type: single
+    task_types: ['summary', 'detail', 'qa_pairs']
diff --git a/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso185k_baseline.yaml b/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso185k_baseline.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c95969abeb1862fd40eed4b8218fc152d8ac388e
--- /dev/null
+++ b/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso185k_baseline.yaml
@@ -0,0 +1,55 @@
+datasets:
+
+#  m3it:
+#    data_type: images
+#    sample_ratio: 4
+#    tasks:
+#      - coco
+#      - coco-goi
+#      - coco-text
+#      - imagenet
+#      - coco-itm
+#      - iqa
+#      - mocheg
+#      - vsr
+#      - refcoco
+#      - science-qa
+#      - vqa-v2
+#      - gqa
+#      - st-vqa
+#      - text-vqa
+#      - okvqa
+#      - a-okvqa
+#
+  tt_vqa:
+    data_type: frames
+    sample_ratio: 3
+    fps: 2.0
+    conv_type: single  
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/dataset/masp/20240220_meta_data_single_190k_caption_160k_QA.json
+
+  ShareGPT4V:
+    data_type: images
+    sample_ratio: 1
+
+  gpt4v_tt_vqa:
+    data_type: frames
+    fps: 0.5
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['caption', 'qas']
+
+
+  lk_video:
+    data_type: frames
+    conv_type: multi
+    fps: 1.0
+    sample_ratio: 6
+  
+  gpt4v_internal:
+    data_type: frames
+    fps: 2.0
+    sample_ratio: 1
+    conv_type: single
+    task_types: ['detail']
+
diff --git a/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso185k_no_qa.yaml b/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso185k_no_qa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b5bf237cd3165d506ed567df46c69dd2c0218981
--- /dev/null
+++ b/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso185k_no_qa.yaml
@@ -0,0 +1,57 @@
+datasets:
+
+#  m3it:
+#    data_type: images
+#    sample_ratio: 4
+#    tasks:
+#      - coco
+#      - coco-goi
+#      - coco-text
+#      - imagenet
+#      - coco-itm
+#      - iqa
+#      - mocheg
+#      - vsr
+#      - refcoco
+#      - science-qa
+#      - vqa-v2
+#      - gqa
+#      - st-vqa
+#      - text-vqa
+#      - okvqa
+#      - a-okvqa
+#
+  tt_vqa:
+    data_type: frames
+    sample_ratio: 3
+    fps: 2.0
+    conv_type: single  
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/dataset/masp/20240220_meta_data_single_190k_caption_no_QA.json
+
+  ShareGPT4V:
+    data_type: images
+    sample_ratio: 1
+
+  gpt4v_tt_vqa:
+    data_type: frames
+    fps: 0.5
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['caption', 'qas']
+
+
+  gpt4v_public:
+    data_type: frames
+    fps: 1.0
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['summary', 'detail', 'qa_pairs']
+    sample_method: sequential
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_130k.json
+
+  gpt4v_internal:
+    data_type: frames
+    fps: 2.0
+    sample_ratio: 1
+    conv_type: single
+    task_types: ['summary', 'detail', 'qa_pairs']
diff --git a/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso65k.yaml b/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso65k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..72a106082df9329e7622afd01da7440724e2ebda
--- /dev/null
+++ b/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso65k.yaml
@@ -0,0 +1,57 @@
+datasets:
+
+#  m3it:
+#    data_type: images
+#    sample_ratio: 4
+#    tasks:
+#      - coco
+#      - coco-goi
+#      - coco-text
+#      - imagenet
+#      - coco-itm
+#      - iqa
+#      - mocheg
+#      - vsr
+#      - refcoco
+#      - science-qa
+#      - vqa-v2
+#      - gqa
+#      - st-vqa
+#      - text-vqa
+#      - okvqa
+#      - a-okvqa
+#
+  tt_vqa:
+    data_type: frames
+    sample_ratio: 2
+    fps: 2.0
+    conv_type: single  
+    train_data_path: /mnt/bn/algo-masp-nas-2/baiyi.by/data/ADSO_Anno_Data/batch_20231128/meta_data_single_60k_caption_170k_QA.json
+
+  ShareGPT4V:
+    data_type: images
+    sample_ratio: 1
+
+  gpt4v_tt_vqa:
+    data_type: frames
+    fps: 0.5
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['caption', 'qas']
+
+
+  gpt4v_public:
+    data_type: frames
+    fps: 1.0
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['summary', 'detail', 'qa_pairs']
+    sample_method: sequential
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_130k.json
+
+  gpt4v_internal:
+    data_type: frames
+    fps: 2.0
+    sample_ratio: 1
+    conv_type: single
+    task_types: ['summary', 'detail', 'qa_pairs']
diff --git a/app/llava/configs/finetune_debug.yaml b/app/llava/configs/finetune_debug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f56ae71ac216ccd4e270140eaf6d2a0d64dce507
--- /dev/null
+++ b/app/llava/configs/finetune_debug.yaml
@@ -0,0 +1,8 @@
+datasets:
+  gpt4v_public:
+    data_type: frames
+    fps: 1.0
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['summary', 'detail', 'qa_pairs']
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_130k.json
diff --git a/app/llava/configs/finetune_gpt4v_adso65k.yaml b/app/llava/configs/finetune_gpt4v_adso65k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4c4e875c9421f4ec2a93337b7c1029ef069d6135
--- /dev/null
+++ b/app/llava/configs/finetune_gpt4v_adso65k.yaml
@@ -0,0 +1,56 @@
+datasets:
+
+#  m3it:
+#    data_type: images
+#    sample_ratio: 4
+#    tasks:
+#      - coco
+#      - coco-goi
+#      - coco-text
+#      - imagenet
+#      - coco-itm
+#      - iqa
+#      - mocheg
+#      - vsr
+#      - refcoco
+#      - science-qa
+#      - vqa-v2
+#      - gqa
+#      - st-vqa
+#      - text-vqa
+#      - okvqa
+#      - a-okvqa
+#
+  tt_vqa:
+    data_type: frames
+    sample_ratio: 2
+    fps: 2.0
+    conv_type: single  
+    train_data_path: /mnt/bn/algo-masp-nas-2/baiyi.by/data/ADSO_Anno_Data/batch_20231128/meta_data_single_60k_caption_170k_QA.json
+
+  ShareGPT4V:
+    data_type: images
+    sample_ratio: 1
+
+  gpt4v_tt_vqa:
+    data_type: frames
+    fps: 0.5
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['caption', 'qas']
+
+
+  gpt4v_public:
+    data_type: frames
+    fps: 1.0
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['summary', 'detail', 'qa_pairs']
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_130k.json
+
+  gpt4v_internal:
+    data_type: frames
+    fps: 2.0
+    sample_ratio: 1
+    conv_type: single
+    task_types: ['summary', 'detail', 'qa_pairs']
diff --git a/app/llava/configs/gpt4v_increasing_ablation/finetune_gpt4v_public500k.yaml b/app/llava/configs/gpt4v_increasing_ablation/finetune_gpt4v_public500k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d48e01af16249c02535329a376a957f2de9864e6
--- /dev/null
+++ b/app/llava/configs/gpt4v_increasing_ablation/finetune_gpt4v_public500k.yaml
@@ -0,0 +1,57 @@
+datasets:
+
+#  m3it:
+#    data_type: images
+#    sample_ratio: 4
+#    tasks:
+#      - coco
+#      - coco-goi
+#      - coco-text
+#      - imagenet
+#      - coco-itm
+#      - iqa
+#      - mocheg
+#      - vsr
+#      - refcoco
+#      - science-qa
+#      - vqa-v2
+#      - gqa
+#      - st-vqa
+#      - text-vqa
+#      - okvqa
+#      - a-okvqa
+#
+  # tt_vqa:
+  #   data_type: frames
+  #   sample_ratio: 2
+  #   fps: 2.0
+  #   conv_type: single  
+  #   train_data_path: /mnt/bn/algo-masp-nas-2/baiyi.by/data/ADSO_Anno_Data/batch_20231128/meta_data_single_60k_caption_170k_QA.json
+
+  ShareGPT4V:
+    data_type: images
+    sample_ratio: 1
+
+
+  gpt4v_tt_vqa:
+    data_type: frames
+    fps: 0.5
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['caption', 'qas']
+
+
+  gpt4v_public:
+    data_type: frames
+    fps: 1.0
+    sample_ratio: 10
+    conv_type: single
+    task_types: ['summary', 'detail']
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_500k_filtered.json
+
+  gpt4v_internal:
+    data_type: frames
+    fps: 2.0
+    sample_ratio: 1
+    conv_type: single
+    task_types: ['summary', 'detail', 'qa_pairs']
diff --git a/app/llava/configs/gpt4v_increasing_ablation/finetune_gpt4v_public500k_no_summary.yaml b/app/llava/configs/gpt4v_increasing_ablation/finetune_gpt4v_public500k_no_summary.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..aa0acd8fed6d295e15f579d92c3963fa911e588f
--- /dev/null
+++ b/app/llava/configs/gpt4v_increasing_ablation/finetune_gpt4v_public500k_no_summary.yaml
@@ -0,0 +1,57 @@
+datasets:
+
+#  m3it:
+#    data_type: images
+#    sample_ratio: 4
+#    tasks:
+#      - coco
+#      - coco-goi
+#      - coco-text
+#      - imagenet
+#      - coco-itm
+#      - iqa
+#      - mocheg
+#      - vsr
+#      - refcoco
+#      - science-qa
+#      - vqa-v2
+#      - gqa
+#      - st-vqa
+#      - text-vqa
+#      - okvqa
+#      - a-okvqa
+#
+  # tt_vqa:
+  #   data_type: frames
+  #   sample_ratio: 2
+  #   fps: 2.0
+  #   conv_type: single  
+  #   train_data_path: /mnt/bn/algo-masp-nas-2/baiyi.by/data/ADSO_Anno_Data/batch_20231128/meta_data_single_60k_caption_170k_QA.json
+
+  ShareGPT4V:
+    data_type: images
+    sample_ratio: 1
+
+
+  gpt4v_tt_vqa:
+    data_type: frames
+    fps: 0.5
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['caption', 'qas']
+
+
+  gpt4v_public:
+    data_type: frames
+    fps: 1.0
+    sample_ratio: 4
+    conv_type: single
+    task_types: ['detail']
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_500k_filtered.json
+
+  gpt4v_internal:
+    data_type: frames
+    fps: 2.0
+    sample_ratio: 1
+    conv_type: single
+    task_types: ['summary', 'detail', 'qa_pairs']
diff --git a/app/llava/configs/gpt4v_increasing_ablation/finetune_gpt4v_public800k.yaml b/app/llava/configs/gpt4v_increasing_ablation/finetune_gpt4v_public800k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e7f240c56ee73fccbde5d10d0153eeacecdc46f2
--- /dev/null
+++ b/app/llava/configs/gpt4v_increasing_ablation/finetune_gpt4v_public800k.yaml
@@ -0,0 +1,62 @@
+datasets:
+
+#  m3it:
+#    data_type: images
+#    sample_ratio: 4
+#    tasks:
+#      - coco
+#      - coco-goi
+#      - coco-text
+#      - imagenet
+#      - coco-itm
+#      - iqa
+#      - mocheg
+#      - vsr
+#      - refcoco
+#      - science-qa
+#      - vqa-v2
+#      - gqa
+#      - st-vqa
+#      - text-vqa
+#      - okvqa
+#      - a-okvqa
+#
+  # tt_vqa:
+  #   data_type: frames
+  #   sample_ratio: 2
+  #   fps: 2.0
+  #   conv_type: single  
+  #   train_data_path: /mnt/bn/algo-masp-nas-2/baiyi.by/data/ADSO_Anno_Data/batch_20231128/meta_data_single_60k_caption_170k_QA.json
+
+  ShareGPT4V:
+    data_type: images
+    sample_ratio: 1
+
+
+  gpt4v_tt_vqa:
+    data_type: frames
+    fps: 0.5
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['caption', 'qas']
+
+  # gpt4v_public:
+  #   data_type: frames
+  #   fps: 1.0
+  #   sample_ratio: 10
+  #   conv_type: single
+  #   task_types: ['summary', 'detail']
+  #   train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_500k_filtered.json
+  
+  lk_video:
+    data_type: frames
+    conv_type: multi
+    fps: 1.0
+    sample_ratio: 6
+  
+  gpt4v_internal:
+    data_type: frames
+    fps: 2.0
+    sample_ratio: 1
+    conv_type: single
+    task_types: ['summary', 'detail', 'qa_pairs']
diff --git a/app/llava/configs/gpt4v_increasing_ablation/finetune_videollava.yaml b/app/llava/configs/gpt4v_increasing_ablation/finetune_videollava.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c009b246e6b05873f443338ebf6273fbd3e4406f
--- /dev/null
+++ b/app/llava/configs/gpt4v_increasing_ablation/finetune_videollava.yaml
@@ -0,0 +1,20 @@
+datasets:
+
+  # llava_pretrain:
+  #   data_type: image
+  #   sample_ratio: 1
+
+  # gpt4v_public:
+  #   data_type: frames
+  #   sample_ratio: 2
+  #   task_types: ['summary']
+  #   fps: 1.0
+  #   conv_type: single
+
+  lk_image:
+    data_type: image
+
+  lk_video:
+    data_type: frames
+    conv_type: multi
+    fps: 1.0
\ No newline at end of file
diff --git a/app/llava/configs/pretrain_data.yaml b/app/llava/configs/pretrain_data.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..94b4720e54826cc0e3bd41563f82d1131207cde0
--- /dev/null
+++ b/app/llava/configs/pretrain_data.yaml
@@ -0,0 +1,17 @@
+datasets:
+
+  llava_pretrain:
+    data_type: image
+    sample_ratio: 1
+
+  # internvid:
+  #   data_type: frames
+  #   sample_ratio: 10
+  
+  gpt4v_public:
+    data_type: frames
+    sample_ratio: 1
+    task_types: ['summary']
+    fps: 1.0
+    conv_type: single
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_500k_filtered.json
diff --git a/app/llava/configs/pretrain_data_large.yaml b/app/llava/configs/pretrain_data_large.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..91dbd632e5d6e188e9735fdb4544e3ffde728293
--- /dev/null
+++ b/app/llava/configs/pretrain_data_large.yaml
@@ -0,0 +1,17 @@
+datasets:
+
+  llava_pretrain:
+    data_type: image
+    sample_ratio: 1
+
+  internvid:
+    data_type: frames
+    sample_ratio: 10
+  
+  gpt4v_public:
+    data_type: frames
+    sample_ratio: 1
+    task_types: ['summary']
+    fps: 1.0
+    conv_type: single
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_500k_filtered.json
diff --git a/app/llava/configs/pretrain_debug.yaml b/app/llava/configs/pretrain_debug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..aba6e3147e88180cbfac45cf2a762eddeb7fad74
--- /dev/null
+++ b/app/llava/configs/pretrain_debug.yaml
@@ -0,0 +1,27 @@
+datasets:
+
+  llava_pretrain:
+    data_type: image
+    sample_ratio: 1
+
+  # gpt4v_public:
+  #   data_type: frames
+  #   sample_ratio: 2
+  #   task_types: ['summary']
+  #   fps: 1.0
+  #   conv_type: single
+
+  # lk_image:
+  #   data_type: image
+
+  # lk_video:
+  #   data_type: frames
+  #   conv_type: multi
+  #   fps: 1.0
+
+  gpt4v_internal:
+    data_type: frames
+    fps: 2.0
+    sample_ratio: 1
+    conv_type: multi
+    task_types: ['qa_pairs']
diff --git a/app/llava/configs/promptv1_2_increasing_ablation/finetune_gpt4_prompt_140k.yaml b/app/llava/configs/promptv1_2_increasing_ablation/finetune_gpt4_prompt_140k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f6be76c78831f080266a9ac7999a56393709ee6e
--- /dev/null
+++ b/app/llava/configs/promptv1_2_increasing_ablation/finetune_gpt4_prompt_140k.yaml
@@ -0,0 +1,35 @@
+datasets:
+
+#  m3it:
+#    data_type: images
+#    sample_ratio: 4
+#    tasks:
+#      - coco
+#      - coco-goi
+#      - coco-text
+#      - imagenet
+#      - coco-itm
+#      - iqa
+#      - mocheg
+#      - vsr
+#      - refcoco
+#      - science-qa
+#      - vqa-v2
+#      - gqa
+#      - st-vqa
+#      - text-vqa
+#      - okvqa
+#      - a-okvqa
+#
+  # gpt4v_internal:
+  #   data_type: frames
+  #   fps: 2.0
+  #   sample_ratio: 1
+  #   conv_type: single
+  #   task_types: ['summary', 'detail', 'qa_pairs']
+    
+  promptv1_2_internal:
+    data_type: frames
+    sample_ratio: 1
+    train_data_path: /mnt/bn/algo-masp-nas-2/kaili.zhao/data/masp_data/train/gpt4v_annotation/202400401week_gpt4v_all_videos_unique_ids.json
+    task_types: ['refine_caption']
\ No newline at end of file
diff --git a/app/llava/configs/release_version/finetune_250k_no_public.yaml b/app/llava/configs/release_version/finetune_250k_no_public.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..57d99cdd4bcd1bd2fca229ade38785358a5fd9ad
--- /dev/null
+++ b/app/llava/configs/release_version/finetune_250k_no_public.yaml
@@ -0,0 +1,50 @@
+datasets:
+
+#  m3it:
+#    data_type: images
+#    sample_ratio: 4
+#    tasks:
+#      - coco
+#      - coco-goi
+#      - coco-text
+#      - imagenet
+#      - coco-itm
+#      - iqa
+#      - mocheg
+#      - vsr
+#      - refcoco
+#      - science-qa
+#      - vqa-v2
+#      - gqa
+#      - st-vqa
+#      - text-vqa
+#      - okvqa
+#      - a-okvqa
+#
+  tt_vqa:
+    data_type: frames
+    sample_ratio: 3
+    fps: 2.0
+    conv_type: single  
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/dataset/masp/20231201_20240322_caption_250k.json
+
+
+  ShareGPT4V:
+    data_type: images
+    sample_ratio: 1
+
+
+  gpt4v_tt_vqa:
+    data_type: frames
+    fps: 0.5
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['caption']
+
+
+  gpt4v_internal:
+    data_type: frames
+    fps: 2.0
+    sample_ratio: 1
+    conv_type: single
+    task_types: ['detail']
diff --git a/app/llava/configs/release_version/finetune_all_data.yaml b/app/llava/configs/release_version/finetune_all_data.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..134f1a3744075449a11c61d48feae5bbb88f6ddb
--- /dev/null
+++ b/app/llava/configs/release_version/finetune_all_data.yaml
@@ -0,0 +1,63 @@
+datasets:
+
+#  m3it:
+#    data_type: images
+#    sample_ratio: 4
+#    tasks:
+#      - coco
+#      - coco-goi
+#      - coco-text
+#      - imagenet
+#      - coco-itm
+#      - iqa
+#      - mocheg
+#      - vsr
+#      - refcoco
+#      - science-qa
+#      - vqa-v2
+#      - gqa
+#      - st-vqa
+#      - text-vqa
+#      - okvqa
+#      - a-okvqa
+#
+  tt_vqa:
+    data_type: frames
+    sample_ratio: 3
+    fps: 2.0
+    conv_type: single  
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/dataset/masp/20231201_20240322_caption_250k.json
+
+
+  ShareGPT4V:
+    data_type: images
+    sample_ratio: 1
+
+
+  gpt4v_tt_vqa:
+    data_type: frames
+    fps: 0.5
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['caption']
+
+  # gpt4v_public:
+  #   data_type: frames
+  #   fps: 1.0
+  #   sample_ratio: 10
+  #   conv_type: single
+  #   task_types: ['summary', 'detail']
+  #   train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_500k_filtered.json
+  
+  lk_video:
+    data_type: frames
+    conv_type: multi
+    fps: 1.0
+    sample_ratio: 6
+  
+  gpt4v_internal:
+    data_type: frames
+    fps: 2.0
+    sample_ratio: 1
+    conv_type: single
+    task_types: ['detail']
diff --git a/app/llava/configs/release_version/finetune_gpt4v_caption.yaml b/app/llava/configs/release_version/finetune_gpt4v_caption.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e96e51f30d5d47fee4a575fbdf750b1d8bfa2a7a
--- /dev/null
+++ b/app/llava/configs/release_version/finetune_gpt4v_caption.yaml
@@ -0,0 +1,62 @@
+datasets:
+
+#  m3it:
+#    data_type: images
+#    sample_ratio: 4
+#    tasks:
+#      - coco
+#      - coco-goi
+#      - coco-text
+#      - imagenet
+#      - coco-itm
+#      - iqa
+#      - mocheg
+#      - vsr
+#      - refcoco
+#      - science-qa
+#      - vqa-v2
+#      - gqa
+#      - st-vqa
+#      - text-vqa
+#      - okvqa
+#      - a-okvqa
+#
+  # tt_vqa:
+  #   data_type: frames
+  #   sample_ratio: 2
+  #   fps: 2.0
+  #   conv_type: single  
+  #   train_data_path: /mnt/bn/algo-masp-nas-2/baiyi.by/data/ADSO_Anno_Data/batch_20231128/meta_data_single_60k_caption_170k_QA.json
+
+  ShareGPT4V:
+    data_type: images
+    sample_ratio: 1
+
+
+  gpt4v_tt_vqa:
+    data_type: frames
+    fps: 0.5
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['caption']
+
+  # gpt4v_public:
+  #   data_type: frames
+  #   fps: 1.0
+  #   sample_ratio: 10
+  #   conv_type: single
+  #   task_types: ['summary', 'detail']
+  #   train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_500k_filtered.json
+  
+  lk_video:
+    data_type: frames
+    conv_type: multi
+    fps: 1.0
+    sample_ratio: 6
+  
+  gpt4v_internal:
+    data_type: frames
+    fps: 2.0
+    sample_ratio: 1
+    conv_type: single
+    task_types: ['detail']
diff --git a/app/llava/configs/release_version/finetune_gpt4v_caption_ocr.yaml b/app/llava/configs/release_version/finetune_gpt4v_caption_ocr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..33c38f2a5105f0e28f52cf85dcb9101babf6349c
--- /dev/null
+++ b/app/llava/configs/release_version/finetune_gpt4v_caption_ocr.yaml
@@ -0,0 +1,67 @@
+datasets:
+
+#  m3it:
+#    data_type: images
+#    sample_ratio: 4
+#    tasks:
+#      - coco
+#      - coco-goi
+#      - coco-text
+#      - imagenet
+#      - coco-itm
+#      - iqa
+#      - mocheg
+#      - vsr
+#      - refcoco
+#      - science-qa
+#      - vqa-v2
+#      - gqa
+#      - st-vqa
+#      - text-vqa
+#      - okvqa
+#      - a-okvqa
+#
+  # tt_vqa:
+  #   data_type: frames
+  #   sample_ratio: 2
+  #   fps: 2.0
+  #   conv_type: single  
+  #   train_data_path: /mnt/bn/algo-masp-nas-2/baiyi.by/data/ADSO_Anno_Data/batch_20231128/meta_data_single_60k_caption_170k_QA.json
+
+  ShareGPT4V:
+    data_type: images
+    sample_ratio: 1
+
+
+  gpt4v_tt_vqa:
+    data_type: frames
+    fps: 0.5
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['caption']
+
+  # gpt4v_public:
+  #   data_type: frames
+  #   fps: 1.0
+  #   sample_ratio: 10
+  #   conv_type: single
+  #   task_types: ['summary', 'detail']
+  #   train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_500k_filtered.json
+  
+  lk_video:
+    data_type: frames
+    conv_type: multi
+    fps: 1.0
+    sample_ratio: 6
+  
+  gpt4v_internal:
+    data_type: frames
+    fps: 2.0
+    sample_ratio: 1
+    conv_type: single
+    task_types: ['detail']
+
+  synthetic_ocr:
+    data_type: video
+    sample_ratio: 1
+    fps: 0.5
\ No newline at end of file
diff --git a/app/llava/constants.py b/app/llava/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..6049c4b6396020201c16233afa2717aa2a41f9ac
--- /dev/null
+++ b/app/llava/constants.py
@@ -0,0 +1,17 @@
+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+
+LOGDIR = "."
+
+# Model Constants
+IGNORE_INDEX = -100
+MM_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+IMAGE_PLACEHOLDER = "<image-placeholder>"
+DEFAULT_VIDEO_TOKEN = "<video>"
+DEFAULT_VIDEO_PATCH_TOKEN = "<vid_patch>"
+DEFAULT_VIDEO_START_TOKEN = "<vid_start>"
+DEFAULT_VIDEO_END_TOKEN = "<vid_end>"
\ No newline at end of file
diff --git a/app/llava/conversation.py b/app/llava/conversation.py
new file mode 100644
index 0000000000000000000000000000000000000000..d236d5205792b82c9a77e5c2bfae24c4f6e83e82
--- /dev/null
+++ b/app/llava/conversation.py
@@ -0,0 +1,454 @@
+import dataclasses
+from enum import auto, Enum
+from typing import List, Tuple
+import base64
+from io import BytesIO
+from PIL import Image
+
+
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    LLAMA_2 = auto()
+
+
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+
+    skip_next: bool = False
+
+    def get_prompt(self, use_chat_template=False, tokenizer=None):
+        if use_chat_template:
+            assert tokenizer is not None, "must have tokenizer when using chat template"
+            messages = self.messages
+            # whether in inference mode
+            if messages[-1][0] == self.roles[1] and (messages[-1][1] is None or messages[-1][1] == ''):
+                generate_flag = True
+                messages = messages[:-1]
+            else:
+                generate_flag = False
+            chat = []
+            for role, message in messages:
+                chat.append(
+                    {
+                        "role": role,
+                        "content": message,                        
+                    }
+                )
+            return tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=generate_flag)
+        else:
+            messages = self.messages
+            if len(messages) > 0 and type(messages[0][1]) is tuple:
+                messages = self.messages.copy()
+                init_role, init_msg = messages[0].copy()
+                init_msg = init_msg[0].replace("<image>", "").strip()
+                if 'mmtag' in self.version:
+                    messages[0] = (init_role, init_msg)
+                    messages.insert(0, (self.roles[0], "<Image><image></Image>"))
+                    messages.insert(1, (self.roles[1], "Received."))
+                else:
+                    messages[0] = (init_role, "<image>\n" + init_msg)
+
+            if self.sep_style == SeparatorStyle.SINGLE:
+                ret = self.system + self.sep
+                for role, message in messages:
+                    if message:
+                        if type(message) is tuple:
+                            message, _, _ = message
+                        ret += role + ": " + message + self.sep
+                    else:
+                        ret += role + ":"
+            elif self.sep_style == SeparatorStyle.TWO:
+                seps = [self.sep, self.sep2]
+                ret = self.system + seps[0]
+                for i, (role, message) in enumerate(messages):
+                    if message:
+                        if type(message) is tuple:
+                            message, _, _ = message
+                        ret += role + ": " + message + seps[i % 2]
+                    else:
+                        ret += role + ":"
+            elif self.sep_style == SeparatorStyle.MPT:
+                ret = self.system + self.sep
+                for role, message in messages:
+                    if message:
+                        if type(message) is tuple:
+                            message, _, _ = message
+                        ret += role + message + self.sep
+                    else:
+                        ret += role
+            elif self.sep_style == SeparatorStyle.LLAMA_2:
+                wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n" if len(msg) > 0 else msg
+                wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
+                ret = ""
+
+                for i, (role, message) in enumerate(messages):
+                    if i == 0:
+                        assert message, "first message should not be none"
+                        assert role == self.roles[0], "first message should come from user"
+                    if message:
+                        if type(message) is tuple:
+                            message, _, _ = message
+                        if i == 0: message = wrap_sys(self.system) + message
+                        if i % 2 == 0:
+                            message = wrap_inst(message)
+                            ret += self.sep + message
+                        else:
+                            ret += " " + message + " " + self.sep2
+                    else:
+                        ret += ""
+                ret = ret.lstrip(self.sep)
+            elif self.sep_style == SeparatorStyle.PLAIN:
+                seps = [self.sep, self.sep2]
+                ret = self.system
+                for i, (role, message) in enumerate(messages):
+                    if message:
+                        if type(message) is tuple:
+                            message, _, _ = message
+                        ret += message + seps[i % 2]
+                    else:
+                        ret += ""
+            else:
+                raise ValueError(f"Invalid style: {self.sep_style}")
+
+        return ret
+
+
+
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+
+    def process_image(self, image, image_process_mode, return_pil=False, image_format='PNG', max_len=1344, min_len=672):
+        if image_process_mode == "Pad":
+            def expand2square(pil_img, background_color=(122, 116, 104)):
+                width, height = pil_img.size
+                if width == height:
+                    return pil_img
+                elif width > height:
+                    result = Image.new(pil_img.mode, (width, width), background_color)
+                    result.paste(pil_img, (0, (width - height) // 2))
+                    return result
+                else:
+                    result = Image.new(pil_img.mode, (height, height), background_color)
+                    result.paste(pil_img, ((height - width) // 2, 0))
+                    return result
+            image = expand2square(image)
+        elif image_process_mode in ["Default", "Crop"]:
+            pass
+        elif image_process_mode == "Resize":
+            image = image.resize((336, 336))
+        else:
+            raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
+        if max(image.size) > max_len:
+            max_hw, min_hw = max(image.size), min(image.size)
+            aspect_ratio = max_hw / min_hw
+            shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+            longest_edge = int(shortest_edge * aspect_ratio)
+            W, H = image.size
+            if H > W:
+                H, W = longest_edge, shortest_edge
+            else:
+                H, W = shortest_edge, longest_edge
+            image = image.resize((W, H))
+        if return_pil:
+            return image
+        else:
+            buffered = BytesIO()
+            image.save(buffered, format=image_format)
+            img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+            return img_b64_str
+
+    def get_images(self, return_pil=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode = msg
+                    image = self.process_image(image, image_process_mode, return_pil=return_pil)
+                    images.append(image)
+        return images
+
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode = msg
+                    img_b64_str = self.process_image(
+                        image, "Default", return_pil=False,
+                        image_format='JPEG')
+                    img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
+                    msg = img_str + msg.replace('<image>', '').strip()
+                    ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version)
+
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+
+
+conv_vicuna_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
+        ("Assistant",
+            "Renewable energy sources are those that can be replenished naturally in a relatively "
+            "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
+            "Non-renewable energy sources, on the other hand, are finite and will eventually be "
+            "depleted, such as coal, oil, and natural gas. Here are some key differences between "
+            "renewable and non-renewable energy sources:\n"
+            "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
+            "energy sources are finite and will eventually run out.\n"
+            "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
+            "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
+            "and other negative effects.\n"
+            "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
+            "have lower operational costs than non-renewable sources.\n"
+            "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
+            "locations than non-renewable sources.\n"
+            "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
+            "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
+            "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
+            "non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+
+conv_vicuna_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+
+conv_llama_2 = Conversation(
+    system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+
+conv_llava_llama_2 = Conversation(
+    system="You are a helpful language and vision assistant. "
+           "You are able to understand the visual content that the user provides, "
+           "and assist the user with a variety of tasks using natural language.",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+
+conv_mpt = Conversation(
+    system="""<|im_start|>system
+A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+
+conv_llava_plain = Conversation(
+    system="",
+    roles=("", ""),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
+)
+
+conv_llava_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+
+conv_llava_v0_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("Human", "Assistant"),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+    version="v0_mmtag",
+)
+
+conv_llava_v1 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+
+conv_llava_v1_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("USER", "ASSISTANT"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+    version="v1_mmtag",
+)
+
+# conv_mistral_instruct = Conversation(
+#     system="",
+#     roles=("USER", "ASSISTANT"),
+#     version="llama_v2",
+#     messages=(),
+#     offset=0,
+#     sep_style=SeparatorStyle.LLAMA_2,
+#     sep="",
+#     sep2="</s>",
+# )
+conv_mistral_instruct = Conversation(
+    system="",
+    roles=("user", "assistant"),
+    version="mistral",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT, # not used 
+    sep="",
+    sep2="</s>",
+)
+
+conv_gemma = Conversation(
+    system="",
+    roles=("user", "model"),
+    version="gemma",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,  # not used
+    sep="<start_of_turn>",
+    sep2="<end_of_turn>",
+)
+
+conv_thoth = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="thoth",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="<[SEP_never_used_51bce0c785ca2f68081bfa7d91973934]>",
+)
+
+conv_chatml_direct = Conversation(
+    system="""<|im_start|>system
+Answer the questions.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+
+
+default_conversation = conv_vicuna_v1
+conv_templates = {
+    "default": conv_vicuna_v0,
+    "v0": conv_vicuna_v0,
+    "v1": conv_vicuna_v1,
+    "vicuna_v1": conv_vicuna_v1,
+    "llama_2": conv_llama_2,
+    "mistral_instruct": conv_mistral_instruct,
+    "chatml_direct": conv_chatml_direct,
+    "mistral_direct": conv_chatml_direct,
+
+    "plain": conv_llava_plain,
+    "v0_plain": conv_llava_plain,
+    "llava_v0": conv_llava_v0,
+    "v0_mmtag": conv_llava_v0_mmtag,
+    "llava_v1": conv_llava_v1,
+    "v1_mmtag": conv_llava_v1_mmtag,
+    "llava_llama_2": conv_llava_llama_2,
+
+    "mpt": conv_mpt,
+    "gemma": conv_gemma,
+    "thoth": conv_thoth,
+
+}
+
+
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())
diff --git a/app/llava/datasets/__init__.py b/app/llava/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6836473b3107c70db48c30eeaba354fdb8a08254
--- /dev/null
+++ b/app/llava/datasets/__init__.py
@@ -0,0 +1,24 @@
+from .data_cfgs import *
+from .base_dataset import *
+from .prompts import *
+from .super_dataset import *
+from .cc_sbu_dataset import *
+from .llava_pretrain_dataset import *
+# from .llava_instruct_dataset import *
+# from .lrv_instruct_dataset import *
+from .internvid_dataset import *
+from .tt_vqa_dataset import *
+from .m3it_dataset import *
+from .sharegpt4v_dataset import *
+from .gpt4v_tt_vqa_dataset import *
+from .gpt4v_public_dataset import *
+from .gpt4v_internal_dataset import *
+# from .synthdog_dataset import *
+# from .ocr_vqa_dataset import *
+# from .sharegpt_dataset import *
+from .textcaps_dataset import * 
+from .synthetic_ocr_dataset import *
+from .lk_image_dataset import *
+from .lk_video_dataset import *
+
+from .promptv1_2_internal_dataset import *
\ No newline at end of file
diff --git a/app/llava/datasets/base_dataset.py b/app/llava/datasets/base_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..e63354a7380eccbb85f7e97e3fcd89f879c8d1e6
--- /dev/null
+++ b/app/llava/datasets/base_dataset.py
@@ -0,0 +1,234 @@
+import os
+import json
+import sys
+import copy
+import math
+import torch
+import decord
+import random
+import numpy as np
+from PIL import Image
+from decord import VideoReader
+from torch.utils.data import Dataset
+from llava.utils import master_print
+from typing import Dict, Optional, Sequence, List
+from llava.datasets.data_cfgs import data_configs
+from transformers import CLIPImageProcessor, SiglipImageProcessor
+
+from llava.mm_utils import get_frame_indices, process_anyres_image
+from torch.utils.data.dataloader import default_collate
+
+decord.bridge.set_bridge("torch")
+
+class TaskBaseDataset(Dataset):
+    """ Implementation of base task dataset """
+    def __init__(self, anno_path=None, data_args=None, name=None, **kwargs):
+
+        self.anno_path = anno_path
+        self.data_args = data_args
+        self.image_aspect_ratio = data_args.image_aspect_ratio
+        self.image_grid_pinpoints = data_args.image_grid_pinpoints
+        self.vis_processor = data_args.image_processor
+        self.type = None
+        self.name = name
+
+        master_print(f"Loading dataset {name}...")
+        if (anno_path is not None):
+            if not hasattr(self, 'annotation'):
+                self.annotation = json.load(open(anno_path, 'r'))
+            master_print(f"Finish loading dataset {name} {len(self.annotation)} samples...")
+
+    def __len__(self):
+        return len(self.annotation)
+
+    def collater(self, samples):
+        return default_collate(samples)
+
+    def text_preprocess(self, sources) -> List[List[Dict[str, str]]]:
+        pass
+
+    def vis_preprocess(self, vis_path) -> Image:
+        pass
+
+    @property
+    def data_type(self):
+        return self.type
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        item = self.annotation[i]
+
+        vis_path = item['vis_path'] if 'vis_path' in item else item['video_path']
+
+        ret = {
+            'images': self.vis_preprocess(vis_path),
+            'conversations': self.text_preprocess(item)
+        }
+        if 'id' in item:
+            ret['id'] = item['id']
+
+        return ret
+
+
+class ImageTaskDataset(TaskBaseDataset):
+    def __init__(self, anno_path=None, data_args=None, name=None):
+        super().__init__(anno_path=anno_path,
+                         data_args=data_args,
+                         name=name)
+        self.type = 'images'
+
+    @staticmethod
+    def expand2square(pil_img, background_color):
+        width, height = pil_img.size
+        if width == height:
+            return pil_img
+        elif width > height:
+            result = Image.new(pil_img.mode, (width, width), background_color)
+            result.paste(pil_img, (0, (width - height) // 2))
+            return result
+        else:
+            result = Image.new(pil_img.mode, (height, height), background_color)
+            result.paste(pil_img, ((height - width) // 2, 0))
+            return result
+
+    def preprocess_image(self, image):
+        if self.image_aspect_ratio == 'pad':
+            image = self.expand2square(image, tuple(int(x *255) for x in self.vis_processor.image_mean))
+            if isinstance(self.vis_processor, CLIPImageProcessor) or isinstance(self.vis_processor, SiglipImageProcessor):
+                image = self.vis_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+            else:
+                image = self.vis_processor.preprocess(image)
+        elif self.image_aspect_ratio == "anyres":
+            image = process_anyres_image(image, self.vis_processor, self.image_grid_pinpoints)
+        else:
+            if isinstance(self.vis_processor, CLIPImageProcessor) or isinstance(self.vis_processor, SiglipImageProcessor):
+                image = self.vis_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+            else:
+                image = self.vis_processor.preprocess(image)
+
+        return image
+
+    def vis_preprocess(self, vis_path):
+        image = Image.open(vis_path).convert('RGB')
+        image = self.preprocess_image(image)
+        if isinstance(image, list):
+            images = image
+        else:
+            images = [image]
+
+        return images
+
+
+class VideoTaskDataset(ImageTaskDataset):
+    def __init__(self, anno_path=None, data_args=None, name=None):
+        super().__init__(anno_path=anno_path,
+                         data_args=data_args,
+                         name=name)
+
+        # if not specify num_segments, use default
+        self.num_segments = self.data_args.num_segments
+        self.sample_strategy = self.data_args.sample_strategy
+        self.type = 'video'
+
+    def vis_preprocess(self, vis_path):
+        images = None
+        try:
+            video_reader = VideoReader(vis_path)
+            vlen = len(video_reader)
+            fps = video_reader.get_avg_fps()
+            duration = vlen / float(fps)
+
+            frame_indices = get_frame_indices(self.num_segments, vlen,
+                                              sample=self.sample_strategy, input_fps=fps, pad_last=False)
+            frames = video_reader.get_batch(frame_indices)
+            frames = frames.numpy().astype(np.uint8)
+            images = [Image.fromarray(frame).convert('RGB') for frame in frames]
+            images = [self.preprocess_image(image) for image in images]
+        except Exception as e:
+            print(e, vis_path)
+            sys.stdout.flush()
+            images = None
+
+        # print(f"images: {len(images)}, {images[0].shape}")
+
+        return images
+
+
+class FramesTaskDataset(ImageTaskDataset):
+    def __init__(self, anno_path=None, data_args=None, fps=0.5, name=None):
+        super().__init__(anno_path=anno_path,
+                         data_args=data_args,
+                         name=name)
+
+        # if not specify num_segments, use default
+        self.num_segments = self.data_args.num_segments
+        # print("self.num_segments:", self.num_segments)
+        self.type = 'video'
+        self.default_fps = 2.0
+        self.fps = fps
+
+    @staticmethod
+    def _downsample_frames(frames, interval, keep_first_last=True):
+        if keep_first_last:
+            first, last, mid = frames[0], frames[-1], frames[1:-1]
+            sampled_frames = mid[interval - 1::interval]
+            ret = [first] + sampled_frames + [last]
+
+        else:
+            # may output empty list, recommend keep first and last frame
+            ret = frames[interval - 1::interval]
+
+        return ret
+
+    @staticmethod
+    def _sample_frames(frames, num_segments):
+        frame_indices = list(range(len(frames)))
+        cand_indices  = copy.deepcopy(frame_indices)
+        intervals = np.linspace(start=0, stop=len(frame_indices), num=num_segments + 1).astype(int)
+        ranges = []
+
+        for idx, interv in enumerate(intervals[:-1]):
+            ranges.append((interv, intervals[idx + 1] - 1))
+
+        try:
+            frame_indices = [cand_indices[random.choice(range(x[0], x[1]))] for x in ranges]
+        except:
+            frame_indices = [cand_indices[x[0]] for x in ranges]
+
+        sampled_frames = [frames[indice] for indice in frame_indices]
+
+        return sampled_frames
+
+    def vis_preprocess(self, vis_path):
+        image_files = [(os.path.splitext(img)[0], img) for img in os.listdir(vis_path) if not img.startswith('cuttime')]
+        if image_files[0][1].endswith('jpeg'):
+            # gpt4v public data
+            image_files = [(int(x[0].split('_')[-1]), x[1]) for x in image_files]
+        else:
+            image_files = [(int(x[0]), x[1]) for x in image_files]
+
+        image_files = sorted(image_files, key=lambda img: img[0])
+
+        if self.fps < self.default_fps:
+            interval = math.floor(self.default_fps / self.fps)
+            image_files = self._downsample_frames(image_files, interval, keep_first_last=True)
+
+        if self.num_segments > 0 and len(image_files) > self.num_segments:
+            image_files = self._sample_frames(image_files, self.num_segments)
+
+        images = []
+        for image_file in image_files:
+            try:
+                images.append(Image.open(os.path.join(vis_path, image_file[1])).convert('RGB'))
+            except Exception as e:
+                continue
+        formatted_images = []
+        for image in images:
+            im = self.preprocess_image(image)
+            if isinstance(im, list):
+                formatted_images.extend(im)
+            else:
+                formatted_images.append(im)
+        return formatted_images
+
+
+
diff --git a/app/llava/datasets/builder.py b/app/llava/datasets/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..597d16acfa7b42453a0d5ea26f8c9d551562e49c
--- /dev/null
+++ b/app/llava/datasets/builder.py
@@ -0,0 +1,5 @@
+from .registry import Registry
+
+__all__ = ['DATASETS']
+
+DATASETS = Registry('datasets')
\ No newline at end of file
diff --git a/app/llava/datasets/cc_sbu_dataset.py b/app/llava/datasets/cc_sbu_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..16caac6a4de1d27c71ac2ef61d683d39c35fea3b
--- /dev/null
+++ b/app/llava/datasets/cc_sbu_dataset.py
@@ -0,0 +1,40 @@
+import os
+import random
+from llava.datasets.builder import DATASETS
+
+from typing import Dict, Optional, Sequence, List
+from llava.datasets.data_cfgs import data_configs
+from llava.datasets.base_dataset import ImageTaskDataset
+from llava.datasets.prompts import cc_sbu_prompt
+from llava.constants import DEFAULT_IMAGE_TOKEN
+
+
+class CCSBUDataset(ImageTaskDataset):
+    def __init__(self, anno_path, data_args=None, name='cc_sbu'):
+        super().__init__(anno_path=anno_path,
+                         data_args=data_args,
+                         name=name)
+
+    def text_preprocess(self, item) -> List[Dict[str, str]]:
+        caption = item['caption']
+
+        conversations = [
+            {
+                'from': 'human',
+                'value': DEFAULT_IMAGE_TOKEN + random.choice(cc_sbu_prompt)
+            },
+            {
+                'from': 'model',
+                'value': caption
+            }
+        ]
+
+        return conversations
+
+
+@DATASETS.register_obj
+def cc_sbu(data_args):
+    return CCSBUDataset(data_configs["cc_sbu"]['train_data_path'], data_args)
+
+
+
diff --git a/app/llava/datasets/data_cfgs.py b/app/llava/datasets/data_cfgs.py
new file mode 100644
index 0000000000000000000000000000000000000000..083d0c2a967b8a56542f48072fdb07ff6496b032
--- /dev/null
+++ b/app/llava/datasets/data_cfgs.py
@@ -0,0 +1,157 @@
+data_configs = {
+    'llava_pretrain': {
+        'data_type': 'images',
+        'train_data_path': '/mnt/bn/algo-masp-nas-2/baiyi.by/data/blip_laion_cc_sbu_558k/meta_data.json'
+    },
+    'llava_instruct': {
+        'data_type': 'images',
+        'train_data_path': '/mnt/bn/algo-masp-nas-2/baiyi.by/data/llava_instruct_150k/meta_data.json'
+    },
+    'lrv_instruct': {
+        'data_type': 'images',
+        'train_data_path': '/mnt/bn/algo-masp-nas-2/baiyi.by/data/lrv_instructions/meta_data.json'
+    },
+    'coco_caption': {
+        'data_type': 'images',
+        'train_data_path': '/mnt/bn/data-tns-algo-masp/baiyi.by/data/coco_caption/train.json'
+    },
+    'cc_sbu': {
+        'data_type': 'images',
+        'train_data_path': '/mnt/bn/baiyi-arnold-nas/data/masp/vlm_data/cc_sbu/meta_data.json'
+    },
+    'laion': {
+        'data_type': 'images',
+        'train_data_path': '/mnt/bn/data-tns-algo-masp/baiyi.by/data/laion/train.json'
+    },
+    'webvid': {
+        'data_type': 'video',
+        'train_data_path': '/mnt/bn/baiyi-arnold-nas/data/masp/vlm_data/webvid_10M_video/train.json',
+        'val_data_path': '/mnt/bn/baiyi-arnold-nas/data/masp/vlm_data/webvid_10M_video/val.json'
+    },
+    'internvid': {
+        'data_type': 'frames',
+        'fps': 0.5,
+        'train_data_path': '/mnt/bn/algo-masp-nas-2/baiyi.by/data/InternVid/meta_data.json'
+    },
+    'video_chatgpt_instruct_single': {
+        'data_type': 'video',
+        'train_data_path': '/mnt/bn/algo-masp-nas-2/baiyi.by/data/VideoChatGPT_Instruct_100K_single/train.json'
+    },
+    'video_chatgpt_instruct_multi': {
+        'data_type': 'video',
+        'train_data_path': '/mnt/bn/algo-masp-nas-2/baiyi.by/data/VideoChatGPT_Instruct_100K_multi/train.json'
+    },
+    'video_chatgpt': {
+        'data_type': 'frames',
+        'train_data_path': '/mnt/bn/algo-masp-nas-2/baiyi.by/data/video_chatgpt_instruct/meta_data.json'
+    },
+    'm3it': {
+        'data_type': 'images',
+        'default_tasks': [
+            'coco',
+            'textcap',
+            'image-paragraph-captioning',
+            'coco-goi',
+            'coco-itm',
+            'vqa-v2',
+            'shapes',
+            'docvqa',
+            'ocr-vqa',
+            'st-vqa',
+            'text-vqa',
+            'gqa',
+            'okvqa',
+            'a-okvqa',
+            'viquae',
+            'clevr',
+            'nlvr',
+            'vcr',
+            'visual-mrc',
+            'visual-dialog',
+            'multi30k'
+        ]
+    },
+    'tt_vqa': {
+        'data_type': 'frames',
+        'fps': 2,
+        'train_data_path': '/mnt/bn/algo-masp-nas-2/baiyi.by/data/ADSO_Anno_Data/batch_20231128/meta_data_single_60k_caption_170k_QA.json'
+        # 'train_data_path': '/mnt/bn/yukunfeng-nasdrive/xiangchen/dataset/masp/20240208_meta_data_single_135k_caption_160k_QA.json'
+        # 'train_data_path': '/mnt/bn/algo-masp-nas-2/baiyi.by/data/ADSO_Anno_Data/batch_20231128/meta_data_final_single_non_empty.json'
+    },
+    'gpt4v_tt_vqa': {
+        'data_type': 'frames',
+        'fps': 0.5,
+        # 'train_data_path': '/mnt/bn/algo-masp-nas-2/baiyi.by/data/GPT4V_Negs/20231127_81k_single.json'
+        # 'train_data_path': '/mnt/bn/yukunfeng-nasdrive/xiangchen/dataset/masp/20231127_81k_25k_filtered_single_non_empty.json'
+        'train_data_path': '/mnt/bn/algo-masp-nas-2/xiangchen/dataset/masp/20231222_120k_multi_filtered.json',
+        'task_types': ['caption', 'qas'],
+        'conv_type': 'single'
+    },
+    'sharegpt4v': {
+        'data_type': 'images',
+        'coco_dir': '/mnt/bn/data-tns-algo-masp/data',
+        'llava_dir': '/mnt/bn/data-tns-algo-masp/baiyi.by/data/blip_laion_cc_sbu_558k',
+        'other_dir': '/mnt/bn/algo-masp-nas-2/xiangchen/dataset/sharegpt4v',
+    },
+    'gpt4v_public': {
+        'data_type': 'frames',
+        'fps': 1,
+        'train_data_path': '/mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_130k.json',
+        # 'train_data_path': '/mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_500k_filtered.json',
+        'task_types': ['summary', 'detail', 'qa_pairs'],
+        'conv_type': 'single',
+        'sample_method': 'uniform'
+    },
+
+    'gpt4v_internal': {
+        'data_type': 'frames',
+        'fps': 2,
+        'train_data_path': '/mnt/bn/algo-masp-nas-2/xiangchen/dataset/masp/gpt4v_internal_28k.json',
+        'task_types': ['summary','detail','qa_pairs'],
+        'conv_type': 'single'
+    },
+
+    'synthdog': { #500k
+        'data_type': 'images',
+    },
+
+    'ocr_vqa': { #200k
+        'data_type': 'images',
+        'train_data_path': '/mnt/bn/algo-masp-nas-2/xiangchen/dataset/OCR-VQA/training_meta.json'
+    },
+
+    'sharegpt': { #50k
+        'data_type': 'text'
+    },
+
+    'text_caps':{ #100k
+        'data_type': 'images',
+        'train_data_path': '/mnt/bn/algo-masp-nas-2/xiangchen/dataset/TextCaps/TextCaps_0.1_train.json'
+    },
+
+    'synthetic_ocr':{ # 50k
+        'data_type': 'frames',
+        'fps': 0.5,  # total 10 frames for each video
+        'train_data_path': '/mnt/bn/algo-masp-nas-2/xiangchen/dataset/masp/synthetic_ocr/train_filtered.json'
+    },
+
+    'lk_image':{  # 600k
+        'data_type': 'images',
+        'train_data_path': '/mnt/bn/liangkeg/data/xiangchen/finetune_all_detail_vidal200k_videollava_images_im.json'
+    },
+
+    'lk_video':{  # 850k
+        'data_type': 'frames',
+        'fps': 1,
+        'train_data_path': '/mnt/bn/liangkeg/data/xiangchen/finetune_all_detail_vidal200k_videollava_images_vid.json',
+        'select_datasets': ['webvid10m', 'webvid2m', 'activitynet', 'vidal', 'hdvila'],
+    },
+
+    'promptv1_2_internal':{  # 210k
+        'data_type': 'frames',
+        'train_data_path': '/mnt/bn/algo-masp-nas-2/kaili.zhao/data/masp_data/train/gpt4v_annotation/202400401week_gpt4v_all_videos_unique_ids.json',
+        'task_types': ['caption']
+    }  
+}
+
+
diff --git a/app/llava/datasets/gpt4v_internal_dataset.py b/app/llava/datasets/gpt4v_internal_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9f661d0342719a05a85883b69db7254f7c6073f
--- /dev/null
+++ b/app/llava/datasets/gpt4v_internal_dataset.py
@@ -0,0 +1,188 @@
+import sys
+
+import datasets
+import torch
+import re
+import os
+import subprocess
+import numpy as np
+
+from llava.datasets.builder import DATASETS
+
+from typing import Dict, Optional, Sequence, List
+from llava.datasets.data_cfgs import data_configs
+from llava.datasets.base_dataset import FramesTaskDataset
+from llava.datasets.data_cfgs import data_configs
+from llava.utils import master_print
+import pickle
+from pathlib import Path
+import random
+from llava.datasets.prompts import tt_caption_prompt, internvid_prompt
+from llava.constants import DEFAULT_VIDEO_TOKEN
+from PIL import Image
+import json
+import numpy as np
+
+class GPT4VInternalDataset(FramesTaskDataset):
+    def __init__(self, anno_path=None, data_args=None, fps=0.5, conv_type='single', task_types=None, name='gpt4v_internal'):
+        self.default_fps = 2.0
+        self.fps = fps
+        self.conv_type = conv_type
+        self.task_types = task_types
+        self.annotation = self.get_dataset(anno_path)
+        assert self.conv_type in ('single', 'multi'), "gpt4v_public conv type must in single/multi"
+        # assert hasattr(self.data_args, 'task_types') ,  "gpt4v_internal must have key 'task_types' in yaml config"
+        # master_print(f"Finished loading dataset {name} {len(self.annotation)} samples...")
+        super().__init__(anno_path=anno_path,
+                         data_args=data_args,
+                         fps=fps,
+                         name=name)
+    def __len__(self):
+        return len(self.annotation)
+        
+    def get_dataset(self, anno_path):
+        dataset = []
+        anno_path = Path(anno_path)
+        with anno_path.open('rb') as f:
+            data = json.load(f)
+        for info in data:   
+            filtered_qa = []
+            for qa in info['qa_pairs']:
+                if len(qa['question']) == 0 or len(qa['answer']) == 0:
+                    continue
+                filtered_qa.append(qa)
+            info['qa_pairs'] = filtered_qa
+            
+            for task_type in self.task_types:
+                info_task = info.copy()
+                if len(info_task[task_type]) == 0:
+                    continue
+                if task_type == 'qa_pairs' and self.conv_type == 'single':
+                    for qa_pair in info_task[task_type]:
+                        one_info = info_task.copy()
+                        one_info[task_type] = [qa_pair]
+                        one_info.update({
+                            'task_type': task_type
+                        })
+                        dataset.append(one_info)
+                else:
+                    info_task.update({
+                        'task_type': task_type
+                    })
+                    dataset.append(info_task)
+
+        return dataset
+
+    @staticmethod
+    def _sample_frames(frames, num_segments):
+        indices = np.linspace(start=0, stop=len(frames) - 1, num=num_segments).astype(int)
+
+        frames = [frames[ind] for ind in indices]
+
+        return frames
+
+
+    def text_preprocess(self, item) -> List[Dict[str, str]]:
+        all_convs = []
+        # TODO: different prompt for summary and detail
+        if item['task_type'] == 'summary':
+             all_convs.append([
+                {
+                    'from': 'human',
+                    'value': random.choice(internvid_prompt)
+                },
+                {
+                    'from': 'model',
+                    'value': item['summary']
+                }
+            ])           
+        elif item['task_type'] == 'detail':
+            all_convs.append([
+                {
+                    'from': 'human',
+                    'value': random.choice(tt_caption_prompt)
+                },
+                {
+                    'from': 'model',
+                    'value': item['detail']
+                }
+            ])
+        else:
+            for qa in item['qa_pairs']:
+                all_convs.append([
+                    {
+                        'from': 'human',
+                        'value': qa['question']
+                    },
+                    {
+                        'from': 'model',
+                        'value': qa['answer']
+                    }
+                ])                
+            
+        conversations = []
+        random.shuffle(all_convs)
+        for idx, conv in enumerate(all_convs):
+            if idx == 0:
+                conv[0]['value'] = DEFAULT_VIDEO_TOKEN + conv[0]['value']
+            conversations.extend(conv)
+
+        return conversations
+
+
+    def vis_preprocess(self, vis_path):
+        image_files = [(os.path.splitext(img)[0], img) for img in os.listdir(vis_path) if not img.startswith('cuttime')]
+        image_files = [(int(x[0]), x[1]) for x in image_files]
+        image_files = sorted(image_files, key=lambda img: img[0])
+        intervals = np.linspace(start=0, stop=len(image_files)-1, num=10).astype(int)
+        image_files = [image_files[i] for i in intervals]
+
+        if self.num_segments > 0 and len(image_files) > self.num_segments:
+            image_files = self._sample_frames(image_files, self.num_segments)
+
+        images = []
+        for image_file in image_files:
+            try:
+                images.append(Image.open(os.path.join(vis_path, image_file[1])).convert('RGB'))
+            except Exception as e:
+                continue
+        formatted_images = []
+        for image in images:
+            im = self.preprocess_image(image)
+            if isinstance(im, list):
+                formatted_images.extend(im)
+            else:
+                formatted_images.append(im)
+
+        # images = [self.preprocess_image(image) for image in images]
+
+        return formatted_images
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        item = self.annotation[i]
+
+        ret = {
+            'images': self.vis_preprocess(item['vis_path']),
+            'conversations': self.text_preprocess(item)
+        }
+        if 'id' in item:
+            ret['id'] = item['id']
+
+        return ret
+
+
+@DATASETS.register_obj
+def gpt4v_internal(data_args):
+    data_cfg = data_configs['gpt4v_internal']
+    train_data_path = None
+    if 'train_data_path' in data_args.external_args:
+        train_data_path = data_args.external_args['train_data_path']
+    else:
+        train_data_path = data_cfg['train_data_path']
+    fps, conv_type, task_types = data_args.external_args['fps'], data_args.external_args['conv_type'], data_args.external_args['task_types']
+    return GPT4VInternalDataset(train_data_path, data_args,  fps, conv_type, task_types)
+
+
+
+        
+  
\ No newline at end of file
diff --git a/app/llava/datasets/gpt4v_public_dataset.py b/app/llava/datasets/gpt4v_public_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5c103fe7ed20d22b1ec772eec052aaf134f9cb4
--- /dev/null
+++ b/app/llava/datasets/gpt4v_public_dataset.py
@@ -0,0 +1,283 @@
+
+from llava.datasets.builder import DATASETS
+
+from typing import Dict, Optional, Sequence, List
+from llava.datasets.data_cfgs import data_configs
+from llava.datasets.base_dataset import FramesTaskDataset
+from llava.datasets.data_cfgs import data_configs
+import pickle
+from pathlib import Path
+import random
+import numpy as np
+from llava.datasets.prompts import tt_caption_prompt, internvid_prompt
+from llava.constants import DEFAULT_VIDEO_TOKEN
+from PIL import Image
+import json
+import torch
+import os
+
+
+class GPT4VPublicDataset(FramesTaskDataset):
+    def __init__(self, anno_path=None, data_args=None, fps=1.0, conv_type='single', task_types=None, sample_method='uniform', name='gpt4v_public'):
+        self.default_fps = 1.0
+        self.fps = fps
+        self.conv_type = conv_type
+        self.task_types = task_types
+        self.annotation = self.get_dataset(anno_path)
+        self.sample_method = sample_method
+        assert self.conv_type in ('single', 'multi'), "gpt4v_public conv type must in single/multi"
+        assert self.sample_method in ('sequential', 'uniform'), "gpt4v_public sample method must in sequential/uniform"
+        # assert hasattr(self.data_args, 'task_types') ,  "gpt4v_public must have key 'task_types' in yaml config"
+        # master_print(f"Finished loading dataset {name} {len(self.annotation)} samples...")
+        super().__init__(anno_path=anno_path,
+                         data_args=data_args,
+                         fps=fps,
+                         name=name)
+    def __len__(self):
+        return len(self.annotation)
+
+
+    def get_dataset(self, anno_path):
+        dataset = []
+        anno_path = Path(anno_path)
+        with anno_path.open('rb') as f:
+            data = json.load(f)
+        for info in data:
+            filtered_qa = []
+            if 'qa_pairs' not in info:
+                index = 0
+                while index < len(info['conversation']):
+                    if len(info['conversation'][index].strip()) == 0:
+                        index += 1
+                        continue
+                    if 'C' in info['conversation'][index]:
+                        if index+1 < len(info['conversation']) and 'A' in info['conversation'][index+1]:
+                            filtered_qa.append(
+                                [info['conversation'][index], info['conversation'][index+1]]
+                            )
+                            index += 2
+                        else:
+                            index += 1
+                            continue
+                    else:
+                        # print(info['conversation'][index])
+                        index += 1
+                        continue
+            else:
+                for qa in info['qa_pairs']:
+                    if len(qa[0]) == 0 or len(qa[1]) == 0:
+                        continue
+                    filtered_qa.append(qa)
+            info['qa_pairs'] = filtered_qa
+            
+            for task_type in self.task_types:
+                info_task = info.copy()
+                if len(info_task[task_type]) == 0:
+                    continue
+                if task_type == 'qa_pairs' and self.conv_type == 'single':
+                    for qa_pair in info_task[task_type]:
+                        one_info = info_task.copy()
+                        one_info[task_type] = [qa_pair]
+                        one_info.update({
+                            'task_type': task_type
+                        })
+                        dataset.append(one_info)
+                else:
+                    info_task.update({
+                        'task_type': task_type
+                    })
+                    dataset.append(info_task)
+
+        return dataset
+
+    # @staticmethod
+    # def _sample_frames(frames, num_segments):
+    #     indices = list(range(num_segments))
+
+    #     frames = [frames[ind] for ind in indices]
+
+    #     return frames
+
+    def text_preprocess(self, item) -> List[Dict[str, str]]:
+        all_convs = []
+        # TODO: different prompt for summary and detail
+        if item['task_type'] == 'summary':
+            summary = ''
+            if isinstance(item['summary'], list):
+                for s in item['summary']:
+                    if len(s.strip()) != 0:
+                        summary = s
+                        break
+            else:
+                summary = item['summary']
+
+            all_convs.append([
+                {
+                    'from': 'human',
+                    'value': random.choice(internvid_prompt)
+                },
+                {
+                    'from': 'model',
+                    'value': summary
+                }
+            ])           
+        elif item['task_type'] == 'detail':
+            detail = ''
+            if isinstance(item['detail'], list):
+                for s in item['detail']:
+                    if len(s.strip()) != 0:
+                        detail = s
+                        break
+            else:
+                detail = item['detail']
+                
+            all_convs.append([
+                {
+                    'from': 'human',
+                    'value': random.choice(tt_caption_prompt)
+                },
+                {
+                    'from': 'model',
+                    'value': detail
+                }
+            ])
+        else:
+            for qa in item['qa_pairs']:
+                all_convs.append([
+                    {
+                        'from': 'human',
+                        'value': qa[0]
+                    },
+                    {
+                        'from': 'model',
+                        'value': qa[1]
+                    }
+                ])                
+            
+        conversations = []
+        random.shuffle(all_convs)
+        for idx, conv in enumerate(all_convs):
+            if idx == 0:
+                conv[0]['value'] = DEFAULT_VIDEO_TOKEN + conv[0]['value']
+            conversations.extend(conv)
+
+        return conversations
+
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        item = self.annotation[i]
+
+        ret = {
+            'images': self.vis_preprocess(item['vis_path']),
+            'conversations': self.text_preprocess(item)
+        }
+        if 'id' in item:
+            ret['id'] = item['id']
+
+        return ret
+
+
+    def _sample_frames(self, frames, num_segments, preprocess=False):
+        if preprocess:
+            if self.sample_method == 'uniform':
+                indices = np.linspace(start=0, stop=len(frames) - 1, num=num_segments).astype(int)
+            elif self.sample_method == 'sequential':
+                indices = range(10)
+            else:
+                raise NotImplementedError
+            frames = [frames[ind] for ind in indices]
+        else:
+            indices = np.linspace(start=0, stop=len(frames) - 1, num=num_segments).astype(int)
+            frames = [frames[ind] for ind in indices]
+
+        return frames
+
+    def vis_preprocess(self, vis_path):
+        image_files = []
+        for img_path in os.listdir(vis_path):
+            if img_path.endswith('.jpeg'):
+                img_idx = int(img_path.split('_')[-1][:-5])
+                image_files.append((img_idx, img_path))
+        
+        image_files = sorted(image_files, key=lambda img: img[0])
+        # TODO: addhoc fix,  only 10 frames
+        if len(image_files) > 10:
+            image_files = self._sample_frames(image_files, 10, preprocess=True)
+        if self.num_segments > 0 and len(image_files) > self.num_segments:
+            image_files = self._sample_frames(image_files, self.num_segments)
+        
+        images = []
+        for image_file in image_files:
+            try:
+                images.append(Image.open(os.path.join(vis_path, image_file[1])).convert('RGB'))
+            except Exception as e:
+                continue
+        formatted_images = []
+        for image in images:
+            im = self.preprocess_image(image)
+            if isinstance(im, list):
+                formatted_images.extend(im)
+            else:
+                formatted_images.append(im)
+        return formatted_images
+
+
+@DATASETS.register_obj
+def gpt4v_public(data_args):
+    data_cfg = data_configs['gpt4v_public']
+    if 'train_data_path' in data_args.external_args:
+        data_cfg['train_data_path'] = data_args.external_args['train_data_path']
+    anno_path = data_cfg['train_data_path']
+    fps, conv_type, task_types = data_args.external_args['fps'], data_args.external_args['conv_type'], data_args.external_args['task_types']
+    if 'sample_method' in data_args.external_args:
+        sample_method = data_args.external_args['sample_method']
+    else:
+        sample_method = 'uniform'
+    return GPT4VPublicDataset(anno_path, data_args, fps, conv_type, task_types, sample_method)
+
+
+if __name__ == '__main__':
+    pass
+    # import pickle
+    # from tqdm import tqdm
+    # file_paths = ['/mnt/bn/algo-masp-nas-2/xianyang/clean_annotations/annotations/webvid10m',
+    #     '/mnt/bn/algo-masp-nas-2/xianyang/clean_annotations/annotations/webvid2m']
+    # frame_paths = ['/mnt/bn/algo-masp-nas-2/xianyang/clean_annotations/frames/webvid10m',
+    # '/mnt/bn/algo-masp-nas-2/xianyang/clean_annotations/frames/webvid2m']
+
+
+    # data = []
+    # for file_path, frame_path in zip(file_paths, frame_paths):
+    #     file_path = Path(file_path)
+   
+    #     for pkl_path in tqdm(file_path.glob('*')):
+    #         with pkl_path.open('rb') as f:
+    #             info = pickle.load(f)     
+    #         pkl_name = pkl_path.name[:-4]
+    #         frame_folder_path = Path(frame_path) / pkl_name
+    #         info['vis_path'] = str(frame_folder_path)
+    #         if os.path.exists(info['vis_path']):
+    #             data.append(info)
+    
+    # with open ('/mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_500k.json', 'w') as f:
+    #     json.dump(data, f)
+            # if frame_path.exists():
+            #     print(1)
+        
+    
+    # with open('/mnt/bn/liangkeg/data/xiangchen/finetune_all_detail_vidal200k_videollava_images.json') as f:
+    #     data = json.load(f)
+    # data_im = []
+    # data_vid = []
+    # for sample in data:
+    #     if 'image' in sample:
+    #         data_im.append(sample)
+    #     else:
+    #         data_vid.append(sample)
+    
+    
+    # with open('/mnt/bn/liangkeg/data/xiangchen/finetune_all_detail_vidal200k_videollava_images_im.json', 'w') as f:
+    #     json.dump(data_im, f)
+
+    # with open('/mnt/bn/liangkeg/data/xiangchen/finetune_all_detail_vidal200k_videollava_images_vid.json', 'w') as f:
+    #     json.dump(data_vid, f)
\ No newline at end of file
diff --git a/app/llava/datasets/gpt4v_tt_vqa_dataset.py b/app/llava/datasets/gpt4v_tt_vqa_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd92ef984d75c34e8369b1fa11f550ceb543480c
--- /dev/null
+++ b/app/llava/datasets/gpt4v_tt_vqa_dataset.py
@@ -0,0 +1,105 @@
+import os
+import json
+import random
+import json
+from pathlib import Path
+from llava.datasets.builder import DATASETS
+from pathlib import Path
+from typing import Dict, Optional, Sequence, List
+from llava.datasets.data_cfgs import data_configs
+from llava.datasets.base_dataset import FramesTaskDataset
+from llava.datasets.prompts import tt_caption_prompt, tt_caption_prompt2
+from llava.constants import DEFAULT_VIDEO_TOKEN
+from llava.utils import master_print
+
+
+class GPT4VTTVqaDataset(FramesTaskDataset):
+    def __init__(self, anno_path, data_args=None, fps=0.5, conv_type='single', task_types=None, name='gpt4v_tt_vqa'):
+        self.default_fps = 0.5
+        self.fps = fps
+        self.conv_type = conv_type
+        self.task_types = task_types
+        self.annotation = self.get_dataset(anno_path)
+        assert self.conv_type in ('single', 'multi'), "gpt4v_tt_vqa conv type must in single/multi"
+        # assert hasattr(self.data_args, 'task_types'), "gpt4v_tt_vqa must have key 'task_types' in yaml config"
+        # master_print(f"Finished loading dataset {name} {len(self.annotation)} samples...")
+        super().__init__(anno_path=anno_path,
+                         data_args=data_args,
+                         fps=fps,
+                         name=name)
+    def get_dataset(self, anno_path):
+        dataset = []
+        anno_path = Path(anno_path)
+        with anno_path.open('rb') as f:
+            data = json.load(f)
+        for info in data:
+            for task_type in self.task_types:
+                info_task = info.copy()
+                if task_type not in info or len(info_task[task_type]) == 0:
+                    continue
+                if task_type == 'qas' and self.conv_type == 'single':
+                    for qa_pair in info_task[task_type]:
+                        one_info = info_task.copy()
+                        one_info[task_type] = [qa_pair]
+                        one_info.update({
+                            'task_type': task_type
+                        })
+                        dataset.append(one_info)
+                else:
+                    info_task.update({
+                        'task_type': task_type
+                    })
+                    dataset.append(info_task)
+        return dataset
+
+
+    def text_preprocess(self, item) -> List[Dict[str, str]]:
+        all_convs = []
+        if hasattr(self.data_args, 'caption_prompt'):
+            cap_prompt = eval(self.data_args.caption_prompt)
+        else:
+            cap_prompt = tt_caption_prompt
+        if item['task_type'] == 'caption':
+            all_convs.append([
+                {
+                    'from': 'human',
+                    'value': random.choice(cap_prompt)
+                },
+                {
+                    'from': 'model',
+                    'value': item['caption']
+                }
+            ])
+        else:
+            for idx, qa in enumerate(item['qas']):
+                all_convs.append([
+                    {
+                        'from': 'human',
+                        'value': qa['q']
+                    },
+                    {
+                        'from': 'model',
+                        'value': qa['a']
+                    }
+                ])
+
+        conversations = []
+        random.shuffle(all_convs)
+        for idx, conv in enumerate(all_convs):
+            if idx == 0:
+                conv[0]['value'] = DEFAULT_VIDEO_TOKEN + conv[0]['value']
+            conversations.extend(conv)
+        return conversations
+
+    
+
+@DATASETS.register_obj
+def gpt4v_tt_vqa(data_args):
+    anno_path = None
+    if 'train_data_path' in data_args.external_args:
+        anno_path = data_args.external_args['train_data_path']
+    else:
+        anno_path = data_configs["gpt4v_tt_vqa"]['train_data_path']
+    fps, conv_type, task_types = data_args.external_args['fps'], data_args.external_args['conv_type'], data_args.external_args['task_types']
+    return GPT4VTTVqaDataset(anno_path, data_args, fps, conv_type, task_types)
+
diff --git a/app/llava/datasets/internvid_dataset.py b/app/llava/datasets/internvid_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..e10db587069a95b18d0d394c6f157c0f3a5bc604
--- /dev/null
+++ b/app/llava/datasets/internvid_dataset.py
@@ -0,0 +1,40 @@
+import os
+import random
+from llava.datasets.builder import DATASETS
+
+from typing import Dict, Optional, Sequence, List
+from llava.datasets.data_cfgs import data_configs
+from llava.datasets.base_dataset import FramesTaskDataset
+from llava.datasets.prompts import internvid_prompt
+from llava.constants import DEFAULT_VIDEO_TOKEN
+
+
+class InternVidDataset(FramesTaskDataset):
+    def __init__(self, anno_path, data_args=None, name='internvid'):
+        super().__init__(anno_path=anno_path,
+                         data_args=data_args,
+                         name=name)
+
+    def text_preprocess(self, item) -> List[Dict[str, str]]:
+        caption = item['caption']
+
+        conversations = [
+            {
+                'from': 'human',
+                'value': DEFAULT_VIDEO_TOKEN + random.choice(internvid_prompt)
+            },
+            {
+                'from': 'model',
+                'value': caption
+            }
+        ]
+
+        return conversations
+
+
+@DATASETS.register_obj
+def internvid(data_args):
+    return InternVidDataset(data_configs["internvid"]['train_data_path'], data_args)
+
+
+
diff --git a/app/llava/datasets/lk_image_dataset.py b/app/llava/datasets/lk_image_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..d528574878c492962ee631cc32d538d75288e21b
--- /dev/null
+++ b/app/llava/datasets/lk_image_dataset.py
@@ -0,0 +1,57 @@
+import datasets
+import torch
+import re
+import os
+import subprocess
+from llava.datasets.builder import DATASETS
+
+from typing import Dict, Optional, Sequence, List
+from llava.datasets.data_cfgs import data_configs
+from llava.datasets.base_dataset import ImageTaskDataset
+from llava.constants import DEFAULT_IMAGE_TOKEN
+from llava.datasets.data_cfgs import data_configs
+from llava.utils import master_print
+
+class LKImageDataset(ImageTaskDataset):
+    def __init__(self, anno_path=None, data_args=None, aux_args=None, name='lk_image'):
+        super().__init__(anno_path=anno_path,
+                         data_args=data_args,
+                         name=name)
+ 
+    def __len__(self):
+        return len(self.annotation)
+
+    def text_preprocess(self, item) -> List[Dict[str, str]]:
+        return item['conversations']
+
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        item = self.annotation[i]
+        vis_path = item['image']
+        ret = {
+            'images': self.vis_preprocess(vis_path),
+            'conversations': self.text_preprocess(item)
+        }
+        if 'id' in item:
+            ret['id'] = item['id']
+        return ret
+
+@DATASETS.register_obj
+def lk_image(data_args):
+    data_cfg = data_configs['lk_image']
+    return LKImageDataset(data_cfg['train_data_path'], data_args, aux_args=data_cfg)
+
+# if __name__ == '__main__':
+    # import json
+    # from tqdm import tqdm
+    # with open('/mnt/bn/liangkeg/data/xiangchen/finetune_all_detail_vidal200k_videollava_images_im.json') as f:
+    #     data = json.load(f)
+    # filterd_data = []
+    # for idx, item in tqdm(enumerate(data)):
+    #     image_path = item['image']
+    #     if os.path.exists(image_path):
+    #         filterd_data.append(item)
+    #     else:
+    #         print(image_path)
+    # with open('/mnt/bn/liangkeg/data/xiangchen/finetune_all_detail_vidal200k_videollava_images_im.json', 'w') as f:
+    #     json.dump(filterd_data, f)
\ No newline at end of file
diff --git a/app/llava/datasets/lk_video_dataset.py b/app/llava/datasets/lk_video_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..65c49d6e5c4e1ba8012d7367109fe31152aba4cb
--- /dev/null
+++ b/app/llava/datasets/lk_video_dataset.py
@@ -0,0 +1,132 @@
+
+from llava.datasets.builder import DATASETS
+
+from typing import Dict, Optional, Sequence, List
+from llava.datasets.data_cfgs import data_configs
+from llava.datasets.base_dataset import FramesTaskDataset
+from llava.datasets.data_cfgs import data_configs
+import pickle
+from pathlib import Path
+import random
+import numpy as np
+from llava.datasets.prompts import tt_caption_prompt, internvid_prompt
+from llava.constants import DEFAULT_VIDEO_TOKEN
+from PIL import Image
+import json
+import torch
+import os
+
+
+class LKVideoDataset(FramesTaskDataset):
+    def __init__(self, anno_path=None, data_args=None, fps=1.0, conv_type='multi', select_datasets=None, name='lk_video'):
+        self.default_fps = 1.0
+        self.fps = fps
+        self.conv_type = conv_type
+        self.select_datasets = select_datasets
+        self.annotation = self.get_dataset(anno_path)
+        #TODO: support single
+        assert self.conv_type in ('multi'), "lk_video conv type must be multi"
+        # assert hasattr(self.data_args, 'task_types') ,  "gpt4v_public must have key 'task_types' in yaml config"
+        # master_print(f"Finished loading dataset {name} {len(self.annotation)} samples...")
+        super().__init__(anno_path=anno_path,
+                         data_args=data_args,
+                         fps=fps,
+                         name=name)
+    def __len__(self):
+        return len(self.annotation)
+
+
+    def get_dataset(self, anno_path):
+        anno_path = Path(anno_path)
+        with anno_path.open('rb') as f:
+            data = json.load(f)
+        
+        if self.select_datasets is not None:
+            filtered_data = []
+            for sample in data:
+                video_path = Path(sample['video'])
+                dataset_name = video_path.parent.name   
+                if dataset_name in self.select_datasets:
+                    filtered_data.append(sample)
+            data = filtered_data
+
+        return data
+
+
+    def text_preprocess(self, item) -> List[Dict[str, str]]:
+        return item['conversations']
+
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        item = self.annotation[i]
+
+        ret = {
+            'images': self.vis_preprocess(item['video']),
+            'conversations': self.text_preprocess(item)
+        }
+        if 'id' in item:
+            ret['id'] = item['id']
+
+        return ret
+
+
+    @staticmethod
+    def _sample_frames(frames, num_segments):
+        indices = np.linspace(start=0, stop=len(frames) - 1, num=num_segments).astype(int)
+
+        frames = [frames[ind] for ind in indices]
+
+        return frames
+
+    def vis_preprocess(self, vis_path):
+        image_files = []
+        for img_path in os.listdir(vis_path):
+            if img_path.endswith('.jpeg'):
+                img_idx = int(img_path.split('_')[-1][:-5])
+                image_files.append((img_idx, img_path))
+        
+        image_files = sorted(image_files, key=lambda img: img[0])
+        # TODO: addhoc fix,  only 10 frames
+        if len(image_files) > 10:
+            image_files = self._sample_frames(image_files, 10)
+        if self.num_segments > 0 and len(image_files) > self.num_segments:
+            image_files = self._sample_frames(image_files, self.num_segments)
+        
+        images = []
+        for image_file in image_files:
+            try:
+                images.append(Image.open(os.path.join(vis_path, image_file[1])).convert('RGB'))
+            except Exception as e:
+                continue
+        formatted_images = []
+        for image in images:
+            im = self.preprocess_image(image)
+            if isinstance(im, list):
+                formatted_images.extend(im)
+            else:
+                formatted_images.append(im)
+        return formatted_images
+
+
+@DATASETS.register_obj
+def lk_video(data_args):
+    data_cfg = data_configs['lk_video']
+    fps, conv_type = data_args.external_args['fps'], data_args.external_args['conv_type']
+    select_datasets = data_args.external_args['select_datasets'] if 'select_datasets' in data_args.external_args else None
+    return LKVideoDataset(data_cfg['train_data_path'], data_args, fps, conv_type, select_datasets=select_datasets)
+
+
+# if __name__ == '__main__':
+    # import json
+    # from tqdm import tqdm
+    # with open('/mnt/bn/liangkeg/data/xiangchen/finetune_all_detail_vidal200k_videollava_images_vid.json') as f:
+    #     data = json.load(f)
+    # filterd_data = []
+    # for item in tqdm(data):
+    #     image_path = item['video']
+    #     if os.path.exists(image_path):
+    #         filterd_data.append(item)
+    #     else:
+    #         print(image_path)
+    # with open('/mnt/bn/liangkeg/data/xiangchen/finetune_all_detail_vidal200k_videollava_images_vid.json', 'w') as f:
+    #     json.dump(filterd_data, f)
diff --git a/app/llava/datasets/llava_pretrain_dataset.py b/app/llava/datasets/llava_pretrain_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..043d318b039b32ce2ee600b1c9ca1c3e3c98dbfa
--- /dev/null
+++ b/app/llava/datasets/llava_pretrain_dataset.py
@@ -0,0 +1,36 @@
+from llava.datasets.builder import DATASETS
+
+from typing import Dict, Optional, Sequence, List
+from llava.datasets.data_cfgs import data_configs
+from llava.datasets.base_dataset import ImageTaskDataset
+from llava.constants import DEFAULT_IMAGE_TOKEN
+
+
+class LLaVAPretrainDataset(ImageTaskDataset):
+    def __init__(self, anno_path, data_args=None, name='llava_pretrain'):
+        super().__init__(anno_path=anno_path,
+                         data_args=data_args,
+                         name=name)
+
+    def text_preprocess(self, item) -> List[Dict[str, str]]:
+        qas = item['qas']
+
+        conversations = []
+        for qa in qas:
+            conv = [
+                {
+                    'from': 'human',
+                    'value': DEFAULT_IMAGE_TOKEN + qa['q']
+                },
+                {
+                    'from': 'model',
+                    'value': qa['a']
+                }
+            ]
+            conversations.extend(conv)
+
+        return conversations
+
+@DATASETS.register_obj
+def llava_pretrain(data_args):
+    return LLaVAPretrainDataset(data_configs["llava_pretrain"]['train_data_path'], data_args)
\ No newline at end of file
diff --git a/app/llava/datasets/m3it_dataset.py b/app/llava/datasets/m3it_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..529a0153dcd06634020925b9b091319d46bf9508
--- /dev/null
+++ b/app/llava/datasets/m3it_dataset.py
@@ -0,0 +1,117 @@
+import logging
+
+import torch
+import datasets
+import cv2
+
+import numpy as np
+from base64 import b64decode
+from io import BytesIO
+from PIL import Image
+from torch.utils.data import ConcatDataset
+from llava.datasets.builder import DATASETS
+from typing import Dict, Optional, Sequence, List
+from llava.datasets.data_cfgs import data_configs
+from llava.datasets.base_dataset import ImageTaskDataset
+from llava.constants import DEFAULT_IMAGE_TOKEN, DEFAULT_VIDEO_TOKEN
+from llava.utils import master_print
+
+
+class M3ITDataset(ImageTaskDataset):
+    def __init__(self, anno_path, data_args=None, name='m3it', selected_tasks=None):
+        super().__init__(anno_path, data_args, name)
+
+        self.selected_tasks = selected_tasks
+        dataset_list = [
+            datasets.load_dataset("MMInstruction/M3IT", i, num_proc=16) for i in selected_tasks
+        ]
+        # some dataset have no validation
+        target_dataset_list = []
+        master_print('#' * 50)
+        for d in dataset_list:
+            try:
+                target_dataset_list.append(d['train'])
+                master_print(f"TASK {d['train']._info.config_name}, SIZE {len(d['train'])}")
+            except KeyError:
+                print(f"{d['train']._info.config_name} has no train set.")
+        self.dataset = ConcatDataset(target_dataset_list)
+        master_print(f"Finished loading dataset {name} {len(self.dataset)} samples...")
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def text_preprocess(self, item, is_video=False) -> List[Dict[str, str]]:
+        instruction = item['instruction']
+        question = item['inputs']
+        answer = item['outputs']
+
+        query = f"{instruction} {DEFAULT_IMAGE_TOKEN if not is_video else DEFAULT_VIDEO_TOKEN}"
+        if len(question) > 0:
+            query += question
+
+        conversations = [
+            {
+                'from': 'human',
+                'value': query
+            },
+            {
+                'from': 'model',
+                'value': answer
+            }
+        ]
+
+        return conversations
+
+    def bin2image(self, image_base64_str):
+        img = Image.open(BytesIO(b64decode(image_base64_str))).convert("RGB")
+        img = np.array(img)
+
+        if img.shape[2] != 3:
+            img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
+
+        img = Image.fromarray(img).convert('RGB')
+        img = self.preprocess_image(img)
+
+        return img
+
+    def vis_preprocess(self, image_base64_str_list) -> Image:
+        try:
+            images = list(map(self.bin2image, image_base64_str_list))
+            formatted_images = []
+            for image in images:
+                if isinstance(image, list):
+                    formatted_images.extend(image)
+                else:
+                    formatted_images.append(image)
+            return formatted_images
+        except Exception as e:
+            # print("Invalid sample, skip.")
+            return None
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        item = self.dataset[i]
+
+        img_data = item['image_base64_str']
+
+        images = self.vis_preprocess(img_data)
+        if images is None:
+            return None
+
+        # M3IT video sample has 8 frames
+        is_video = True if len(images) > 0 else False
+
+        ret = {
+            'images': images,
+            'conversations': self.text_preprocess(item, is_video)
+        }
+
+        return ret
+
+
+@DATASETS.register_obj
+def m3it(data_args):
+    tasks = data_configs['m3it']['default_tasks']
+    if 'tasks' in data_args.external_args:
+        tasks = data_args.external_args['tasks']
+
+    return M3ITDataset(anno_path=None, data_args=data_args, selected_tasks=tasks)
diff --git a/app/llava/datasets/prompts.py b/app/llava/datasets/prompts.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5328f87f8718e3945df3411d7d7e339f11f250e
--- /dev/null
+++ b/app/llava/datasets/prompts.py
@@ -0,0 +1,34 @@
+__all__ = ['cc_sbu_prompt', 'internvid_prompt', 'tt_caption_prompt', 'm3it_infer_prompt']
+
+cc_sbu_prompt = [
+    'Describe the image briefly.',
+    'Write a relevant description to pair with the image.'
+]
+
+internvid_prompt = [
+    'Describe the video briefly.',
+    'Describe the video shortly.',
+    'Provide a brief description of the given video clip.',
+    'Give a short and clear explanation of the subsequent video clip.',
+    'Summarize the visual content of the following video.'
+]
+
+tt_caption_prompt = [
+    "Describe the given video in detail.",
+    "Elaborate on the video's content.",
+    "Provide a detailed explanation of the video.",
+    "Explain the video thoroughly.",
+    "Give an in-depth description of the video."
+]
+
+tt_caption_prompt2 = [
+    "Describe the following video in detail.",
+]
+
+m3it_infer_prompt = "Assess the image provided and respond to the relevant question."
+
+synthdog_prompt = 'write down the text overlays in the provided image.'
+
+ocr_prompt = ['From the video, there are some text overlays:',
+             'The video also contains some OCR info:',
+             'The text overlay says:']
\ No newline at end of file
diff --git a/app/llava/datasets/promptv1_2_internal_dataset.py b/app/llava/datasets/promptv1_2_internal_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..6eca9a0fba85adec20b9c51cfbc727acb61050fa
--- /dev/null
+++ b/app/llava/datasets/promptv1_2_internal_dataset.py
@@ -0,0 +1,155 @@
+
+from llava.datasets.builder import DATASETS
+
+from typing import Dict, Optional, Sequence, List
+from llava.datasets.data_cfgs import data_configs
+from llava.datasets.base_dataset import FramesTaskDataset
+from llava.datasets.data_cfgs import data_configs
+import pickle
+from pathlib import Path
+import random
+import numpy as np
+from llava.datasets.prompts import tt_caption_prompt, internvid_prompt
+from llava.constants import DEFAULT_VIDEO_TOKEN
+from PIL import Image
+import json
+import torch
+import os
+
+
+class PromptV1Dataset(FramesTaskDataset):
+    def __init__(self, anno_path=None, data_args=None, name='promptv1_2_internal', task_types=None):
+        self.default_fps = 1.0
+        self.task_types = task_types
+        self.annotation = self.get_dataset(anno_path)
+        super().__init__(anno_path=anno_path,
+                         data_args=data_args,
+                         name=name)
+    def __len__(self):
+        return len(self.annotation)
+
+
+    def get_dataset(self, anno_path):
+        dataset = []
+        anno_path = Path(anno_path)
+        with anno_path.open('rb') as f:
+            data = json.load(f)
+        for info in data:
+            for task_type in self.task_types:
+                info_task = info.copy()
+                if task_type not in info or len(info_task[task_type]) == 0:
+                    continue
+                if task_type == 'qas' and self.conv_type == 'single':
+                    for qa_pair in info_task[task_type]:
+                        one_info = info_task.copy()
+                        one_info[task_type] = [qa_pair]
+                        one_info.update({
+                            'task_type': task_type
+                        })
+                        dataset.append(one_info)
+                else:
+                    info_task.update({
+                        'task_type': task_type
+                    })
+                    dataset.append(info_task)
+        return dataset
+
+
+    def text_preprocess(self, item) -> List[Dict[str, str]]:
+        all_convs = []
+        if hasattr(self.data_args, 'caption_prompt'):
+            cap_prompt = eval(self.data_args.caption_prompt)
+        else:
+            cap_prompt = tt_caption_prompt
+        if item['task_type'] == 'refine_caption':
+            all_convs.append([
+                {
+                    'from': 'human',
+                    'value': random.choice(cap_prompt)
+                },
+                {
+                    'from': 'model',
+                    'value': item['refine_caption']
+                }
+            ])
+        else:
+            for idx, qa in enumerate(item['qas']):
+                all_convs.append([
+                    {
+                        'from': 'human',
+                        'value': qa['q']
+                    },
+                    {
+                        'from': 'model',
+                        'value': qa['a']
+                    }
+                ])
+
+        conversations = []
+        random.shuffle(all_convs)
+        for idx, conv in enumerate(all_convs):
+            if idx == 0:
+                conv[0]['value'] = DEFAULT_VIDEO_TOKEN + conv[0]['value']
+            conversations.extend(conv)
+        return conversations
+
+
+
+    # def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+    #     item = self.annotation[i]
+
+    #     ret = {
+    #         'images': self.vis_preprocess(item['video_path']),
+    #         'conversations': self.text_preprocess(item)
+    #     }
+    #     if 'id' in item:
+    #         ret['id'] = item['id']
+
+    #     return ret
+
+
+    # @staticmethod
+    # def _sample_frames(frames, num_segments):
+    #     indices = np.linspace(start=0, stop=len(frames) - 1, num=num_segments).astype(int)
+
+    #     frames = [frames[ind] for ind in indices]
+
+    #     return frames
+
+    # def vis_preprocess(self, vis_path):
+    #     image_files = []
+    #     for img_path in os.listdir(vis_path):
+    #         if img_path.endswith('.jpeg'):
+    #             img_idx = int(img_path.split('_')[-1][:-5])
+    #             image_files.append((img_idx, img_path))
+        
+    #     image_files = sorted(image_files, key=lambda img: img[0])
+    #     # TODO: addhoc fix,  only 10 frames
+    #     if len(image_files) > 10:
+    #         image_files = self._sample_frames(image_files, 10)
+    #     if self.num_segments > 0 and len(image_files) > self.num_segments:
+    #         image_files = self._sample_frames(image_files, self.num_segments)
+        
+    #     images = []
+    #     for image_file in image_files:
+    #         try:
+    #             images.append(Image.open(os.path.join(vis_path, image_file[1])).convert('RGB'))
+    #         except Exception as e:
+    #             continue
+    #     formatted_images = []
+    #     for image in images:
+    #         im = self.preprocess_image(image)
+    #         if isinstance(im, list):
+    #             formatted_images.extend(im)
+    #         else:
+    #             formatted_images.append(im)
+    #     return formatted_images
+
+
+@DATASETS.register_obj
+def promptv1_2_internal(data_args):
+    data_cfg = data_configs['promptv1_2_internal']
+    task_types = data_args.external_args['task_types']
+    return PromptV1Dataset(anno_path=data_cfg['train_data_path'], data_args=data_args, task_types=task_types)
+
+    
\ No newline at end of file
diff --git a/app/llava/datasets/registry.py b/app/llava/datasets/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..b96453ec25cde42ea2cfc8f7d94719c59919fa1a
--- /dev/null
+++ b/app/llava/datasets/registry.py
@@ -0,0 +1,82 @@
+
+__all__ = ['Registry', 'build_from_cfg']
+
+class Registry(object):
+    """A registry to map strings to classes.
+
+    Args:
+        name (str): Registry name.
+    """
+
+    def __init__(self, name):
+        self._name = name
+        self._obj_dict = dict()
+
+    def __repr__(self):
+        format_str = self.__class__.__name__ + '(name={}, items={})'.format(
+            self._name, self.items())
+        return format_str
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def obj_dict(self):
+        return self._obj_dict
+
+    def get(self, key):
+        return self._obj_dict.get(key, None)
+
+    def has(self, key):
+        if key in self._obj_dict.keys():
+            return True
+        return False
+
+    def items(self):
+        return list(self._obj_dict.keys())
+
+    def _register_obj(self, obj):
+        """Register a object.
+
+        Args:
+            obj (:obj: callable): Callable object to be registered.
+        """
+        if not callable(obj):
+            raise TypeError(f'object {str(obj)} must be callable')
+        obj_name = obj.__name__
+        if obj_name in self._obj_dict:
+            raise KeyError(f'{obj_name} is already registered in {self.name}.')
+        self._obj_dict[obj_name] = obj
+
+    def register_obj(self, obj):
+        self._register_obj(obj)
+        return obj
+
+
+def build_from_cfg(name, cfg, registry, default_args=None):
+    """Build a module from config dict.
+       Since Cruise Module has different config format with haggs, we will
+       try to detect and catch Cruise usage in the begining.
+
+    Args:
+        name (str): Name of the object
+        cfg (addict): Config dict of the object
+        registry (:obj:`Registry`): The registry to search the type from.
+        default_args (dict, optional): Default initialization arguments.
+
+    Returns:
+        obj: The constructed object.
+    """
+    obj = registry.get(name)
+    if obj is None:
+        raise KeyError(f'{name} is not in the {registry.name} registry. '
+                       f'Choose among {list(registry.obj_dict.keys())}')
+
+    if default_args is not None:
+        # for key, value in default_args.items():
+        #     # cfg.setdefault(key, value)
+        #     setattr(cfg, key, value)
+        cfg.external_args = default_args
+
+    return obj(cfg)
\ No newline at end of file
diff --git a/app/llava/datasets/sharegpt4v_dataset.py b/app/llava/datasets/sharegpt4v_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0d521370644204a5afa0751c8247f122c2f2cd8
--- /dev/null
+++ b/app/llava/datasets/sharegpt4v_dataset.py
@@ -0,0 +1,102 @@
+import datasets
+import torch
+import re
+import os
+import subprocess
+from llava.datasets.builder import DATASETS
+
+from typing import Dict, Optional, Sequence, List
+from llava.datasets.data_cfgs import data_configs
+from llava.datasets.base_dataset import ImageTaskDataset
+from llava.constants import DEFAULT_IMAGE_TOKEN
+from llava.datasets.data_cfgs import data_configs
+from llava.utils import master_print
+
+class ShareGPT4VDataset(ImageTaskDataset):
+    def __init__(self, anno_path=None, data_args=None, aux_args=None, name='sharegpt4v'):
+        super().__init__(anno_path=anno_path,
+                         data_args=data_args,
+                         name=name)
+        self.annotation = datasets.load_dataset("Lin-Chen/ShareGPT4V", "ShareGPT4V")['train']
+        self.aux_args = aux_args
+        master_print(f"Finished loading dataset {name} {len(self.annotation)} samples...")
+
+        
+    def __len__(self):
+        return len(self.annotation)
+
+    def text_preprocess(self, item) -> List[Dict[str, str]]:
+        captions = item['conversations']
+
+        conversations = []
+        conv = [
+            {
+                'from': 'human',
+                'value': DEFAULT_IMAGE_TOKEN + captions[0]['value'].replace('<image>', '')
+            },
+            {
+                'from': 'model',
+                'value': captions[1]['value']
+            }
+        ]
+        conversations.extend(conv)
+
+        return conversations
+
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        item = self.annotation[i]
+        if 'coco' in item['image']:
+            vis_path = os.path.join(self.aux_args['coco_dir'], item['image'])
+        elif 'llava' in item['image']:
+            file_names =  item['image'].split('/')
+            vis_path = os.path.join(self.aux_args['llava_dir'], *file_names[-3:])
+        else:
+            vis_path = os.path.join(self.aux_args['other_dir'], item['image'])
+            
+        ret = {
+            'images': self.vis_preprocess(vis_path),
+            'conversations': self.text_preprocess(item)
+        }
+        if 'id' in item:
+            ret['id'] = item['id']
+
+        return ret
+
+@DATASETS.register_obj
+def ShareGPT4V(data_args):
+    data_cfg = data_configs['sharegpt4v']
+    return ShareGPT4VDataset(None, data_args, aux_args=data_cfg)
+
+if __name__ == '__main__':
+    dataset = datasets.load_dataset("Lin-Chen/ShareGPT4V", "ShareGPT4V")['train']
+    aux_args = data_configs['sharegpt4v']
+    for item in dataset:
+        if 'coco' in item['image']:
+            vis_path = os.path.join(aux_args['coco_dir'], item['image'])
+        elif 'llava' in item['image']:
+            file_names =  item['image'].split('/')
+            vis_path = os.path.join(aux_args['llava_dir'], *file_names[-3:])
+        else:
+            vis_path = os.path.join(aux_args['other_dir'], item['image'])
+        if not os.path.exists(vis_path):
+            print(vis_path)
+    # with open('/mnt/bn/yukunfeng-nasdrive/xiangchen/dataset/sharegpt4v/sam.txt') as f:
+    #     for line in f:
+    #         items = line.split('\t')
+    #         name = items[0].strip()
+    #         url = items[1].strip()
+    #         match = re.search(r'(\d+)', name).group(1)
+    #         idx = int(match)
+    #         if idx >= 60:
+    #             continue
+    #         print(name, url)
+    #         output_file = os.path.join('/mnt/bn/yukunfeng-nasdrive/xiangchen/dataset/sharegpt4v/sam', name)
+    #         try:
+    #             subprocess.run(["wget", "-O", output_file, url], check=True)
+    #         except subprocess.CalledProcessError as e:
+    #             print("An error occurred while downloading the file.")
+    # from glob import glob
+    # file_path = '/mnt/bn/yukunfeng-nasdrive/xiangchen/dataset/sharegpt4v/sam'
+    # for file_name in glob(os.path.join(file_path, '*.tar')):
+    #     subprocess.run(["tar", "-xf", file_name, '-C', '/mnt/bn/yukunfeng-nasdrive/xiangchen/dataset/sharegpt4v/sam/images'], check=True)
diff --git a/app/llava/datasets/super_dataset.py b/app/llava/datasets/super_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..27aa0e99b88261eef5833d95f337687b3f6c6090
--- /dev/null
+++ b/app/llava/datasets/super_dataset.py
@@ -0,0 +1,316 @@
+from operator import length_hint
+import random
+import bisect
+import copy
+import torch
+import transformers
+from torch.utils.data import get_worker_info
+from omegaconf import OmegaConf
+import torchvision.transforms.functional as F
+from dataclasses import dataclass, field
+from typing import Dict, Optional, Sequence, List
+from torch.utils.data import Dataset, ConcatDataset
+
+from llava.datasets.registry import build_from_cfg
+from llava.datasets.builder import DATASETS
+from llava.datasets.data_cfgs import data_configs
+from llava.train.arguments import DataArguments
+from llava.model.preprocessor import preprocess_multimodal, preprocess
+from llava.constants import IGNORE_INDEX
+from llava.utils import DatasetIter, get_world_size, get_rank, master_print
+from transformers import CLIPImageProcessor, SiglipImageProcessor
+
+class LazySupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+
+    def __init__(self, data_cfg: str,
+                 tokenizer: transformers.PreTrainedTokenizer,
+                 data_args: DataArguments,
+                 num_workers: int):
+
+        super(LazySupervisedDataset, self).__init__()
+        dataset_config = OmegaConf.load(data_cfg)
+
+        self.tokenizer = tokenizer
+        self.data_args = data_args
+
+        self.datasets, self.sample_ratios = list(), list()
+        for ds in list(dataset_config.datasets.keys()):
+            ds_cfg = dataset_config.datasets[ds]
+            external_args = {}
+            for key, value in ds_cfg.items():
+                external_args[key] = value
+            args_ = copy.deepcopy(vars(data_args))
+            data_args_copy = type('DataArguments', (object,), args_)
+            dataset = build_from_cfg(ds, data_args_copy, DATASETS, default_args=external_args)
+            self.datasets.append(dataset)
+            if 'sample_ratio' in ds_cfg:
+                self.sample_ratios.append(ds_cfg.sample_ratio)
+
+        if len(self.sample_ratios) != len(self.datasets):
+            self.sample_ratios = [1.0] * len(self.sample_ratios)
+
+        self.sample_ratios = [float(ratio) / sum(self.sample_ratios) for ratio in self.sample_ratios]
+        self.ds_iters = [DatasetIter(len(dataset), get_world_size(), get_rank(), num_workers)
+                         for dataset in self.datasets]
+    def __len__(self):
+        # set iters per epoch as the maximum iterations of each dataset
+        max_ds = sorted([int(len(ds) / ratio) for (ds, ratio) in zip(self.datasets, self.sample_ratios)], reverse=True)[0]
+
+        return max_ds
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        worker_info = get_worker_info()
+
+        ds_idx = random.choices(range(len(self.datasets)), self.sample_ratios, k=1)[0]
+
+        item = None
+        while item is None:
+            item_id = self.ds_iters[ds_idx].increment(worker_info.id)
+            # item_id = self.ds_iters[ds_idx].increment(0)
+            item = self.datasets[ds_idx].__getitem__(item_id)
+
+        sources = item
+        if isinstance(i, int):
+            sources = [sources]
+        assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME
+        if 'images' in sources[0]:
+            images = sources[0]['images']
+            conversations = copy.deepcopy([e['conversations'] for e in sources])
+
+            sources = preprocess_multimodal(
+                conversations, self.data_args)
+        else:
+            sources = copy.deepcopy([e["conversations"] for e in sources])
+
+        data_dict = preprocess(
+            sources,
+            self.tokenizer,
+            has_image=('images' in item))
+
+        if isinstance(i, int):
+            data_dict = dict(input_ids=data_dict["input_ids"][0],
+                             labels=data_dict["labels"][0])
+
+        if images is not None and len(images) > 0:
+            data_dict["images"] = images
+        elif self.data_args.is_multimodal:
+            # image does not exist in the data, but the model is multimodal
+            img_size = self.data_args.image_processor.img_size
+            # data_dict['image'] = torch.zeros(3, crop_size['height'], crop_size['width'])
+            if getattr(self.data_args, 'image_aspect_ratio', 'square') == 'anyres':
+                data_dict['images'] = [torch.zeros(1, 3, img_size, img_size)]
+            else:
+                data_dict['images'] = [torch.zeros(3, img_size, img_size)]
+            data_dict['labels'][:] = IGNORE_INDEX
+        return data_dict
+    
+
+
+
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    """Collate examples for supervised fine-tuning."""
+    tokenizer: transformers.PreTrainedTokenizer
+
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple([instance[key] for instance in instances]
+                                  for key in ("input_ids", "labels"))
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids,
+            batch_first=True,
+            padding_value=self.tokenizer.pad_token_id)
+        labels = torch.nn.utils.rnn.pad_sequence(labels,
+                                                 batch_first=True,
+                                                 padding_value=IGNORE_INDEX)
+        input_ids = input_ids[:, :self.tokenizer.model_max_length]
+        labels = labels[:, :self.tokenizer.model_max_length]
+        batch = dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
+        )
+
+        if 'images' in instances[0]:
+            images = [instance['images'] for instance in instances]
+            images_data = []
+            for imgs in images:
+                if all(x is not None and x.shape == imgs[0].shape for x in imgs):
+                    imgs = torch.stack(imgs)
+                else:
+                    imgs = [x for x in imgs if x is not None]
+                    imgs = [x for x in imgs if x.shape == imgs[0].shape]
+                    imgs = torch.stack(imgs)
+
+                images_data.append(imgs)
+
+            batch["images"] = images_data
+
+        if 'images' not in batch or len(batch['images']) == 0:
+            print("images not in batch")
+
+        return batch
+
+
+def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer,
+                                data_args,
+                                num_workers) -> Dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    train_dataset = LazySupervisedDataset(data_cfg=data_args.dataset_config,
+                                          tokenizer=tokenizer,
+                                          data_args=data_args,
+                                          num_workers=num_workers)
+
+    for ds, ratio in zip(train_dataset.datasets, train_dataset.sample_ratios):
+        master_print(f"==> Real epoch of {ds.name} is {round(len(train_dataset) * ratio / len(ds), 2)} epochs.")
+
+    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+    return dict(train_dataset=train_dataset,
+                eval_dataset=None,
+                data_collator=data_collator)
+
+
+
+class SupervisedConcatDataset(ConcatDataset):
+    r"""Dataset as a concatenation of multiple datasets.
+
+    This class is useful to assemble different existing datasets.
+
+    Args:
+        datasets (sequence): List of datasets to be concatenated
+    """
+
+    datasets: List[Dataset]
+    cumulative_sizes: List[int]
+
+
+    def __init__(self, datasets: List[Dataset],
+                 tokenizer: transformers.PreTrainedTokenizer,
+                 data_args: DataArguments) -> None:
+        # super().__init__()
+        super().__init__(datasets)
+        self.tokenizer = tokenizer
+        self.data_args = data_args
+
+        # self.datasets = list(datasets)
+        # assert len(self.datasets) > 0, 'datasets should not be an empty iterable'  # type: ignore[arg-type]
+        # for d in self.datasets:
+        #     assert not isinstance(d, IterableDataset), "ConcatDataset does not support IterableDataset"
+        # self.cumulative_sizes = self.cumsum(self.datasets)
+
+    # @property
+    # def lengths(self):
+    #     length_list = []
+    #     for sample in self.list_data_dict:
+    #         img_tokens = 128 if 'image' in sample else 0
+    #         length_list.append(sum(len(conv['value'].split()) for conv in sample['conversations']) + img_tokens)
+    #     return length_list
+
+    @property
+    def modality_lengths(self):
+        length_list = []
+        token_per_image = getattr(self.data_args, 'num_token_per_image', 32)
+        # token_per_image = 32
+        # for sample in self.list_data_dict:
+        #     cur_len = sum(len(conv['value'].split()) for conv in sample['conversations'])
+        #     cur_len = cur_len if 'image' in sample else -cur_len
+        #     length_list.append(cur_len)
+        for idx in range(len(self)):
+            dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+            if dataset_idx == 0:
+                sample_idx = idx
+            else:
+                sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+            item = self.datasets[dataset_idx].annotation[sample_idx]
+            conversations = self.datasets[dataset_idx].text_preprocess(item)
+            cur_len = sum([len(conv['value'].split()) for conv in conversations])
+            if self.datasets[dataset_idx].type == 'images':
+                cur_len += token_per_image
+            else:
+                cur_len += token_per_image * self.data_args.num_segments  
+            length_list.append(cur_len)          
+        return length_list
+
+    def __len__(self):
+        return self.cumulative_sizes[-1]
+
+    def __getitem__(self, idx):
+        if idx < 0:
+            if -idx > len(self):
+                raise ValueError("absolute value of index should not exceed dataset length")
+            idx = len(self) + idx
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        item = self.datasets[dataset_idx][sample_idx] 
+        sources = item
+        if isinstance(idx, int):
+            sources = [sources]
+        assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME
+        if 'images' in sources[0]:
+            images = sources[0]['images']
+            conversations = copy.deepcopy([e['conversations'] for e in sources])
+
+            sources = preprocess_multimodal(
+                conversations, self.data_args)
+        else:
+            sources = copy.deepcopy([e["conversations"] for e in sources])
+
+        data_dict = preprocess(
+            sources,
+            self.tokenizer,
+            has_image=('images' in item))
+
+        if isinstance(idx, int):
+            data_dict = dict(input_ids=data_dict["input_ids"][0],
+                             labels=data_dict["labels"][0])
+
+        if images is not None and len(images) > 0:
+            data_dict["images"] = images
+        elif self.data_args.is_multimodal:
+            # image does not exist in the data, but the model is multimodal
+            if isinstance(self.data_args.image_processor, SiglipImageProcessor):
+                img_size = self.data_args.image_processor.size['height']
+            elif isinstance(self.data_args.image_processor, CLIPImageProcessor):
+                img_size = self.data_args.image_processor.crop_size['height']
+            else:
+                img_size = self.data_args.image_processor.img_size
+            # data_dict['image'] = torch.zeros(3, crop_size['height'], crop_size['width'])
+            if getattr(self.data_args, 'image_aspect_ratio', 'square') == 'anyres':
+                data_dict['images'] = [torch.zeros(1, 3, img_size, img_size)]
+            else:
+                data_dict['images'] = [torch.zeros(3, img_size, img_size)]
+            data_dict['labels'][:] = IGNORE_INDEX
+        return data_dict
+
+
+def make_supervised_data_module_concatdataset(tokenizer: transformers.PreTrainedTokenizer,
+                                        data_args,
+                                        num_workers) -> Dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    datasets = []
+    dataset_config = OmegaConf.load(data_args.dataset_config)
+    for ds in list(dataset_config.datasets.keys()):
+        ds_cfg = dataset_config.datasets[ds]
+        external_args = {}
+        for key, value in ds_cfg.items():
+            external_args[key] = value
+        args_ = copy.deepcopy(vars(data_args))
+        data_args_copy = type('DataArguments', (object,), args_)
+        dataset = build_from_cfg(ds, data_args_copy, DATASETS, default_args=external_args)
+        datasets.append(dataset)
+
+    train_dataset = SupervisedConcatDataset(datasets=datasets,
+                                          tokenizer=tokenizer,
+                                          data_args=data_args)
+
+    # for ds, ratio in zip(train_dataset.datasets, train_dataset.sample_ratios):
+    #     master_print(f"==> Real epoch of {ds.name} is {round(len(train_dataset) * ratio / len(ds), 2)} epochs.")
+
+    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+    return dict(train_dataset=train_dataset,
+                eval_dataset=None,
+                data_collator=data_collator)
+
diff --git a/app/llava/datasets/synthetic_ocr_dataset.py b/app/llava/datasets/synthetic_ocr_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f6cffe70b3fe09a6920a13c1b8b15f2cc4d7cba
--- /dev/null
+++ b/app/llava/datasets/synthetic_ocr_dataset.py
@@ -0,0 +1,72 @@
+import os
+import torch
+import random
+import json
+from pathlib import Path
+from llava.datasets.builder import DATASETS
+
+from typing import Dict, Optional, Sequence, List
+from llava.datasets.data_cfgs import data_configs
+from llava.datasets.base_dataset import FramesTaskDataset
+from llava.datasets.prompts import tt_caption_prompt, ocr_prompt
+from llava.constants import DEFAULT_VIDEO_TOKEN
+
+
+class SyntheticOCRDataset(FramesTaskDataset):
+    def __init__(self, anno_path, data_args=None, fps=2.0, name='synthetic_ocr'):
+        super().__init__(anno_path=anno_path,
+                         data_args=data_args,
+                         fps=fps,
+                         name=name)
+        self.default_fps = 0.1
+
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        item = self.annotation[i]
+
+        ret = {
+            'images': self.vis_preprocess(item['video_path']),
+            'conversations': self.text_preprocess(item)
+        }
+        if 'id' in item:
+            ret['id'] = item['id']
+
+        return ret
+
+    def text_preprocess(self, item) -> List[Dict[str, str]]:
+        all_convs = []
+        if hasattr(self.data_args, 'caption_prompt'):
+            cap_prompt = eval(self.data_args.caption_prompt)
+        else:
+            cap_prompt = tt_caption_prompt
+        
+        conversations = []
+        conversations.extend([
+                {
+                    'from': 'human',
+                    'value': DEFAULT_VIDEO_TOKEN + random.choice(cap_prompt)
+                },
+                {
+                    'from': 'model',
+                    'value': item['gpt_caption'] + ' ' + random.choice(ocr_prompt) + ','.join(item['ocr_list'])
+                }
+        ])
+        return conversations
+
+
+@DATASETS.register_obj
+def synthetic_ocr(data_args):
+    train_data_path = None
+    if 'train_data_path' in data_args.external_args:
+        train_data_path = data_args.external_args['train_data_path']
+    else:
+        train_data_path = data_configs["synthetic_ocr"]['train_data_path']
+    return SyntheticOCRDataset(train_data_path, data_args, 2.0)
+
+if __name__ == '__main__':
+    with open('/mnt/bn/algo-masp-nas-2/xiangchen/dataset/masp/synthetic_ocr/train_filtered.json') as f:
+        data = json.load(f)
+    
+    for sample in data:
+        res = sample['gpt_caption'] + ' ' + random.choice(ocr_prompt) + ','.join(sample['ocr_list'])
+        # print(res)
\ No newline at end of file
diff --git a/app/llava/datasets/textcaps_dataset.py b/app/llava/datasets/textcaps_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..bad821615a37fc0049d7dff261e3fb893285e3c0
--- /dev/null
+++ b/app/llava/datasets/textcaps_dataset.py
@@ -0,0 +1,76 @@
+import datasets
+import torch
+import re
+import os
+import json
+from llava.datasets.builder import DATASETS
+from pathlib import Path
+import random
+from typing import Dict, Optional, Sequence, List
+from llava.datasets.data_cfgs import data_configs
+from llava.datasets.base_dataset import ImageTaskDataset
+from llava.datasets.prompts import cc_sbu_prompt
+from llava.constants import DEFAULT_IMAGE_TOKEN
+from llava.datasets.data_cfgs import data_configs
+from llava.utils import master_print
+
+
+class TextCapsDataset(ImageTaskDataset):
+    def __init__(self, anno_path=None, data_args=None, aux_args=None, name='TextCaps'):
+        with open(anno_path) as f:
+            self.annotation = json.load(f)['data']        
+        self.dataset_dir = Path(anno_path).parent
+        super().__init__(anno_path=anno_path,
+                         data_args=data_args,
+                         name=name)
+        
+    def __len__(self):
+        return len(self.annotation)
+    
+
+    def text_preprocess(self, item) -> List[Dict[str, str]]:
+        conversations = []
+        conversations.extend([
+                {
+                    'from': 'human',
+                    'value': DEFAULT_IMAGE_TOKEN + random.choice(cc_sbu_prompt)
+                },
+                {
+                    'from': 'model',
+                    'value': item['caption_str']
+                }
+            ])       
+                    
+        return conversations
+
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        item = self.annotation[i]
+        vis_path = self.dataset_dir / item['image_path']
+        ret = {
+            'images': self.vis_preprocess(str(vis_path)),
+            'conversations': self.text_preprocess(item)
+        }
+        if 'id' in item:
+            ret['id'] = item['id']
+
+        return ret
+
+@DATASETS.register_obj
+def TextCaps(data_args):
+    data_cfg = data_configs['text_caps']
+    return TextCapsDataset(data_cfg['train_data_path'], data_args)
+
+if __name__ == '__main__':
+    # viz_dir = '/mnt/bn/yukunfeng-nasdrive/xiangchen/dataset/OCR-VQA/'
+    with open('/mnt/bn/yukunfeng-nasdrive/xiangchen/dataset/TextCaps/TextCaps_0.1_train.json') as f:
+        data = json.load(f)
+    res = []
+    for value in data:
+        # ext=os.path.splitext(value['imageURL'])[1]
+        # outputFile=os.path.join(viz_dir, 'images/%s%s'%(key,ext))
+        # q = value['questions']
+        # a = value['answers']
+        if len(value['questions']) == 0:
+            print(1)
+        res.append(value)
diff --git a/app/llava/datasets/tt_gptv_v1_dataset.py b/app/llava/datasets/tt_gptv_v1_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0a771569c9d425593fddf5908d333159e283713
--- /dev/null
+++ b/app/llava/datasets/tt_gptv_v1_dataset.py
@@ -0,0 +1,14 @@
+import json
+import os
+from pathlib import Path
+with open('/mnt/bn/algo-masp-nas-2/kaili.zhao/data/masp_data/train/gpt4v_annotation/20240325week_gpt4v_all_videos_unique_ids.json') as f:
+    data = json.load(f)
+
+for sample in data:
+    video_path = Path(sample['video_path'])
+    file_names = os.listdir(sample['video_path'])
+    if len(file_names) == 10:
+        file_names.sort(key=lambda x: int(x[:-4]))
+        print(file_names)
+    
+    
\ No newline at end of file
diff --git a/app/llava/datasets/tt_vqa_dataset.py b/app/llava/datasets/tt_vqa_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bd75bc48e898090eb52c685d482d9457bd59d3d
--- /dev/null
+++ b/app/llava/datasets/tt_vqa_dataset.py
@@ -0,0 +1,71 @@
+import os
+import random
+import json
+from pathlib import Path
+from llava.datasets.builder import DATASETS
+
+from typing import Dict, Optional, Sequence, List
+from llava.datasets.data_cfgs import data_configs
+from llava.datasets.base_dataset import FramesTaskDataset
+from llava.datasets.prompts import tt_caption_prompt, tt_caption_prompt2
+from llava.constants import DEFAULT_VIDEO_TOKEN
+
+
+class TTVqaDataset(FramesTaskDataset):
+    def __init__(self, anno_path, data_args=None, fps=2.0, data_cfgs=None, name='tt_vqa'):
+        super().__init__(anno_path=anno_path,
+                         data_args=data_args,
+                         fps=fps,
+                         name=name)
+        self.default_fps = data_cfgs['fps']
+
+
+    def text_preprocess(self, item) -> List[Dict[str, str]]:
+        all_convs = []
+        if hasattr(self.data_args, 'caption_prompt'):
+            cap_prompt = eval(self.data_args.caption_prompt)
+        else:
+            cap_prompt = tt_caption_prompt
+        if 'caption' in item:
+            all_convs.append([
+                {
+                    'from': 'human',
+                    'value': random.choice(cap_prompt)
+                },
+                {
+                    'from': 'model',
+                    'value': item['caption']
+                }
+            ])
+        if 'qas' in item:
+            for idx, qa in enumerate(item['qas']):
+                all_convs.append([
+                    {
+                        'from': 'human',
+                        'value': qa['q']
+                    },
+                    {
+                        'from': 'model',
+                        'value': qa['a']
+                    }
+                ])
+
+        conversations = []
+        random.shuffle(all_convs)
+        for idx, conv in enumerate(all_convs):
+            if idx == 0:
+                conv[0]['value'] = DEFAULT_VIDEO_TOKEN + conv[0]['value']
+            conversations.extend(conv)
+
+        return conversations
+
+
+@DATASETS.register_obj
+def tt_vqa(data_args):
+    train_data_path = None
+    if 'train_data_path' in data_args.external_args:
+        train_data_path = data_args.external_args['train_data_path']
+    else:
+        train_data_path = data_configs["tt_vqa"]['train_data_path']
+    return TTVqaDataset(train_data_path, data_args, 2.0, data_configs["tt_vqa"])
+
diff --git a/app/llava/eval/llava_eval/eval_gpt_review.py b/app/llava/eval/llava_eval/eval_gpt_review.py
new file mode 100644
index 0000000000000000000000000000000000000000..8af4559c65fc2728b11fd2097a109981ee1ef686
--- /dev/null
+++ b/app/llava/eval/llava_eval/eval_gpt_review.py
@@ -0,0 +1,113 @@
+import argparse
+import json
+import os
+
+import openai
+import tqdm
+import ray
+import time
+
+NUM_SECONDS_TO_SLEEP = 3
+
+@ray.remote(num_cpus=4)
+def get_eval(content: str, max_tokens: int):
+    while True:
+        try:
+            response = openai.ChatCompletion.create(
+                model='gpt-4',
+                messages=[{
+                    'role': 'system',
+                    'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
+                }, {
+                    'role': 'user',
+                    'content': content,
+                }],
+                temperature=0.2,  # TODO: figure out which temperature is best for evaluation
+                max_tokens=max_tokens,
+            )
+            break
+        except openai.error.RateLimitError:
+            pass
+        except Exception as e:
+            print(e)
+        time.sleep(NUM_SECONDS_TO_SLEEP)
+
+    print('success!')
+    return response['choices'][0]['message']['content']
+
+
+def parse_score(review):
+    try:
+        score_pair = review.split('\n')[0]
+        score_pair = score_pair.replace(',', ' ')
+        sp = score_pair.split(' ')
+        if len(sp) == 2:
+            return [float(sp[0]), float(sp[1])]
+        else:
+            print('error', review)
+            return [-1, -1]
+    except Exception as e:
+        print(e)
+        print('error', review)
+        return [-1, -1]
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
+    parser.add_argument('-q', '--question')
+    # parser.add_argument('-a', '--answer')
+    parser.add_argument('-a', '--answer-list', nargs='+', default=[])
+    parser.add_argument('-r', '--rule')
+    parser.add_argument('-o', '--output')
+    parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
+    args = parser.parse_args()
+
+    ray.init()
+
+    f_q = open(os.path.expanduser(args.question))
+    f_ans1 = open(os.path.expanduser(args.answer_list[0]))
+    f_ans2 = open(os.path.expanduser(args.answer_list[1]))
+    rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
+
+    review_file = open(f'{args.output}', 'w')
+
+    js_list = []
+    handles = []
+    idx = 0
+    for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
+        # if idx == 1:
+        #     break
+
+        ques = json.loads(ques_js)
+        ans1 = json.loads(ans1_js)
+        ans2 = json.loads(ans2_js)
+
+        category = json.loads(ques_js)['category']
+        if category in rule_dict:
+            rule = rule_dict[category]
+        else:
+            rule = rule_dict['default']
+        prompt = rule['prompt']
+        role = rule['role']
+        content = (f'[Question]\n{ques["text"]}\n\n'
+                   f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
+                   f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
+                   f'[System]\n{prompt}\n\n')
+        js_list.append({
+            'id': idx+1,
+            'question_id': ques['question_id'],
+            'answer1_id': ans1['answer_id'],
+            'answer2_id': ans2['answer_id'],
+            'category': category})
+        idx += 1
+        handles.append(get_eval.remote(content, args.max_tokens))
+        # To avoid the rate limit set by OpenAI
+        time.sleep(NUM_SECONDS_TO_SLEEP)
+
+    reviews = ray.get(handles)
+    for idx, review in enumerate(reviews):
+        scores = parse_score(review)
+        js_list[idx]['content'] = review
+        js_list[idx]['tuple'] = scores
+        review_file.write(json.dumps(js_list[idx]) + '\n')
+    review_file.close()
diff --git a/app/llava/eval/llava_eval/eval_gpt_review_bench.py b/app/llava/eval/llava_eval/eval_gpt_review_bench.py
new file mode 100644
index 0000000000000000000000000000000000000000..06160f2422b5368f30fb967f7cae635208a1dc69
--- /dev/null
+++ b/app/llava/eval/llava_eval/eval_gpt_review_bench.py
@@ -0,0 +1,121 @@
+import argparse
+import json
+import os
+
+import openai
+import time
+
+NUM_SECONDS_TO_SLEEP = 0.5
+
+
+def get_eval(content: str, max_tokens: int):
+    while True:
+        try:
+            response = openai.ChatCompletion.create(
+                model='gpt-4-0314',
+                messages=[{
+                    'role': 'system',
+                    'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
+                }, {
+                    'role': 'user',
+                    'content': content,
+                }],
+                temperature=0.2,  # TODO: figure out which temperature is best for evaluation
+                max_tokens=max_tokens,
+            )
+            break
+        except openai.error.RateLimitError:
+            pass
+        except Exception as e:
+            print(e)
+        time.sleep(NUM_SECONDS_TO_SLEEP)
+
+    return response['choices'][0]['message']['content']
+
+
+def parse_score(review):
+    try:
+        score_pair = review.split('\n')[0]
+        score_pair = score_pair.replace(',', ' ')
+        sp = score_pair.split(' ')
+        if len(sp) == 2:
+            return [float(sp[0]), float(sp[1])]
+        else:
+            print('error', review)
+            return [-1, -1]
+    except Exception as e:
+        print(e)
+        print('error', review)
+        return [-1, -1]
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
+    parser.add_argument('-q', '--question')
+    parser.add_argument('-c', '--context')
+    parser.add_argument('-a', '--answer-list', nargs='+', default=[])
+    parser.add_argument('-r', '--rule')
+    parser.add_argument('-o', '--output')
+    parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
+    args = parser.parse_args()
+
+    f_q = open(os.path.expanduser(args.question))
+    f_ans1 = open(os.path.expanduser(args.answer_list[0]))
+    f_ans2 = open(os.path.expanduser(args.answer_list[1]))
+    rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
+
+    if os.path.isfile(os.path.expanduser(args.output)):
+        cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
+    else:
+        cur_reviews = []
+
+    review_file = open(f'{args.output}', 'a')
+
+    context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
+    image_to_context = {context['image']: context for context in context_list}
+
+    handles = []
+    idx = 0
+    for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
+        ques = json.loads(ques_js)
+        ans1 = json.loads(ans1_js)
+        ans2 = json.loads(ans2_js)
+
+        inst = image_to_context[ques['image']]
+
+        if isinstance(inst['caption'], list):
+            cap_str = '\n'.join(inst['caption'])
+        else:
+            cap_str = inst['caption']
+
+        category = 'llava_bench_' + json.loads(ques_js)['category']
+        if category in rule_dict:
+            rule = rule_dict[category]
+        else:
+            assert False, f"Visual QA category not found in rule file: {category}."
+        prompt = rule['prompt']
+        role = rule['role']
+        content = (f'[Context]\n{cap_str}\n\n'
+                   f'[Question]\n{ques["text"]}\n\n'
+                   f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
+                   f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
+                   f'[System]\n{prompt}\n\n')
+        cur_js = {
+            'id': idx+1,
+            'question_id': ques['question_id'],
+            'answer1_id': ans1.get('answer_id', ans1['question_id']),
+            'answer2_id': ans2.get('answer_id', ans2['answer_id']),
+            'category': category
+        }
+        if idx >= len(cur_reviews):
+            review = get_eval(content, args.max_tokens)
+            scores = parse_score(review)
+            cur_js['content'] = review
+            cur_js['tuple'] = scores
+            review_file.write(json.dumps(cur_js) + '\n')
+            review_file.flush()
+        else:
+            print(f'Skipping {idx} as we already have it.')
+        idx += 1
+        print(idx)
+    review_file.close()
diff --git a/app/llava/eval/llava_eval/eval_gpt_review_visual.py b/app/llava/eval/llava_eval/eval_gpt_review_visual.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6e407a400a67020d801e6c27a3c32a2ee38f30c
--- /dev/null
+++ b/app/llava/eval/llava_eval/eval_gpt_review_visual.py
@@ -0,0 +1,118 @@
+import argparse
+import json
+import os
+
+import openai
+import time
+
+NUM_SECONDS_TO_SLEEP = 0.5
+
+
+def get_eval(content: str, max_tokens: int):
+    while True:
+        try:
+            response = openai.ChatCompletion.create(
+                model='gpt-4-0314',
+                messages=[{
+                    'role': 'system',
+                    'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
+                }, {
+                    'role': 'user',
+                    'content': content,
+                }],
+                temperature=0.2,  # TODO: figure out which temperature is best for evaluation
+                max_tokens=max_tokens,
+            )
+            break
+        except openai.error.RateLimitError:
+            pass
+        except Exception as e:
+            print(e)
+        time.sleep(NUM_SECONDS_TO_SLEEP)
+
+    return response['choices'][0]['message']['content']
+
+
+def parse_score(review):
+    try:
+        score_pair = review.split('\n')[0]
+        score_pair = score_pair.replace(',', ' ')
+        sp = score_pair.split(' ')
+        if len(sp) == 2:
+            return [float(sp[0]), float(sp[1])]
+        else:
+            print('error', review)
+            return [-1, -1]
+    except Exception as e:
+        print(e)
+        print('error', review)
+        return [-1, -1]
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
+    parser.add_argument('-q', '--question')
+    parser.add_argument('-c', '--context')
+    parser.add_argument('-a', '--answer-list', nargs='+', default=[])
+    parser.add_argument('-r', '--rule')
+    parser.add_argument('-o', '--output')
+    parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
+    args = parser.parse_args()
+
+    f_q = open(os.path.expanduser(args.question))
+    f_ans1 = open(os.path.expanduser(args.answer_list[0]))
+    f_ans2 = open(os.path.expanduser(args.answer_list[1]))
+    rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
+
+    if os.path.isfile(os.path.expanduser(args.output)):
+        cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
+    else:
+        cur_reviews = []
+
+    review_file = open(f'{args.output}', 'a')
+
+    context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
+    image_to_context = {context['image']: context for context in context_list}
+
+    handles = []
+    idx = 0
+    for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
+        ques = json.loads(ques_js)
+        ans1 = json.loads(ans1_js)
+        ans2 = json.loads(ans2_js)
+
+        inst = image_to_context[ques['image']]
+        cap_str = '\n'.join(inst['captions'])
+        box_str = '\n'.join([f'{instance["category"]}: {instance["bbox"]}' for instance in inst['instances']])
+
+        category = json.loads(ques_js)['category']
+        if category in rule_dict:
+            rule = rule_dict[category]
+        else:
+            assert False, f"Visual QA category not found in rule file: {category}."
+        prompt = rule['prompt']
+        role = rule['role']
+        content = (f'[Context]\n{cap_str}\n\n{box_str}\n\n'
+                   f'[Question]\n{ques["text"]}\n\n'
+                   f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
+                   f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
+                   f'[System]\n{prompt}\n\n')
+        cur_js = {
+            'id': idx+1,
+            'question_id': ques['question_id'],
+            'answer1_id': ans1.get('answer_id', ans1['question_id']),
+            'answer2_id': ans2.get('answer_id', ans2['answer_id']),
+            'category': category
+        }
+        if idx >= len(cur_reviews):
+            review = get_eval(content, args.max_tokens)
+            scores = parse_score(review)
+            cur_js['content'] = review
+            cur_js['tuple'] = scores
+            review_file.write(json.dumps(cur_js) + '\n')
+            review_file.flush()
+        else:
+            print(f'Skipping {idx} as we already have it.')
+        idx += 1
+        print(idx)
+    review_file.close()
diff --git a/app/llava/eval/llava_eval/eval_pope.py b/app/llava/eval/llava_eval/eval_pope.py
new file mode 100644
index 0000000000000000000000000000000000000000..b115b8f2327ea9d972f9e41bcbb03c68be6b3508
--- /dev/null
+++ b/app/llava/eval/llava_eval/eval_pope.py
@@ -0,0 +1,81 @@
+import os
+import json
+import argparse
+
+def eval_pope(answers, label_file):
+    label_list = [json.loads(q)['label'] for q in open(label_file, 'r')]
+
+    for answer in answers:
+        text = answer['text']
+
+        # Only keep the first sentence
+        if text.find('.') != -1:
+            text = text.split('.')[0]
+
+        text = text.replace(',', '')
+        words = text.split(' ')
+        if 'No' in words or 'not' in words or 'no' in words:
+            answer['text'] = 'no'
+        else:
+            answer['text'] = 'yes'
+
+    for i in range(len(label_list)):
+        if label_list[i] == 'no':
+            label_list[i] = 0
+        else:
+            label_list[i] = 1
+
+    pred_list = []
+    for answer in answers:
+        if answer['text'] == 'no':
+            pred_list.append(0)
+        else:
+            pred_list.append(1)
+
+    pos = 1
+    neg = 0
+    yes_ratio = pred_list.count(1) / len(pred_list)
+
+    TP, TN, FP, FN = 0, 0, 0, 0
+    for pred, label in zip(pred_list, label_list):
+        if pred == pos and label == pos:
+            TP += 1
+        elif pred == pos and label == neg:
+            FP += 1
+        elif pred == neg and label == neg:
+            TN += 1
+        elif pred == neg and label == pos:
+            FN += 1
+
+    print('TP\tFP\tTN\tFN\t')
+    print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN))
+
+    precision = float(TP) / float(TP + FP)
+    recall = float(TP) / float(TP + FN)
+    f1 = 2*precision*recall / (precision + recall)
+    acc = (TP + TN) / (TP + TN + FP + FN)
+    print('Accuracy: {}'.format(acc))
+    print('Precision: {}'.format(precision))
+    print('Recall: {}'.format(recall))
+    print('F1 score: {}'.format(f1))
+    print('Yes ratio: {}'.format(yes_ratio))
+    print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) )
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--annotation-dir", type=str)
+    parser.add_argument("--question-file", type=str)
+    parser.add_argument("--result-file", type=str)
+    args = parser.parse_args()
+
+    questions = [json.loads(line) for line in open(args.question_file)]
+    questions = {question['question_id']: question for question in questions}
+    answers = [json.loads(q) for q in open(args.result_file)]
+    for file in os.listdir(args.annotation_dir):
+        assert file.startswith('coco_pope_')
+        assert file.endswith('.json')
+        category = file[10:-5]
+        cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category]
+        print('Category: {}, # samples: {}'.format(category, len(cur_answers)))
+        eval_pope(cur_answers, os.path.join(args.annotation_dir, file))
+        print("====================================")
diff --git a/app/llava/eval/llava_eval/eval_science_qa.py b/app/llava/eval/llava_eval/eval_science_qa.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccf206bbd7a5d6376eef82d61b3ef8bbe0f71c6c
--- /dev/null
+++ b/app/llava/eval/llava_eval/eval_science_qa.py
@@ -0,0 +1,114 @@
+import argparse
+import json
+import os
+import re
+import random
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--base-dir', type=str)
+    parser.add_argument('--result-file', type=str)
+    parser.add_argument('--output-file', type=str)
+    parser.add_argument('--output-result', type=str)
+    parser.add_argument('--split', type=str, default='test')
+    parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
+    return parser.parse_args()
+
+
+def convert_caps(results):
+    fakecaps = []
+    for result in results:
+        image_id = result['question_id']
+        caption = result['text']
+        fakecaps.append({"image_id": int(image_id), "caption": caption})
+    return fakecaps
+
+
+def get_pred_idx(prediction, choices, options):
+    """
+    Get the index (e.g. 2) from the prediction (e.g. 'C')
+    """
+    if prediction in options[:len(choices)]:
+        return options.index(prediction)
+    else:
+        return -1
+        return random.choice(range(len(choices)))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    base_dir = args.base_dir
+    split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
+    problems = json.load(open(os.path.join(base_dir, "problems.json")))
+    predictions = [json.loads(line) for line in open(args.result_file)]
+    predictions = {pred['question_id']: pred for pred in predictions}
+    split_problems = {idx: problems[idx] for idx in split_indices}
+
+    results = {'correct': [], 'incorrect': []}
+    sqa_results = {}
+    sqa_results['acc'] = None
+    sqa_results['correct'] = None
+    sqa_results['count'] = None
+    sqa_results['results'] = {}
+    sqa_results['outputs'] = {}
+
+    for prob_id, prob in split_problems.items():
+        if prob_id not in predictions:
+            pred = {'text': 'FAILED', 'prompt': 'Unknown'}
+            pred_text = 'FAILED'
+        else:
+            pred = predictions[prob_id]
+            pred_text = pred['text']
+
+        if pred_text in args.options:
+            answer = pred_text
+        elif len(pred_text) >= 3 and pred_text[0] in args.options and pred_text[1:3] == ". ":
+            answer = pred_text[0]
+        else:
+            pattern = re.compile(r'The answer is ([A-Z]).')
+            res = pattern.findall(pred_text)
+            if len(res) == 1:
+                answer = res[0]  # 'A', 'B', ...
+            else:
+                answer = "FAILED"
+
+        pred_idx = get_pred_idx(answer, prob['choices'], args.options)
+
+        analysis = {
+            'question_id': prob_id,
+            'parsed_ans': answer,
+            'ground_truth': args.options[prob['answer']],
+            'question': pred['prompt'],
+            'pred': pred_text,
+            'is_multimodal': '<image>' in pred['prompt'],
+        }
+
+        sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options)
+        sqa_results['outputs'][prob_id] = pred_text
+
+        if pred_idx == prob['answer']:
+            results['correct'].append(analysis)
+        else:
+            results['incorrect'].append(analysis)
+
+    correct = len(results['correct'])
+    total = len(results['correct']) + len(results['incorrect'])
+
+    ###### IMG ######
+    multimodal_correct = len([x for x in results['correct'] if x['is_multimodal']])
+    multimodal_incorrect = len([x for x in results['incorrect'] if x['is_multimodal']])
+    multimodal_total = multimodal_correct + multimodal_incorrect
+    ###### IMG ######
+
+    print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%, IMG-Accuracy: {multimodal_correct / multimodal_total * 100:.2f}%')
+
+    sqa_results['acc'] = correct / total * 100
+    sqa_results['correct'] = correct
+    sqa_results['count'] = total
+
+    with open(args.output_file, 'w') as f:
+        json.dump(results, f, indent=2)
+    with open(args.output_result, 'w') as f:
+        json.dump(sqa_results, f, indent=2)
diff --git a/app/llava/eval/llava_eval/eval_science_qa_gpt4.py b/app/llava/eval/llava_eval/eval_science_qa_gpt4.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2ff17c915481fb556aba6ec816a9e08f519c515
--- /dev/null
+++ b/app/llava/eval/llava_eval/eval_science_qa_gpt4.py
@@ -0,0 +1,104 @@
+import argparse
+import json
+import os
+import re
+import random
+from collections import defaultdict
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--base-dir', type=str)
+    parser.add_argument('--gpt4-result', type=str)
+    parser.add_argument('--our-result', type=str)
+    parser.add_argument('--split', type=str, default='test')
+    parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
+    return parser.parse_args()
+
+
+def convert_caps(results):
+    fakecaps = []
+    for result in results:
+        image_id = result['question_id']
+        caption = result['text']
+        fakecaps.append({"image_id": int(image_id), "caption": caption})
+    return fakecaps
+
+
+def get_pred_idx(prediction, choices, options):
+    """
+    Get the index (e.g. 2) from the prediction (e.g. 'C')
+    """
+    if prediction in options[:len(choices)]:
+        return options.index(prediction)
+    else:
+        return random.choice(range(len(choices)))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    base_dir = args.base_dir
+    split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
+    problems = json.load(open(os.path.join(base_dir, "problems.json")))
+    our_predictions = [json.loads(line) for line in open(args.our_result)]
+    our_predictions = {pred['question_id']: pred for pred in our_predictions}
+    split_problems = {idx: problems[idx] for idx in split_indices}
+
+    gpt4_predictions = json.load(open(args.gpt4_result))['outputs']
+
+    results = defaultdict(lambda: 0)
+
+    for prob_id, prob in split_problems.items():
+        if prob_id not in our_predictions:
+            continue
+        if prob_id not in gpt4_predictions:
+            continue
+        our_pred = our_predictions[prob_id]['text']
+        gpt4_pred = gpt4_predictions[prob_id]
+
+        pattern = re.compile(r'The answer is ([A-Z]).')
+        our_res = pattern.findall(our_pred)
+        if len(our_res) == 1:
+            our_answer = our_res[0]  # 'A', 'B', ...
+        else:
+            our_answer = "FAILED"
+        gpt4_res = pattern.findall(gpt4_pred)
+        if len(gpt4_res) == 1:
+            gpt4_answer = gpt4_res[0]  # 'A', 'B', ...
+        else:
+            gpt4_answer = "FAILED"
+
+        our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options)
+        gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options)
+
+        if gpt4_answer == 'FAILED':
+            results['gpt4_failed'] += 1
+            # continue
+            gpt4_pred_idx = our_pred_idx
+            # if our_pred_idx != prob['answer']:
+            #     print(our_predictions[prob_id]['prompt'])
+            #     print('-----------------')
+            #     print(f'LECTURE: {prob["lecture"]}')
+            #     print(f'SOLUTION: {prob["solution"]}')
+            #     print('=====================')
+        else:
+            # continue
+            pass
+        # gpt4_pred_idx = our_pred_idx
+
+        if gpt4_pred_idx == prob['answer']:
+            results['correct'] += 1
+        else:
+            results['incorrect'] += 1
+
+
+        if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']:
+            results['correct_upperbound'] += 1
+
+    correct = results['correct']
+    total = results['correct'] + results['incorrect']
+    print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%')
+    print(f'Total: {total}, Correct (upper): {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%')
+    print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%')
+
diff --git a/app/llava/eval/llava_eval/eval_science_qa_gpt4_requery.py b/app/llava/eval/llava_eval/eval_science_qa_gpt4_requery.py
new file mode 100644
index 0000000000000000000000000000000000000000..698546e995d365d1ccc2c25a87e6c5cd681e6eb6
--- /dev/null
+++ b/app/llava/eval/llava_eval/eval_science_qa_gpt4_requery.py
@@ -0,0 +1,149 @@
+import argparse
+import json
+import os
+import re
+import random
+from collections import defaultdict
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--base-dir', type=str)
+    parser.add_argument('--gpt4-result', type=str)
+    parser.add_argument('--requery-result', type=str)
+    parser.add_argument('--our-result', type=str)
+    parser.add_argument('--output-result', type=str)
+    parser.add_argument('--split', type=str, default='test')
+    parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
+    return parser.parse_args()
+
+
+def convert_caps(results):
+    fakecaps = []
+    for result in results:
+        image_id = result['question_id']
+        caption = result['text']
+        fakecaps.append({"image_id": int(image_id), "caption": caption})
+    return fakecaps
+
+
+def get_pred_idx(prediction, choices, options):
+    """
+    Get the index (e.g. 2) from the prediction (e.g. 'C')
+    """
+    if prediction in options[:len(choices)]:
+        return options.index(prediction)
+    else:
+        return random.choice(range(len(choices)))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    base_dir = args.base_dir
+    split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
+    problems = json.load(open(os.path.join(base_dir, "problems.json")))
+    our_predictions = [json.loads(line) for line in open(args.our_result)]
+    our_predictions = {pred['question_id']: pred for pred in our_predictions}
+    split_problems = {idx: problems[idx] for idx in split_indices}
+
+    requery_predictions = [json.loads(line) for line in open(args.requery_result)]
+    requery_predictions = {pred['question_id']: pred for pred in requery_predictions}
+
+    gpt4_predictions = json.load(open(args.gpt4_result))['outputs']
+
+    results = defaultdict(lambda: 0)
+
+    sqa_results = {}
+    sqa_results['acc'] = None
+    sqa_results['correct'] = None
+    sqa_results['count'] = None
+    sqa_results['results'] = {}
+    sqa_results['outputs'] = {}
+
+    for prob_id, prob in split_problems.items():
+        if prob_id not in our_predictions:
+            assert False
+        if prob_id not in gpt4_predictions:
+            assert False
+        our_pred = our_predictions[prob_id]['text']
+        gpt4_pred = gpt4_predictions[prob_id]
+        if prob_id not in requery_predictions:
+            results['missing_requery'] += 1
+            requery_pred = "MISSING"
+        else:
+            requery_pred = requery_predictions[prob_id]['text']
+
+        pattern = re.compile(r'The answer is ([A-Z]).')
+        our_res = pattern.findall(our_pred)
+        if len(our_res) == 1:
+            our_answer = our_res[0]  # 'A', 'B', ...
+        else:
+            our_answer = "FAILED"
+
+        requery_res = pattern.findall(requery_pred)
+        if len(requery_res) == 1:
+            requery_answer = requery_res[0]  # 'A', 'B', ...
+        else:
+            requery_answer = "FAILED"
+
+        gpt4_res = pattern.findall(gpt4_pred)
+        if len(gpt4_res) == 1:
+            gpt4_answer = gpt4_res[0]  # 'A', 'B', ...
+        else:
+            gpt4_answer = "FAILED"
+
+        our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options)
+        gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options)
+        requery_pred_idx = get_pred_idx(requery_answer, prob['choices'], args.options)
+
+        results['total'] += 1
+
+        if gpt4_answer == 'FAILED':
+            results['gpt4_failed'] += 1
+            if gpt4_pred_idx == prob['answer']:
+                results['gpt4_correct'] += 1
+            if our_pred_idx == prob['answer']:
+                results['gpt4_ourvisual_correct'] += 1
+        elif gpt4_pred_idx == prob['answer']:
+            results['gpt4_correct'] += 1
+            results['gpt4_ourvisual_correct'] += 1
+
+        if our_pred_idx == prob['answer']:
+            results['our_correct'] += 1
+
+        if requery_answer == 'FAILED':
+            sqa_results['results'][prob_id] = our_pred_idx
+            if our_pred_idx == prob['answer']:
+                results['requery_correct'] += 1
+        else:
+            sqa_results['results'][prob_id] = requery_pred_idx
+            if requery_pred_idx == prob['answer']:
+                results['requery_correct'] += 1
+            else:
+                print(f"""
+Question ({args.options[prob['answer']]}): {our_predictions[prob_id]['prompt']}
+Our ({our_answer}): {our_pred}
+GPT-4 ({gpt4_answer}): {gpt4_pred}
+Requery ({requery_answer}): {requery_pred}
+print("=====================================")
+""")
+
+        if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']:
+            results['correct_upperbound'] += 1
+
+    total = results['total']
+    print(f'Total: {total}, Our-Correct: {results["our_correct"]}, Accuracy: {results["our_correct"] / total * 100:.2f}%')
+    print(f'Total: {total}, GPT-4-Correct: {results["gpt4_correct"]}, Accuracy: {results["gpt4_correct"] / total * 100:.2f}%')
+    print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%')
+    print(f'Total: {total}, GPT-4-OursVisual-Correct: {results["gpt4_ourvisual_correct"]}, Accuracy: {results["gpt4_ourvisual_correct"] / total * 100:.2f}%')
+    print(f'Total: {total}, Requery-Correct: {results["requery_correct"]}, Accuracy: {results["requery_correct"] / total * 100:.2f}%')
+    print(f'Total: {total}, Correct upper: {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%')
+
+    sqa_results['acc'] = results["requery_correct"] / total * 100
+    sqa_results['correct'] = results["requery_correct"]
+    sqa_results['count'] = total
+
+    with open(args.output_result, 'w') as f:
+        json.dump(sqa_results, f, indent=2)
+
diff --git a/app/llava/eval/llava_eval/eval_textvqa.py b/app/llava/eval/llava_eval/eval_textvqa.py
new file mode 100644
index 0000000000000000000000000000000000000000..468f4bb120448a036bd5b5c7955464fe2e13892a
--- /dev/null
+++ b/app/llava/eval/llava_eval/eval_textvqa.py
@@ -0,0 +1,65 @@
+import os
+import argparse
+import json
+import re
+
+from llava.eval.m4c_evaluator import TextVQAAccuracyEvaluator
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--annotation-file', type=str)
+    parser.add_argument('--result-file', type=str)
+    parser.add_argument('--result-dir', type=str)
+    return parser.parse_args()
+
+
+def prompt_processor(prompt):
+    if prompt.startswith('OCR tokens: '):
+        pattern = r"Question: (.*?) Short answer:"
+        match = re.search(pattern, prompt, re.DOTALL)
+        question = match.group(1)
+    elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3:
+        if prompt.startswith('Reference OCR token:'):
+            question = prompt.split('\n')[1]
+        else:
+            question = prompt.split('\n')[0]
+    elif len(prompt.split('\n')) == 2:
+        question = prompt.split('\n')[0]
+    else:
+        assert False
+
+    return question.lower()
+
+
+def eval_single(annotation_file, result_file):
+    experiment_name = os.path.splitext(os.path.basename(result_file))[0]
+    print(experiment_name)
+    annotations = json.load(open(annotation_file))['data']
+    annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations}
+    results = [json.loads(line) for line in open(result_file)]
+
+    pred_list = []
+    for result in results:
+        annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))]
+        pred_list.append({
+            "pred_answer": result['text'],
+            "gt_answers": annotation['answers'],
+        })
+
+    evaluator = TextVQAAccuracyEvaluator()
+    print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list)))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    if args.result_file is not None:
+        eval_single(args.annotation_file, args.result_file)
+
+    if args.result_dir is not None:
+        for result_file in sorted(os.listdir(args.result_dir)):
+            if not result_file.endswith('.jsonl'):
+                print(f'Skipping {result_file}')
+                continue
+            eval_single(args.annotation_file, os.path.join(args.result_dir, result_file))
diff --git a/app/llava/eval/llava_eval/generate_webpage_data_from_table.py b/app/llava/eval/llava_eval/generate_webpage_data_from_table.py
new file mode 100644
index 0000000000000000000000000000000000000000..92602258ccd953a1d7137056aaf15c8de8166e21
--- /dev/null
+++ b/app/llava/eval/llava_eval/generate_webpage_data_from_table.py
@@ -0,0 +1,111 @@
+"""Generate json file for webpage."""
+import json
+import os
+import re
+
+# models = ['llama', 'alpaca', 'gpt35', 'bard']
+models = ['vicuna']
+
+
+def read_jsonl(path: str, key: str=None):
+    data = []
+    with open(os.path.expanduser(path)) as f:
+        for line in f:
+            if not line:
+                continue
+            data.append(json.loads(line))
+    if key is not None:
+        data.sort(key=lambda x: x[key])
+        data = {item[key]: item for item in data}
+    return data
+
+
+def trim_hanging_lines(s: str, n: int) -> str:
+    s = s.strip()
+    for _ in range(n):
+        s = s.split('\n', 1)[1].strip()
+    return s
+
+
+if __name__ == '__main__':
+    questions = read_jsonl('table/question.jsonl', key='question_id')
+
+    # alpaca_answers = read_jsonl('table/answer/answer_alpaca-13b.jsonl', key='question_id')
+    # bard_answers = read_jsonl('table/answer/answer_bard.jsonl', key='question_id')
+    # gpt35_answers = read_jsonl('table/answer/answer_gpt35.jsonl', key='question_id')
+    # llama_answers = read_jsonl('table/answer/answer_llama-13b.jsonl', key='question_id')
+    vicuna_answers = read_jsonl('table/answer/answer_vicuna-13b.jsonl', key='question_id')
+    ours_answers = read_jsonl('table/results/llama-13b-hf-alpaca.jsonl', key='question_id')
+
+    review_vicuna = read_jsonl('table/review/review_vicuna-13b_llama-13b-hf-alpaca.jsonl', key='question_id')
+    # review_alpaca = read_jsonl('table/review/review_alpaca-13b_vicuna-13b.jsonl', key='question_id')
+    # review_bard = read_jsonl('table/review/review_bard_vicuna-13b.jsonl', key='question_id')
+    # review_gpt35 = read_jsonl('table/review/review_gpt35_vicuna-13b.jsonl', key='question_id')
+    # review_llama = read_jsonl('table/review/review_llama-13b_vicuna-13b.jsonl', key='question_id')
+
+    records = []
+    for qid in questions.keys():
+        r = {
+            'id': qid,
+            'category': questions[qid]['category'],
+            'question': questions[qid]['text'],
+            'answers': {
+                # 'alpaca': alpaca_answers[qid]['text'],
+                # 'llama': llama_answers[qid]['text'],
+                # 'bard': bard_answers[qid]['text'],
+                # 'gpt35': gpt35_answers[qid]['text'],
+                'vicuna': vicuna_answers[qid]['text'],
+                'ours': ours_answers[qid]['text'],
+            },
+            'evaluations': {
+                # 'alpaca': review_alpaca[qid]['text'],
+                # 'llama': review_llama[qid]['text'],
+                # 'bard': review_bard[qid]['text'],
+                'vicuna': review_vicuna[qid]['content'],
+                # 'gpt35': review_gpt35[qid]['text'],
+            },
+            'scores': {
+                'vicuna': review_vicuna[qid]['tuple'],
+                # 'alpaca': review_alpaca[qid]['score'],
+                # 'llama': review_llama[qid]['score'],
+                # 'bard': review_bard[qid]['score'],
+                # 'gpt35': review_gpt35[qid]['score'],
+            },
+        }
+
+        # cleanup data
+        cleaned_evals = {}
+        for k, v in r['evaluations'].items():
+            v = v.strip()
+            lines = v.split('\n')
+            # trim the first line if it's a pair of numbers
+            if re.match(r'\d+[, ]+\d+', lines[0]):
+                lines = lines[1:]
+            v = '\n'.join(lines)
+            cleaned_evals[k] = v.replace('Assistant 1', "**Assistant 1**").replace('Assistant 2', '**Assistant 2**')
+
+        r['evaluations'] = cleaned_evals
+        records.append(r)
+
+    # Reorder the records, this is optional
+    for r in records:
+        if r['id'] <= 20:
+            r['id'] += 60
+        else:
+            r['id'] -= 20
+    for r in records:
+        if r['id'] <= 50:
+            r['id'] += 10
+        elif 50 < r['id'] <= 60:
+            r['id'] -= 50
+    for r in records:
+        if r['id'] == 7:
+            r['id'] = 1
+        elif r['id'] < 7:
+            r['id'] += 1 
+
+    records.sort(key=lambda x: x['id'])
+
+    # Write to file
+    with open('webpage/data.json', 'w') as f:
+        json.dump({'questions': records, 'models': models}, f, indent=2)
diff --git a/app/llava/eval/llava_eval/m4c_evaluator.py b/app/llava/eval/llava_eval/m4c_evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..e30e958da061a4f0a0bfe34b12d2fcaeba7ff2f4
--- /dev/null
+++ b/app/llava/eval/llava_eval/m4c_evaluator.py
@@ -0,0 +1,334 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import re
+
+from tqdm import tqdm
+
+
+class EvalAIAnswerProcessor:
+    """
+    Processes an answer similar to Eval AI
+        copied from
+        https://github.com/facebookresearch/mmf/blob/c46b3b3391275b4181567db80943473a89ab98ab/pythia/tasks/processors.py#L897
+    """
+
+    CONTRACTIONS = {
+        "aint": "ain't",
+        "arent": "aren't",
+        "cant": "can't",
+        "couldve": "could've",
+        "couldnt": "couldn't",
+        "couldn'tve": "couldn't've",
+        "couldnt've": "couldn't've",
+        "didnt": "didn't",
+        "doesnt": "doesn't",
+        "dont": "don't",
+        "hadnt": "hadn't",
+        "hadnt've": "hadn't've",
+        "hadn'tve": "hadn't've",
+        "hasnt": "hasn't",
+        "havent": "haven't",
+        "hed": "he'd",
+        "hed've": "he'd've",
+        "he'dve": "he'd've",
+        "hes": "he's",
+        "howd": "how'd",
+        "howll": "how'll",
+        "hows": "how's",
+        "Id've": "I'd've",
+        "I'dve": "I'd've",
+        "Im": "I'm",
+        "Ive": "I've",
+        "isnt": "isn't",
+        "itd": "it'd",
+        "itd've": "it'd've",
+        "it'dve": "it'd've",
+        "itll": "it'll",
+        "let's": "let's",
+        "maam": "ma'am",
+        "mightnt": "mightn't",
+        "mightnt've": "mightn't've",
+        "mightn'tve": "mightn't've",
+        "mightve": "might've",
+        "mustnt": "mustn't",
+        "mustve": "must've",
+        "neednt": "needn't",
+        "notve": "not've",
+        "oclock": "o'clock",
+        "oughtnt": "oughtn't",
+        "ow's'at": "'ow's'at",
+        "'ows'at": "'ow's'at",
+        "'ow'sat": "'ow's'at",
+        "shant": "shan't",
+        "shed've": "she'd've",
+        "she'dve": "she'd've",
+        "she's": "she's",
+        "shouldve": "should've",
+        "shouldnt": "shouldn't",
+        "shouldnt've": "shouldn't've",
+        "shouldn'tve": "shouldn't've",
+        "somebody'd": "somebodyd",
+        "somebodyd've": "somebody'd've",
+        "somebody'dve": "somebody'd've",
+        "somebodyll": "somebody'll",
+        "somebodys": "somebody's",
+        "someoned": "someone'd",
+        "someoned've": "someone'd've",
+        "someone'dve": "someone'd've",
+        "someonell": "someone'll",
+        "someones": "someone's",
+        "somethingd": "something'd",
+        "somethingd've": "something'd've",
+        "something'dve": "something'd've",
+        "somethingll": "something'll",
+        "thats": "that's",
+        "thered": "there'd",
+        "thered've": "there'd've",
+        "there'dve": "there'd've",
+        "therere": "there're",
+        "theres": "there's",
+        "theyd": "they'd",
+        "theyd've": "they'd've",
+        "they'dve": "they'd've",
+        "theyll": "they'll",
+        "theyre": "they're",
+        "theyve": "they've",
+        "twas": "'twas",
+        "wasnt": "wasn't",
+        "wed've": "we'd've",
+        "we'dve": "we'd've",
+        "weve": "we've",
+        "werent": "weren't",
+        "whatll": "what'll",
+        "whatre": "what're",
+        "whats": "what's",
+        "whatve": "what've",
+        "whens": "when's",
+        "whered": "where'd",
+        "wheres": "where's",
+        "whereve": "where've",
+        "whod": "who'd",
+        "whod've": "who'd've",
+        "who'dve": "who'd've",
+        "wholl": "who'll",
+        "whos": "who's",
+        "whove": "who've",
+        "whyll": "why'll",
+        "whyre": "why're",
+        "whys": "why's",
+        "wont": "won't",
+        "wouldve": "would've",
+        "wouldnt": "wouldn't",
+        "wouldnt've": "wouldn't've",
+        "wouldn'tve": "wouldn't've",
+        "yall": "y'all",
+        "yall'll": "y'all'll",
+        "y'allll": "y'all'll",
+        "yall'd've": "y'all'd've",
+        "y'alld've": "y'all'd've",
+        "y'all'dve": "y'all'd've",
+        "youd": "you'd",
+        "youd've": "you'd've",
+        "you'dve": "you'd've",
+        "youll": "you'll",
+        "youre": "you're",
+        "youve": "you've",
+    }
+
+    NUMBER_MAP = {
+        "none": "0",
+        "zero": "0",
+        "one": "1",
+        "two": "2",
+        "three": "3",
+        "four": "4",
+        "five": "5",
+        "six": "6",
+        "seven": "7",
+        "eight": "8",
+        "nine": "9",
+        "ten": "10",
+    }
+    ARTICLES = ["a", "an", "the"]
+    PERIOD_STRIP = re.compile(r"(?!<=\d)(\.)(?!\d)")
+    COMMA_STRIP = re.compile(r"(?<=\d)(\,)+(?=\d)")
+    PUNCTUATIONS = [
+        ";",
+        r"/",
+        "[",
+        "]",
+        '"',
+        "{",
+        "}",
+        "(",
+        ")",
+        "=",
+        "+",
+        "\\",
+        "_",
+        "-",
+        ">",
+        "<",
+        "@",
+        "`",
+        ",",
+        "?",
+        "!",
+    ]
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def word_tokenize(self, word):
+        word = word.lower()
+        word = word.replace(",", "").replace("?", "").replace("'s", " 's")
+        return word.strip()
+
+    def process_punctuation(self, in_text):
+        out_text = in_text
+        for p in self.PUNCTUATIONS:
+            if (p + " " in in_text or " " + p in in_text) or (
+                re.search(self.COMMA_STRIP, in_text) is not None
+            ):
+                out_text = out_text.replace(p, "")
+            else:
+                out_text = out_text.replace(p, " ")
+        out_text = self.PERIOD_STRIP.sub("", out_text, re.UNICODE)
+        return out_text
+
+    def process_digit_article(self, in_text):
+        out_text = []
+        temp_text = in_text.lower().split()
+        for word in temp_text:
+            word = self.NUMBER_MAP.setdefault(word, word)
+            if word not in self.ARTICLES:
+                out_text.append(word)
+            else:
+                pass
+        for word_id, word in enumerate(out_text):
+            if word in self.CONTRACTIONS:
+                out_text[word_id] = self.CONTRACTIONS[word]
+        out_text = " ".join(out_text)
+        return out_text
+
+    def __call__(self, item):
+        item = self.word_tokenize(item)
+        item = item.replace("\n", " ").replace("\t", " ").strip()
+        item = self.process_punctuation(item)
+        item = self.process_digit_article(item)
+        return item
+
+
+class TextVQAAccuracyEvaluator:
+    def __init__(self):
+        self.answer_processor = EvalAIAnswerProcessor()
+
+    def _compute_answer_scores(self, raw_answers):
+        """
+        compute the accuracy (soft score) of human answers
+        """
+        answers = [self.answer_processor(a) for a in raw_answers]
+        assert len(answers) == 10
+        gt_answers = list(enumerate(answers))
+        unique_answers = set(answers)
+        unique_answer_scores = {}
+
+        for unique_answer in unique_answers:
+            accs = []
+            for gt_answer in gt_answers:
+                other_answers = [item for item in gt_answers if item != gt_answer]
+                matching_answers = [
+                    item for item in other_answers if item[1] == unique_answer
+                ]
+                acc = min(1, float(len(matching_answers)) / 3)
+                accs.append(acc)
+            unique_answer_scores[unique_answer] = sum(accs) / len(accs)
+
+        return unique_answer_scores
+
+    def eval_pred_list(self, pred_list):
+        pred_scores = []
+        for entry in tqdm(pred_list):
+            pred_answer = self.answer_processor(entry["pred_answer"])
+            unique_answer_scores = self._compute_answer_scores(entry["gt_answers"])
+            score = unique_answer_scores.get(pred_answer, 0.0)
+            pred_scores.append(score)
+
+        accuracy = sum(pred_scores) / len(pred_scores)
+        return accuracy
+
+
+class STVQAAccuracyEvaluator:
+    def __init__(self):
+        self.answer_processor = EvalAIAnswerProcessor()
+
+    def eval_pred_list(self, pred_list):
+        pred_scores = []
+        for entry in pred_list:
+            pred_answer = self.answer_processor(entry["pred_answer"])
+            gts = [self.answer_processor(a) for a in entry["gt_answers"]]
+            score = 1.0 if pred_answer in gts else 0.0
+            pred_scores.append(score)
+
+        accuracy = sum(pred_scores) / len(pred_scores)
+        return accuracy
+
+
+class STVQAANLSEvaluator:
+    def __init__(self):
+        import editdistance  # install with `pip install editdistance`
+
+        self.get_edit_distance = editdistance.eval
+
+    def get_anls(self, s1, s2):
+        s1 = s1.lower().strip()
+        s2 = s2.lower().strip()
+        iou = 1 - self.get_edit_distance(s1, s2) / max(len(s1), len(s2))
+        anls = iou if iou >= 0.5 else 0.0
+        return anls
+
+    def eval_pred_list(self, pred_list):
+        pred_scores = []
+        for entry in pred_list:
+            anls = max(
+                self.get_anls(entry["pred_answer"], gt) for gt in entry["gt_answers"]
+            )
+            pred_scores.append(anls)
+
+        accuracy = sum(pred_scores) / len(pred_scores)
+        return accuracy
+
+
+class TextCapsBleu4Evaluator:
+    def __init__(self):
+        # The following script requires Java 1.8.0 and pycocotools installed.
+        # The pycocoevalcap can be installed with pip as
+        # pip install git+https://github.com/ronghanghu/coco-caption.git@python23
+        # Original pycocoevalcap code is at https://github.com/tylin/coco-caption
+        # but has no python3 support yet.
+        try:
+            from pycocoevalcap.bleu.bleu import Bleu
+            from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
+        except ModuleNotFoundError:
+            print(
+                "Please install pycocoevalcap module using "
+                "pip install git+https://github.com/ronghanghu/coco-caption.git@python23"  # noqa
+            )
+            raise
+
+        self.tokenizer = PTBTokenizer()
+        self.scorer = Bleu(4)
+
+    def eval_pred_list(self, pred_list):
+        # Create reference and hypotheses captions.
+        gts = {}
+        res = {}
+        for idx, entry in enumerate(pred_list):
+            gts[idx] = [{"caption": a} for a in entry["gt_answers"]]
+            res[idx] = [{"caption": entry["pred_answer"]}]
+
+        gts = self.tokenizer.tokenize(gts)
+        res = self.tokenizer.tokenize(res)
+        score, _ = self.scorer.compute_score(gts, res)
+
+        bleu4 = score[3]  # score is (Bleu-1, Bleu-2, Bleu-3, Bleu-4)
+        return bleu4
diff --git a/app/llava/eval/llava_eval/model_qa.py b/app/llava/eval/llava_eval/model_qa.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e254da152ac644ff54fb5fa57e625d9e6ba31d1
--- /dev/null
+++ b/app/llava/eval/llava_eval/model_qa.py
@@ -0,0 +1,64 @@
+import argparse
+from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria
+import torch
+import os
+import json
+from tqdm import tqdm
+import shortuuid
+
+from llava.conversation import default_conversation
+from llava.utils import disable_torch_init
+
+
+@torch.inference_mode()
+def eval_model(model_name, questions_file, answers_file):
+    # Model
+    disable_torch_init()
+    model_name = os.path.expanduser(model_name)
+    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
+    model = AutoModelForCausalLM.from_pretrained(model_name,
+        torch_dtype=torch.float16).cuda()
+
+
+    ques_file = open(os.path.expanduser(questions_file), "r")
+    ans_file = open(os.path.expanduser(answers_file), "w")
+    for i, line in enumerate(tqdm(ques_file)):
+        idx = json.loads(line)["question_id"]
+        qs = json.loads(line)["text"]
+        cat = json.loads(line)["category"]
+        conv = default_conversation.copy()
+        conv.append_message(conv.roles[0], qs)
+        prompt = conv.get_prompt()
+        inputs = tokenizer([prompt])
+        input_ids = torch.as_tensor(inputs.input_ids).cuda()
+        output_ids = model.generate(
+            input_ids,
+            do_sample=True,
+            use_cache=True,
+            temperature=0.7,
+            max_new_tokens=1024,)
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
+        try:
+            index = outputs.index(conv.sep, len(prompt))
+        except ValueError:
+            outputs += conv.sep
+            index = outputs.index(conv.sep, len(prompt))
+
+        outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip()
+        ans_id = shortuuid.uuid()
+        ans_file.write(json.dumps({"question_id": idx,
+                                   "text": outputs,
+                                   "answer_id": ans_id,
+                                   "model_id": model_name,
+                                   "metadata": {}}) + "\n")
+        ans_file.flush()
+    ans_file.close()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
+    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    args = parser.parse_args()
+
+    eval_model(args.model_name, args.question_file, args.answers_file)
diff --git a/app/llava/eval/llava_eval/model_vqa.py b/app/llava/eval/llava_eval/model_vqa.py
new file mode 100644
index 0000000000000000000000000000000000000000..8421a6103933a3e4e9eda8a35f23ea14f6049688
--- /dev/null
+++ b/app/llava/eval/llava_eval/model_vqa.py
@@ -0,0 +1,101 @@
+import argparse
+import torch
+import os
+import json
+from tqdm import tqdm
+import shortuuid
+
+from llava.constants import MM_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from llava.conversation import conv_templates, SeparatorStyle
+from llava.model.builder import load_pretrained_model
+from llava.utils import disable_torch_init
+from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
+
+from PIL import Image
+import math
+
+
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+
+
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+
+
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
+
+    questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
+    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+    for line in tqdm(questions):
+        idx = line["question_id"]
+        image_file = line["image"]
+        qs = line["text"]
+        cur_prompt = qs
+        if model.config.mm_use_start_end:
+            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+        else:
+            qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+
+        conv = conv_templates[args.conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+
+        input_ids = tokenizer_image_token(prompt, tokenizer, MM_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+
+        image = Image.open(os.path.join(args.image_folder, image_file)).convert('RGB')
+        image_tensor = process_images([image], image_processor, model.config)[0]
+
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=image_tensor.unsqueeze(0).half().cuda(),
+                image_sizes=[image.size],
+                do_sample=True if args.temperature > 0 else False,
+                temperature=args.temperature,
+                top_p=args.top_p,
+                num_beams=args.num_beams,
+                # no_repeat_ngram_size=3,
+                max_new_tokens=1024,
+                use_cache=True)
+
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+
+        ans_id = shortuuid.uuid()
+        ans_file.write(json.dumps({"question_id": idx,
+                                   "prompt": cur_prompt,
+                                   "text": outputs,
+                                   "answer_id": ans_id,
+                                   "model_id": model_name,
+                                   "metadata": {}}) + "\n")
+        ans_file.flush()
+    ans_file.close()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-folder", type=str, default="")
+    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="llava_v1")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--num_beams", type=int, default=1)
+    args = parser.parse_args()
+
+    eval_model(args)
diff --git a/app/llava/eval/llava_eval/model_vqa_loader.py b/app/llava/eval/llava_eval/model_vqa_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..96128198b97d9c2e4f8baef2342469983a658c8c
--- /dev/null
+++ b/app/llava/eval/llava_eval/model_vqa_loader.py
@@ -0,0 +1,144 @@
+import argparse
+import torch
+import os
+import json
+from tqdm import tqdm
+import shortuuid
+
+from llava.constants import MM_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from llava.conversation import conv_templates, SeparatorStyle
+from llava.model.builder import load_pretrained_model
+from llava.utils import disable_torch_init
+from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
+from torch.utils.data import Dataset, DataLoader
+
+from PIL import Image
+import math
+
+
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+
+
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+
+
+# Custom dataset class
+class CustomDataset(Dataset):
+    def __init__(self, questions, image_folder, tokenizer, image_processor, model_config):
+        self.questions = questions
+        self.image_folder = image_folder
+        self.tokenizer = tokenizer
+        self.image_processor = image_processor
+        self.model_config = model_config
+
+    def __getitem__(self, index):
+        line = self.questions[index]
+        image_file = line["image"]
+        qs = line["text"]
+        if self.model_config.mm_use_start_end:
+            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+        else:
+            qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+
+        conv = conv_templates[args.conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+
+        image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB')
+        image_tensor = process_images([image], self.image_processor, self.model_config)[0]
+
+        input_ids = tokenizer_image_token(prompt, self.tokenizer, MM_TOKEN_INDEX, return_tensors='pt')
+
+        return input_ids, image_tensor, image.size
+
+    def __len__(self):
+        return len(self.questions)
+
+
+def collate_fn(batch):
+    input_ids, image_tensors, image_sizes = zip(*batch)
+    input_ids = torch.stack(input_ids, dim=0)
+    image_tensors = torch.stack(image_tensors, dim=0)
+    return input_ids, image_tensors, image_sizes
+
+
+# DataLoader
+def create_data_loader(questions, image_folder, tokenizer, image_processor, model_config, batch_size=1, num_workers=4):
+    assert batch_size == 1, "batch_size must be 1"
+    dataset = CustomDataset(questions, image_folder, tokenizer, image_processor, model_config)
+    data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False, collate_fn=collate_fn)
+    return data_loader
+
+
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
+
+    questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
+    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+
+    if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
+        args.conv_mode = args.conv_mode + '_mmtag'
+        print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')
+
+    data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config)
+
+    for (input_ids, image_tensor, image_sizes), line in tqdm(zip(data_loader, questions), total=len(questions)):
+        idx = line["question_id"]
+        cur_prompt = line["text"]
+
+        input_ids = input_ids.to(device='cuda', non_blocking=True)
+
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True),
+                image_sizes=image_sizes,
+                do_sample=True if args.temperature > 0 else False,
+                temperature=args.temperature,
+                top_p=args.top_p,
+                num_beams=args.num_beams,
+                max_new_tokens=args.max_new_tokens,
+                use_cache=True)
+
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+
+        ans_id = shortuuid.uuid()
+        ans_file.write(json.dumps({"question_id": idx,
+                                   "prompt": cur_prompt,
+                                   "text": outputs,
+                                   "answer_id": ans_id,
+                                   "model_id": model_name,
+                                   "metadata": {}}) + "\n")
+        # ans_file.flush()
+    ans_file.close()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-folder", type=str, default="")
+    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="llava_v1")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--num_beams", type=int, default=1)
+    parser.add_argument("--max_new_tokens", type=int, default=128)
+    args = parser.parse_args()
+
+    eval_model(args)
diff --git a/app/llava/eval/llava_eval/model_vqa_mmbench.py b/app/llava/eval/llava_eval/model_vqa_mmbench.py
new file mode 100644
index 0000000000000000000000000000000000000000..525fb9584ebba2e689505e9c0fc9071af884088e
--- /dev/null
+++ b/app/llava/eval/llava_eval/model_vqa_mmbench.py
@@ -0,0 +1,160 @@
+import argparse
+import torch
+import os
+import json
+import pandas as pd
+from tqdm import tqdm
+import shortuuid
+
+from llava.constants import MM_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from llava.conversation import conv_templates, SeparatorStyle
+from llava.model.builder import load_pretrained_model
+from llava.utils import disable_torch_init
+from llava.mm_utils import tokenizer_image_token, process_images, load_image_from_base64, get_model_name_from_path
+
+from PIL import Image
+import math
+
+
+all_options = ['A', 'B', 'C', 'D']
+
+
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+
+
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+
+
+def is_none(value):
+    if value is None:
+        return True
+    if type(value) is float and math.isnan(value):
+        return True
+    if type(value) is str and value.lower() == 'nan':
+        return True
+    if type(value) is str and value.lower() == 'none':
+        return True
+    return False
+
+def get_options(row, options):
+    parsed_options = []
+    for option in options:
+        option_value = row[option]
+        if is_none(option_value):
+            break
+        parsed_options.append(option_value)
+    return parsed_options
+
+
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
+
+    questions = pd.read_table(os.path.expanduser(args.question_file))
+    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+
+    if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
+        args.conv_mode = args.conv_mode + '_mmtag'
+        print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')
+
+    for index, row in tqdm(questions.iterrows(), total=len(questions)):
+        options = get_options(row, all_options)
+        cur_option_char = all_options[:len(options)]
+
+        if args.all_rounds:
+            num_rounds = len(options)
+        else:
+            num_rounds = 1
+
+        for round_idx in range(num_rounds):
+            idx = row['index']
+            question = row['question']
+            hint = row['hint']
+            image = load_image_from_base64(row['image'])
+            if not is_none(hint):
+                question = hint + '\n' + question
+            for option_char, option in zip(all_options[:len(options)], options):
+                question = question + '\n' + option_char + '. ' + option
+            qs = cur_prompt = question
+            if model.config.mm_use_start_end:
+                qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+            else:
+                qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+
+            if args.single_pred_prompt:
+                if args.lang == 'cn':
+                    qs = qs + '\n' + "请直接回答选项字母。"
+                else:
+                    qs = qs + '\n' + "Answer with the option's letter from the given choices directly."
+
+            conv = conv_templates[args.conv_mode].copy()
+            conv.append_message(conv.roles[0], qs)
+            conv.append_message(conv.roles[1], None)
+            prompt = conv.get_prompt()
+
+            input_ids = tokenizer_image_token(prompt, tokenizer, MM_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+
+            image_tensor = process_images([image], image_processor, model.config)[0]
+
+            with torch.inference_mode():
+                output_ids = model.generate(
+                    input_ids,
+                    images=image_tensor.unsqueeze(0).half().cuda(),
+                    image_sizes=[image.size],
+                    do_sample=True if args.temperature > 0 else False,
+                    temperature=args.temperature,
+                    top_p=args.top_p,
+                    num_beams=args.num_beams,
+                    # no_repeat_ngram_size=3,
+                    max_new_tokens=1024,
+                    use_cache=True)
+
+            outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+
+            ans_id = shortuuid.uuid()
+            ans_file.write(json.dumps({"question_id": idx,
+                                    "round_id": round_idx,
+                                    "prompt": cur_prompt,
+                                    "text": outputs,
+                                    "options": options,
+                                    "option_char": cur_option_char,
+                                    "answer_id": ans_id,
+                                    "model_id": model_name,
+                                    "metadata": {}}) + "\n")
+            ans_file.flush()
+
+            # rotate options
+            options = options[1:] + options[:1]
+            cur_option_char = cur_option_char[1:] + cur_option_char[:1]
+    ans_file.close()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-folder", type=str, default="")
+    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="llava_v1")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--num_beams", type=int, default=1)
+    parser.add_argument("--all-rounds", action="store_true")
+    parser.add_argument("--single-pred-prompt", action="store_true")
+    parser.add_argument("--lang", type=str, default="en")
+    args = parser.parse_args()
+
+    eval_model(args)
diff --git a/app/llava/eval/llava_eval/model_vqa_science.py b/app/llava/eval/llava_eval/model_vqa_science.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cb259cdedd9254786116f098052336b3d630cfa
--- /dev/null
+++ b/app/llava/eval/llava_eval/model_vqa_science.py
@@ -0,0 +1,111 @@
+import argparse
+import torch
+import os
+import json
+from tqdm import tqdm
+import shortuuid
+
+from llava.constants import MM_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from llava.conversation import conv_templates, SeparatorStyle
+from llava.model.builder import load_pretrained_model
+from llava.utils import disable_torch_init
+from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
+
+from PIL import Image
+import math
+
+
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+
+
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+
+
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
+
+    questions = json.load(open(os.path.expanduser(args.question_file), "r"))
+    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+    for i, line in enumerate(tqdm(questions)):
+        idx = line["id"]
+        question = line['conversations'][0]
+        qs = question['value'].replace('<image>', '').strip()
+        cur_prompt = qs
+
+        if 'image' in line:
+            image_file = line["image"]
+            image = Image.open(os.path.join(args.image_folder, image_file))
+            image_tensor = process_images([image], image_processor, model.config)[0]
+            images = image_tensor.unsqueeze(0).half().cuda()
+            image_sizes = [image.size]
+            if getattr(model.config, 'mm_use_start_end', False):
+                qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+            else:
+                qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+            cur_prompt = '<image>' + '\n' + cur_prompt
+        else:
+            images = None
+            image_sizes = None
+
+        if args.single_pred_prompt:
+            qs = qs + '\n' + "Answer with the option's letter from the given choices directly."
+            cur_prompt = cur_prompt + '\n' + "Answer with the option's letter from the given choices directly."
+
+        conv = conv_templates[args.conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+
+        input_ids = tokenizer_image_token(prompt, tokenizer, MM_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=images,
+                image_sizes=image_sizes,
+                do_sample=True if args.temperature > 0 else False,
+                temperature=args.temperature,
+                max_new_tokens=1024,
+                use_cache=True,
+            )
+
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+
+        ans_id = shortuuid.uuid()
+        ans_file.write(json.dumps({"question_id": idx,
+                                   "prompt": cur_prompt,
+                                   "text": outputs,
+                                   "answer_id": ans_id,
+                                   "model_id": model_name,
+                                   "metadata": {}}) + "\n")
+        ans_file.flush()
+    ans_file.close()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-folder", type=str, default="")
+    parser.add_argument("--question-file", type=str, default="tables/question.json")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="llava_v0")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--answer-prompter", action="store_true")
+    parser.add_argument("--single-pred-prompt", action="store_true")
+    args = parser.parse_args()
+
+    eval_model(args)
diff --git a/app/llava/eval/llava_eval/qa_baseline_gpt35.py b/app/llava/eval/llava_eval/qa_baseline_gpt35.py
new file mode 100644
index 0000000000000000000000000000000000000000..babab6e12b4bb8cfa74a7edfa5e56cd1b3e2bf6c
--- /dev/null
+++ b/app/llava/eval/llava_eval/qa_baseline_gpt35.py
@@ -0,0 +1,74 @@
+"""Generate answers with GPT-3.5"""
+# Note: you need to be using OpenAI Python v0.27.0 for the code below to work
+import argparse
+import json
+import os
+import time
+import concurrent.futures
+
+import openai
+import tqdm
+import shortuuid
+
+MODEL = 'gpt-3.5-turbo'
+MODEL_ID = 'gpt-3.5-turbo:20230327'
+
+def get_answer(question_id: int, question: str, max_tokens: int):
+    ans = {
+        'answer_id': shortuuid.uuid(),
+        'question_id': question_id,
+        'model_id': MODEL_ID,
+    }
+    for _ in range(3):
+        try:
+            response = openai.ChatCompletion.create(
+                model=MODEL,
+                messages=[{
+                    'role': 'system',
+                    'content': 'You are a helpful assistant.'
+                }, {
+                    'role': 'user',
+                    'content': question,
+                }],
+                max_tokens=max_tokens,
+            )
+            ans['text'] = response['choices'][0]['message']['content']
+            return ans
+        except Exception as e:
+            print('[ERROR]', e)
+            ans['text'] = '#ERROR#'
+            time.sleep(1)
+    return ans
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='ChatGPT answer generation.')
+    parser.add_argument('-q', '--question')
+    parser.add_argument('-o', '--output')
+    parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
+    args = parser.parse_args()
+
+    questions_dict = {}
+    with open(os.path.expanduser(args.question)) as f:
+        for line in f:
+            if not line:
+                continue
+            q = json.loads(line)
+            questions_dict[q['question_id']] = q['text']
+
+    answers = []
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
+        futures = []
+        for qid, question in questions_dict.items():
+            future = executor.submit(get_answer, qid, question, args.max_tokens)
+            futures.append(future)
+
+        for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
+            answers.append(future.result())
+
+    answers.sort(key=lambda x: x['question_id'])
+
+    with open(os.path.expanduser(args.output), 'w') as f:
+        table = [json.dumps(ans) for ans in answers]
+        f.write('\n'.join(table))
diff --git a/app/llava/eval/llava_eval/run_llava.py b/app/llava/eval/llava_eval/run_llava.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cccecf97845fd7d131f8056e58fb542ccf1e48f
--- /dev/null
+++ b/app/llava/eval/llava_eval/run_llava.py
@@ -0,0 +1,145 @@
+import argparse
+import torch
+
+from llava.constants import (
+    MM_TOKEN_INDEX,
+    DEFAULT_IMAGE_TOKEN,
+    DEFAULT_IM_START_TOKEN,
+    DEFAULT_IM_END_TOKEN,
+    IMAGE_PLACEHOLDER,
+)
+from llava.conversation import conv_templates, SeparatorStyle
+from llava.model.builder import load_pretrained_model
+from llava.utils import disable_torch_init
+from llava.mm_utils import (
+    process_images,
+    tokenizer_image_token,
+    get_model_name_from_path,
+)
+
+from PIL import Image
+
+import requests
+from PIL import Image
+from io import BytesIO
+import re
+
+
+def image_parser(args):
+    out = args.image_file.split(args.sep)
+    return out
+
+
+def load_image(image_file):
+    if image_file.startswith("http") or image_file.startswith("https"):
+        response = requests.get(image_file)
+        image = Image.open(BytesIO(response.content)).convert("RGB")
+    else:
+        image = Image.open(image_file).convert("RGB")
+    return image
+
+
+def load_images(image_files):
+    out = []
+    for image_file in image_files:
+        image = load_image(image_file)
+        out.append(image)
+    return out
+
+
+def eval_model(args):
+    # Model
+    disable_torch_init()
+
+    model_name = get_model_name_from_path(args.model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(
+        args.model_path, args.model_base, model_name
+    )
+
+    qs = args.query
+    image_token_se = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
+    if IMAGE_PLACEHOLDER in qs:
+        if model.config.mm_use_start_end:
+            qs = re.sub(IMAGE_PLACEHOLDER, image_token_se, qs)
+        else:
+            qs = re.sub(IMAGE_PLACEHOLDER, DEFAULT_IMAGE_TOKEN, qs)
+    else:
+        if model.config.mm_use_start_end:
+            qs = image_token_se + "\n" + qs
+        else:
+            qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
+
+    if "llama-2" in model_name.lower():
+        conv_mode = "llava_llama_2"
+    elif "mistral" in model_name.lower():
+        conv_mode = "mistral_instruct"
+    elif "v1.6-34b" in model_name.lower():
+        conv_mode = "chatml_direct"
+    elif "v1" in model_name.lower():
+        conv_mode = "llava_v1"
+    elif "mpt" in model_name.lower():
+        conv_mode = "mpt"
+    else:
+        conv_mode = "llava_v0"
+
+    if args.conv_mode is not None and conv_mode != args.conv_mode:
+        print(
+            "[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}".format(
+                conv_mode, args.conv_mode, args.conv_mode
+            )
+        )
+    else:
+        args.conv_mode = conv_mode
+
+    conv = conv_templates[args.conv_mode].copy()
+    conv.append_message(conv.roles[0], qs)
+    conv.append_message(conv.roles[1], None)
+    prompt = conv.get_prompt()
+
+    image_files = image_parser(args)
+    images = load_images(image_files)
+    image_sizes = [x.size for x in images]
+    images_tensor = process_images(
+        images,
+        image_processor,
+        model.config
+    ).to(model.device, dtype=torch.float16)
+
+    input_ids = (
+        tokenizer_image_token(prompt, tokenizer, MM_TOKEN_INDEX, return_tensors="pt")
+        .unsqueeze(0)
+        .cuda()
+    )
+
+    with torch.inference_mode():
+        output_ids = model.generate(
+            input_ids,
+            images=images_tensor,
+            image_sizes=image_sizes,
+            do_sample=True if args.temperature > 0 else False,
+            temperature=args.temperature,
+            top_p=args.top_p,
+            num_beams=args.num_beams,
+            max_new_tokens=args.max_new_tokens,
+            use_cache=True,
+        )
+
+    outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+    print(outputs)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-file", type=str, required=True)
+    parser.add_argument("--query", type=str, required=True)
+    parser.add_argument("--conv-mode", type=str, default=None)
+    parser.add_argument("--sep", type=str, default=",")
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--num_beams", type=int, default=1)
+    parser.add_argument("--max_new_tokens", type=int, default=512)
+    args = parser.parse_args()
+
+    eval_model(args)
diff --git a/app/llava/eval/llava_eval/summarize_gpt_review.py b/app/llava/eval/llava_eval/summarize_gpt_review.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f796a3880341739677a5fe3bfbcc90515a0f324
--- /dev/null
+++ b/app/llava/eval/llava_eval/summarize_gpt_review.py
@@ -0,0 +1,60 @@
+import json
+import os
+from collections import defaultdict
+
+import numpy as np
+
+import argparse
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
+    parser.add_argument('-d', '--dir', default=None)
+    parser.add_argument('-v', '--version', default=None)
+    parser.add_argument('-s', '--select', nargs='*', default=None)
+    parser.add_argument('-f', '--files', nargs='*', default=[])
+    parser.add_argument('-i', '--ignore', nargs='*', default=[])
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    if args.ignore is not None:
+        args.ignore = [int(x) for x in args.ignore]
+
+    if len(args.files) > 0:
+        review_files = args.files
+    else:
+        review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (x.startswith('gpt4_text') or x.startswith('reviews_') or x.startswith('review_') or 'review' in args.dir)]
+
+    for review_file in sorted(review_files):
+        config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '')
+        if args.select is not None and any(x not in config for x in args.select):
+            continue
+        if '0613' in config:
+            version = '0613'
+        else:
+            version = '0314'
+        if args.version is not None and args.version != version:
+            continue
+        scores = defaultdict(list)
+        print(config)
+        with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f:
+            for review_str in f:
+                review = json.loads(review_str)
+                if review['question_id'] in args.ignore:
+                    continue
+                if 'category' in review:
+                    scores[review['category']].append(review['tuple'])
+                    scores['all'].append(review['tuple'])
+                else:
+                    if 'tuple' in review:
+                        scores['all'].append(review['tuple'])
+                    else:
+                        scores['all'].append(review['score'])
+        for k, v in sorted(scores.items()):
+            stats = np.asarray(v).mean(0).tolist()
+            stats = [round(x, 3) for x in stats]
+            # print(k, stats, round(stats[1]/stats[0]*100, 1))
+            print(k, round(stats[1]/stats[0]*100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1))
+        print('=================================')
diff --git a/app/llava/eval/llava_eval/webpage/figures/chatgpt.svg b/app/llava/eval/llava_eval/webpage/figures/chatgpt.svg
new file mode 100644
index 0000000000000000000000000000000000000000..8147382a3152de03c24b4cd91f9870ced1a95d54
--- /dev/null
+++ b/app/llava/eval/llava_eval/webpage/figures/chatgpt.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 2406 2406"><path d="M1 578.4C1 259.5 259.5 1 578.4 1h1249.1c319 0 577.5 258.5 577.5 577.4V2406H578.4C259.5 2406 1 2147.5 1 1828.6V578.4z" fill="#74aa9c"/><path d="M1107.3 299.1c-198 0-373.9 127.3-435.2 315.3C544.8 640.6 434.9 720.2 370.5 833c-99.3 171.4-76.6 386.9 56.4 533.8-41.1 123.1-27 257.7 38.6 369.2 98.7 172 297.3 260.2 491.6 219.2 86.1 97 209.8 152.3 339.6 151.8 198 0 373.9-127.3 435.3-315.3 127.5-26.3 237.2-105.9 301-218.5 99.9-171.4 77.2-386.9-55.8-533.9v-.6c41.1-123.1 27-257.8-38.6-369.8-98.7-171.4-297.3-259.6-491-218.6-86.6-96.8-210.5-151.8-340.3-151.2zm0 117.5-.6.6c79.7 0 156.3 27.5 217.6 78.4-2.5 1.2-7.4 4.3-11 6.1L952.8 709.3c-18.4 10.4-29.4 30-29.4 51.4V1248l-155.1-89.4V755.8c-.1-187.1 151.6-338.9 339-339.2zm434.2 141.9c121.6-.2 234 64.5 294.7 169.8 39.2 68.6 53.9 148.8 40.4 226.5-2.5-1.8-7.3-4.3-10.4-6.1l-360.4-208.2c-18.4-10.4-41-10.4-59.4 0L1024 984.2V805.4L1372.7 604c51.3-29.7 109.5-45.4 168.8-45.5zM650 743.5v427.9c0 21.4 11 40.4 29.4 51.4l421.7 243-155.7 90L597.2 1355c-162-93.8-217.4-300.9-123.8-462.8C513.1 823.6 575.5 771 650 743.5zm807.9 106 348.8 200.8c162.5 93.7 217.6 300.6 123.8 462.8l.6.6c-39.8 68.6-102.4 121.2-176.5 148.2v-428c0-21.4-11-41-29.4-51.4l-422.3-243.7 155-89.3zM1201.7 997l177.8 102.8v205.1l-177.8 102.8-177.8-102.8v-205.1L1201.7 997zm279.5 161.6 155.1 89.4v402.2c0 187.3-152 339.2-339 339.2v-.6c-79.1 0-156.3-27.6-217-78.4 2.5-1.2 8-4.3 11-6.1l360.4-207.5c18.4-10.4 30-30 29.4-51.4l.1-486.8zM1380 1421.9v178.8l-348.8 200.8c-162.5 93.1-369.6 38-463.4-123.7h.6c-39.8-68-54-148.8-40.5-226.5 2.5 1.8 7.4 4.3 10.4 6.1l360.4 208.2c18.4 10.4 41 10.4 59.4 0l421.9-243.7z" fill="white"/></svg>
\ No newline at end of file
diff --git a/app/llava/eval/llava_eval/webpage/figures/swords_FILL0_wght300_GRAD0_opsz48.svg b/app/llava/eval/llava_eval/webpage/figures/swords_FILL0_wght300_GRAD0_opsz48.svg
new file mode 100644
index 0000000000000000000000000000000000000000..3bee468d34515fdcbef1a8b8803c9fc4f7dc0b34
--- /dev/null
+++ b/app/llava/eval/llava_eval/webpage/figures/swords_FILL0_wght300_GRAD0_opsz48.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" height="48" viewBox="0 96 960 960" width="48"><path d="m762.846 947.614-124.77-124.769-88 88-30.306-30.692q-16.616-16.231-16.616-40.077 0-23.846 16.616-40.461L708 611.385q16.23-16.231 40.076-16.231t40.462 16.231l30.307 30.691-88 88 124.154 124.77q8.615 8.615 8.615 20.23 0 11.616-8.615 20.231l-51.692 52.307q-8.615 9-20.231 9-11.615 0-20.23-9Zm97.153-624.076L412.768 771.153l27.847 28.077q16.231 16.616 16.231 40.462 0 23.846-16.231 40.077l-30.691 30.691-88-88-124.77 124.769q-8.615 9-20.23 9-11.616 0-20.231-9l-52.307-52.307q-9-8.615-9-20.23 0-11.616 9-20.231l124.769-124.769-88-88L171.847 611q16.231-16.23 40.077-16.23 23.846 0 40.461 16.23l28.462 28.232 447.615-447.231h131.537v131.537ZM323.846 483.769l33.769-34.154 34.154-34.153-34.154 34.153-33.769 34.154Zm-31.999 31.999-191.846-192.23V192.001h131.537l191.461 191.846-31.23 31.615-179.077-178.077h-67.307v67.307l178.461 179.077-31.999 31.999Zm87.691 222.77 435.077-433.846v-67.307h-67.307L312.231 670.846l67.307 67.692Zm0 0L346.385 704l-34.154-33.154L346.385 704l33.153 34.538Z"/></svg>
\ No newline at end of file
diff --git a/app/llava/eval/llava_eval/webpage/index.html b/app/llava/eval/llava_eval/webpage/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..c2e3cf020ba7d8e064f2cd801788a5d2d50b97da
--- /dev/null
+++ b/app/llava/eval/llava_eval/webpage/index.html
@@ -0,0 +1,162 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Who's GPT-4's favorite? Battles between State-of-the-Art Chatbots</title>
+    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.5.2/css/bootstrap.min.css">
+    <link rel="stylesheet" href="https://fonts.googleapis.com/icon?family=Material+Icons">
+    <link rel="stylesheet" href="styles.css">
+</head>
+
+<body>
+    <nav class="navbar navbar-expand-lg navbar-dark bg-dark">
+        <a class="navbar-brand" href="#">🏔️ Vicuna Evaluation Examples</a>
+        <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation">
+          <span class="navbar-toggler-icon"></span>
+        </button>
+        <div class="collapse navbar-collapse" id="navbarNav">
+          <ul class="navbar-nav mr-auto">
+            <li class="nav-item">
+                <a class="nav-link" href="https://chat.lmsys.org/">Demo</a>
+              </li>
+              <li class="nav-item">
+                <a class="nav-link" href="https://vicuna.lmsys.org">Blog</a>
+              </li>
+              <li class="nav-item">
+                <a class="nav-link" href="https://github.com/lm-sys/FastChat">Github</a>
+              </li>
+          </ul>
+        </div>
+    </nav>
+
+    <div class="container mt-5">
+        <h2 class="text-center mb-5">Who's GPT-4's favorite? Battles between State-of-the-Art Chatbots</h2>
+
+        <!-- Selection -->
+        <div class="form-row">
+            <div class="form-group col-md-2">
+                <label for="category-select">Category</label>
+                <select class="form-control" id="category-select"></select>
+            </div>
+            <div class="form-group col-md-8">
+                <label for="question-select">Question</label>
+                <select class="form-control" id="question-select"></select>
+            </div>
+            <div class="form-group col-md-2">
+                <div class="col-md-2"><label>&nbsp;</label></div>
+                <div class="btn-group" role="group" aria-label="Left and Right Controller">
+                    <button type="button" class="form-control btn btn-primary" id="prev-question"><i class="material-icons">keyboard_arrow_left</i></button>
+                    <button type="button" class="form-control btn btn-primary" id="next-question"><i class="material-icons">keyboard_arrow_right</i></button>
+                </div>
+            </div>
+        </div>
+
+        <!-- "Battle" -->
+        <div class="row mb-4" style="justify-content: center;">
+            <div class="col" style="display: flex; justify-content: center; align-items: center;">
+                <label class="adjustable-font-size" id="other-score-label">*/10</label>
+            </div>
+            <div class="col">
+                <div class="vertical-flex-layout">
+                    <img class="shadow figure-img img-fluid" src="" alt="other logo" width="150" id="other-model-figure">
+                </div>
+            </div>
+            <div class="col">
+                <div class="vertical-flex-layout">
+                    <!-- from: https://fonts.google.com/icons?icon.query=battle&selected=Material+Symbols+Outlined:swords:FILL@0;wght@300;GRAD@0;opsz@48&icon.style=Outlined -->
+                    <img class="figure-img img-fluid" src="figures/swords_FILL0_wght300_GRAD0_opsz48.svg" width="60" height="60">
+                </div>
+            </div>
+            <div class="col">
+                <div class="vertical-flex-layout">
+                    <img class="shadow figure-img img-fluid" src="figures/vicuna.jpeg" alt="vicuna logo" width="150" id="our-model-figure">
+                </div>
+            </div>
+            <div class="col" style="display: flex; justify-content: center; align-items: center;">
+                <label class="adjustable-font-size" id="our-score-label">*/10</label>
+            </div>
+        </div>
+
+        <!-- Question Card -->
+        <div class="card mb-4">
+            <div class="card-body" id="selected-question"></div>
+        </div>
+
+        <!-- Answer Cards -->
+        <div class="row">
+            <div class="col-md-6">
+                <div class="card mb-4 expandable-card">
+                    <div class="card-header" style="padding-bottom: 0.2rem" id="other-model-header-bg">
+                        <div class="row">
+                            <div class="col-md-5" style="align-items: center; display: flex;">
+                                <label id="other-model-header">Assistant #1</label>
+                            </div>
+                            <div class="col-md-7">
+                                <select class="form-control" id="model-select" style="height: fit-content; margin-top: -0.3rem;"></select>
+                            </div>
+                        </div>
+                    </div>
+                    <div class="card-body">
+                        <div class="card-text-container">
+                            <div class="card-text" id="other-model-answer"></div>
+                        </div>
+                        <div class="btn btn-primary expand-btn" style="display:flex;"></div>
+                    </div>
+                </div>
+            </div>
+            <div class="col-md-6">
+                <div class="card mb-4 expandable-card">
+                    <div class="card-header" id="our-model-header">
+                        Assistant #2 (Vicuna, our model)
+                    </div>
+                    <div class="card-body">
+                        <div class="card-text-container">
+                            <div class="card-text" id="our-model-answer"></div>
+                        </div>
+                        <div class="btn btn-primary expand-btn" style="display:flex;"></div>
+                    </div>
+                </div>
+            </div>
+        </div>
+
+        <!-- Evaluation -->
+        <div class="card expandable-card">
+            <div class="card-header" style="background-color: #c9c9f2;" id="evaluation-header">GPT-4 Evaluation</div>
+            <div class="card-body">
+                <div class="card-text-container">
+                    <div class="card-text" id="evaluation-result"></div>
+                </div>
+                <div class="btn btn-primary expand-btn" style="display:flex;"></div>
+            </div>
+        </div>
+    </div>
+
+    <div class="container-fluid bg-light py-2">
+        <div class="text-center">
+            <small class="text-muted">This website is co-authored with <a href="https://openai.com" target="_blank">GPT-4</a>.</small>
+        </div>
+    </div>
+
+    <!-- Marked.js -->
+    <script src="https://cdn.jsdelivr.net/npm/marked@4.3.0/lib/marked.umd.min.js"></script>
+    <!-- Bootstrap and Popper.js JavaScript dependencies -->
+    <script src="https://code.jquery.com/jquery-3.5.1.slim.min.js"></script>
+    <script src="https://cdn.jsdelivr.net/npm/@popperjs/core@2.11.6/dist/umd/popper.min.js"></script>
+    <script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.5.2/js/bootstrap.min.js"></script>
+
+    <script src="script.js"></script>
+    <script>
+      // Fetch the JSON file
+      fetch('data.json')
+        .then(response => response.json())
+        .then(json_data => {
+            // Populate the models and questions.
+            populateModels(json_data.models);
+            populateQuestions(json_data.questions);
+            displayQuestion(currentQuestionIndex);
+        }).catch(error => console.error(error));
+    </script>
+</body>
+
+</html>
diff --git a/app/llava/eval/llava_eval/webpage/script.js b/app/llava/eval/llava_eval/webpage/script.js
new file mode 100644
index 0000000000000000000000000000000000000000..4b71e3d5618a262e4746f58e5d10947b73370dca
--- /dev/null
+++ b/app/llava/eval/llava_eval/webpage/script.js
@@ -0,0 +1,245 @@
+// Description: Script for the evaluation webpage.
+
+let currentQuestionIndex = 1;
+
+// Store the model name mapping for later use.
+modelNameMapping = {
+    "gpt35": "ChatGPT-3.5",
+    "gpt4": "GPT-4",
+    "alpaca": "Alpaca-13b",
+    "vicuna": "Vicuna-13b",
+    "llama": "LLaMA-13b",
+    "bard": "Bard",
+};
+
+modelFigureMapping = {
+    "vicuna": "figures/vicuna.jpeg",
+    // Image from: https://commons.wikimedia.org/wiki/File:ChatGPT_logo.svg
+    "gpt35": "figures/chatgpt.svg",
+    // Image from: https://www.reddit.com/r/logodesign/comments/1128aat/google_ai_bard_logo_design/
+    "bard": "figures/bard.jpg",
+    // Image from: https://crfm.stanford.edu/2023/03/13/alpaca.html
+    "alpaca": "figures/alpaca.png",
+    // Image adapted from https://commons.wikimedia.org/wiki/File:Llama_on_Machu_Picchu.jpg
+    "llama": "figures/llama.jpg",
+}
+
+// Store the question data in a mapping for later use.
+questionMapping = {};
+// Store the question ids in a mapping for later use.
+categoryMapping = {};
+// Store the number of questions for later use.
+questionsCount = 0;
+
+
+function text2Markdown(text) {
+    // Normalize the text for markdown rendering.
+    text = text.trim().replaceAll('\n\n', '\n').replaceAll('\n', '\n\n');
+    return marked.parse(text);
+}
+
+function capitalizeFirstChar(str) {
+    if (!str || str.length === 0) {
+      return str;
+    }
+    return str.charAt(0).toUpperCase() + str.slice(1);
+}
+
+function updateQuestionSelect(question_id) {
+    const select = document.getElementById('question-select');
+    // Clear the question select.
+    select.innerHTML = '';
+    // Populate the question select.
+    category = questionMapping[question_id].category;
+    categoryMapping[category].forEach(question_id => {
+        const question = questionMapping[question_id];
+        const option = document.createElement('option');
+        option.value = question_id;
+        option.textContent = 'Q' + question_id.toString() + ': ' + question.question;
+        select.appendChild(option);
+    });
+    select.value = question_id;
+}
+
+function updateModelSelect() {
+    const select = document.getElementById('model-select');
+    img_path = modelFigureMapping[select.value];
+    document.getElementById('other-model-figure').src = img_path;
+}
+
+function populateModels(models) {
+    const select = document.getElementById('model-select');
+    models.forEach(model => {
+        const option = document.createElement('option');
+        option.value = model;
+        option.textContent = modelNameMapping[model];
+        select.appendChild(option);
+    });
+    updateModelSelect();
+}
+
+function populateQuestions(questions) {
+    const category_select = document.getElementById('category-select');
+
+    questionsCount = questions.length;
+    questions.forEach(question => {
+        const option = document.createElement('option');
+        // Store the question data in a mapping for later use.
+        questionMapping[question.id] = {
+            category: question.category,
+            question: question.question,
+            answers: question.answers,
+            evaluations: question.evaluations,
+            scores: question.scores,
+        };
+        // Store the question id in the category mapping.
+        if (question.category in categoryMapping) {
+            categoryMapping[question.category].push(question.id);
+        } else {
+            categoryMapping[question.category] = [question.id];
+            const category_option = document.createElement('option');
+            category_option.value = question.category;
+            category_option.textContent = capitalizeFirstChar(question.category);
+            category_select.appendChild(category_option);
+        }
+    });
+    // Set the default category.
+    updateQuestionSelect(currentQuestionIndex);
+}
+
+function displayQuestion(index) {
+    const question = questionMapping[index].question;
+    document.getElementById('selected-question').innerHTML = text2Markdown('**Question:** ' + question);
+    displayAnswers(index);
+}
+
+function displayAnswers(index) {
+    const question = questionMapping[index];
+    const otherModel = document.getElementById('model-select').value;
+    // render the answers with markdown
+    document.getElementById('other-model-answer').innerHTML = text2Markdown(question.answers[otherModel]);
+    document.getElementById('our-model-answer').innerHTML = text2Markdown(question.answers.vicuna);
+
+    // Display evaluation
+    score = question.scores[otherModel];
+    score_text = modelNameMapping[otherModel] + " " + score[0] + "/10, Vicuna-13b " + score[1] + "/10";
+    document.getElementById('evaluation-header').textContent = "GPT-4 Evaluation" + " (Score: " + score_text + ")";
+    document.getElementById('evaluation-result').innerHTML = text2Markdown(question.evaluations[otherModel]);
+
+    // Update model names
+    let assistant1_title = "Assistant #1"; // (" + modelNameMapping[otherModel] + ")";
+    let assistant2_title = "Assistant #2 (Vicuna-13b, our model)";
+    // Update scores/labels.
+    let assistant1_score_label = score[0].toString() + '/10';
+    let assistant2_score_label = score[1].toString() + '/10';
+
+    const colorRed ='#fa9'; // '#eb978d';
+    // const colorGreen = '#c9f2c9';
+    const colorBlue = '#8ef'; // '#71dbf9';
+    const colorYellow = '#fe7'; // '#fada57';
+    let otherModelHeaderColor = '';
+    let ourModelHeaderColor = '';
+    // Update the winner.
+    if (score[0] == score[1]) {
+        assistant1_title = '🏆 ' + assistant1_title;
+        assistant1_score_label = '🏆 ' + assistant1_score_label;
+        assistant2_title = '🏆 ' + assistant2_title;
+        assistant2_score_label = '🏆 ' + assistant2_score_label;
+        otherModelHeaderColor = colorYellow;
+        ourModelHeaderColor = colorYellow;
+    } else if (score[0] > score[1]) {
+        assistant1_title = '🏆 ' + assistant1_title;
+        assistant1_score_label = '🏆 ' + assistant1_score_label;
+        otherModelHeaderColor = colorBlue;
+        ourModelHeaderColor = colorRed;
+    } else if (score[0] < score[1]) {
+        assistant2_title = '🏆 ' + assistant2_title;
+        assistant2_score_label = '🏆 ' + assistant2_score_label;
+        otherModelHeaderColor = colorRed;
+        ourModelHeaderColor = colorBlue;
+    }
+
+    document.getElementById('other-model-header-bg').style.backgroundColor = otherModelHeaderColor;
+    document.getElementById('our-model-header').style.backgroundColor = ourModelHeaderColor;
+
+    document.getElementById('other-model-header').textContent = assistant1_title;
+    document.getElementById('our-model-header').textContent = assistant2_title;
+
+    document.getElementById('other-score-label').textContent = assistant1_score_label;
+    document.getElementById('our-score-label').textContent = assistant2_score_label;
+
+    // Update expand buttons visibility for both cards after displaying answers
+    // Reset the expanded state and update expand buttons visibility for both cards after displaying answers
+    document.querySelectorAll('.expandable-card').forEach(card => {
+        card.classList.remove('expanded');
+        updateExpandButtonVisibility(card);
+        const expandBtn = card.querySelector('.expand-btn');
+        expandBtn.innerHTML = '<i class="material-icons" style="pointer-events: none">keyboard_arrow_down</i> Show more';   // .textContent = 'Show more';
+    });
+}
+
+document.getElementById('question-select').addEventListener('change', e => {
+    currentQuestionIndex = parseInt(e.target.value);
+    displayQuestion(currentQuestionIndex);
+});
+
+document.getElementById('category-select').addEventListener('change', e => {
+    let currentCategory = e.target.value;
+    const questionIds = categoryMapping[currentCategory];
+    currentQuestionIndex = questionIds[0];
+    updateQuestionSelect(currentQuestionIndex);
+    displayQuestion(currentQuestionIndex);
+});
+
+// Update expand buttons whenever the model is changed
+document.getElementById('model-select').addEventListener('change', () => {
+    displayAnswers(currentQuestionIndex);
+    document.querySelectorAll('.expandable-card').forEach(card => {
+        updateExpandButtonVisibility(card);
+    });
+    updateModelSelect();
+});
+
+function switchQuestionAndCategory() {
+    document.getElementById('question-select').value = currentQuestionIndex;
+    old_category = document.getElementById('category-select').value;
+    new_category = questionMapping[currentQuestionIndex].category;
+    if (old_category != new_category) {
+        document.getElementById('category-select').value = new_category;
+        updateQuestionSelect(currentQuestionIndex);
+    }
+    displayQuestion(currentQuestionIndex);
+}
+
+document.getElementById('prev-question').addEventListener('click', () => {
+    // Question index starts from 1.
+    currentQuestionIndex = Math.max(1, currentQuestionIndex - 1);
+    switchQuestionAndCategory();
+});
+
+document.getElementById('next-question').addEventListener('click', () => {
+    // Question index starts from 1.
+    currentQuestionIndex = Math.min(questionsCount, currentQuestionIndex + 1);
+    switchQuestionAndCategory();
+});
+
+function updateExpandButtonVisibility(card) {
+    const cardTextContainer = card.querySelector('.card-text-container');
+    const expandBtn = card.querySelector('.expand-btn');
+    if (cardTextContainer.scrollHeight > cardTextContainer.offsetHeight) {
+        expandBtn.style.display = 'flex';
+    } else {
+        expandBtn.style.display = 'none';
+        card.classList.add('expanded');
+    }
+}
+
+document.querySelectorAll('.expand-btn').forEach(btn => {
+    btn.addEventListener('click', e => {
+        const card = e.target.closest('.expandable-card');
+        card.classList.toggle('expanded');
+        const more = '<i class="material-icons" style="pointer-events: none">keyboard_arrow_down</i> Show more';
+        const less = '<i class="material-icons" style="pointer-events: none">keyboard_arrow_up</i> Show less';
+        e.target.innerHTML = card.classList.contains('expanded') ? less : more;
+    });
+});
diff --git a/app/llava/eval/llava_eval/webpage/styles.css b/app/llava/eval/llava_eval/webpage/styles.css
new file mode 100644
index 0000000000000000000000000000000000000000..7b6d6fc69b336c0a5d103be9fb13a0e0897c76a3
--- /dev/null
+++ b/app/llava/eval/llava_eval/webpage/styles.css
@@ -0,0 +1,105 @@
+body {
+    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+    background-color: #f8f9fa;
+}
+
+.navbar-dark .navbar-nav .nav-link {
+    color: #f1cf68;
+    font-size: 1.1rem;
+    padding: 0.5rem 0.6rem;
+}
+
+.card-header {
+    font-weight: bold;
+}
+
+.card {
+    box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
+    transition: 0.3s;
+}
+
+.card:hover {
+    box-shadow: 0 8px 16px rgba(0, 0, 0, 0.2);
+}
+
+button {
+    transition: background-color 0.3s;
+}
+
+button:hover {
+    background-color: #007bff;
+}
+
+@media (max-width: 767px) {
+    .form-row .form-group {
+        margin-bottom: 10px;
+    }
+}
+
+/* Extra styles */
+
+.expandable-card .card-text-container {
+    max-height: 200px;
+    overflow-y: hidden;
+    position: relative;
+}
+
+.expandable-card.expanded .card-text-container {
+    max-height: none;
+}
+
+.expand-btn {
+    position: relative;
+    display: none;
+    background-color: rgba(255, 255, 255, 0.8);
+    color: #510c75;
+    border-color: transparent;
+}
+
+.expand-btn:hover {
+    background-color: rgba(200, 200, 200, 0.8);
+    text-decoration: none;
+    border-color: transparent;
+    color: #510c75;
+}
+
+.expand-btn:focus {
+    outline: none;
+    text-decoration: none;
+}
+
+.expandable-card:not(.expanded) .card-text-container:after {
+    content: "";
+    position: absolute;
+    bottom: 0;
+    left: 0;
+    width: 100%;
+    height: 90px;
+    background: linear-gradient(rgba(255, 255, 255, 0.2), rgba(255, 255, 255, 1));
+}
+
+.expandable-card:not(.expanded) .expand-btn {
+    margin-top: -40px;
+}
+
+.card-body {
+    padding-bottom: 5px;
+}
+
+.vertical-flex-layout {
+    justify-content: center;
+    align-items: center;
+    height: 100%;
+    display: flex;
+    flex-direction: column;
+    gap: 5px;
+}
+
+.figure-img {
+    max-width: 100%;
+    height: auto;
+}
+
+.adjustable-font-size {
+    font-size: calc(0.5rem + 2vw);
+}
diff --git a/app/llava/eval/masp_eval/eval_case.py b/app/llava/eval/masp_eval/eval_case.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ffae65baec70f7cc0221e5d27b6b00af891489d
--- /dev/null
+++ b/app/llava/eval/masp_eval/eval_case.py
@@ -0,0 +1,161 @@
+import argparse
+import logging
+
+import copy
+import codecs
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
+import torch
+import decord
+import os
+import json
+import random
+import requests
+from tqdm import tqdm
+import numpy as np
+
+from llava.constants import MM_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_VIDEO_TOKEN, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN
+from llava.conversation import conv_templates, SeparatorStyle
+from llava.utils import disable_torch_init
+from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria, process_images_v2
+from llava.model import *
+from llava.model.builder import load_pretrained_model
+from llava.model.multimodal_encoder.processor import Blip2ImageTrainProcessor
+
+from transformers import CLIPImageProcessor
+from PIL import Image
+from decord import VideoReader, cpu
+
+decord.bridge.set_bridge("torch")
+
+
+
+def get_image(image_path):
+    image = Image.open(image_path).convert('RGB')
+    return image
+
+
+# def load_frames(frames_dir, frame_names):
+#     results = []
+#     for frame_name in frame_names:
+#         image_path = f"{frames_dir}/{frame_name}"
+#         image = get_image(image_path)
+#         results.append(image)
+#     return results
+
+def load_frames(frames_dir):
+    results = []
+    image_files = [(int(os.path.splitext(img)[0]), img) for img in os.listdir(frames_dir) if not img.startswith('cuttime')]
+    image_files = sorted(image_files, key=lambda img: img[0])
+    for frame_name in image_files:
+        image_path = f"{frames_dir}/{frame_name[1]}"
+        image = get_image(image_path)
+        results.append(image)
+    return results
+
+
+
+
+def uniform_sample(frames, num_segments):
+    indices = np.linspace(start=0, stop=len(frames) - 1, num=num_segments).astype(int)
+    frames = [frames[ind] for ind in indices]
+    return frames
+
+    
+
+
+def run_inference(args, frame_folders):
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, _, context_len = load_pretrained_model(model_path, args.model_base, model_name, device_map={"":0})
+    image_processor = Blip2ImageTrainProcessor(
+        image_size=model.config.img_size,
+        is_training=False)
+    model_cfgs = model.config
+
+    
+    for frame_folder in frame_folders:
+        question = "Describe the video in detail."
+
+        # Question input here
+        qs = question
+        # qs = DEFAULT_VIDEO_TOKEN + '\n' + qs
+        if model.config.mm_use_start_end:
+            qs = DEFAULT_VIDEO_START_TOKEN + DEFAULT_VIDEO_TOKEN + DEFAULT_VIDEO_END_TOKEN + '\n' + qs
+        else:
+            qs = DEFAULT_VIDEO_TOKEN + '\n' + qs
+
+        conv = conv_templates[args.conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+
+        # inputs = tokenizer([prompt])
+        input_ids = tokenizer_image_token(prompt, tokenizer, MM_TOKEN_INDEX, return_tensors='pt').unsqueeze(
+            0).cuda()
+    
+
+        # try:
+        images = load_frames(frame_folder)
+        # images = images[:15:2]
+        if len(images) > args.num_segments:
+            images = uniform_sample(images, args.num_segments)
+        elif len(images) < args.num_segments:
+            # frame_indices = [i for i in range(len(images))]
+            images = uniform_sample(images, args.num_segments)
+        else:
+            pass
+        
+        if model_cfgs.image_aspect_ratio == 'pad':
+            model_cfgs.image_aspect_ratio = 'no_padding' 
+        images_tensor = process_images_v2(images, image_processor, model_cfgs).half().cuda()
+        # print(images_tensor.shape)
+
+        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+        keywords = [stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+        images_tensors = [images_tensor.clone() for _ in range(args.num_beams)]
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images= images_tensors,
+                do_sample=True,
+                temperature=args.temperature,
+                top_p=args.top_p,
+                num_beams=args.num_beams,
+                no_repeat_ngram_size=args.no_repeat_ngram_size,
+                pad_token_id=tokenizer.eos_token_id, 
+                max_new_tokens=1024,
+                use_cache=True,
+                stopping_criteria=[stopping_criteria])
+
+
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
+
+        outputs = outputs.strip()
+        if outputs.endswith(conv.sep):
+            outputs = outputs[:-len(stop_str)]
+        outputs = outputs.strip()
+        print(outputs)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--video_dir', help='Directory containing video files.', type=str, default="")
+    parser.add_argument('--validation_data', type=str,
+                        default="/mnt/bn/yukunfeng-nasdrive/xiangchen/repo/benchmark_data/refine_chair_eval_gt_neg_1k.json")
+    parser.add_argument('--num_samples', help='Number of samples to predict', type=int, default=-1)
+    parser.add_argument("--model_path", type=str,
+                        default="/mnt/bn/algo-masp-nas-2/xiangchen/model/masp_models/checkpoints/llava-mistral_gpt4v_adso185k_unfreeze_qformer_data_sampler/")
+    parser.add_argument("--model_base", type=str, default=None)
+    parser.add_argument("--conv_mode", type=str, default="v1")
+    parser.add_argument("--output_file", type=str, default="vid_top1k_res.json")
+    parser.add_argument("--num_segments", type=int, default=10)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--num_beams", type=int, default=1)
+    parser.add_argument("--no_repeat_ngram_size", type=int, default=3)
+
+    args = parser.parse_args()
+    frame_folders = ['/mnt/bn/algo-masp-nas-2/xiangchen/repo/LLaVA/tmp/cases/yj']
+    run_inference(args, frame_folders)
diff --git a/app/llava/eval/masp_eval/gpt4v_score/eval_gpt4v_dist_stage1.py b/app/llava/eval/masp_eval/gpt4v_score/eval_gpt4v_dist_stage1.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca3e163c017511d29e0b97129f1283c66687771e
--- /dev/null
+++ b/app/llava/eval/masp_eval/gpt4v_score/eval_gpt4v_dist_stage1.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[1]:
+
+
+#coding=utf-8
+
+
+# In[2]:
+
+
+import os
+import re
+import sys
+from tqdm import tqdm
+
+from PIL import Image
+import base64
+from io import BytesIO
+import pandas as pd
+
+import requests
+import json
+import time
+import openai
+import random
+
+sys.path.append('../..')
+
+from concurrent.futures import ProcessPoolExecutor, as_completed
+# from llava.eval.benchmark_core.utils.azure_utils import AzureVisionClient
+# from llava.eval.benchmark_core.utils.video_utils import sample_frames
+from masp_eval.utils.azure_utils import AzureVisionClient
+from masp_eval.utils.video_utils import sample_frames
+
+
+# In[3]:
+
+
+stage1_prompt = """
+Extract information from a sequence of video frames based on five aspects:
+
+- **Subjects**: Identify all primary and secondary entities, describing their quantity, types, and notable features.
+- **Attributes**: Detail the subjects' characteristics, such as physical appearance, emotional expressions, and other qualities.
+- **Scenes**: Describe the video's setting, including location, ambiance, time, and weather if relevant.
+- **Actions**: Outline the subjects' actions or events, including movements, interactions, and environmental changes.
+- **OCR**: Transcribe overlaid text and provide its context and significance.
+
+Provide a detailed description for each aspect in JSON format:
+
+```json
+{
+    "Subjects": "List of subjects in the video.",
+    "Attributes": "List of the attributes of the subjects in the video.",
+    "Scenes": "List of the scenes in the video.",
+    "Actions": "List of the actions in the video.",
+    "OCR": "Transcription and interpretation of any text overlays in the video."
+}
+```
+"""
+
+
+# In[4]:
+# ak does not allow plaintext storage.
+ak = "Gjrgj*"
+client = AzureVisionClient(ak)
+pattern = r'\{.*?\}'
+
+
+# In[5]:
+
+
+def encode_image(image):
+    buffered = BytesIO()
+    image.save(buffered, format="JPEG")
+    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+    
+    return img_b64_str
+
+def load_frames(path_to_images):
+    max_frames = 10
+
+    # Get list of all files in the directory
+    image_files = [(int(os.path.splitext(file)[0]), file) for file in os.listdir(path_to_images) if file.endswith(('jpg', 'jpeg', 'png'))]
+    image_files = sorted(image_files, key=lambda img: img[0])
+    # Set up the matplotlib figure and axes, based on the number of images
+    num_images = len(image_files)
+    frames = []
+    # Read and display each image
+    for image_file in image_files:
+        image_path = os.path.join(path_to_images, image_file[1])
+        image = Image.open(image_path)
+        frames.append(image)
+
+    if len(frames) > max_frames:
+        frames = sample_frames(frames, max_frames)
+
+    vid_b64_lst = list(map(encode_image, frames))
+
+    return vid_b64_lst
+
+
+# In[6]:
+
+
+def extract_info(item):
+    ans = None
+    max_attempts = 3
+    attempts = 0
+
+    while attempts < max_attempts and ans is None:
+        try:
+            resp, messages = client.request(stage1_prompt, vid_data=load_frames(item['video_path']))
+            out = json.loads(re.findall(pattern, resp, re.DOTALL)[0].replace("\n", ""))
+            res = {
+                "object_id": item['object_id'],
+                "video_path": item['video_path'],
+                "policy_list": item['policy_list'],
+                "refine_caption": item['refine_caption'],
+                # "masp_inference": item['masp_inference'],
+                "Subjects": out['Subjects'],
+                "Attributes": out['Attributes'],
+                "Scenes": out['Scenes'],
+                "Actions": out['Actions'],
+                "OCR": out['OCR'],
+                "Resp": resp
+            }
+            # ans = (res, messages)
+            ans = res
+        except Exception as e:
+            attempts += 1
+            print(e)
+            
+    return ans
+
+
+# In[ ]:
+
+
+if __name__ == "__main__":
+    import argparse
+    
+    parser = argparse.ArgumentParser("GPT4-V Eval")
+    parser.add_argument("--gt_file", type=str, default="/mnt/bn/algo-masp-nas-2/baiyi.by/data/Benchmarks/GPT4V_Eval/eval_negative_1k.json")
+    parser.add_argument("--inter_file", type=str, default="/mnt/bn/algo-masp-nas-2/baiyi.by/data/Benchmarks/GPT4V_Eval/eval_negative_1k_gptv_inter_res.json")
+    parser.add_argument("-f", "--fff", type=str, default="1", required=False)
+    args = parser.parse_args()
+    
+    pred_data = json.load(open(args.pred_file))
+    
+    executor = ProcessPoolExecutor(max_workers=40)
+    
+    all_tasks = [executor.submit(extract_info, item) for item in pred_data]
+    
+    all_results = []
+    
+    for future in tqdm(as_completed(all_tasks)):
+        result = future.result()
+        
+        if result is None:
+            continue
+            
+        all_results.append(result)
+    
+    json.dump(all_results, open(args.inter_file, 'w'), indent=4, ensure_ascii=False)
+    
+    print("==> Eval Finished.")
+
+
+# In[ ]:
+
+
+
+
diff --git a/app/llava/eval/masp_eval/gpt4v_score/eval_gpt4v_dist_stage2.py b/app/llava/eval/masp_eval/gpt4v_score/eval_gpt4v_dist_stage2.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee77f123a545949d67490b06b422cc30bc91d3e3
--- /dev/null
+++ b/app/llava/eval/masp_eval/gpt4v_score/eval_gpt4v_dist_stage2.py
@@ -0,0 +1,311 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[1]:
+
+
+# coding=utf-8
+
+
+# In[2]:
+
+
+import os
+import re
+import sys
+from tqdm import tqdm
+
+from PIL import Image
+import base64
+from io import BytesIO
+import pandas as pd
+
+import requests
+import json
+import time
+import openai
+import random
+
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from llava.eval.masp_eval.utils.azure_utils import AzureVisionClient
+from llava.eval.masp_eval.utils.video_utils import sample_frames, uniform_sample
+
+# from benchmark_core.utils.azure_utils import AzureVisionClient
+# from benchmark_core.utils.video_utils import sample_frames
+
+
+# In[4]:
+
+stage1_prompt = """
+Extract information from a sequence of video frames based on five aspects:
+
+- **Subjects**: Identify all primary and secondary entities, describing their quantity, types, and notable features.
+- **Attributes**: Detail the subjects' characteristics, such as physical appearance, emotional expressions, and other qualities.
+- **Scenes**: Describe the video's setting, including location, ambiance, time, and weather if relevant.
+- **Actions**: Outline the subjects' actions or events, including movements, interactions, and environmental changes.
+- **OCR**: Transcribe overlaid text and provide its context and significance.
+
+Provide a detailed description for each aspect in JSON format:
+
+```json
+{
+    "Subjects": "List of subjects in the video.",
+    "Attributes": "List of the attributes of the subjects in the video.",
+    "Scenes": "List of the scenes in the video.",
+    "Actions": "List of the actions in the video.",
+    "OCR": "Transcription and interpretation of any text overlays in the video."
+}
+```
+"""
+
+stage2_prompt = """
+Evaluate the input caption's recall on five aspects of a video, assigning a numerical score for each. Average these scores for a final recall score. The aspects and their scoring guidelines are:
+- **SubjectsScore (100 points max)**: Evaluate the recall of 'Subjects' information in the caption. Full score indicates complete recall.
+- **AttributesScore (100 points max)**: Evaluate the recall of 'Attributes' information in the caption. Full score indicates complete recall.
+- **ScenesScore (100 points max)**: Evaluate the recall of 'Scenes' information in the caption. Full score indicates complete recall.
+- **ActionsScore (100 points max)**: Evaluate the recall of 'Actions' information in the caption. Full score indicates complete recall.
+- **OCRScore (100 points max)**: Evaluate the recall of 'OCR' information in the caption. Full score indicates complete recall.
+- **HallucinationScore (100 points max)**: Identify discrepancies between the caption and actual information, deducting points for added non-existent content.
+
+Provide structured output in JSON format following this template:
+
+```json
+{
+    "SubjectsScore": [Recall score out of 100 for subjects],
+    "SubjectsReason": "Explanation for Subjects score.",
+    "AttributesScore": [Recall score out of 100 for attributes],
+    "AttributesReason": "Explanation for Attributes score.",
+    "ScenesScore": [Recall score out of 100 for scenes],
+    "ScenesReason": "Explanation for Scenes score.",
+    "ActionsScore": [Recall score out of 100 for actions],
+    "ActionsReason": "Explanation for Actions score.",
+    "OCRScore": [Recall score out of 100 for OCR text],
+    "OCRReason": "Explanation for OCR score.",
+    "HallucinationScore": [Score out of 100 for hallucinated elements],
+    "HallucinationReason": "Explanation for Hallucination score.",
+    "FinalCaptionScore": [Final score averaged from each aspect]
+}
+
+The video information and caption for evaluation are as follows:
+"""
+
+# In[5]:
+
+# ak does not allow plaintext storage.
+ak_list = ["F1uPN8EfGeYXUSwt9iDy4KzxRBBLuxWr"]
+client_list = [AzureVisionClient(ak, max_retries=200) for ak in ak_list]
+
+pattern = r'\{.*?\}'
+
+
+# In[6]:
+
+
+def encode_image(image):
+    buffered = BytesIO()
+    image.save(buffered, format="JPEG")
+    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+
+    return img_b64_str
+
+
+def load_frames(path_to_images):
+    max_frames = 10
+
+    # Get list of all files in the directory
+    image_files = [(int(os.path.splitext(file)[0]), file) for file in os.listdir(path_to_images) if
+                   file.endswith(('jpg', 'jpeg', 'png'))]
+    image_files = sorted(image_files, key=lambda img: img[0])
+    # Set up the matplotlib figure and axes, based on the number of images
+    num_images = len(image_files)
+    frames = []
+    # Read and display each image
+    for image_file in image_files:
+        image_path = os.path.join(path_to_images, image_file[1])
+        image = Image.open(image_path)
+        frames.append(image)
+
+    if len(frames) > max_frames:
+        frames = uniform_sample(frames, max_frames)
+
+    vid_b64_lst = list(map(encode_image, frames))
+
+    return vid_b64_lst
+
+
+def score_func(item, messages=None):
+    ans = None
+    max_attempts = 3
+    attempts = 0
+
+    client = random.choice(client_list)
+
+    while attempts < max_attempts and ans is None:
+        try:
+            usr_prompt = stage2_prompt + '\nThe provided caption:\n' + item['masp_inference']
+            resp, _ = client.request(usr_prompt, messages=messages)
+            out = json.loads(re.findall(pattern, resp, re.DOTALL)[0].replace("\n", ""))
+            # out = json.loads(resp.choices[0].message.content)
+            res = {
+                "object_id": item['object_id'],
+                "video_path": item['video_path'],
+                "policy_list": item['policy_list'],
+                "refine_caption": item['refine_caption'],
+                "masp_inference": item['masp_inference'],
+                # "origin_caption_rate": item['origin_caption_rate'],
+                # "hard_negative_caption": item['hard_negative_caption'],
+                "Subjects": item['Subjects'],
+                "Attributes": item['Attributes'],
+                "Scenes": item['Scenes'],
+                "Actions": item['Actions'],
+                "OCR": item['OCR'],
+                "SubjectsScore": out['SubjectsScore'],
+                "SubjectsReason": out['SubjectsReason'],
+                "AttributesScore": out['AttributesScore'],
+                "AttributesReason": out['AttributesReason'],
+                "ScenesScore": out['ScenesScore'],
+                "ScenesReason": out['ScenesReason'],
+                "ActionsScore": out['ActionsScore'],
+                "ActionsReason": out['ActionsReason'],
+                "OCRScore": out['OCRScore'],
+                "OCRReason": out['OCRReason'],
+                "HallucinationScore": out['HallucinationScore'],
+                "HallucinationReason": out['HallucinationReason'],
+                "FinalCaptionScore": out['FinalCaptionScore']
+            }
+            ans = res
+        except Exception as e:
+            attempts += 1
+            print(e)
+    return ans
+
+
+# In[9]:
+def build_context(item):
+    vid_data = load_frames(item['video_path'])
+
+    content = [{
+            "type": "text",
+            "text": stage1_prompt
+        },
+        *map(lambda x: {
+            "type": "image_url",
+            "image_url": {
+                "url": f"data:image/jpeg;base64, {x}"
+            }
+        }, vid_data)]
+
+    messages = [{
+        'role': 'user',
+        'content': content
+    }, {
+        "role": "assistant",
+        "content": [{
+            "type": "text",
+            "text": item['Resp']
+        }]
+    }]
+    return messages
+
+
+def eval_pipeline(item):
+    try:
+        messages = build_context(item)
+
+        item_score = score_func(item, messages)
+
+    except Exception as e:
+        print(item["object_id"], e)
+        return None
+
+    return item_score
+
+
+# In[10]:
+
+
+def get_final_score(all_items):
+    from collections import defaultdict
+
+    score_dict = defaultdict(list)
+
+    for item in all_items:
+        for key in item:
+            if not key.endswith('Score'):
+                continue
+
+            score_dict[key].append(item[key])
+
+    final_score_dict = dict()
+    for key in score_dict:
+        final_score_dict[key] = round(sum(score_dict[key]) / len(score_dict[key]), 4)
+
+    return final_score_dict
+
+
+# In[11]:
+
+def merge_data(gt_data, pred_data):
+    gt_dict = dict()
+
+    for item in gt_data:
+        gt_dict[item['object_id']] = item
+
+    merge_list = []
+    for item in pred_data:
+        if item['object_id'] not in gt_dict:
+            continue
+
+        # item['Resp'] = gt_dict[item['object_id']]['Resp']
+        for key in ['Resp', 'Subjects', 'Attributes', 'Scenes', 'Actions', 'OCR']:
+            assert key in gt_dict[item['object_id']], f"object id: {item['object_id']}, key: {key}"
+            item[key] = gt_dict[item['object_id']][key]
+
+        merge_list.append(item)
+
+    return merge_list
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser("GPT4-V Eval")
+    parser.add_argument("--pred_file", type=str,
+                        default="/mnt/bn/algo-masp-nas-2/xiangchen/model/masp_models/checkpoints/llava-mistral_gpt4v_webvid/video_chair/pred_result.json")
+    parser.add_argument("--gt_file", type=str,
+                        default="/mnt/bn/algo-masp-nas-2/baiyi.by/data/Benchmarks/GPT4V_Eval/eval_negative_1k_gptv_inter_res.json")
+    parser.add_argument("-f", "--fff", type=str, default="1", required=False)
+    args = parser.parse_args()
+
+    pred_data = json.load(open(args.pred_file))
+    gt_data = json.load(open(args.gt_file))
+
+    merge_list = merge_data(gt_data, pred_data)
+    print(f"==> Valid data size: {len(merge_list)}")
+
+    executor = ProcessPoolExecutor(max_workers=8)
+
+    all_tasks = [executor.submit(eval_pipeline, item) for item in merge_list]
+
+    all_results = []
+
+    for future in tqdm(as_completed(all_tasks)):
+        result = future.result()
+
+        if result is None:
+            continue
+
+        all_results.append(result)
+
+    detail_file = os.path.splitext(args.pred_file)[0] + "_detail_res.json"
+    json.dump(all_results, open(detail_file, 'w'), indent=4, ensure_ascii=False)
+
+    final_score = get_final_score(all_results)
+    final_file = os.path.splitext(args.pred_file)[0] + "_final_score.json"
+    json.dump(final_score, open(final_file, 'w'), indent=4, ensure_ascii=False)
+
+    print(f"==> Final Result")
+
+    print(json.dumps(final_score, indent=4))
+
+    print("==> Eval Finished.")
diff --git a/app/llava/eval/masp_eval/gpt4v_score/v1/eval_gpt4v_dist_stage1.py b/app/llava/eval/masp_eval/gpt4v_score/v1/eval_gpt4v_dist_stage1.py
new file mode 100644
index 0000000000000000000000000000000000000000..63a718a463671aa600b8836bdfa884b1492da89c
--- /dev/null
+++ b/app/llava/eval/masp_eval/gpt4v_score/v1/eval_gpt4v_dist_stage1.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[8]:
+
+
+#coding=utf-8
+
+
+# In[9]:
+
+
+import os
+import re
+import sys
+from tqdm import tqdm
+
+from PIL import Image
+import base64
+from io import BytesIO
+import pandas as pd
+
+import requests
+import json
+import time
+import openai
+import random
+
+sys.path.append('../..')
+
+from concurrent.futures import ProcessPoolExecutor, as_completed
+# from llava.eval.benchmark_core.utils.azure_utils import AzureVisionClient
+# from llava.eval.benchmark_core.utils.video_utils import uniform_sample
+from masp_eval.utils.azure_utils import AzureVisionClient
+from masp_eval.utils.video_utils import uniform_sample
+
+
+
+# In[10]:
+
+
+stage1_prompt = """
+As a customer service agent, your primary responsibility is to comprehend the user-uploaded videos, taking into account both the visual content and the text presented, then extract the required aspects:
+
+- Main Content: The central idea and main content of the video, including an overview of the story and its context. 
+- Subjects: Identify all the entities that appear in the video content, such as people, animals, or objects.
+- Attributes: Detail the attributes of the subjects, such as physical appearance, emotional expressions, and other qualities.
+- Scenes: Identify the settings or environment of the video, including location, ambiance, time, and weather if applicable.
+- Actions: Identify the subjects' actions or events, including movements, interactions, and environmental changes.
+- OCR: Transcribe all the overlaid text that can be clearly observed in the video. Explain its context and summarize the main content of the text.
+
+For each aspect, provide a detailed description based on the video frames. The structured output must be in JSON format and follow this template:
+
+```json
+{
+    "Main Content": "The central idea and main content of the video.",
+    "Subjects": "The subjects in the video that appear in the video content.",
+    "Attributes": "The attributes of the subjects in the video.",
+    "Scenes": "The settings or environment of the video",
+    "Actions": "The subjects' actions or events in the video",
+    "OCR": "Transcription and interpretation of the text overlays in the video, along with its context and a summary of the main content."
+}
+```
+"""
+
+
+# In[11]:
+
+
+ak = "GjrgjjyJHUbLa15DLnr7t0Bhu6IPqFPj"
+client = AzureVisionClient(ak)
+pattern = r'\{.*?\}'
+
+
+# In[12]:
+
+
+def encode_image(image):
+    buffered = BytesIO()
+    image.save(buffered, format="JPEG")
+    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+    
+    return img_b64_str
+
+def load_frames(path_to_images):
+    max_frames = 10
+
+    # Get list of all files in the directory
+    image_files = [(int(os.path.splitext(file)[0]), file) for file in os.listdir(path_to_images) if file.endswith(('jpg', 'jpeg', 'png'))]
+    image_files = sorted(image_files, key=lambda img: img[0])
+    # Set up the matplotlib figure and axes, based on the number of images
+    num_images = len(image_files)
+    frames = []
+    # Read and display each image
+    for image_file in image_files:
+        image_path = os.path.join(path_to_images, image_file[1])
+        image = Image.open(image_path)
+        frames.append(image)
+
+    if len(frames) > max_frames:
+        frames = uniform_sample(frames, max_frames)
+
+    vid_b64_lst = list(map(encode_image, frames))
+
+    return vid_b64_lst
+    # return frames
+
+
+# In[13]:
+
+
+def extract_info(item):
+    ans = None
+    max_attempts = 10
+    attempts = 0
+
+    while attempts < max_attempts and ans is None:
+        try:
+            resp, messages = client.request(stage1_prompt, vid_data=load_frames(item['video_path']))
+            out = json.loads(re.findall(pattern, resp, re.DOTALL)[0].replace("\n", ""))
+            res = {
+                "object_id": item['object_id'],
+                "video_path": item['video_path'],
+                "policy_list": item['policy_list'],
+                "refine_caption": item['refine_caption'],
+                # "masp_inference": item['masp_inference'],
+                "Main Content": out['Main Content'],
+                "Subjects": out['Subjects'],
+                "Attributes": out['Attributes'],
+                "Scenes": out['Scenes'],
+                "Actions": out['Actions'],
+                "OCR": out['OCR'],
+                "Resp": resp
+            }
+            # ans = (res, messages)
+            ans = res
+        except Exception as e:
+            attempts += 1
+            print(e)
+            
+    return ans
+
+
+# In[14]:
+
+
+if __name__ == "__main__":
+    import argparse
+    
+    parser = argparse.ArgumentParser("GPT4-V Eval")
+    parser.add_argument("--gt_file", type=str, default="/mnt/bn/yukunfeng-nasdrive/kaili.zhao/masp/data/TT/eval_benchmark_neg_diverse_5k_uniform.json")
+    parser.add_argument("--inter_file", type=str, default="/mnt/bn/algo-masp-nas-2/baiyi.by/data/Benchmarks/GPT4V_Eval/PROMPT_VERSION/eval_v1_1_neg_5k_stage1_info.json")
+    parser.add_argument("-f", "--fff", type=str, default="1", required=False)
+    args = parser.parse_args()
+    
+    pred_data = json.load(open(args.gt_file))
+    
+    executor = ProcessPoolExecutor(max_workers=10)
+    
+    all_tasks = [executor.submit(extract_info, item) for item in pred_data]
+    
+    all_results = []
+    
+    for future in tqdm(as_completed(all_tasks)):
+        result = future.result()
+        
+        if result is None:
+            continue
+            
+        all_results.append(result)
+    
+    json.dump(all_results, open(args.inter_file, 'w'), indent=4, ensure_ascii=False)
+    
+    print("==> Eval Finished.")
diff --git a/app/llava/eval/masp_eval/gpt4v_score/v1/eval_gpt4v_dist_stage2.py b/app/llava/eval/masp_eval/gpt4v_score/v1/eval_gpt4v_dist_stage2.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b5859f7f04d7f45aac8f1b4c4eab93cb5ab4c03
--- /dev/null
+++ b/app/llava/eval/masp_eval/gpt4v_score/v1/eval_gpt4v_dist_stage2.py
@@ -0,0 +1,386 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[1]:
+
+
+# coding=utf-8
+
+
+# In[2]:
+
+
+import os
+import re
+import sys
+from tqdm import tqdm
+
+from PIL import Image
+import base64
+from io import BytesIO
+import pandas as pd
+
+import requests
+import json
+import time
+import openai
+import random
+
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from llava.eval.masp_eval.utils.azure_utils import AzureVisionClient
+from llava.eval.masp_eval.utils.video_utils import uniform_sample
+
+
+# In[57]:
+
+
+CONTENT_WEIGHT = 6
+SUBJECTS_WEIGHT = 4
+ATTRIBUTES_WEIGHT = 2
+SCENES_WEIGHT = 2
+ACTIONS_WEIGHT = 2
+OCR_WEIGHT = 4
+
+
+# In[32]:
+
+
+stage1_prompt = """
+As a customer service agent, your primary responsibility is to comprehend the user-uploaded videos, taking into account both the visual content and the text presented, then extract the required aspects:
+
+- Main Content: The central idea and main content of the video, including an overview of the story and its context. 
+- Subjects: Identify all the entities that appear in the video content, such as people, animals, or objects.
+- Attributes: Detail the attributes of the subjects, such as physical appearance, emotional expressions, and other qualities.
+- Scenes: Identify the settings or environment of the video, including location, ambiance, time, and weather if applicable.
+- Actions: Identify the subjects' actions or events, including movements, interactions, and environmental changes.
+- OCR: Transcribe all the overlaid text that can be clearly observed in the video. Explain its context and summarize the main content of the text.
+
+For each aspect, provide a detailed description based on the video frames. The structured output must be in JSON format and follow this template:
+
+```json
+{
+    "Main Content": "The central idea and main content of the video.",
+    "Subjects": "The subjects in the video that appear in the video content.",
+    "Attributes": "The attributes of the subjects in the video.",
+    "Scenes": "The settings or environment of the video",
+    "Actions": "The subjects' actions or events in the video",
+    "OCR": "Transcription and interpretation of the text overlays in the video, along with its context and a summary of the main content."
+}
+```
+"""
+
+
+# In[33]:
+
+
+stage2_prompt = """
+As a customer service agent, your primary responsibility is to score a given video content description (caption) based on the user-uploaded videos regarding the above aspects. 
+
+First, score the caption based on its main content and central idea. The main content of the video: {content}
+
+The scoring criteria are as follows (1~5):
+- 1: Does not accurately reflect the main content of the video or is largely incorrect.
+- 2: Somewhat reflects the main content of the video, but there are significant inaccuracies or omissions.
+- 3: Generally reflects the main content of the video, but there are some inaccuracies or omissions.
+- 4: Accurately reflects the main content of the video, with only minor inaccuracies or omissions.
+- 5: Perfectly reflects the main content of the video, with no inaccuracies or omissions.
+
+Then, compare the content extracted for each aspect (Subjects/Attributes/Scenes/Actions/OCR) with the caption to determine whether the caption mentions these contents. 
+The extracted aspects includes:
+- Subjects: {subjects}
+- Attributes: {attributes}
+- Scenes: {scenes}
+- Actions: {actions}
+- OCR: {ocr}
+
+And compare the caption with the video content to judge whether the caption hallucinates content that does not exist in the video for each aspect.
+Assign a score to each aspect of the caption based on the following criteria, and provide the reasons:
+
+- 1: Does not mention the key aspect or is entirely hallucinated.
+- 2: Lacks major aspects or contains major hallucinations.
+- 3: Generally recalls the aspects, but contains some hallucinations or lacks some aspects.
+- 4: Recalls most aspects, but lacks minor aspects or contains minor hallucinations.
+- 5: Accurately recalls all aspects with no hallucinations.
+
+Please note, in terms of 'Text Overlays', if there are no text overlays in the video, any text that the caption hallucinates will be seen as a negative indicator.
+
+Finally, output the scores for each aspect along with the reasoning behind the scores into a JSON output format.
+
+```json
+{{
+    "Main Content Reason": "Reason for the score concerning the main content and central idea.",
+    "Main Content Score": "Score assigned for the main content and central idea of the caption.",
+    "Subjects Reason": "Reason for the score concerning 'Subjects', considering both recall and hallucination.",
+    "Subjects Score": "Score assigned for caption concerning 'Subjects'.",
+    "Attributes Reason": "Reason for the score concerning 'Attributes', considering both recall and hallucination.",
+    "Attributes Score": "Score assigned for caption concerning 'Attributes'.",
+    "Scenes Reason": "Reason for the score concerning 'Scenes', considering both recall and hallucination.",
+    "Scenes Score": "Score assigned for caption concerning 'Scenes'.",
+    "Actions Reason": "Reason for the score concerning 'Actions', considering both recall and hallucination.",
+    "Actions Score": "Score assigned for caption concerning 'Actions'.",
+    "OCR Reason": "Reason for the score concerning 'OCR', considering both recall and hallucination.",
+    "OCR Score": "Score assigned for caption concerning 'OCR'."
+}}
+```
+"""
+
+
+# In[34]:
+
+ak_list = ["F1uPN8EfGeYXUSwt9iDy4KzxRBBLuxWr", "GjrgjjyJHUbLa15DLnr7t0Bhu6IPqFPj", "9SfaybfwPR3qpKqtZOzFtl28yXQNeHrJ"]
+client_list = [AzureVisionClient(ak, max_retries=20) for ak in ak_list]
+
+pattern = r'\{.*?\}'
+
+
+# In[35]:
+
+
+def encode_image(image):
+    buffered = BytesIO()
+    image.save(buffered, format="JPEG")
+    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+
+    return img_b64_str
+
+
+def load_frames(path_to_images):
+    max_frames = 10
+
+    # Get list of all files in the directory
+    image_files = [(int(os.path.splitext(file)[0]), file) for file in os.listdir(path_to_images) if
+                   file.endswith(('jpg', 'jpeg', 'png'))]
+    image_files = sorted(image_files, key=lambda img: img[0])
+    # Set up the matplotlib figure and axes, based on the number of images
+    num_images = len(image_files)
+    frames = []
+    # Read and display each image
+    for image_file in image_files:
+        image_path = os.path.join(path_to_images, image_file[1])
+        image = Image.open(image_path)
+        frames.append(image)
+
+    if len(frames) > max_frames:
+        frames = uniform_sample(frames, max_frames)
+
+    vid_b64_lst = list(map(encode_image, frames))
+
+    return vid_b64_lst
+
+
+# In[64]:
+
+
+def score_func(item, messages=None):
+    ans = None
+    max_attempts = 10
+    attempts = 0
+
+    client = random.choice(client_list)
+
+    while attempts < max_attempts and ans is None:
+        try:
+            param_dict = {
+                'content': item['Main Content'],
+                'subjects': item['Subjects'],
+                'attributes': item['Attributes'],
+                'actions': item['Actions'],
+                'scenes': item['Scenes'],
+                'ocr': item['OCR']
+            }
+            usr_prompt = stage2_prompt.format(**param_dict) + '\nThe provided caption:\n' + item['masp_inference']
+            
+            resp, _ = client.request(usr_prompt, messages=messages)
+            out = json.loads(re.findall(pattern, resp, re.DOTALL)[0].replace("\n", ""))
+            # out = json.loads(resp.choices[0].message.content)
+            res = {
+                "object_id": item['object_id'],
+                "video_path": item['video_path'],
+                "policy_list": item['policy_list'],
+                "refine_caption": item['refine_caption'],
+                "masp_inference": item['masp_inference'],
+                # "origin_caption_rate": item['origin_caption_rate'],
+                # "hard_negative_caption": item['hard_negative_caption'],
+                "Main Content": item['Main Content'],
+                "Subjects": item['Subjects'],
+                "Attributes": item['Attributes'],
+                "Scenes": item['Scenes'],
+                "Actions": item['Actions'],
+                "OCR": item['OCR'],
+                "Main Content Reason": out['Main Content Reason'],
+                "Main Content Score": out['Main Content Score'],
+                "Subjects Reason": out['Subjects Reason'],
+                "Subjects Score": out['Subjects Score'],
+                "Attributes Reason": out['Attributes Reason'],
+                "Attributes Score": out['Attributes Score'],
+                "Scenes Reason": out['Scenes Reason'],
+                "Scenes Score": out['Scenes Score'],
+                "Actions Reason": out['Actions Reason'],
+                "Actions Score": out['Actions Score'],
+                "OCR Reason": out['OCR Reason'],
+                "OCR Score": out['OCR Score']
+            }
+            final_score = CONTENT_WEIGHT * int(res['Main Content Score']) + SUBJECTS_WEIGHT * int(res['Subjects Score']) + ATTRIBUTES_WEIGHT * int(res['Attributes Score']) + SCENES_WEIGHT * int(res['Scenes Score']) + ACTIONS_WEIGHT * int(res['Actions Score']) + OCR_WEIGHT * int(res['OCR Score'])
+            
+            res['Final Score'] = round(final_score)
+            
+            ans = res
+        except Exception as e:
+            attempts += 1
+            print(e)
+    return ans
+
+
+# In[65]:
+
+
+def build_context(item):
+    vid_data = load_frames(item['video_path'])
+
+    content = [{
+            "type": "text",
+            "text": stage1_prompt
+        },
+        *map(lambda x: {
+            "type": "image_url",
+            "image_url": {
+                "url": f"data:image/jpeg;base64, {x}"
+            }
+        }, vid_data)]
+
+    messages = [{
+        'role': 'user',
+        'content': content
+    }, {
+        "role": "assistant",
+        "content": [{
+            "type": "text",
+            "text": item['Resp']
+        }]
+    }]
+    return messages
+
+
+# In[66]:
+
+
+def eval_pipeline(item):
+    try:
+        messages = build_context(item)
+
+        item_score = score_func(item, messages)
+
+    except Exception as e:
+        print(item["object_id"], e)
+        return None
+
+    return item_score
+
+
+# In[67]:
+
+
+def get_final_score(all_items):
+    from collections import defaultdict
+
+    score_dict = defaultdict(list)
+
+    for item in all_items:
+        for key in item:
+            if not key.endswith('Score'):
+                continue
+
+            score_dict[key].append(int(item[key]))
+
+    final_score_dict = dict()
+    for key in score_dict:
+        final_score_dict[key] = round(sum(score_dict[key]) / len(score_dict[key]), 4)
+
+    return final_score_dict
+
+
+# In[68]:
+
+
+def merge_data(gt_data, pred_data):
+    gt_dict = dict()
+
+    for item in gt_data:
+        gt_dict[item['object_id']] = item
+
+    merge_list = []
+    for item in pred_data:
+        if item['object_id'] not in gt_dict:
+            continue
+
+        # item['Resp'] = gt_dict[item['object_id']]['Resp']
+        for key in ['Main Content', 'Subjects', 'Attributes', 'Scenes', 'Actions', 'OCR', 'Resp']:
+            assert key in gt_dict[item['object_id']], f"object id: {item['object_id']}, key: {key}"
+            item[key] = gt_dict[item['object_id']][key]
+
+        merge_list.append(item)
+
+    return merge_list
+
+
+# In[70]:
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser("GPT4-V Eval")
+    parser.add_argument("--pred_file", type=str,
+                        default="/mnt/bn/masp-nas/xiangchen/model/masp_models/checkpoints/llava-thothv2_mar_release_all_data/video_chair/pred_result_5k.json")
+    parser.add_argument("--gt_file", type=str,
+                        default="/mnt/bn/algo-masp-nas-2/xiangchen/dataset/masp/eval_v1_1_neg_5k_stage1_info.json")
+    parser.add_argument("-f", "--fff", type=str, default="1", required=False)
+    args = parser.parse_args()
+
+    pred_data = json.load(open(args.pred_file))
+    print(f"Pred data: {len(pred_data)}")
+    gt_data = json.load(open(args.gt_file))
+    print(f"GT data: {len(gt_data)}")
+
+    merge_list = merge_data(gt_data, pred_data)
+    
+    print(f"==> Valid data size: {len(merge_list)}")
+    print('evaluation version: v1')
+
+    executor = ProcessPoolExecutor(max_workers=20)
+
+    all_tasks = [executor.submit(eval_pipeline, item) for item in merge_list]
+
+    all_results = []
+
+    for future in tqdm(as_completed(all_tasks)):
+        result = future.result()
+
+
+    # for item in tqdm(merge_list):
+    #     result = eval_pipeline(item)
+
+        if result is None:
+            continue
+        
+        all_results.append(result)
+        
+    print(f"==> Succeeded {len(all_results)} samples.")
+
+    detail_file = os.path.splitext(args.pred_file)[0] + "_detail_res.json"
+    json.dump(all_results, open(detail_file, 'w'), indent=4, ensure_ascii=False)
+
+    final_score = get_final_score(all_results)
+    final_file = os.path.splitext(args.pred_file)[0] + "_final_score.json"
+    json.dump(final_score, open(final_file, 'w'), indent=4, ensure_ascii=False)
+
+    print(f"==> Final Result")
+
+    print(json.dumps(final_score, indent=4))
+
+    print("==> Eval Finished.")
+
+
+
+
+
diff --git a/app/llava/eval/masp_eval/utils/__init__.py b/app/llava/eval/masp_eval/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d6338135e6e6fe83b46e25d8d27c49e3de3188e
--- /dev/null
+++ b/app/llava/eval/masp_eval/utils/__init__.py
@@ -0,0 +1,2 @@
+from .video_utils import *
+from .azure_utils import *
\ No newline at end of file
diff --git a/app/llava/eval/masp_eval/utils/azure_utils.py b/app/llava/eval/masp_eval/utils/azure_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..285df1fec6877b10ba8eef056ab8c6f28486c647
--- /dev/null
+++ b/app/llava/eval/masp_eval/utils/azure_utils.py
@@ -0,0 +1,159 @@
+#coding=utf-8
+import time
+import copy
+import requests
+import openai
+from abc import ABC, abstractmethod
+
+class AzureVisionClient():
+    def __init__(self, ak, max_retries=3):
+
+        self.client = openai.AzureOpenAI(
+            azure_endpoint="https://search-va.byteintl.net/gpt/openapi/online/multimodal/crawl/",
+            api_version="2023-09-01-preview",
+            api_key=ak
+        )
+        self.max_retries = max_retries
+        self.model_name = 'gptv'
+        self.temperature = 0.000000001
+        self.max_tokens = 3000
+        
+    def request(self, query, messages=None, img_data=None, vid_data=None):
+        if messages is None:
+            messages = []
+        
+        content = {
+            "type": "text",
+            "text": query
+        }
+        if img_data:
+            content = [
+                content,
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64,{img_data}"
+                    }
+                }
+            ]
+        elif vid_data:
+            content = [
+                content,
+                *map(lambda x: {
+                    "type": "image_url", 
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64, {x}"
+                    }
+                }, vid_data)
+            ]
+        else:
+            content = [content]
+    
+        messages.append({
+            'role': 'user',
+            'content': content
+        })
+        
+        completion = None
+        num_cur_retry = 0
+        while num_cur_retry < self.max_retries:
+            try:
+                completion = self.client.chat.completions.create(
+                    model=self.model_name,  # gptv 或 openai_gpt-4-vision
+                    max_tokens=self.max_tokens,
+                    temperature=self.temperature,
+                    messages=messages
+                )
+                break
+            except Exception as e:
+                num_cur_retry += 1
+                if 'Error code: 429' not in e.message:
+                    completion = None
+                    print(e)
+                    break
+                if num_cur_retry % 20 == 1:
+                    print('retry times:', num_cur_retry, e)
+                time.sleep(5)
+                
+        resp = completion.choices[0].message.content
+        messages.append({
+            "role": "assistant",
+            "content": [{
+                "type": "text",
+                "text": completion.choices[0].message.content
+            }]
+        })
+        
+        return resp, messages
+    
+
+class BaseAPIWrapper(ABC):
+    @abstractmethod
+    def get_completion(self, user_prompt, system_prompt=None):
+        pass
+
+class GPTAPIWrapper(BaseAPIWrapper):
+    def __init__(self, ak, max_retries=1000):
+        # self.key_pool = key_pool
+        # self.temperature = temperature
+        # self.model = model
+        # self.time_out = time_out
+        # openai.api_base = "https://search-us.byteintl.net/gpt/openapi/online/v2/crawl"
+        # openai.api_type = "azure"
+        # openai.api_version = "2023-06-01-preview" 
+        # openai.api_key = key_pool[0]
+
+        self.client = openai.AzureOpenAI(
+            azure_endpoint="https://search-us.byteintl.net/gpt/openapi/online/v2/crawl",
+            api_version="2023-06-01-preview",
+            api_key=ak
+        )
+        self.max_retries = max_retries
+        self.model_name = 'gpt-4-32k-0613'
+        self.temperature = 0.000000001
+        self.max_tokens = 3000
+
+    def request(self, system_content, usr_question, previous_msg=None, last_answer=None):
+        if previous_msg is None:
+            msgs = [
+                {"role": "system", "content": f"{system_content}"},
+                {"role": "user", "content": f"{usr_question}"}
+            ]
+        else:
+            msgs = copy.deepcopy(previous_msg)
+            msgs += [
+                {"role": "assistant", "content": last_answer},
+                {"role": "user", "content": usr_question}
+            ]
+        response = self.client.chat.completions.create(
+            messages=msgs,
+            temperature=self.temperature, 
+            max_tokens=self.max_tokens,
+            model=self.model_name,
+        )
+        resp = response.choices[0].message.content
+        # total_tokens = response.usage['total_tokens']
+
+        return resp, msgs
+    
+    def get_completion(self, user_prompt=None, system_prompt=None, previous_msgs=None, last_answer=None):
+        gpt_cv_nlp = '[]'
+        key_i = 0
+        total_tokens = 0
+        max_try = self.max_retries
+        # gpt_cv_nlp, total_tokens = self.request(system_prompt, user_prompt)
+        while max_try > 0:
+            try:
+                gpt_cv_nlp, msgs = self.request(system_prompt, user_prompt, previous_msgs, last_answer)
+                # print('Succ: ', gpt_cv_nlp)
+                max_try = 0
+                break
+            except Exception as e:
+                print("fail ", max_try, e)
+                # key = self.key_pool[key_i%2]
+                # openai.api_key = key
+                # key_i += 1
+                time.sleep(1)
+                max_try -= 1
+    
+        return gpt_cv_nlp, msgs
\ No newline at end of file
diff --git a/app/llava/eval/masp_eval/utils/video_utils.py b/app/llava/eval/masp_eval/utils/video_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e12512c3f01ff8d684e96bf6ddba8b8ddfbdf55
--- /dev/null
+++ b/app/llava/eval/masp_eval/utils/video_utils.py
@@ -0,0 +1,71 @@
+import os
+import copy
+import random
+import numpy as np
+from PIL import Image
+
+def get_image(image_path):
+    image = Image.open(image_path).convert('RGB')
+    return image
+
+
+def load_frames(frames_dir):
+    results = []
+    image_files = [(int(os.path.splitext(img)[0]), img) for img in os.listdir(frames_dir) if img.endswith('jpg')]
+    image_files = sorted(image_files, key=lambda img: img[0])
+
+    for frame_name in image_files:
+        image_path = f"{frames_dir}/{frame_name[1]}"
+        image = get_image(image_path)
+        results.append(image)
+    return results
+
+
+def uniform_sample(frames, num_segments):
+    """
+    Uniformly samples 10 frames from a list of frames.
+
+    Args:
+    - frames (list): A list of frames.
+
+    Returns:
+    - list: A list containing 10 uniformly sampled frames.
+    """
+
+    indices = np.linspace(start=0, stop=len(frames) - 1, num=num_segments).astype(int)
+
+    frames = [frames[ind] for ind in indices]
+
+    return frames
+
+
+def downsample_frames(frames, interval, keep_first_last=True):
+    if keep_first_last:
+        first, last, mid = frames[0], frames[-1], frames[1:-1]
+        sampled_frames = mid[interval - 1::interval]
+        ret = [first] + sampled_frames + [last]
+
+    else:
+        # may output empty list, recommend keep first and last frame
+        ret = frames[interval - 1::interval]
+
+    return ret
+
+
+def sample_frames(frames, num_segments):
+    frame_indices = list(range(len(frames)))
+    cand_indices = copy.deepcopy(frame_indices)
+    intervals = np.linspace(start=0, stop=len(frame_indices), num=num_segments + 1).astype(int)
+    ranges = []
+
+    for idx, interv in enumerate(intervals[:-1]):
+        ranges.append((interv, intervals[idx + 1] - 1))
+
+    # try:
+    #     frame_indices = [cand_indices[random.choice(range(x[0], x[1]))] for x in ranges]
+    # except:
+    frame_indices = [cand_indices[x[0]] for x in ranges]
+
+    sampled_frames = [frames[indice] for indice in frame_indices]
+
+    return sampled_frames
\ No newline at end of file
diff --git a/app/llava/eval/masp_eval/video_chair/cal_video_chair.py b/app/llava/eval/masp_eval/video_chair/cal_video_chair.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e8262bcc754591113d1ca3c3d6860149be81cd1
--- /dev/null
+++ b/app/llava/eval/masp_eval/video_chair/cal_video_chair.py
@@ -0,0 +1,287 @@
+from audioop import avg
+from email.policy import default
+import os
+import re
+import json
+import sys
+import argparse
+
+import openai
+from abc import ABC, abstractmethod
+# from pattern3.en import singularize
+# from nltk.stem import WordNetLemmatizer
+# from call_dino_service import 
+from tqdm import tqdm
+from functools import partial
+
+# import spacy
+import time
+from collections import defaultdict
+from copy import deepcopy
+from pathlib import Path
+from multiprocessing import Pool
+from llava.eval.masp_eval.utils import GPTAPIWrapper 
+
+# class RefineCHAIR():
+#     def __init__(self):
+#         self.system_prompt = "I am ChatGPT, a virtual assistant based on OpenAI's GPT-4 model. I'm designed to understand and generate human-like text based on the input I receive. My main purpose is to assist with information, answer questions, help with tasks that involve natural language processing, and engage in conversations with users.Please note that while I aim to provide accurate and reliable information, I can't guarantee perfection, and it's always a good idea to consult additional resources or professionals when making critical decisions based on the information I provide."
+#         self.openai_obj = GPTAPIWrapper(ak="GjrgjjyJHUbLa15DLnr7t0Bhu6IPqFPj")
+#         with open('llava/eval/masp_eval/video_chair/prompts/cap_mention.txt', 'r') as file:
+#             content = file.read()
+#         self.cap_user_prompt = content
+system_prompt = "I am ChatGPT, a virtual assistant based on OpenAI's GPT-4 model. I'm designed to understand and generate human-like text based on the input I receive. My main purpose is to assist with information, answer questions, help with tasks that involve natural language processing, and engage in conversations with users.Please note that while I aim to provide accurate and reliable information, I can't guarantee perfection, and it's always a good idea to consult additional resources or professionals when making critical decisions based on the information I provide."
+
+with open('llava/eval/masp_eval/video_chair/prompts/cap_mention.txt', 'r') as file:
+    content = file.read()
+cap_user_prompt = content
+
+openai_obj = GPTAPIWrapper(ak="GjrgjjyJHUbLa15DLnr7t0Bhu6IPqFPj")
+
+
+def _add(case_res, all_res):
+    for key, value in case_res.items():
+        for idx, count_ in enumerate(value):
+            all_res[key][idx] += count_
+    return
+    
+def save_metric(coverage, hallucination, case_len, output_dir=None):
+    final_metrics = {}
+    for name, res in [['coverage', coverage], ['hallucination', hallucination]]:
+        combine_counter = [0, 0]    
+        for cat, counter in res.items():
+            final_metrics[name+'_'+cat] = round(counter[0] * 100/ counter[1], 2)
+            combine_counter[0] += counter[0]
+            combine_counter[1] += counter[1]
+            if name == 'hallucination':
+                final_metrics[name+'_'+cat] = round(100 - final_metrics[name+'_'+cat], 2)
+            final_metrics[name] = round(combine_counter[0] * 100 / combine_counter[1], 2)
+        if name == 'hallucination':
+            final_metrics[name] = round(100 - final_metrics[name], 2)
+    final_metrics['avg_len'] = round(sum(case_len) / len(case_len), 1)
+
+    if output_dir is not None:
+        with (output_dir / 'chair_metric_neg.json').open('w') as f:
+            json.dump(final_metrics, f, indent=4)
+
+    print(json.dumps(final_metrics, indent=1))
+
+def combine_info(pred_info, gt_info):
+    combined_info = defaultdict(dict)
+    if 'object_id' in gt_info[0]:
+        id_key = 'object_id'
+    else:
+        id_key = 'task_id'
+    for gt in gt_info:
+        object_id = gt[id_key]
+        if gt['cap_info'] is None:
+            continue
+        combined_info[object_id]['gt_caption'] = gt['refine_caption']
+        combined_info[object_id]['gt_info'] = gt['cap_info']
+
+    for pred in pred_info:
+        object_id = pred[id_key]
+        if object_id not in combined_info:
+            # print(pred)
+            continue
+        if pred['cap_info'] is None:
+            continue
+        combined_info[object_id]['pred_caption'] = pred['masp_inference']
+        combined_info[object_id]['pred_info'] = pred['cap_info']
+    filtered_ids = []
+    for key, value in combined_info.items():
+        if ('pred_info' not in value) or ('gt_info' not in value):
+            filtered_ids.append(key)
+    for obj_id in filtered_ids:
+        del combined_info[obj_id]
+
+    print(f'evaluation cases: {len(combined_info)}')
+    
+    return combined_info
+
+def format_question(info):
+    categories = ['subjects', 'activities', 'locations', 'text_overlays']
+    question_id = 0
+    question_mapping = {}
+    questions = []
+    for cat in categories:
+        if cat == 'subjects':
+            for c_id, character_content in enumerate(info['subjects']):
+                questions.append(cat + ':' + character_content['name'])
+                question_mapping[question_id] = (cat, c_id)
+                question_id += 1
+                if 'attributes' not in character_content:
+                    continue
+                for a_id, attr in enumerate(character_content['attributes']):
+                    questions.append(character_content['name'] + ':' + attr)
+                    question_mapping[question_id] = ('attributes', c_id, a_id)
+                    question_id += 1
+            
+        else:
+            for c_id, cat_attr in enumerate(info[cat]):
+                questions.append(cat + ':' + cat_attr)
+                question_mapping[question_id] = (cat, c_id)
+                question_id += 1
+                
+    question_str = ''
+    for idx, q in enumerate(questions):
+        question_str += f'{idx+1}. {q}' + '\n'
+
+    return question_str, question_mapping
+
+def parsing_results(gpt_ret, question_mapping):
+    gpt_ret = gpt_ret.lower()
+    pattern = r'(\d+)\.(.+) - (yes|no|maybe),(.+)'
+
+    # Find all matches in the text
+    matches = re.findall(pattern, gpt_ret)
+    collected_answer = defaultdict(lambda:[0,0])
+    # Print the matches
+    for match in matches:
+        question_id, question, answer, reason = match
+        question_id = int(question_id) - 1
+        cat = question_mapping[question_id][0]
+        collected_answer[cat][1] += 1
+        if 'yes' in answer:
+            collected_answer[cat][0] += 1
+        elif 'no' in answer:
+            pass
+        elif 'maybe' in answer:
+            collected_answer[cat][0] += 1
+        else:
+            NotImplementedError
+    return collected_answer
+
+
+
+def process_coverage(data):
+    object_id = data[0]
+    case_info = data[1]
+    gt_info = case_info['gt_info']
+    # if gt_info is None:
+    #     return None
+    try:
+        question_str, question_mapping = format_question(gt_info)
+    except Exception as e:
+        print(e)
+        return None
+    user_prompt = deepcopy(cap_user_prompt)
+    user_prompt = user_prompt.replace("/video caption/", case_info['pred_caption'])
+    user_prompt = user_prompt.replace("/question/", question_str)
+    gpt_ret, _ = openai_obj.get_completion(user_prompt=user_prompt, system_prompt=system_prompt)
+    try:
+        coverage_res = parsing_results(gpt_ret, question_mapping)
+    except Exception as e:
+        print(e)
+        print(gpt_ret)
+        return None    
+    sentence_len = len(case_info['pred_caption'].split(' '))
+    return (object_id, gpt_ret, dict(coverage_res), sentence_len)
+
+
+def process_hallucination(data):
+    object_id = data[0]
+    case_info = data[1]
+    pred_info = case_info['pred_info']
+    # if pred_info is None:
+    #     return None
+    try:
+        question_str, question_mapping = format_question(pred_info)
+    except Exception as e:
+        print(e)
+        return None
+    user_prompt = deepcopy(cap_user_prompt)
+    user_prompt = user_prompt.replace("/video caption/", case_info['gt_caption'])
+    user_prompt = user_prompt.replace("/question/", question_str)
+    gpt_ret, _ = openai_obj.get_completion(user_prompt=user_prompt, system_prompt=system_prompt)
+    try:
+        hallucination_res = parsing_results(gpt_ret, question_mapping)
+    except Exception as e:
+        print(e)
+        print(gpt_ret)
+        return None        
+    # self._add(hallucination_res, evaluator.hallucination_metric)
+    # saved_combined_info[object_id]['hallucination_res'] = gpt_ret
+    # print(gpt_ret)    
+    return (object_id, gpt_ret, dict(hallucination_res))
+
+
+
+def compute_refine_chair(pred_file, gt_file, coverage_file, hallucination_file):
+    coverage_metric = defaultdict(lambda:[0,0])
+    hallucination_metric = defaultdict(lambda:[0,0])
+    case_len = []
+
+    with open(pred_file, 'r', encoding='utf-8') as f:
+        pred_info = json.load(f)
+    with open(gt_file, 'r', encoding='utf-8') as f:
+        gt_info = json.load(f)
+
+    combined_info = combine_info(pred_info, gt_info)
+    saved_combined_info = deepcopy(combined_info) 
+    combine_info_lst = list(combined_info.items())
+
+    pool = Pool(processes=32)
+    print('calculate coverage')
+    dict_res_coverage = {}   
+    for res in tqdm(pool.imap_unordered(process_coverage, combine_info_lst), total=len(combine_info_lst)):
+        if res is None:
+            continue
+        object_id, gpt_ret, coverage_res, sentence_len = res
+        _add(coverage_res, coverage_metric)
+        case_len.append(sentence_len)
+        saved_combined_info[object_id]['coverage_res'] = gpt_ret
+        dict_res_coverage[str(object_id)] = coverage_res
+
+    print('calculate hallucination')
+    dict_res_hallucination = {}
+    for res in tqdm(pool.imap_unordered(process_hallucination, combine_info_lst), total=len(combine_info_lst)):
+        if res is None:
+            continue
+        object_id, gpt_ret, hallucination_res = res
+        _add(hallucination_res, hallucination_metric)
+        saved_combined_info[object_id]['hallucination_res'] = gpt_ret
+        dict_res_hallucination[str(object_id)] = hallucination_res
+
+    pool.close()
+    pool.join()
+
+    output_dir = Path(pred_file).parent
+
+    with (output_dir / coverage_file).open('w') as f:
+        json.dump(dict_res_coverage, f, indent=4)
+    print(f"Saving coverage result for each video in {output_dir}")
+
+    with (output_dir / hallucination_file).open('w') as f:
+        json.dump(dict_res_hallucination, f, indent=4) 
+    print(f"Saving hallucination result for each video in {output_dir}")
+
+    save_metric(coverage_metric, hallucination_metric, case_len, output_dir)
+    with (output_dir / 'chair_metric_detailed.json').open('w') as f:
+        json.dump(saved_combined_info, f, indent=4)     
+
+    
+def print_metrics(hallucination_cap_dict, quiet=False):
+    sentence_metrics = hallucination_cap_dict['overall_metrics']
+    metric_string = "%0.01f\t%0.01f" %(sentence_metrics['CHAIRs']*100, 
+                                       sentence_metrics['CHAIRi']*100)
+    if not quiet:
+        print("CHAIRs\tCHAIRi")
+        print(metric_string)
+        print(sentence_metrics['sentence len'])
+        print(sentence_metrics['avg objects'])
+    else:
+        return metric_string
+    
+# python3 chair/chair_gpt.py --cap_file /mnt/bd/bohanzhaiv1/LLM/bohan/POPE/caption_data/vg_instruction1_llava.json  --annotation_path /mnt/bn/algo-masp-nas-2/masp_data/coco_2014/annotations
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pred_file", type=str, default='/mnt/bn/yukunfeng-nasdrive/xiangchen/model/masp_models/checkpoints/mistral-ablation-v077-ocr/video_chair/vid_top1k_neg_res_non_dup_info.json')
+    parser.add_argument("--gt_file", type=str, default='/mnt/bn/yukunfeng-nasdrive/xiangchen/repo/benchmark_data/refine_chair_eval_gt_neg_1k.json')
+    parser.add_argument("--coverage_filename", type=str, default='each_video_coverage_detail.json')
+    parser.add_argument("--hallucination_filename", type=str, default='each_video_halluciantion_detail.json')
+    
+    # parser.add_argument("--gt_file", type=str, default='/mnt/bn/yukunfeng-nasdrive/xiangchen/repo/benchmark_data/refine_chair_eval_gt.json')
+    args = parser.parse_args()
+
+    compute_refine_chair(args.pred_file, args.gt_file, args.coverage_filename, args.hallucination_filename)
+ 
\ No newline at end of file
diff --git a/app/llava/eval/masp_eval/video_chair/eval_instance_video_chair.py b/app/llava/eval/masp_eval/video_chair/eval_instance_video_chair.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d8c6e3544edb6bf315e4a4552344a003052b41a
--- /dev/null
+++ b/app/llava/eval/masp_eval/video_chair/eval_instance_video_chair.py
@@ -0,0 +1,80 @@
+# compute chair for each video
+import json
+import collections 
+import argparse 
+from pathlib import Path
+
+def eval_video_chair(file_name, metric):
+    with file_name.open("r") as json_file:
+        data = json.load(json_file)
+
+    items = {}
+    coverages = collections.defaultdict(list)
+    hallucinations = collections.defaultdict(list)
+    buckets = ['subjects', 'attributes', 'activities', 'locations', 'text_overlays']
+    index = 0
+    for object_id, tag_info in data.items():
+        items[object_id] = index
+        for tag in buckets:
+            if tag in tag_info:
+                cvg = round(tag_info[tag][0]*100 / tag_info[tag][1], 2)
+                coverages[tag].append(cvg) if metric == "coverage" else hallucinations[tag].append(round(100 - cvg, 2))
+            else: # "-100" means gt has no such tag for coverage and pred has no such tag for hallucination, leading to N/A value.
+                coverages[tag].append(-100) if metric == "coverage"  else hallucinations[tag].append(-100)
+        index += 1
+    return (items, coverages) if metric == "coverage" else (items, hallucinations)
+
+
+def get_dict_val(inputs, items, key):
+    for dd in inputs:
+        if str(dd["object_id"]) == str(items):
+            return dd["cap_info"][key] if key in dd["cap_info"] else []                
+    return []
+
+
+def get_instance_result(pred_file, gt_file, coverage_file, hallucination_file, save_file):
+    buckets = ['subjects', 'attributes', 'activities', 'locations', 'text_overlays']
+    pred = json.load(open(pred_file, "r"))
+    gt = json.load(open(gt_file, "r"))
+    output_dir = Path(pred_file).parent
+
+    items1, coverages  = eval_video_chair(output_dir / coverage_file, "coverage")
+    items2, hallucinations = eval_video_chair(output_dir / hallucination_file, "hallucination")
+
+    gt_map = {str(item['object_id']): item for item in gt}
+    pred_map = {str(item['object_id']): item for item in pred}
+
+    out = []
+    for obj_id, idx_1 in items1.items():
+        if obj_id not in items2:
+            continue
+        idx_2 = items2[obj_id]
+        res = {}
+        for key in buckets:
+            res["object_id"] = obj_id
+            res["coverage_"+key] = coverages[key][idx_1] if coverages[key][idx_1] != -100 else "N/A"
+            res["hallucination_"+key] = hallucinations[key][idx_2] if hallucinations[key][idx_2] != -100 else "N/A"
+            if key == "attributes": # "skip attributes which are combined in subjects"
+                continue
+            res["pred_"+key] = get_dict_val(pred, obj_id, key)
+            res["gt_"+key] = get_dict_val(gt, obj_id, key)
+            res['masp_inference'] = pred_map[obj_id]['masp_inference']
+            res['refine_caption'] = gt_map[obj_id]['refine_caption']
+        out.append(res) 
+
+    
+    with (output_dir / save_file).open("w") as json_data:
+        json.dump(out, json_data, indent=4)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pred_file", type=str, default='/mnt/bn/algo-masp-nas-2/xiangchen/model/masp_models/checkpoints/llava-mistral_gpt4v_public800k_unfreeze_qformer/video_chair/video_chair_1k_res_info.json')
+    parser.add_argument("--gt_file", type=str, default='/mnt/bn/algo-masp-nas-2/kaili.zhao/data/masp_data/eval/eval_v1.0/eval_benchmark_pos_diverse_1k_11policies_gt.json')
+    parser.add_argument("--coverage_file", type=str, default='each_video_coverage_detail.json')
+    parser.add_argument("--hallucination_file", type=str, default='each_video_halluciantion_detail.json')
+    parser.add_argument("--save_file", type=str, default='video_chair_final.json')
+    args = parser.parse_args()
+    get_instance_result(args.pred_file, args.gt_file, args.coverage_file, args.hallucination_file, args.save_file)
+    print(f"===== Completed video chair for each individual computation! =====")
+    
diff --git a/app/llava/eval/masp_eval/video_chair/info_extract_from_caption.py b/app/llava/eval/masp_eval/video_chair/info_extract_from_caption.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ee6fcdbc075003ff0f253308549774c554f66ed
--- /dev/null
+++ b/app/llava/eval/masp_eval/video_chair/info_extract_from_caption.py
@@ -0,0 +1,130 @@
+from operator import truediv
+import os
+import re
+import json
+import sys
+import argparse
+# from nltk.stem import *
+# import nltk
+import openai
+from abc import ABC, abstractmethod
+# from pattern3.en import singularize
+# from nltk.stem import WordNetLemmatizer
+# from call_dino_service import 
+from tqdm import tqdm
+from llava.eval.masp_eval.utils import GPTAPIWrapper 
+
+import time
+
+class BaseAPIWrapper(ABC):
+    @abstractmethod
+    def get_completion(self, user_prompt, system_prompt=None):
+        pass
+
+class CHAIR():
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.system_prompt = "I am ChatGPT, a virtual assistant based on OpenAI's GPT-4 model. I'm designed to understand and generate human-like text based on the input I receive. My main purpose is to assist with information, answer questions, help with tasks that involve natural language processing, and engage in conversations with users.Please note that while I aim to provide accurate and reliable information, I can't guarantee perfection, and it's always a good idea to consult additional resources or professionals when making critical decisions based on the information I provide."
+        # self.openai_obj = OpenAIAPIWrapper(key_pool=["VrJQmRwcwnRW3KVEDaE8D9gYZm2a0zPm", "GjrgjjyJHUbLa15DLnr7t0Bhu6IPqFPj"])
+        self.openai_obj = GPTAPIWrapper(ak="GjrgjjyJHUbLa15DLnr7t0Bhu6IPqFPj")
+        with open('llava/eval/masp_eval/video_chair/prompts/cap2info.txt', 'r') as file:
+            content = file.read()
+        self.cap_user_prompt = content
+        with open('llava/eval/masp_eval/video_chair/prompts/refine_json.txt', 'r') as file:
+            content = file.read()
+        self.cap_user_prompt_deduplicate = content     
+    
+    def cap2info_gpt4(self, cap):
+        user_prompt = self.cap_user_prompt.replace('/video caption/', cap)
+        gpt_ret1, msgs = self.openai_obj.get_completion(user_prompt=user_prompt, system_prompt=self.system_prompt)
+        user_prompt = self.cap_user_prompt_deduplicate.replace('/json file/', gpt_ret1)
+        gpt_ret2, msgs = self.openai_obj.get_completion(user_prompt=user_prompt, system_prompt=self.system_prompt, previous_msgs=msgs, last_answer=gpt_ret1)
+        match = re.search(r"(?<=```json\n)([\s\S]*?)(?=```)", gpt_ret2)
+        if match:
+            try:
+                info = json.loads(match.group(1))
+            except Exception as e:
+                print(match.group(1))
+                info = None
+            # Split the string into a list of items
+            return info
+        else:
+            try:
+                start = gpt_ret2.find('{')
+                end = gpt_ret2.rfind('}')
+                info = json.loads(gpt_ret2[start:end+1])
+                return info
+            except Exception as e:
+                print(gpt_ret1)
+                print(gpt_ret2)
+                return None
+
+
+def post_process_masp_cap_label(evaluator, annotations_file, gt=True):
+    results = []
+    with open(annotations_file, 'r', encoding='utf-8') as f:
+        annotations = json.load(f)
+    for data in tqdm(annotations):
+        if gt:
+            caption = data['refine_caption']
+        else:
+            caption = data['masp_inference']
+        cap_info = evaluator.cap2info_gpt4(caption)
+        data['cap_info'] = cap_info
+        results.append(data)
+    return results
+
+
+from multiprocessing import Pool
+
+evaluator = CHAIR()
+
+# Function to process a single data item
+def process_data(data, gt):
+    if gt:
+        caption = data['refine_caption']
+    else:
+        caption = data['masp_inference']
+    cap_info = evaluator.cap2info_gpt4(caption)
+    data['cap_info'] = cap_info
+    return data
+
+# Function to initialize the multiprocessing pool and process the data
+def process_annotations(annotations_file, gt=False):
+    # Load annotations
+    with open(annotations_file, 'r', encoding='utf-8') as f:
+        annotations = json.load(f)
+
+    # Create a pool of workers equal to the number of available CPU cores
+    pool = Pool(processes=32)  # None means use all available cores
+
+    # Use a partial function to fix the gt and evaluator arguments
+    from functools import partial
+    process_data_partial = partial(process_data, gt=gt)
+
+    # Map the data processing function over the annotations using the pool
+    # pool.map(process_data_partial, annotations)
+    res = []
+    for data in tqdm(pool.imap_unordered(process_data_partial, annotations), total=len(annotations)):
+        res.append(data)
+    # Close the pool and wait for the work to finish
+    pool.close()
+    pool.join()
+    return res
+
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--cap_file", type=str, default='/mnt/bn/algo-masp-nas-2/xiangchen/model/masp_models/checkpoints/llava-mistral_gpt4v_adso65k_unfreeze_qformer/video_chair/vid_top1k_res.json')
+    parser.add_argument("--output_file", type=str, default='/mnt/bn/algo-masp-nas-2/xiangchen/model/masp_models/checkpoints/llava-mistral_gpt4v_adso65k_unfreeze_qformer/video_chair/vid_top1k_res_info.json')
+    parser.add_argument("--gt", type=bool, default=False)
+
+    args = parser.parse_args()
+    
+    # post_anno = post_process_masp_cap_label(evaluator, args.cap_file, args.gt)
+    post_anno = process_annotations(args.cap_file, args.gt)
+    with open(f"{args.output_file}", "w") as file:
+            json.dump(post_anno, file, indent=4)
+
diff --git a/app/llava/eval/masp_eval/video_chair/prompts/VQA_generate/Live/noquestion.txt b/app/llava/eval/masp_eval/video_chair/prompts/VQA_generate/Live/noquestion.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2d1d2ba1ea465d89db2776dd8240298e8f524d27
--- /dev/null
+++ b/app/llava/eval/masp_eval/video_chair/prompts/VQA_generate/Live/noquestion.txt
@@ -0,0 +1,5 @@
+You are provided with the sentence which describes an image. You need to finish the following tasks: design questions
+based on the contrastive objects/attributes/actions. The contrastive object/attributes/actions are defined as having similar features, easy to confuse or always co-occur. The answer
+to the questions should be "no" because the contrastive objects/attributes/actions are not mentioned in the sentence.
+describe = {cap}
+question = 
\ No newline at end of file
diff --git a/app/llava/eval/masp_eval/video_chair/prompts/VQA_generate/Live/yesquestion.txt b/app/llava/eval/masp_eval/video_chair/prompts/VQA_generate/Live/yesquestion.txt
new file mode 100644
index 0000000000000000000000000000000000000000..98ced43b4d25561ab1635e4875925cf9e3513c25
--- /dev/null
+++ b/app/llava/eval/masp_eval/video_chair/prompts/VQA_generate/Live/yesquestion.txt
@@ -0,0 +1,5 @@
+You are provided with the sentence which describes an image. You need to finish the following tasks: design questions
+based on the objects/attributes/actions mentioned in the sentence. The answer to the question should be "yes" because
+the objects/attributes/actions are mentioned in the sentence.
+describe = {cap}
+question = 
\ No newline at end of file
diff --git a/app/llava/eval/masp_eval/video_chair/prompts/VQA_generate/QAprompts/yes_no_questions.txt b/app/llava/eval/masp_eval/video_chair/prompts/VQA_generate/QAprompts/yes_no_questions.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f67983c4468bd8a2e82d8011e9c1d9a17db61b61
--- /dev/null
+++ b/app/llava/eval/masp_eval/video_chair/prompts/VQA_generate/QAprompts/yes_no_questions.txt
@@ -0,0 +1,7 @@
+Can you generate some object existence yes no questions with paired answers which should have answer yes, based on provided video caption CAP.
+Here are some examples:
+1. 
+Cap = ""
+QA:
+Question = ""
+Answer = "yes"
\ No newline at end of file
diff --git a/app/llava/eval/masp_eval/video_chair/prompts/cap2info.txt b/app/llava/eval/masp_eval/video_chair/prompts/cap2info.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bf6cef9102950651192f0622e43bc8394cdbfa4b
--- /dev/null
+++ b/app/llava/eval/masp_eval/video_chair/prompts/cap2info.txt
@@ -0,0 +1,52 @@
+Please review the provided video caption and create a JSON representation.  The 'subjects' should list the main characters or entities in the video along with their distinctive attributes. 'Activities' should describe what actions or events are taking place. 'Locations' should pinpoint where the video is set, and 'text_overlays' should mention any text that appears in the video. Remember, clarity and precision are key.
+JSON format:
+{
+  "subjects": [
+    {
+      "name": "<Insert subject name here>",
+      "attributes": [
+        "<Insert subject attributes here separated by commas>"
+      ]
+    },
+    {
+      "name": "<Insert subject name here>",
+      "attributes": [
+        "<Insert subject attributes here separated by commas>"
+      ]
+    }
+  ],
+  "activities": [
+    "<Insert activities here separated by commas>"
+  ],
+  "locations": [
+    "<Insert locations here separated by commas>"
+  ],
+  "text_overlays": [
+    "<Insert text overlays here separated by commas>"
+  ]
+}
+
+
+Example: 
+video caption: The video content primarily features a young boy in various casual settings. The boy is then seen wearing filter pink swimming goggles in different scenes, either lying on a couch or sitting. The scenes are generally relaxed and playful, with the pink goggles adding a unique touch to the images. The video concludes with the boy still wearing the filter goggles.
+Answer:
+{
+  "subjects": [
+    {
+      "name": "young boy",
+      "attributes": [
+        "wearing filter pink swimming goggles",
+      ]
+    }
+  ],
+  "activities": [
+    "lying on a couch", "sitting"
+  ],
+  "locations": [
+    "various casual settings"
+  ],
+  "text_overlays": []
+}
+
+The following is the input video caption:
+video caption: /video caption/
\ No newline at end of file
diff --git a/app/llava/eval/masp_eval/video_chair/prompts/cap2objs.txt b/app/llava/eval/masp_eval/video_chair/prompts/cap2objs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bdfe5166695a1bda8b26c098d44fadfb38fca100
--- /dev/null
+++ b/app/llava/eval/masp_eval/video_chair/prompts/cap2objs.txt
@@ -0,0 +1,38 @@
+I have a description of an image, and I want to get objects from this description and return these objects in a list the object should be noun, and I don't want duplicated objects.
+I don't want scene name to be included, such as some caption describe the image is a scene or depict a position or a situation or place, this things is not an object, don't need to include.
+Here some objects are inside [] which we want to ignore.
+Here are some examples: 
+
+Example 1:
+caption = "The image features a bathroom sink situated under a large mirror. The sink is accompanied by a soap dispenser, and there are multiple toothbrushes placed around it. A few cups can be seen scattered around the sink area as well. \n\nIn addition to the sink, there is a toilet visible to the left side of the bathroom. The overall scene gives an impression of a well-equipped and functional bathroom space. Also a [brush] can been seen."
+Answer:
+objects = ['sink', 'mirror', 'soap dispenser', 'toothbrush', 'cup', 'toilet']
+
+Here we can see [brush] is ignored because its inside []. bathroom is the place not object, so not included.
+ 
+Example 2:
+caption = "The image depicts a cluttered dining room with a large kitchen table in the center. The table is covered with dirty dishes, including plates, bowls, cups, and utensils. There are several chairs around the table, with some placed closer to the center and others positioned at the edges.  In addition to the dishes, there is an apple sitting on the table, likely left over from a meal or snack. A bottle of water can be seen on the table as well, and a [flower], adding to the messy atmosphere of the room."
+Answer:
+objects = ['table', 'dish', 'bowl', 'cup', 'utensil', 'chair', 'apple', 'water']
+
+Here [flower] is in [], should be ignored. Here dining room and room are places, so ignored, not in objects.
+
+Example 3:
+caption = "The image depicts a busy city street with a pedestrian crossing in a sunny day. A man is walking across the street, carrying a backpack and wearing a jacket."
+Answer:
+objects = ['street', 'pedestrian crossing', 'man', 'backpack', 'jacket']
+
+Here 'city' is place, so not an object so not include in objects. 'The image depicts' is about image caption task, so not object in the scence.
+'sunny' or 'sunny day' or 'day' are not object in the image, this is time situation so not object, can't in objects.
+
+Example 4:
+caption = "The image depicts an office cubicle with a desk in the center. The desk is equipped with a computer, a keyboard, and a mouse."
+Answer:
+objects = ['desk', 'computer', 'keyboard', 'mouse']
+
+Here office is place so not in objects. Here 'center' is not objects, 'center' is position, not object, same thing like 'left' or 'right' etc.
+
+Here is the input caption
+caption = {cap}
+Answer:
+objects = 
\ No newline at end of file
diff --git a/app/llava/eval/masp_eval/video_chair/prompts/cap_mention.txt b/app/llava/eval/masp_eval/video_chair/prompts/cap_mention.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ab63d51fe9cc5e95f451dd4101367ecd03a7a873
--- /dev/null
+++ b/app/llava/eval/masp_eval/video_chair/prompts/cap_mention.txt
@@ -0,0 +1,7 @@
+/video caption/
+According to the above video caption, please judge one by one whether the following contents are mentioned in the caption or not. If it's not sure, please answer 'maybe'
+/question/
+The answer should be in the following format:
+<questionid>. <question content> - <Yes/No/Maybe>, <answer reason>
+Format example:
+1. activities:dancing - No,  the video only metioned the woman is singing.
\ No newline at end of file
diff --git a/app/llava/eval/masp_eval/video_chair/prompts/gpt4v_score.txt b/app/llava/eval/masp_eval/video_chair/prompts/gpt4v_score.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9a8b32091aa54253a8732cd8a80d59347343ae3d
--- /dev/null
+++ b/app/llava/eval/masp_eval/video_chair/prompts/gpt4v_score.txt
@@ -0,0 +1,31 @@
+Please conduct an accuracy evaluation of the following video caption compared to the video content. Your assessment should focus on two main tasks, with an important condition to avoid counting duplicate facts:
+
+Count the Correct Facts/Details (Avoiding Duplicates):
+
+Identify and count each distinct fact or detail in the caption that accurately corresponds to the video. This includes names, dates, places, specific events, and any data points.
+Ensure that each correct fact or detail is counted only once, regardless of how many times it appears in the caption.
+Provide a list of these correct facts/details and their total count.
+Count the Incorrect Facts/Details (Avoiding Duplicates):
+
+Identify and count each distinct fact or detail in the caption that is inaccurately represented or not present in the video. This includes hallucinations, factual errors, or misrepresentations.
+Make sure to count each incorrect fact or detail only once, even if it appears multiple times in the caption.
+Provide a list of these incorrect facts/details and their total count.
+In your analysis, be meticulous in ensuring that duplicates are not counted in either category. This is essential to provide an accurate assessment of the caption's accuracy. Present your findings clearly, specifying the exact number of correct and incorrect facts/details as per the video content.
+
+Imagine you are seeing the full video content. Please do reasonable inference for the temporal facts/detail based on the still images.
+The answer should be formartted as the following structure:
+
+Total Count of Correct Facts/Details: [Insert Total Number]
+List of Correct Facts/Details:
+    Fact/Detail 1: [Description]
+    Fact/Detail 2: [Description]
+    ...
+    Fact/Detail N: [Description]
+Total Count of Incorrect Facts/Details: [Insert Total Number]
+List of Incorrect Facts/Details:
+Fact/Detail 1: [Description]
+Fact/Detail 2: [Description]
+...
+Fact/Detail N: [Description]
+
+Here is the given video caption: /caption/
diff --git a/app/llava/eval/masp_eval/video_chair/prompts/gpt4v_sys_prompt.txt b/app/llava/eval/masp_eval/video_chair/prompts/gpt4v_sys_prompt.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5b958dfde2132abe4efc8f6067333cdca144bb14
--- /dev/null
+++ b/app/llava/eval/masp_eval/video_chair/prompts/gpt4v_sys_prompt.txt
@@ -0,0 +1 @@
+ Picture yourself as a customer service agent managing user-uploaded video. The uploaded video, consists of a seires of images. All the analysis should be video-level.
\ No newline at end of file
diff --git a/app/llava/eval/masp_eval/video_chair/prompts/object_coverage.txt b/app/llava/eval/masp_eval/video_chair/prompts/object_coverage.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fe5eeccf5d91d83eb35ae23f7e7ec8094489b6e6
--- /dev/null
+++ b/app/llava/eval/masp_eval/video_chair/prompts/object_coverage.txt
@@ -0,0 +1,45 @@
+I have two list of objects, list_A and list_B, I want to return a list named uncover which find items in list_B doesn't appear in list_A, 
+sometimes same object can be expressed in different ways in list_A and list_B, we treat different expression but similar meaning objects as matched, not include in mistmatch list.
+
+For example
+list_A = ['two cars', 'dark bagpack', 'yellow jacket', 'light', 'brick building', 'wood chair', 'chair', 'green car', 'dining room table', 'bike', 'city street', 'traffic light', 'sedan']
+list_B = ['reflection of light', 'view of office building', 'street chair', 'white car', 'red car', 'dark hair']
+
+Answer:
+uncover = ['reflection of light', 'dark hair']
+
+In this example
+'reflection of light' cannot find matched object in list_A, especially, 'light' is not equal to 'reflection of light'.
+'view of office building' in list_B can find matched object 'brick building' although they are not exactly same but they point to similar object.
+'street chair' in list_B can find 'chair', 'wood chair' in list_A which is an alternate expression of 'chair'.
+'white car' in list_B can find 'two cars' in list_A.
+'red car' in list_B can find 'two cars' in list_A.
+'dark hair' in list_B cannot find anything similar in list_A
+
+Another example
+list_A = ['bag', 'cloth', 'boy', 'Drinking glasses', 'table']
+list_B =['backpack', 'jacket', 'young man', 'cup', 'kitchen table', 'plate', 'apple']
+Answer:
+uncover = ['plate', 'apple']
+
+In this example, 
+'backpack' in list_B can find 'bag' in list_A has similar meaning, matched.
+'jacket' in list_B can be seen as a kind of 'cloth' in list_A still matching;
+'young man' in list_B can match 'boy' in list_A;
+'cup' in list_B is similar to 'Drinking glasses' in list_A;
+'kitchen table' is a kind of table as 'table' in list_A so there is no uncovered items.
+'plate' in list_B but no object has same or similar meaning in list_A.
+'apple' in list_B but no object has same or similar meaning in list_A.
+
+Another example
+list_A = ['keyboard', 'mouse', 'moniter', 'cpu']
+list_B = ['computer']
+Answer:
+uncover = []
+'computer' in list_B can find 'keyboard', 'mouse', 'moniter', 'cpu' as whole thing in list_A, matched.
+
+Here is the inputs
+list_A = {cap_obj}
+list_B = {gt}
+Answer:
+uncover = 
\ No newline at end of file
diff --git a/app/llava/eval/masp_eval/video_chair/prompts/refine_json.txt b/app/llava/eval/masp_eval/video_chair/prompts/refine_json.txt
new file mode 100644
index 0000000000000000000000000000000000000000..20a35b1c342039b2bada87881629cb43812cb63f
--- /dev/null
+++ b/app/llava/eval/masp_eval/video_chair/prompts/refine_json.txt
@@ -0,0 +1,39 @@
+please remove any duplicate information in the JSON to ensure that each category – 'subjects', 'activities', 'locations', and 'text_overlays' – contains unique elements without repetition. 
+Also, ensure that all categories in the JSON, including "subjects", "activities", "locations", and "text_overlays", are formatted correctly, please follow these guidelines:
+1. **Subjects**: For each subject in the video, provide their name and a list of attributes. Each subject should be a dictionary with a "name" key and an "attributes" key. If there are no specific attributes for a subject, leave the attributes list empty.
+
+   Example Format:
+   ```
+   "subjects": [
+     {
+       "name": "subject1",
+       "attributes": ["attribute1", "attribute2"]
+     },
+     {
+       "name": "subject2",
+       "attributes": []
+     }
+   ]
+   ```
+
+2. **Activities**: List all the activities featured in the video. Each activity should be a separate item in the list.
+
+   Example Format: 
+   ```
+   "activities": ["activity1", "activity2", "activity3"]
+   ```
+
+3. **Locations**: List all the distinct locations shown in the video. Each location should be a separate item in the list.
+
+   Example Format:
+   ```
+   "locations": ["location1", "location2", "location3"]
+   ```
+
+4. **Text Overlays**: If there are any text overlays in the video, list them. Each piece of text should be a separate item in the list. If there are no text overlays, the list should be empty.
+
+   Example Format:
+   ```
+   "text_overlays": ["text1", "text2", "text3"]
+   ```
+
diff --git a/app/llava/mm_utils.py b/app/llava/mm_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..80b9d9799a8c24f43f9e633bfdc938caa23a16ed
--- /dev/null
+++ b/app/llava/mm_utils.py
@@ -0,0 +1,341 @@
+from PIL import Image
+from io import BytesIO
+import base64
+import torch
+import math
+import ast
+import copy
+import numpy as np
+import random
+from transformers import StoppingCriteria, CLIPImageProcessor, SiglipImageProcessor
+from llava.constants import MM_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_VIDEO_TOKEN
+
+
+def select_best_resolution(original_size, possible_resolutions):
+    """
+    Selects the best resolution from a list of possible resolutions based on the original size.
+
+    Args:
+        original_size (tuple): The original size of the image in the format (width, height).
+        possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+
+    Returns:
+        tuple: The best fit resolution in the format (width, height).
+    """
+    original_width, original_height = original_size
+    best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float('inf')
+
+    for width, height in possible_resolutions:
+        scale = min(width / original_width, height / original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
+        effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
+        wasted_resolution = (width * height) - effective_resolution
+
+        if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
+            max_effective_resolution = effective_resolution
+            min_wasted_resolution = wasted_resolution
+            best_fit = (width, height)
+
+    return best_fit
+
+
+def resize_and_pad_image(image, target_resolution):
+    """
+    Resize and pad an image to a target resolution while maintaining aspect ratio.
+
+    Args:
+        image (PIL.Image.Image): The input image.
+        target_resolution (tuple): The target resolution (width, height) of the image.
+
+    Returns:
+        PIL.Image.Image: The resized and padded image.
+    """
+    original_width, original_height = image.size
+    target_width, target_height = target_resolution
+
+    scale_w = target_width / original_width
+    scale_h = target_height / original_height
+
+    if scale_w < scale_h:
+        new_width = target_width
+        new_height = min(math.ceil(original_height * scale_w), target_height)
+    else:
+        new_height = target_height
+        new_width = min(math.ceil(original_width * scale_h), target_width)
+
+    # Resize the image
+    resized_image = image.resize((new_width, new_height))
+
+    new_image = Image.new('RGB', (target_width, target_height), (0, 0, 0))
+    paste_x = (target_width - new_width) // 2
+    paste_y = (target_height - new_height) // 2
+    new_image.paste(resized_image, (paste_x, paste_y))
+
+    return new_image
+
+
+def divide_to_patches(image, patch_size):
+    """
+    Divides an image into patches of a specified size.
+
+    Args:
+        image (PIL.Image.Image): The input image.
+        patch_size (int): The size of each patch.
+
+    Returns:
+        list: A list of PIL.Image.Image objects representing the patches.
+    """
+    patches = []
+    width, height = image.size
+    for i in range(0, height, patch_size):
+        for j in range(0, width, patch_size):
+            box = (j, i, j + patch_size, i + patch_size)
+            patch = image.crop(box)
+            patches.append(patch)
+
+    return patches
+
+
+def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
+    """
+    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
+
+    Args:
+        image_size (tuple): The size of the input image in the format (width, height).
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+        patch_size (int): The size of each image patch.
+
+    Returns:
+        tuple: The shape of the image patch grid in the format (width, height).
+    """
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    width, height = select_best_resolution(image_size, possible_resolutions)
+    return width // patch_size, height // patch_size
+
+
+def process_anyres_image(image, processor, grid_pinpoints):
+    """
+    Process an image with variable resolutions.
+
+    Args:
+        image (PIL.Image.Image): The input image to be processed.
+        processor: The image processor object.
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+
+    Returns:
+        torch.Tensor: A tensor containing the processed image patches.
+    """
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    best_resolution = select_best_resolution(image.size, possible_resolutions)
+    image_padded = resize_and_pad_image(image, best_resolution)
+
+    patches = divide_to_patches(image_padded, processor.crop_size['height'] if hasattr(processor, 'crop_size') else processor.size['height'])
+
+
+    if isinstance(processor, CLIPImageProcessor) or isinstance(processor, SiglipImageProcessor):
+        image_original_resize = image.resize((processor.size['height'], processor.size['width']))
+        image_patches = [image_original_resize] + patches
+        image_patches = [processor.preprocess(image_patch, return_tensors='pt')['pixel_values'][0]
+                        for image_patch in image_patches]
+    else:
+        image_original_resize = image.resize((processor.img_size, processor.img_size))
+        image_patches = [image_original_resize] + patches
+        image_patches = [processor.preprocess(image_patch)
+                        for image_patch in image_patches]        
+    return torch.stack(image_patches, dim=0)
+
+
+def load_image_from_base64(image):
+    return Image.open(BytesIO(base64.b64decode(image)))
+
+
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+
+
+def process_images(images, image_processor, model_cfg):
+    image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
+    new_images = []
+    if image_aspect_ratio == 'pad':
+        for image in images:
+            image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean))
+            image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+            new_images.append(image)
+    elif image_aspect_ratio == "anyres":
+        for image in images:
+            image = process_anyres_image(image, image_processor, model_cfg.image_grid_pinpoints)
+            new_images.append(image)
+    else:
+        return image_processor(images, return_tensors='pt')['pixel_values']
+    if all(x.shape == new_images[0].shape for x in new_images):
+        new_images = torch.stack(new_images, dim=0)
+    return new_images
+
+def process_images_v2(images, image_processor, model_cfg):
+    image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
+    new_images = []
+    if image_aspect_ratio == 'pad':
+        for image in images:
+            image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean))
+            if isinstance(image_processor, CLIPImageProcessor) or isinstance(image_processor, SiglipImageProcessor):
+                image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+            else:
+                image = image_processor.preprocess(image)            
+            new_images.append(image)
+    elif image_aspect_ratio == "anyres":
+        for image in images:
+            image = process_anyres_image(image, image_processor, model_cfg.image_grid_pinpoints)
+            new_images.append(image)
+    else:
+        for image in images:
+            if isinstance(image_processor, CLIPImageProcessor) or isinstance(image_processor, SiglipImageProcessor):
+                image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+            else:
+                image = image_processor.preprocess(image)
+            new_images.append(image)
+    if all(x.shape == new_images[0].shape for x in new_images):
+        new_images = torch.stack(new_images, dim=0)
+    return new_images
+
+
+def tokenizer_image_token(prompt, tokenizer, MM_TOKEN_INDEX=MM_TOKEN_INDEX, return_tensors=None):
+    mm_token = DEFAULT_VIDEO_TOKEN if DEFAULT_VIDEO_TOKEN in prompt else DEFAULT_IMAGE_TOKEN
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split(mm_token)]
+
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
+
+    input_ids = []
+    offset = 0
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+
+    for x in insert_separator(prompt_chunks, [MM_TOKEN_INDEX] * (offset + 1)):
+        input_ids.extend(x[offset:])
+
+    if return_tensors is not None:
+        if return_tensors == 'pt':
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f'Unsupported tensor type: {return_tensors}')
+    return input_ids
+
+
+def get_model_name_from_path(model_path):
+    model_path = model_path.strip("/")
+    model_paths = model_path.split("/")
+    if model_paths[-1].startswith('checkpoint-'):
+        return model_paths[-2] + "_" + model_paths[-1]
+    else:
+        return model_paths[-1]
+
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.keyword_ids = []
+        self.max_keyword_len = 0
+        for keyword in keywords:
+            cur_keyword_ids = tokenizer(keyword).input_ids
+            if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
+                cur_keyword_ids = cur_keyword_ids[1:]
+            if len(cur_keyword_ids) > self.max_keyword_len:
+                self.max_keyword_len = len(cur_keyword_ids)
+            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
+        self.tokenizer = tokenizer
+        self.start_len = input_ids.shape[1]
+    
+    def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
+        self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
+        for keyword_id in self.keyword_ids:
+            truncated_output_ids = output_ids[0, -keyword_id.shape[0]:]
+            if torch.equal(truncated_output_ids, keyword_id):
+                return True
+        outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
+        for keyword in self.keywords:
+            if keyword in outputs:
+                return True
+        return False
+    
+    def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        outputs = []
+        for i in range(output_ids.shape[0]):
+            outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
+        return all(outputs)
+
+
+def get_frame_indices(num_segments, vlen, sample='rand', fix_start=None, input_fps=1, pad_last=False):
+    if sample in ['rand', 'middle']: # uniform sampling
+        num_segments = min(num_segments, vlen)
+        intervals = np.linspace(start=0, stop=vlen, num=num_segments + 1).astype(int)
+        ranges = []
+
+        for idx, interv in enumerate(intervals[:-1]):
+            ranges.append((interv, intervals[idx + 1] - 1))
+
+        if sample == 'rand':
+            try:
+                frame_indices = [random.choice(range(x[0], x[1])) for x in ranges]
+
+            except:
+                frame_indices = np.random.permutation(vlen)[:num_segments]
+                frame_indices.sort()
+                frame_indices = list(frame_indices)
+
+        elif fix_start is not None:
+            frame_indices = [x[0] + fix_start for x in ranges]
+
+        elif sample == 'middle':
+            frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
+
+        if pad_last:
+            if len(frame_indices) < num_segments:
+                padded_frame_indices = [frame_indices[-1]] * num_segments
+                padded_frame_indices[:len(frame_indices)] = frame_indices
+                frame_indices = padded_frame_indices
+
+    elif "fps" in sample: # fps0.5, sequentially sample frames at 0.5 fps
+        output_fps = float(sample[3:])
+        duration = float(vlen) / input_fps
+        delta = 1 / output_fps # gap between frames, this is also the clip length each frame represents
+        frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
+        frame_indices = np.around(frame_seconds * input_fps).astype(int)
+        frame_indices = [e for e in frame_indices if e < vlen]
+        if num_segments > 0 and len(frame_indices) > num_segments:
+            cand_indices = copy.deepcopy(frame_indices)
+            intervals = np.linspace(start=0, stop=len(cand_indices), num=num_segments + 1).astype(int)
+            ranges = []
+
+            for idx, interv in enumerate(intervals[:-1]):
+                ranges.append((interv, intervals[idx + 1] - 1))
+
+            try:
+                frame_indices = [cand_indices[random.choice(range(x[0], x[1]))] for x in ranges]
+            except:
+                frame_indices = [cand_indices[x[0]] for x in ranges]
+
+    else:
+        raise NotImplementedError
+
+    if len(frame_indices) == 0:
+        frame_indices = [0]
+
+    return frame_indices
\ No newline at end of file
diff --git a/app/llava/model/__init__.py b/app/llava/model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fbfad5b663146291184a843297ada82963989a2
--- /dev/null
+++ b/app/llava/model/__init__.py
@@ -0,0 +1,11 @@
+from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig
+from .language_model.llava_mpt import LlavaMptForCausalLM, LlavaMptConfig
+from .language_model.llava_mistral import LlavaMistralForCausalLM, LlavaMistralConfig
+try:
+    from .language_model.llava_gemma import LlavaGemmaForCausalLM, LlavaGemmaConfig
+except:
+    pass
+try:
+    from .language_model.llava_thoth import LlavaThothForCausalLM, LlavaThothConfig
+except:
+    pass
diff --git a/app/llava/model/__pycache__/__init__.cpython-310.pyc b/app/llava/model/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..97e23c3af81c24fd18c54eb416faa891866ec75e
Binary files /dev/null and b/app/llava/model/__pycache__/__init__.cpython-310.pyc differ
diff --git a/app/llava/model/__pycache__/__init__.cpython-39.pyc b/app/llava/model/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..88a1dbc5a71f505172403e9a074f99fed09075c2
Binary files /dev/null and b/app/llava/model/__pycache__/__init__.cpython-39.pyc differ
diff --git a/app/llava/model/__pycache__/builder.cpython-39.pyc b/app/llava/model/__pycache__/builder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c93eecc5401c1100d4d186be3263c3a211a0dbe5
Binary files /dev/null and b/app/llava/model/__pycache__/builder.cpython-39.pyc differ
diff --git a/app/llava/model/__pycache__/llava_arch.cpython-310.pyc b/app/llava/model/__pycache__/llava_arch.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..477b93e92bb6a1a95cbde67c7d1667fa00a6686d
Binary files /dev/null and b/app/llava/model/__pycache__/llava_arch.cpython-310.pyc differ
diff --git a/app/llava/model/__pycache__/llava_arch.cpython-39.pyc b/app/llava/model/__pycache__/llava_arch.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ecb6b7570e2cb0c307328c88c6e1d10235da1f0b
Binary files /dev/null and b/app/llava/model/__pycache__/llava_arch.cpython-39.pyc differ
diff --git a/app/llava/model/__pycache__/utils.cpython-310.pyc b/app/llava/model/__pycache__/utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d32b70782900dd687e9367f3d3d8005d409c7a55
Binary files /dev/null and b/app/llava/model/__pycache__/utils.cpython-310.pyc differ
diff --git a/app/llava/model/__pycache__/utils.cpython-39.pyc b/app/llava/model/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..10c06172798f8e0e71bf99e591d0d5b23e4c9d67
Binary files /dev/null and b/app/llava/model/__pycache__/utils.cpython-39.pyc differ
diff --git a/app/llava/model/apply_delta.py b/app/llava/model/apply_delta.py
new file mode 100644
index 0000000000000000000000000000000000000000..666dd9691bde7d54ddf2871e311d6f621e29f099
--- /dev/null
+++ b/app/llava/model/apply_delta.py
@@ -0,0 +1,48 @@
+"""
+Usage:
+python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta
+"""
+import argparse
+
+import torch
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from llava import LlavaLlamaForCausalLM
+
+
+def apply_delta(base_model_path, target_model_path, delta_path):
+    print("Loading base model")
+    base = AutoModelForCausalLM.from_pretrained(
+        base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+
+    print("Loading delta")
+    delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+    delta_tokenizer = AutoTokenizer.from_pretrained(delta_path)
+
+    print("Applying delta")
+    for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"):
+        if name not in base.state_dict():
+            assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
+            continue
+        if param.data.shape == base.state_dict()[name].shape:
+            param.data += base.state_dict()[name]
+        else:
+            assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \
+                f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
+            bparam = base.state_dict()[name]
+            param.data[:bparam.shape[0], :bparam.shape[1]] += bparam
+
+    print("Saving target model")
+    delta.save_pretrained(target_model_path)
+    delta_tokenizer.save_pretrained(target_model_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-model-path", type=str, required=True)
+    parser.add_argument("--target-model-path", type=str, required=True)
+    parser.add_argument("--delta-path", type=str, required=True)
+
+    args = parser.parse_args()
+
+    apply_delta(args.base_model_path, args.target_model_path, args.delta_path)
diff --git a/app/llava/model/builder.py b/app/llava/model/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..24f32de6b8e034aa2a75ac1fd2f40d19605b0b44
--- /dev/null
+++ b/app/llava/model/builder.py
@@ -0,0 +1,181 @@
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+
+import os
+import warnings
+import shutil
+
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
+import torch
+from llava.model import *
+from llava.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_VIDEO_PATCH_TOKEN, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN
+
+
+def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
+    kwargs = {"device_map": device_map, **kwargs}
+
+    if device != "cuda":
+        kwargs['device_map'] = {"": device}
+
+    if load_8bit:
+        kwargs['load_in_8bit'] = True
+    elif load_4bit:
+        kwargs['load_in_4bit'] = True
+        kwargs['quantization_config'] = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type='nf4'
+        )
+    else:
+        kwargs['torch_dtype'] = torch.float16
+
+    if use_flash_attn:
+        kwargs['attn_implementation'] = 'flash_attention_2'
+
+    if 'llava' in model_name.lower():
+        # Load LLaVA model
+        if 'lora' in model_name.lower() and model_base is None:
+            warnings.warn('There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument. Detailed instruction: https://github.com/haotian-liu/LLaVA#launch-a-model-worker-lora-weights-unmerged.')
+        if 'lora' in model_name.lower() and model_base is not None:
+            from llava.model.language_model.llava_llama import LlavaConfig
+            lora_cfg_pretrained = LlavaConfig.from_pretrained(model_path)
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            print('Loading LLaVA from base model...')
+            model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
+            token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
+            if model.lm_head.weight.shape[0] != token_num:
+                model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
+                model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
+
+            print('Loading additional LLaVA weights...')
+            if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')):
+                non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu')
+            else:
+                # this is probably from HF Hub
+                from huggingface_hub import hf_hub_download
+                def load_from_hf(repo_id, filename, subfolder=None):
+                    cache_file = hf_hub_download(
+                        repo_id=repo_id,
+                        filename=filename,
+                        subfolder=subfolder)
+                    return torch.load(cache_file, map_location='cpu')
+                non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin')
+            non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
+            if any(k.startswith('model.model.') for k in non_lora_trainables):
+                non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
+            model.load_state_dict(non_lora_trainables, strict=False)
+
+            from peft import PeftModel
+            print('Loading LoRA weights...')
+            model = PeftModel.from_pretrained(model, model_path)
+            print('Merging LoRA weights...')
+            model = model.merge_and_unload()
+            print('Model is loaded...')
+        elif model_base is not None:
+            # this may be mm projector only
+            print('Loading LLaVA from base model...')
+            if 'mpt' in model_name.lower():
+                if not os.path.isfile(os.path.join(model_path, 'configuration_mpt.py')):
+                    shutil.copyfile(os.path.join(model_base, 'configuration_mpt.py'), os.path.join(model_path, 'configuration_mpt.py'))
+                tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=True)
+                cfg_pretrained = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+                model = LlavaMptForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+                cfg_pretrained = AutoConfig.from_pretrained(model_path)
+                model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
+
+            mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu')
+            mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
+            model.load_state_dict(mm_projector_weights, strict=False)
+        else:
+            if 'mpt' in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
+                model = LlavaMptForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+            elif 'mistral' in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path)
+                model = LlavaMistralForCausalLM.from_pretrained(
+                    model_path,
+                    low_cpu_mem_usage=True,
+                    **kwargs
+                )
+            elif 'gemma' in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path)
+                model = LlavaGemmaForCausalLM.from_pretrained(
+                    model_path,
+                    low_cpu_mem_usage=True,
+                    **kwargs
+                )
+            elif 'thoth' in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path)
+                model = LlavaThothForCausalLM.from_pretrained(
+                    model_path,
+                    low_cpu_mem_usage=True,
+                    **kwargs
+                )
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+                model = LlavaLlamaForCausalLM.from_pretrained(
+                    model_path,
+                    low_cpu_mem_usage=True,
+                    **kwargs
+                )
+    else:
+        # Load language model
+        if model_base is not None:
+            # PEFT model
+            from peft import PeftModel
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            model = AutoModelForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, **kwargs)
+            print(f"Loading LoRA weights from {model_path}")
+            model = PeftModel.from_pretrained(model, model_path)
+            print(f"Merging weights")
+            model = model.merge_and_unload()
+            print('Convert to FP16...')
+            model.to(torch.float16)
+        else:
+            use_fast = False
+            if 'mpt' in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
+                model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, trust_remote_code=True, **kwargs)
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+                model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+
+    image_processor = None
+
+    if 'llava' in model_name.lower():
+        mm_use_start_end = getattr(model.config, "mm_use_start_end", False)
+        mm_use_patch_token = getattr(model.config, "mm_use_patch_token", True)
+        if mm_use_patch_token:
+            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_VIDEO_PATCH_TOKEN], special_tokens=True)
+        if mm_use_start_end:
+            tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN], special_tokens=True)
+        model.resize_token_embeddings(len(tokenizer))
+
+        vision_tower = model.get_vision_tower()
+        if not vision_tower.is_loaded:
+            vision_tower.load_model(device_map=device_map)
+        if device_map != 'auto':
+            vision_tower.to(device='cuda', dtype=torch.float16)
+        image_processor = vision_tower.image_processor
+
+    if hasattr(model.config, "max_sequence_length"):
+        context_len = model.config.max_sequence_length
+    else:
+        context_len = 2048
+
+    return tokenizer, model, image_processor, context_len
diff --git a/app/llava/model/consolidate.py b/app/llava/model/consolidate.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e324210e229eeba23b75791bba82df7c6e639eb
--- /dev/null
+++ b/app/llava/model/consolidate.py
@@ -0,0 +1,29 @@
+"""
+Usage:
+python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
+"""
+import argparse
+
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from llava.model import *
+from llava.model.utils import auto_upgrade
+
+
+def consolidate_ckpt(src_path, dst_path):
+    print("Loading model")
+    auto_upgrade(src_path)
+    src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+    src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
+    src_model.save_pretrained(dst_path)
+    src_tokenizer.save_pretrained(dst_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--src", type=str, required=True)
+    parser.add_argument("--dst", type=str, required=True)
+
+    args = parser.parse_args()
+
+    consolidate_ckpt(args.src, args.dst)
diff --git a/app/llava/model/language_model/__pycache__/llava_gemma.cpython-310.pyc b/app/llava/model/language_model/__pycache__/llava_gemma.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f5a6ff2601e0bd5e94133df316810066668a32d4
Binary files /dev/null and b/app/llava/model/language_model/__pycache__/llava_gemma.cpython-310.pyc differ
diff --git a/app/llava/model/language_model/__pycache__/llava_gemma.cpython-39.pyc b/app/llava/model/language_model/__pycache__/llava_gemma.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f4c00a3deebccc5e7a14ec621fb6ed8139b019d2
Binary files /dev/null and b/app/llava/model/language_model/__pycache__/llava_gemma.cpython-39.pyc differ
diff --git a/app/llava/model/language_model/__pycache__/llava_llama.cpython-310.pyc b/app/llava/model/language_model/__pycache__/llava_llama.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2880a6ac9dbef051c888be38500e3d9798e15ffa
Binary files /dev/null and b/app/llava/model/language_model/__pycache__/llava_llama.cpython-310.pyc differ
diff --git a/app/llava/model/language_model/__pycache__/llava_llama.cpython-39.pyc b/app/llava/model/language_model/__pycache__/llava_llama.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5ce107c2941178840711c288123fc570cc938741
Binary files /dev/null and b/app/llava/model/language_model/__pycache__/llava_llama.cpython-39.pyc differ
diff --git a/app/llava/model/language_model/__pycache__/llava_mistral.cpython-310.pyc b/app/llava/model/language_model/__pycache__/llava_mistral.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3fb92f9354c75c4445dd2758f9d5b3484ecdd16f
Binary files /dev/null and b/app/llava/model/language_model/__pycache__/llava_mistral.cpython-310.pyc differ
diff --git a/app/llava/model/language_model/__pycache__/llava_mistral.cpython-39.pyc b/app/llava/model/language_model/__pycache__/llava_mistral.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a2e37ec7c64c60727692d1f80bafa197e3d3858b
Binary files /dev/null and b/app/llava/model/language_model/__pycache__/llava_mistral.cpython-39.pyc differ
diff --git a/app/llava/model/language_model/__pycache__/llava_mpt.cpython-310.pyc b/app/llava/model/language_model/__pycache__/llava_mpt.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..39677e002d4bb1bc6a500cfa4516e467e06a475b
Binary files /dev/null and b/app/llava/model/language_model/__pycache__/llava_mpt.cpython-310.pyc differ
diff --git a/app/llava/model/language_model/__pycache__/llava_mpt.cpython-39.pyc b/app/llava/model/language_model/__pycache__/llava_mpt.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b2007d6122583c1ccc8c5ff763310c1e2e41eb9e
Binary files /dev/null and b/app/llava/model/language_model/__pycache__/llava_mpt.cpython-39.pyc differ
diff --git a/app/llava/model/language_model/__pycache__/llava_thoth.cpython-310.pyc b/app/llava/model/language_model/__pycache__/llava_thoth.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..388387c3fe30e1275fee559aa1722c7316f1e48a
Binary files /dev/null and b/app/llava/model/language_model/__pycache__/llava_thoth.cpython-310.pyc differ
diff --git a/app/llava/model/language_model/__pycache__/llava_thoth.cpython-39.pyc b/app/llava/model/language_model/__pycache__/llava_thoth.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..672a5c8e44a43205b26abf12c6c2088ee69261c2
Binary files /dev/null and b/app/llava/model/language_model/__pycache__/llava_thoth.cpython-39.pyc differ
diff --git a/app/llava/model/language_model/llava_gemma.py b/app/llava/model/language_model/llava_gemma.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f072970cbe2021ef15597c23be396fe8c9dc5f4
--- /dev/null
+++ b/app/llava/model/language_model/llava_gemma.py
@@ -0,0 +1,160 @@
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+
+from transformers import AutoConfig, AutoModelForCausalLM, \
+                         GemmaConfig, GemmaForCausalLM, GemmaModel
+
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.generation.utils import GenerateOutput
+
+from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
+
+
+class LlavaGemmaConfig(GemmaConfig):
+    model_type = "llava_gemma"
+
+
+class LlavaGemmalModel(LlavaMetaModel, GemmaModel):
+    config_class = LlavaGemmaConfig
+
+    def __init__(self, config: GemmaConfig):
+        super(LlavaGemmalModel, self).__init__(config)
+
+
+class LlavaGemmaForCausalLM(GemmaForCausalLM, LlavaMetaForCausalLM):
+    config_class = LlavaGemmaConfig
+
+    def __init__(self, config):
+        super(LlavaGemmaForCausalLM, self).__init__(config)
+        self.model = LlavaGemmalModel(config)
+
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_model(self):
+        return self.model
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[List[List[int]]] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images,
+                image_sizes
+            )
+
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+
+        if images is not None:
+            (
+                inputs,
+                position_ids,
+                attention_mask,
+                _,
+                inputs_embeds,
+                _
+            ) = self.prepare_inputs_labels_for_multimodal(
+                inputs,
+                position_ids,
+                attention_mask,
+                None,
+                None,
+                images,
+                image_sizes=image_sizes
+            )
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+
+        return super().generate(
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
+                                      inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        image_sizes = kwargs.pop("image_sizes", None)
+        inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+        )
+        if images is not None:
+            inputs['images'] = images
+        if image_sizes is not None:
+            inputs['image_sizes'] = image_sizes
+        return inputs
+
+AutoConfig.register("llava_gemma", LlavaGemmaConfig)
+AutoModelForCausalLM.register(LlavaGemmaConfig, LlavaGemmaForCausalLM)
diff --git a/app/llava/model/language_model/llava_llama.py b/app/llava/model/language_model/llava_llama.py
new file mode 100644
index 0000000000000000000000000000000000000000..069d0d1c10da42f5d278598e8534f166d1f9f5ff
--- /dev/null
+++ b/app/llava/model/language_model/llava_llama.py
@@ -0,0 +1,158 @@
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from transformers import AutoConfig, AutoModelForCausalLM, \
+                         LlamaConfig, LlamaModel, LlamaForCausalLM
+
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.generation.utils import GenerateOutput
+
+from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
+
+
+class LlavaConfig(LlamaConfig):
+    model_type = "llava_llama"
+
+
+class LlavaLlamaModel(LlavaMetaModel, LlamaModel):
+    config_class = LlavaConfig
+
+    def __init__(self, config: LlamaConfig):
+        super(LlavaLlamaModel, self).__init__(config)
+
+
+class LlavaLlamaForCausalLM(LlamaForCausalLM, LlavaMetaForCausalLM):
+    config_class = LlavaConfig
+
+    def __init__(self, config):
+        super(LlamaForCausalLM, self).__init__(config)
+        self.model = LlavaLlamaModel(config)
+        self.pretraining_tp = config.pretraining_tp
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_model(self):
+        return self.model
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[List[List[int]]] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images,
+                image_sizes
+            )
+
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+
+        if images is not None:
+            (
+                inputs,
+                position_ids,
+                attention_mask,
+                _,
+                inputs_embeds,
+                _
+            ) = self.prepare_inputs_labels_for_multimodal(
+                inputs,
+                position_ids,
+                attention_mask,
+                None,
+                None,
+                images,
+                image_sizes=image_sizes
+            )
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+
+        return super().generate(
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
+                                      inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        image_sizes = kwargs.pop("image_sizes", None)
+        inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+        )
+        if images is not None:
+            inputs['images'] = images
+        if image_sizes is not None:
+            inputs['image_sizes'] = image_sizes
+        return inputs
+
+AutoConfig.register("llava_llama", LlavaConfig)
+AutoModelForCausalLM.register(LlavaConfig, LlavaLlamaForCausalLM)
diff --git a/app/llava/model/language_model/llava_mistral.py b/app/llava/model/language_model/llava_mistral.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f1b2f3546fb1015fe57b61ba4eb46dfac444085
--- /dev/null
+++ b/app/llava/model/language_model/llava_mistral.py
@@ -0,0 +1,166 @@
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+
+from transformers import AutoConfig, AutoModelForCausalLM, \
+                         MistralConfig, MistralModel, MistralForCausalLM
+
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.generation.utils import GenerateOutput
+
+from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
+
+
+class LlavaMistralConfig(MistralConfig):
+    model_type = "llava_mistral"
+
+
+class LlavaMistralModel(LlavaMetaModel, MistralModel):
+    config_class = LlavaMistralConfig
+
+    def __init__(self, config: MistralConfig):
+        super(LlavaMistralModel, self).__init__(config)
+
+
+class LlavaMistralForCausalLM(MistralForCausalLM, LlavaMetaForCausalLM):
+    config_class = LlavaMistralConfig
+
+    def __init__(self, config):
+        super(MistralForCausalLM, self).__init__(config)
+        self.model = LlavaMistralModel(config)
+
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_model(self):
+        return self.model
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[List[List[int]]] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images,
+                image_sizes
+            )
+
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+
+        if images is not None:
+            (
+                inputs,
+                position_ids,
+                attention_mask,
+                _,
+                inputs_embeds,
+                _
+            ) = self.prepare_inputs_labels_for_multimodal(
+                inputs,
+                position_ids,
+                attention_mask,
+                None,
+                None,
+                images,
+                image_sizes=image_sizes
+            )
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+
+        return super().generate(
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs
+        )
+    
+    def generate_from_base_class(self, inputs_embeds, **kwargs):
+        return super().generate(
+            position_ids=None,
+            attention_mask=None,
+            inputs_embeds=inputs_embeds,
+            **kwargs
+        )        
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
+                                      inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        image_sizes = kwargs.pop("image_sizes", None)
+        inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+        )
+        if images is not None:
+            inputs['images'] = images
+        if image_sizes is not None:
+            inputs['image_sizes'] = image_sizes
+        return inputs
+
+AutoConfig.register("llava_mistral", LlavaMistralConfig)
+AutoModelForCausalLM.register(LlavaMistralConfig, LlavaMistralForCausalLM)
diff --git a/app/llava/model/language_model/llava_mpt.py b/app/llava/model/language_model/llava_mpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..02e5237ece031af23fcd76b5b4e0d9b0bc5f55cc
--- /dev/null
+++ b/app/llava/model/language_model/llava_mpt.py
@@ -0,0 +1,97 @@
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+
+from typing import Optional, Tuple
+
+import torch
+
+from transformers import AutoConfig, AutoModelForCausalLM, \
+                         MptConfig, MptForCausalLM, MptModel
+from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
+
+
+class LlavaMptConfig(MptConfig):
+    model_type = "llava_mpt"
+
+
+class LlavaMptModel(LlavaMetaModel, MptModel):
+    config_class = LlavaMptConfig
+
+    def __init__(self, config: MptConfig):
+        config.hidden_size = config.d_model
+        super(LlavaMptModel, self).__init__(config)
+    
+    def embed_tokens(self, x):
+        return self.wte(x)
+
+
+class LlavaMptForCausalLM(MptForCausalLM, LlavaMetaForCausalLM):
+    config_class = LlavaMptConfig
+    supports_gradient_checkpointing = True
+
+    def __init__(self, config):
+        super(MptForCausalLM, self).__init__(config)
+
+        self.transformer = LlavaMptModel(config)
+        self.lm_head = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_model(self):
+        return self.transformer
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, LlavaMptModel):
+            module.gradient_checkpointing = value
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        images=None):
+
+        input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images)
+        
+        return super().forward(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        _inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+        )
+        _inputs['images'] = images
+        return _inputs
+
+
+AutoConfig.register("llava_mpt", LlavaMptConfig)
+AutoModelForCausalLM.register(LlavaMptConfig, LlavaMptForCausalLM)
diff --git a/app/llava/model/language_model/llava_thoth.py b/app/llava/model/language_model/llava_thoth.py
new file mode 100644
index 0000000000000000000000000000000000000000..13c44c79f9f3d112166e3c46ce9d2f7a8de78195
--- /dev/null
+++ b/app/llava/model/language_model/llava_thoth.py
@@ -0,0 +1,169 @@
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+
+from transformers import AutoConfig, AutoModelForCausalLM
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.generation.utils import GenerateOutput
+
+from .thoth.modeling_thoth import ThothForCausalLM, ThothModel, ThothConfig
+from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
+
+
+class LlavaThothConfig(ThothConfig):
+    model_type = "llava_thoth"
+
+
+class LlavaThothModel(LlavaMetaModel, ThothModel):
+    config_class = LlavaThothConfig
+
+    def __init__(self, config: LlavaThothConfig):
+        super(LlavaThothModel, self).__init__(config)
+
+    # def embed_tokens(self, x):
+    #     return self.embed_tokens(x)
+
+class LlavaThothForCausalLM(ThothForCausalLM, LlavaMetaForCausalLM):
+    config_class = LlavaThothConfig
+    supports_gradient_checkpointing = True
+
+    def __init__(self, config):
+        super(ThothForCausalLM, self).__init__(config)
+        self.model = LlavaThothModel(config)
+
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_model(self):
+        return self.model
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[List[List[int]]] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images,
+                image_sizes
+            )
+
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+
+        if images is not None:
+            (
+                inputs,
+                position_ids,
+                attention_mask,
+                _,
+                inputs_embeds,
+                _
+            ) = self.prepare_inputs_labels_for_multimodal(
+                inputs,
+                position_ids,
+                attention_mask,
+                None,
+                None,
+                images,
+                image_sizes=image_sizes
+            )
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+
+        return super().generate(
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs
+        )
+
+    def generate_from_base_class(self, inputs_embeds, **kwargs):
+        return super().generate(
+            position_ids=None,
+            attention_mask=None,
+            inputs_embeds=inputs_embeds,
+            **kwargs
+        )  
+      
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
+                                      inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        image_sizes = kwargs.pop("image_sizes", None)
+        inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+        )
+        if images is not None:
+            inputs['images'] = images
+        if image_sizes is not None:
+            inputs['image_sizes'] = image_sizes
+        return inputs
+
+
+AutoConfig.register("llava_thoth", LlavaThothConfig)
+AutoModelForCausalLM.register(LlavaThothConfig, LlavaThothForCausalLM)
diff --git a/app/llava/model/language_model/thoth/__pycache__/configuration_thoth.cpython-310.pyc b/app/llava/model/language_model/thoth/__pycache__/configuration_thoth.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b4111084a8afdd1e482898b00df9f968247bf6f
Binary files /dev/null and b/app/llava/model/language_model/thoth/__pycache__/configuration_thoth.cpython-310.pyc differ
diff --git a/app/llava/model/language_model/thoth/__pycache__/configuration_thoth.cpython-39.pyc b/app/llava/model/language_model/thoth/__pycache__/configuration_thoth.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b29d0aaa589e3bf29cba35fe5bc5dd0541345fb
Binary files /dev/null and b/app/llava/model/language_model/thoth/__pycache__/configuration_thoth.cpython-39.pyc differ
diff --git a/app/llava/model/language_model/thoth/__pycache__/modeling_thoth.cpython-310.pyc b/app/llava/model/language_model/thoth/__pycache__/modeling_thoth.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..34802445edc1c79800c9d53e098ee60555314e1e
Binary files /dev/null and b/app/llava/model/language_model/thoth/__pycache__/modeling_thoth.cpython-310.pyc differ
diff --git a/app/llava/model/language_model/thoth/__pycache__/modeling_thoth.cpython-39.pyc b/app/llava/model/language_model/thoth/__pycache__/modeling_thoth.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a8d8f3dcfe25def55a9032f29f2c3372022f6641
Binary files /dev/null and b/app/llava/model/language_model/thoth/__pycache__/modeling_thoth.cpython-39.pyc differ
diff --git a/app/llava/model/language_model/thoth/configuration_thoth.py b/app/llava/model/language_model/thoth/configuration_thoth.py
new file mode 100644
index 0000000000000000000000000000000000000000..c19098dd75f561c737002c7055a08c23b879cdac
--- /dev/null
+++ b/app/llava/model/language_model/thoth/configuration_thoth.py
@@ -0,0 +1,160 @@
+# coding=utf-8
+
+""" Thoth model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+THOTH_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+
+
+class ThothConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ThothModel`]. It is used to instantiate an Thoth
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Thoth-6B5.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Thoth model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`ThothModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to understand more about it. This value is
+            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+
+    """
+
+    model_type = "thoth"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
+    
\ No newline at end of file
diff --git a/app/llava/model/language_model/thoth/modeling_thoth.py b/app/llava/model/language_model/thoth/modeling_thoth.py
new file mode 100644
index 0000000000000000000000000000000000000000..5397c633068f74221b379b53c4be22847eda9594
--- /dev/null
+++ b/app/llava/model/language_model/thoth/modeling_thoth.py
@@ -0,0 +1,1376 @@
+# coding=utf-8
+
+""" PyTorch Thoth model."""
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import (
+    AttentionMaskConverter,
+    _prepare_4d_attention_mask,
+    _prepare_4d_causal_attention_mask,
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS, is_torch_greater_or_equal_than_1_13
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.utils.import_utils import is_torch_fx_available
+from .configuration_thoth import ThothConfig
+
+
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+
+
+# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
+# It means that the function will not be traced through and simply appear as a node in the graph.
+if is_torch_fx_available():
+    if not is_torch_greater_or_equal_than_1_13:
+        import torch.fx
+
+    _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "ThothConfig"
+
+
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    warnings.warn(
+        "Calling `transformers.models.thoth.modeling_thoth._prepare_4d_attention_mask` is deprecated and will be removed in v4.37. Use `transformers.modeling_attn_mask_utils._prepare_4d_attention_mask"
+    )
+    return _prepare_4d_attention_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
+
+
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    warnings.warn(
+        "Calling `transformers.models.thoth.modeling_thoth._make_causal_mask` is deprecated and will be removed in v4.37. Use `transformers.models.thoth.modeling_thoth.AttentionMaskConverter._make_causal_mask"
+    )
+    return AttentionMaskConverter._make_causal_mask(
+        input_ids_shape=input_ids_shape, dtype=dtype, device=device, past_key_values_length=past_key_values_length
+    )
+
+
+class ThothRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        ThothRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+ALL_LAYERNORM_LAYERS.append(ThothRMSNorm)
+
+
+class ThothRotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+
+
+class ThothLinearScalingRotaryEmbedding(ThothRotaryEmbedding):
+    """ThothRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        t = t / self.scaling_factor
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+class ThothDynamicNTKScalingRotaryEmbedding(ThothRotaryEmbedding):
+    """ThothRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class ThothMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        if self.config.pretraining_tp > 1:
+            slice = self.intermediate_size // self.config.pretraining_tp
+            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
+            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
+            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
+
+            gate_proj = torch.cat(
+                [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
+            )
+            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
+
+            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
+            down_proj = [
+                F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
+            ]
+            down_proj = sum(down_proj)
+        else:
+            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+        return down_proj
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class ThothAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: ThothConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias)
+        self._init_rope()
+
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = ThothRotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = ThothLinearScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            elif scaling_type == "dynamic":
+                self.rotary_emb = ThothDynamicNTKScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        if self.config.pretraining_tp > 1:
+            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+            query_slices = self.q_proj.weight.split(
+                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
+            )
+            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+
+            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+            query_states = torch.cat(query_states, dim=-1)
+
+            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+            key_states = torch.cat(key_states, dim=-1)
+
+            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+            value_states = torch.cat(value_states, dim=-1)
+
+        else:
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        if self.config.pretraining_tp > 1:
+            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
+            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
+            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
+        else:
+            attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class ThothFlashAttention2(ThothAttention):
+    """
+    Thoth flash attention module. This module inherits from `ThothAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # ThothFlashAttention2 attention does not support output_attentions
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+
+            # overwrite attention_mask with padding_mask
+            attention_mask = kwargs.pop("padding_mask")
+
+        output_attentions = False
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        dropout_rate = self.attention_dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (ThothRMSNorm handles it correctly)
+
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            # Handle the case where the model is quantized
+            if hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_output = self._flash_attention_forward(
+            query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+    def _flash_attention_forward(
+        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+        """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in ThothFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            attn_output = flash_attn_func(
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
+            )
+
+        return attn_output
+
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
+class ThothSdpaAttention(ThothAttention):
+    """
+    Thoth attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `ThothAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from ThothAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "ThothModel is using ThothSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+THOTH_ATTENTION_CLASSES = {
+    "eager": ThothAttention,
+    "flash_attention_2": ThothFlashAttention2,
+    "sdpa": ThothSdpaAttention,
+}
+
+
+class ThothDecoderLayer(nn.Module):
+    def __init__(self, config: ThothConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = THOTH_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
+
+        self.mlp = ThothMLP(config)
+        self.input_layernorm = ThothRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = ThothRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+THOTH_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`ThothConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Thoth Model outputting raw hidden-states without any specific head on top.",
+    THOTH_START_DOCSTRING,
+)
+class ThothPreTrainedModel(PreTrainedModel):
+    config_class = ThothConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["ThothDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, ThothModel):
+            module.gradient_checkpointing = value
+
+
+THOTH_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Thoth Model outputting raw hidden-states without any specific head on top.",
+    THOTH_START_DOCSTRING,
+)
+class ThothModel(ThothPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`ThothDecoderLayer`]
+
+    Args:
+        config: ThothConfig
+    """
+
+    def __init__(self, config: ThothConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [ThothDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._use_sdpa = config._attn_implementation == "sdpa"
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        self.norm = ThothRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        self._gradient_checkpointing_func = torch.utils.checkpoint.checkpoint
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(THOTH_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape[:2]
+        elif inputs_embeds is not None:
+            batch_size, seq_length = inputs_embeds.shape[:2]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        past_key_values_length = 0
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if self._use_flash_attention_2:
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self._use_sdpa and not output_attentions:
+            # output_attentions=True can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+            )
+
+        # embed positions
+        hidden_states = inputs_embeds
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class ThothForCausalLM(ThothPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = ThothModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(THOTH_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        if self.config.pretraining_tp > 1:
+            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
+            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
+            logits = torch.cat(logits, dim=-1)
+        else:
+            logits = self.lm_head(hidden_states)
+        logits = logits.float()
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """
+    The Thoth Model transformer with a sequence classification head on top (linear layer).
+
+    [`ThothForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    THOTH_START_DOCSTRING,
+)
+class ThothForSequenceClassification(ThothPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = ThothModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(THOTH_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1).to(
+                    logits.device
+                )
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/app/llava/model/llava_arch.py b/app/llava/model/llava_arch.py
new file mode 100644
index 0000000000000000000000000000000000000000..0522c7c07dacb3fc87b421ae26b3637c127f2eb2
--- /dev/null
+++ b/app/llava/model/llava_arch.py
@@ -0,0 +1,573 @@
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+from email.mime import image
+import os
+from abc import ABC, abstractmethod
+
+import torch
+import torch.nn as nn
+
+from .multimodal_encoder.builder import build_adapter_module, build_vision_tower, build_Qformer
+from .multimodal_projector.builder import build_vision_projector
+
+from llava.constants import IGNORE_INDEX, MM_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_VIDEO_PATCH_TOKEN, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN
+
+from llava.mm_utils import get_anyres_image_grid_shape
+from llava.utils import master_print
+
+class LlavaMetaModel:
+
+    def __init__(self, config):
+        super(LlavaMetaModel, self).__init__(config)
+
+        if hasattr(config, "mm_vision_tower"):
+            self.vision_tower = build_vision_tower(config, delay_load=True)
+            self.mm_projector = build_vision_projector(config)
+            if getattr(config, "qformer_model_path", None):
+                self.Qformer, self.ln_vision, self.query_tokens = build_Qformer(
+                            config.num_query_token, self.vision_tower.hidden_size)
+                self.frame_position_encoding = nn.Embedding(
+                    config.max_num_segments,
+                    self.Qformer.config.hidden_size
+                )
+            if getattr(config, "adapter_module_name", None):
+                self.adapter_module = build_adapter_module(config, self.vision_tower.hidden_size)
+            if 'unpad' in getattr(config, 'mm_patch_merge_type', ''):
+                self.image_newline = nn.Parameter(
+                    torch.empty(config.hidden_size, dtype=self.dtype)
+                )
+
+    def get_vision_tower(self):
+        vision_tower = getattr(self, 'vision_tower', None)
+        if type(vision_tower) is list:
+            vision_tower = vision_tower[0]
+        return vision_tower
+    
+    def get_adapter_module(self):
+        adapter_module = getattr(self, 'adapter_module', None)
+        if type(adapter_module) is list:
+            adapter_module = adapter_module[0]
+        return adapter_module
+
+    def get_qformer(self):
+        qformer = getattr(self, 'Qformer', None)
+        if type(qformer) is list:
+            qformer = qformer[0]
+        return qformer
+
+    def get_ln_vision(self):
+        ln_vision = getattr(self, 'ln_vision', None)
+        if type(ln_vision) is list:
+            ln_vision = ln_vision[0]
+        return ln_vision
+    
+    def get_query_tokens(self):
+        query_tokens = getattr(self, 'query_tokens', None)
+        if type(query_tokens) is list:
+            query_tokens = query_tokens[0]
+        return query_tokens
+
+    def get_frame_position_encoding(self):
+        frame_position_encoding = getattr(self, 'frame_position_encoding', None)
+        if type(frame_position_encoding) is list:
+            frame_position_encoding = frame_position_encoding[0]
+        return frame_position_encoding    
+
+    def initialize_vision_modules(self, model_args, fsdp=None):
+        vision_tower = model_args.vision_tower
+        mm_vision_select_layer = model_args.mm_vision_select_layer
+        mm_vision_select_feature = model_args.mm_vision_select_feature
+        pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
+        mm_patch_merge_type = model_args.mm_patch_merge_type
+        image_grid_pinpoints = model_args.image_grid_pinpoints
+        self.config.mm_vision_tower = vision_tower
+        self.config.img_size = model_args.img_size
+        self.config.drop_path_rate = model_args.drop_path_rate
+        self.config.vit_precision = model_args.vit_precision
+        self.config.vit_model_path = model_args.vit_model_path   
+        self.config.num_query_token = model_args.num_query_token
+        self.config.qformer_model_path = model_args.qformer_model_path
+        self.config.adapter_module_name = model_args.adapter_module_name
+        self.config.adapter_module_path = model_args.adapter_module_path
+        self.config.max_num_segments = model_args.max_num_segments
+        self.config.pretrain_mm_mlp_adapter = pretrain_mm_mlp_adapter
+        # TODO: FSDP training is not ready
+        if self.get_vision_tower() is None:
+            vision_tower = build_vision_tower(model_args)
+
+            if fsdp is not None and len(fsdp) > 0:
+                self.vision_tower = [vision_tower]
+            else:
+                self.vision_tower = vision_tower
+        else:
+            if fsdp is not None and len(fsdp) > 0:
+                vision_tower = self.vision_tower[0]
+            else:
+                vision_tower = self.vision_tower
+            vision_tower.load_model()
+
+        self.config.use_mm_proj = True
+        self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'linear')
+        self.config.mm_hidden_size = vision_tower.hidden_size
+        self.config.mm_vision_hidden_size = vision_tower.hidden_size
+        self.config.mm_vision_select_layer = mm_vision_select_layer
+        self.config.mm_vision_select_feature = mm_vision_select_feature
+        self.config.mm_patch_merge_type = mm_patch_merge_type
+        self.config.image_grid_pinpoints = image_grid_pinpoints
+
+        if getattr(model_args, "qformer_model_path", None):
+            if self.get_qformer() is None:
+                self.Qformer, self.ln_vision, self.query_tokens = build_Qformer(
+                            model_args.num_query_token, self.vision_tower.hidden_size)
+                self.frame_position_encoding = nn.Embedding(
+                    model_args.max_num_segments,
+                    self.Qformer.config.hidden_size
+                )
+                self.config.mm_hidden_size = self.Qformer.config.hidden_size
+            # self.Qformer = self.Qformer.to(torch.bfloat16)
+            if model_args.qformer_model_path != 'from_scratch':
+                self.load_pretrained_qformer(model_args.qformer_model_path)
+            
+        if getattr(model_args, 'adapter_module_name', None):
+            if self.get_adapter_module() is None:
+                self.adapter_module = build_adapter_module(self.config, self.vision_tower.hidden_size)
+                self.adapter_module.load_model()
+            self.config.mm_hidden_size = self.adapter_module.output_dim
+
+        if getattr(self, 'mm_projector', None) is None:
+
+            self.mm_projector = build_vision_projector(self.config)
+
+            if 'unpad' in mm_patch_merge_type:
+                embed_std = 1 / torch.sqrt(torch.tensor(self.config.hidden_size, dtype=self.dtype))
+                self.image_newline = nn.Parameter(
+                    torch.randn(self.config.hidden_size, dtype=self.dtype) * embed_std
+                )
+        else:
+            # In case it is frozen by LoRA
+            for p in self.mm_projector.parameters():
+                p.requires_grad = True
+
+        if pretrain_mm_mlp_adapter is not None:
+            mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
+            def get_w(weights, keyword):
+                return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
+            
+            def get_variable_frame_encoding_w(model_weights, load_weights):
+                model_len = model_weights.shape[0]
+                load_weights = {'.'.join(k.split('.')[1:]): v for k, v in load_weights.items()}
+
+                load_len = load_weights['frame_position_encoding.weight'].shape[0]
+                if model_len == load_len:
+                    return get_w(load_weights, 'frame_position_encoding')
+                elif model_len < load_len:
+                    value = load_weights['frame_position_encoding.weight'][:model_len]
+                    return {'weight': value}
+                else:
+                    value = model_weights.clone().cpu()
+                    value[:load_len] = load_weights['frame_position_encoding.weight']
+                    return {'weight': value}
+            
+            self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'))
+            if self.get_frame_position_encoding():
+                self.frame_position_encoding.load_state_dict(get_variable_frame_encoding_w(self.frame_position_encoding.weight, mm_projector_weights))
+            
+            master_print(f"Loaded pretrained parameters from {pretrain_mm_mlp_adapter}")
+
+
+    def load_pretrained_qformer(self, model_path):
+        if os.path.isfile(model_path):
+            checkpoint = torch.load(model_path, map_location="cpu")
+        else:
+            raise RuntimeError("checkpoint path is invalid")
+        if 'projector.bin' in model_path:
+            state_dict = {}
+            match_keys = ['Qformer', 'query_tokens']
+            for k, v in checkpoint.items():
+                flag = False
+                for match_key in match_keys:
+                    if match_key in k:
+                        flag = True
+                        break
+                if flag:
+                    state_dict[k.replace('model.', '')] = v            
+
+        else:
+            state_dict = checkpoint["model"]
+        msg = self.load_state_dict(state_dict, strict=False)
+
+        master_print(f"Loaded Qformer from {model_path}")
+        # master_print(msg)
+
+        # return msg
+
+
+def unpad_image(tensor, original_size):
+    """
+    Unpads a PyTorch tensor of a padded and resized image.
+
+    Args:
+    tensor (torch.Tensor): The image tensor, assumed to be in CxHxW format.
+    original_size (tuple): The original size of the image (height, width).
+
+    Returns:
+    torch.Tensor: The unpadded image tensor.
+    """
+    original_width, original_height = original_size
+    current_height, current_width = tensor.shape[1:]
+
+    original_aspect_ratio = original_width / original_height
+    current_aspect_ratio = current_width / current_height
+
+    if original_aspect_ratio > current_aspect_ratio:
+        scale_factor = current_width / original_width
+        new_height = int(original_height * scale_factor)
+        padding = (current_height - new_height) // 2
+        unpadded_tensor = tensor[:, padding:current_height - padding, :]
+    else:
+        scale_factor = current_height / original_height
+        new_width = int(original_width * scale_factor)
+        padding = (current_width - new_width) // 2
+        unpadded_tensor = tensor[:, :, padding:current_width - padding]
+
+    return unpadded_tensor
+
+
+class LlavaMetaForCausalLM(ABC):
+
+    @abstractmethod
+    def get_model(self):
+        pass
+
+    def get_vision_tower(self):
+        return self.get_model().get_vision_tower()
+    
+    def get_adapter_module(self):
+        return self.get_model().get_adapter_module()
+
+    def get_ln_vision(self):
+        return self.get_model().get_ln_vision()
+
+    def get_qformer(self):
+        return self.get_model().get_qformer()
+
+    def get_query_tokens(self):
+        return self.get_model().get_query_tokens()
+
+    def get_frame_position_encoding(self):
+        return self.get_model().get_frame_position_encoding()
+
+    def encode_images(self, images):
+        image_features = self.get_vision_tower()(images)
+        if self.get_qformer():
+            image_features = self.get_ln_vision()(image_features)
+            query_tokens = self.get_query_tokens()
+            query_tokens = query_tokens.expand(image_features.shape[0], -1, -1)
+            attn_mask = torch.ones(image_features.size()[:-1], dtype=torch.long).to(image_features.device)
+            dtype_ = self.get_vision_tower().dtype
+            # print(dtype_)
+            image_features = self.qformer_fusion(
+                query_tokens.to(dtype_),
+                image_features.to(dtype_), 
+                attn_mask
+            ).to(images.dtype)
+
+        # image_features = self.get_model().mm_projector(image_features)
+        return image_features
+
+    def qformer_fusion(self, query_tokens, features, attn_mask=None):
+        qformer = self.get_qformer()
+        query_output = qformer.bert(
+            query_embeds=query_tokens,
+            encoder_hidden_states=features,
+            encoder_attention_mask=attn_mask,
+            return_dict=True
+        )
+        return query_output.last_hidden_state
+
+    def prepare_inputs_labels_for_multimodal(
+        self, input_ids, position_ids, attention_mask, past_key_values, labels,
+        images, image_sizes=None
+    ):  
+
+        vision_tower = self.get_vision_tower()
+        if vision_tower is None or images is None or input_ids.shape[1] == 1:
+            return input_ids, position_ids, attention_mask, past_key_values, None, labels
+        
+        # image:  list(B) of tensor[1, 3, 336, 336]
+        # video:  list(B) of tensor[N, 3, 336, 336]
+        # video_any_res: list(B) of tensor[N, P, 3, 336, 336]
+        if type(images) is list or images.ndim == 5:
+            if type(images) is list:
+                images = [x.unsqueeze(0) if x.ndim == 3 else x for x in images]
+            # video any res
+            if images[0].ndim == 5:
+                concat_images = torch.cat([image.flatten(0, 1) for image in images], dim=0)
+                split_sizes = [image.shape[0:2] for image in images]
+            else:
+                concat_images = torch.cat([image for image in images], dim=0)
+                split_sizes = [image.shape[0] for image in images]
+            image_features = self.encode_images(concat_images)
+
+            # add frame encoding then projector
+            if images[0].ndim == 5:
+                frame_ids = []
+                for split_size in split_sizes:
+                    frame_ids.append(torch.tensor([idx for idx in range(split_size[0]) for _ in range(split_size[1])], \
+                                                    dtype=torch.long, device=image_features.device))
+            else:
+                frame_ids = [torch.arange(split_size, dtype=torch.long, device=image_features.device)
+                                for split_size in split_sizes]
+            frame_ids = torch.concat(frame_ids)
+            frame_position_encoding = self.get_frame_position_encoding()
+            if frame_position_encoding:
+
+                frame_embeddings = frame_position_encoding(frame_ids).unsqueeze(-2)
+                image_features += frame_embeddings
+
+            # TODO: add fusion model, rewrite this part in the future
+            adapter_module = self.get_adapter_module()
+            if adapter_module:
+                image_features = adapter_module(image_features, frame_ids)
+            image_features = self.get_model().mm_projector(image_features)
+            if images[0].ndim == 5:
+                split_sizes = [split_size[0] * split_size[1] for split_size in split_sizes]
+            image_features = torch.split(image_features, split_sizes, dim=0)
+            if adapter_module:
+                # image_features = [image_features[i].view(images[i].shape[0], images[i].shape[1], -1) for i in range(image_features.shape[0])]
+                image_features = [x.view(im.shape[0], -1, x.shape[2]) for x, im in zip(image_features, images)]
+                image_features = adapter_module.compress_token_per_img(image_features)
+
+            mm_patch_merge_type = getattr(self.config, 'mm_patch_merge_type', 'flat')
+            image_aspect_ratio = getattr(self.config, 'image_aspect_ratio', 'square')
+            if mm_patch_merge_type == 'flat':
+                image_features = [x.flatten(0, 1) for x in image_features]
+            elif mm_patch_merge_type.startswith('spatial'):
+                new_image_features = []
+                for image_idx, image_feature in enumerate(image_features):
+                    if image_feature.shape[0] > 1:
+                        base_image_feature = image_feature[0]
+                        image_feature = image_feature[1:]
+                        height = width = self.get_vision_tower().num_patches_per_side
+                        assert height * width == base_image_feature.shape[0]
+                        if image_aspect_ratio == 'anyres':
+                            num_patch_width, num_patch_height = get_anyres_image_grid_shape(image_sizes[image_idx], self.config.image_grid_pinpoints, self.get_vision_tower().config.image_size)
+                            image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
+                        else:
+                            raise NotImplementedError
+                        if 'unpad' in mm_patch_merge_type:
+                            image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
+                            image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+                            image_feature = unpad_image(image_feature, image_sizes[image_idx])
+                            image_feature = torch.cat((
+                                image_feature,
+                                self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)
+                            ), dim=-1)
+                            image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+                        else:
+                            image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
+                            image_feature = image_feature.flatten(0, 3)
+                        image_feature = torch.cat((base_image_feature, image_feature), dim=0)
+                    else:
+                        image_feature = image_feature[0]
+                        if 'unpad' in mm_patch_merge_type:
+                            image_feature = torch.cat((
+                                image_feature,
+                                self.model.image_newline[None].to(image_feature.device)
+                            ), dim=0)
+                    new_image_features.append(image_feature)
+                image_features = new_image_features
+            else:
+                raise ValueError(f"Unexpected mm_patch_merge_type: {self.config.mm_patch_merge_type}")
+
+        else:
+            image_features = self.encode_images(images)
+
+        # if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_start_end', False):
+        #     raise NotImplementedError
+
+
+        # TODO: Currently, all the embed_token will bu update when tune_mm_mlp_adapter = True && mm_use_start_end = True
+
+        # Let's just add dummy tensors if they do not exist,
+        # it is a headache to deal with None all the time.
+        # But it is not ideal, and if you have a better idea,
+        # please open an issue / submit a PR, thanks.
+        _labels = labels
+        _position_ids = position_ids
+        _attention_mask = attention_mask
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+        else:
+            attention_mask = attention_mask.bool()
+        if position_ids is None:
+            position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
+        if labels is None:
+            labels = torch.full_like(input_ids, IGNORE_INDEX)
+
+        # remove the padding using attention_mask -- FIXME
+        _input_ids = input_ids
+        input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)]
+        labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
+
+        new_input_embeds = []
+        new_labels = []
+        cur_image_idx = 0
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            num_images = (cur_input_ids == MM_TOKEN_INDEX).sum()
+            if num_images == 0:
+                cur_image_features = image_features[cur_image_idx]
+                cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids)
+                cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
+                new_input_embeds.append(cur_input_embeds)
+                new_labels.append(labels[batch_idx])
+                cur_image_idx += 1
+                continue
+
+            image_token_indices = [-1] + torch.where(cur_input_ids == MM_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
+            cur_input_ids_noim = []
+            cur_labels = labels[batch_idx]
+            cur_labels_noim = []
+            for i in range(len(image_token_indices) - 1):
+                cur_input_ids_noim.append(cur_input_ids[image_token_indices[i]+1:image_token_indices[i+1]])
+                cur_labels_noim.append(cur_labels[image_token_indices[i]+1:image_token_indices[i+1]])
+            split_sizes = [x.shape[0] for x in cur_labels_noim]
+            cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_noim))
+            cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
+            cur_new_input_embeds = []
+            cur_new_labels = []
+
+            for i in range(num_images + 1):
+                cur_new_input_embeds.append(cur_input_embeds_no_im[i])
+                cur_new_labels.append(cur_labels_noim[i])
+                if i < num_images:
+                    cur_image_features = image_features[cur_image_idx]
+                    cur_image_idx += 1
+                    cur_new_input_embeds.append(cur_image_features)
+                    cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=cur_labels.device, dtype=cur_labels.dtype))
+
+            cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds]
+
+            cur_new_input_embeds = torch.cat(cur_new_input_embeds)
+            cur_new_labels = torch.cat(cur_new_labels)
+
+            new_input_embeds.append(cur_new_input_embeds)
+            new_labels.append(cur_new_labels)
+
+        # Truncate sequences to max length as image embeddings can make the sequence longer
+        tokenizer_model_max_length = getattr(self.config, 'tokenizer_model_max_length', None)
+        if tokenizer_model_max_length is not None:
+            new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
+            new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
+
+        # Combine them
+        max_len = max(x.shape[0] for x in new_input_embeds)
+        batch_size = len(new_input_embeds)
+
+        new_input_embeds_padded = []
+        new_labels_padded = torch.full((batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device)
+        attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
+        position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
+
+        for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
+            cur_len = cur_new_embed.shape[0]
+            if getattr(self.config, 'tokenizer_padding_side', 'right') == "left":
+                new_input_embeds_padded.append(torch.cat((
+                    torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device),
+                    cur_new_embed
+                ), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, -cur_len:] = cur_new_labels
+                    attention_mask[i, -cur_len:] = True
+                    position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+            else:
+                new_input_embeds_padded.append(torch.cat((
+                    cur_new_embed,
+                    torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)
+                ), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, :cur_len] = cur_new_labels
+                    attention_mask[i, :cur_len] = True
+                    position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+
+        new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
+
+        if _labels is None:
+            new_labels = None
+        else:
+            new_labels = new_labels_padded
+
+        if _attention_mask is None:
+            attention_mask = None
+        else:
+            attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
+
+        if _position_ids is None:
+            position_ids = None
+
+        return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
+
+    def initialize_vision_tokenizer(self, model_args, tokenizer):
+        if model_args.mm_use_patch_token:
+            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_VIDEO_PATCH_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+
+        if model_args.mm_use_start_end:
+            num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+
+            if num_new_tokens > 0:
+                input_embeddings = self.get_input_embeddings().weight.data
+                output_embeddings = self.get_output_embeddings().weight.data
+
+                input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True)
+                output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True)
+
+                input_embeddings[-num_new_tokens:] = input_embeddings_avg
+                output_embeddings[-num_new_tokens:] = output_embeddings_avg
+
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = True
+                if 'gemma' in model_args.model_name_or_path:
+                    # gemma use the same embedding for input and output
+                    pass
+                else:
+                    for p in self.get_output_embeddings().parameters():
+                        p.requires_grad = False
+
+            if model_args.pretrain_mm_mlp_adapter:
+                # raise NotImplementedError
+                mm_projector_weights = torch.load(model_args.pretrain_mm_mlp_adapter, map_location='cpu')
+                mm_projector_weights = {'.'.join(k.split('.')[1:]): v for k, v in mm_projector_weights.items()}
+                embed_tokens_weight = mm_projector_weights['embed_tokens.weight']
+                input_embeddings[:] = embed_tokens_weight
+                if 'gemma' in model_args.model_name_or_path:
+                    output_embeddings[:] = embed_tokens_weight
+                assert num_new_tokens == 4
+                # if input_embeddings.shape == embed_tokens_weight.shape:
+                #     input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:]
+                # elif embed_tokens_weight.shape[0] == num_new_tokens:
+                #     input_embeddings[-num_new_tokens:] = embed_tokens_weight
+                # else:
+                #     raise ValueError(f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.")
+        elif model_args.mm_use_patch_token:
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = False
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False
diff --git a/app/llava/model/make_delta.py b/app/llava/model/make_delta.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ae55d59c2c8bab80299272314a41bbeb959d8ed
--- /dev/null
+++ b/app/llava/model/make_delta.py
@@ -0,0 +1,52 @@
+"""
+Usage:
+python3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta
+"""
+import argparse
+
+import torch
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from llava.model.utils import auto_upgrade
+
+
+def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id):
+    print("Loading base model")
+    base = AutoModelForCausalLM.from_pretrained(
+        base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+
+    print("Loading target model")
+    auto_upgrade(target_model_path)
+    target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+
+    print("Calculating delta")
+    for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
+        if name not in base.state_dict():
+            assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
+            continue
+        if param.data.shape == base.state_dict()[name].shape:
+            param.data -= base.state_dict()[name]
+        else:
+            assert name in ['model.embed_tokens.weight', 'lm_head.weight'], f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
+            bparam = base.state_dict()[name]
+            param.data[:bparam.shape[0], :bparam.shape[1]] -= bparam
+
+    print("Saving delta")
+    if hub_repo_id:
+        kwargs = {"push_to_hub": True, "repo_id": hub_repo_id}
+    else:
+        kwargs = {}
+    target.save_pretrained(delta_path, **kwargs)
+    target_tokenizer = AutoTokenizer.from_pretrained(target_model_path)
+    target_tokenizer.save_pretrained(delta_path, **kwargs)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-model-path", type=str, required=True)
+    parser.add_argument("--target-model-path", type=str, required=True)
+    parser.add_argument("--delta-path", type=str, required=True)
+    parser.add_argument("--hub-repo-id", type=str, default=None)
+    args = parser.parse_args()
+
+    make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id)
diff --git a/app/llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc b/app/llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b1cf8dbbd677d6da6fe3e121ebe7073291d3bff7
Binary files /dev/null and b/app/llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc differ
diff --git a/app/llava/model/multimodal_encoder/__pycache__/builder.cpython-39.pyc b/app/llava/model/multimodal_encoder/__pycache__/builder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1934ced3537da75272aec62ef5d342bb20371846
Binary files /dev/null and b/app/llava/model/multimodal_encoder/__pycache__/builder.cpython-39.pyc differ
diff --git a/app/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc b/app/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d8445abef93a08e4e0d0079d9dd4b7e2447e9c2b
Binary files /dev/null and b/app/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc differ
diff --git a/app/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-39.pyc b/app/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e57b4b8fae7186c77b26d14e1a09ec133049c1c4
Binary files /dev/null and b/app/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-39.pyc differ
diff --git a/app/llava/model/multimodal_encoder/__pycache__/eva_clip_encoder.cpython-310.pyc b/app/llava/model/multimodal_encoder/__pycache__/eva_clip_encoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e8178e3d4e58a132c2fa03d40c4d0359298fcf4f
Binary files /dev/null and b/app/llava/model/multimodal_encoder/__pycache__/eva_clip_encoder.cpython-310.pyc differ
diff --git a/app/llava/model/multimodal_encoder/__pycache__/eva_clip_encoder.cpython-39.pyc b/app/llava/model/multimodal_encoder/__pycache__/eva_clip_encoder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d90cfae048ac436d374d5df0b26ab977397fc67
Binary files /dev/null and b/app/llava/model/multimodal_encoder/__pycache__/eva_clip_encoder.cpython-39.pyc differ
diff --git a/app/llava/model/multimodal_encoder/__pycache__/eva_vit.cpython-310.pyc b/app/llava/model/multimodal_encoder/__pycache__/eva_vit.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..19c19449c7f65b796c1b7dcf7500f5ad239002e9
Binary files /dev/null and b/app/llava/model/multimodal_encoder/__pycache__/eva_vit.cpython-310.pyc differ
diff --git a/app/llava/model/multimodal_encoder/__pycache__/eva_vit.cpython-39.pyc b/app/llava/model/multimodal_encoder/__pycache__/eva_vit.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1e5bfb7da02a6e63cc168d1c3f9b6485e46d64cf
Binary files /dev/null and b/app/llava/model/multimodal_encoder/__pycache__/eva_vit.cpython-39.pyc differ
diff --git a/app/llava/model/multimodal_encoder/__pycache__/google_siglip_encoder.cpython-310.pyc b/app/llava/model/multimodal_encoder/__pycache__/google_siglip_encoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d97833b210585ade030f7db4f0ca4f97d44e075b
Binary files /dev/null and b/app/llava/model/multimodal_encoder/__pycache__/google_siglip_encoder.cpython-310.pyc differ
diff --git a/app/llava/model/multimodal_encoder/__pycache__/google_siglip_encoder.cpython-39.pyc b/app/llava/model/multimodal_encoder/__pycache__/google_siglip_encoder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b06cde1470074614af079163f638e3dda589b8bd
Binary files /dev/null and b/app/llava/model/multimodal_encoder/__pycache__/google_siglip_encoder.cpython-39.pyc differ
diff --git a/app/llava/model/multimodal_encoder/__pycache__/qformer.cpython-310.pyc b/app/llava/model/multimodal_encoder/__pycache__/qformer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b0000a8583dbd458cf1e1742da33239382a2af23
Binary files /dev/null and b/app/llava/model/multimodal_encoder/__pycache__/qformer.cpython-310.pyc differ
diff --git a/app/llava/model/multimodal_encoder/__pycache__/qformer.cpython-39.pyc b/app/llava/model/multimodal_encoder/__pycache__/qformer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..58c61670b5874d556a278709d544ba2500b0010e
Binary files /dev/null and b/app/llava/model/multimodal_encoder/__pycache__/qformer.cpython-39.pyc differ
diff --git a/app/llava/model/multimodal_encoder/__pycache__/resampler.cpython-310.pyc b/app/llava/model/multimodal_encoder/__pycache__/resampler.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1ea81a708caa93212cb3eeb3199d80cab1e5ef9d
Binary files /dev/null and b/app/llava/model/multimodal_encoder/__pycache__/resampler.cpython-310.pyc differ
diff --git a/app/llava/model/multimodal_encoder/__pycache__/resampler.cpython-39.pyc b/app/llava/model/multimodal_encoder/__pycache__/resampler.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b10050e69c9387bcf905e92e55a0a65427f27cf4
Binary files /dev/null and b/app/llava/model/multimodal_encoder/__pycache__/resampler.cpython-39.pyc differ
diff --git a/app/llava/model/multimodal_encoder/__pycache__/siglip_encoder.cpython-310.pyc b/app/llava/model/multimodal_encoder/__pycache__/siglip_encoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d2c9a23df2a77de5f2234295b92616665c24a4cc
Binary files /dev/null and b/app/llava/model/multimodal_encoder/__pycache__/siglip_encoder.cpython-310.pyc differ
diff --git a/app/llava/model/multimodal_encoder/__pycache__/siglip_encoder.cpython-39.pyc b/app/llava/model/multimodal_encoder/__pycache__/siglip_encoder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f8b3b4fcb0bf56e190a55f93a5e864c3f88d52dc
Binary files /dev/null and b/app/llava/model/multimodal_encoder/__pycache__/siglip_encoder.cpython-39.pyc differ
diff --git a/app/llava/model/multimodal_encoder/builder.py b/app/llava/model/multimodal_encoder/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec4bf76adaad5bd01406ad2532d38aa274b5c88f
--- /dev/null
+++ b/app/llava/model/multimodal_encoder/builder.py
@@ -0,0 +1,239 @@
+import os
+import re
+import math
+import torch
+import torch.nn as nn
+from .clip_encoder import CLIPVisionTower
+from .eva_clip_encoder import EvaClipVisionTower
+from .siglip_encoder import SiglipVisionTower
+from .google_siglip_encoder import GoogleSiglipVisionTower
+from llava.model.utils import LayerNorm
+from .qformer import BertConfig, BertLMHeadModel
+from .resampler import Resampler, TokenCompressor
+from torch.nn.init import trunc_normal_
+
+
+
+
+
+def build_vision_tower(vision_tower_cfg, **kwargs):
+    vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
+    # is_absolute_path_exists = os.path.exists(vision_tower)
+    if vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower:
+        vision_tower = CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
+    elif vision_tower.startswith("eva"):
+        vision_tower = EvaClipVisionTower(vision_tower, args=vision_tower_cfg)
+    elif vision_tower.startswith("google/siglip"):
+        vision_tower = GoogleSiglipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
+    elif 'HuggingFaceM4/siglip' in vision_tower:
+        vision_tower = SiglipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
+    else:
+        raise ValueError(f'Unknown vision tower: {vision_tower}')
+    
+    return vision_tower
+
+
+
+def build_Qformer(num_query_token, vision_width, extra_num_query_token=64, cross_attention_freq=2):
+    ln_vision = LayerNorm(vision_width)
+    encoder_config = BertConfig.from_pretrained("./model/bert-base-uncased")
+    encoder_config.encoder_width = vision_width
+    # insert cross-attention layer every other block
+    encoder_config.add_cross_attention = True
+    encoder_config.cross_attention_freq = cross_attention_freq
+    encoder_config.query_length = num_query_token
+    Qformer = BertLMHeadModel(config=encoder_config)
+    query_tokens = nn.Parameter(
+        torch.zeros(1, num_query_token, encoder_config.hidden_size)
+    )
+    query_tokens.data.normal_(mean=0.0, std=encoder_config.initializer_range)
+    
+    Qformer.cls = None
+    Qformer.bert.embeddings.word_embeddings = None
+    Qformer.bert.embeddings.position_embeddings = None
+    for layer in Qformer.bert.encoder.layer:
+        layer.output = None
+        layer.intermediate = None
+
+    return Qformer, ln_vision, query_tokens
+
+#TODO: remove the vision_width here
+def build_adapter_module(cfg, vision_width):
+    return AdapterModule(cfg, vision_width)
+
+
+class IdentityMap(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, *args, **kwargs):
+        return x
+
+
+class AdapterModule(nn.Module):
+    def __init__(self, config, vision_width):
+        super().__init__()
+        self.adapter_name = config.adapter_module_name
+        self.config = config
+        self.output_dim = vision_width
+        if 'perceiver' in self.adapter_name:
+            from flash_perceiver import Perceiver
+            self.adapter = Perceiver(
+                input_dim=vision_width,
+                depth=6,
+                output_dim=vision_width,
+                num_latents=self.config.num_query_token,
+                latent_dim=1024,
+                cross_heads=1,
+                cross_head_dim=128,
+                cross_rotary_emb_dim=0,
+                cross_attn_dropout=0.0,
+                latent_heads=8,
+                latent_head_dim=128,
+                latent_rotary_emb_dim=0,
+                latent_attn_dropout=0.0,
+                weight_tie_layers=False,
+                gated_mlp=True,
+                self_per_cross_attn=1,
+                num_zero_tokens=None,
+                use_flash_attn=True,
+            )
+        elif 'naive_resampler' in self.adapter_name:
+            assert math.sqrt(self.config.num_query_token) ** 2 == self.config.num_query_token,  'num of query need to be a square number'
+            self.adapter = Resampler(
+                    grid_size=int(math.sqrt(self.config.num_query_token)), 
+                    embed_dim=vision_width,
+                    num_heads=8, 
+            )
+        elif 'qformer' in self.adapter_name:
+            Qformer, ln_vision, query_tokens = build_Qformer(
+                self.config.num_query_token, vision_width)
+            self.adapter = Qformer
+            self.ln_vision = ln_vision
+            self.query_tokens = query_tokens
+            self.output_dim = Qformer.config.hidden_size
+        elif 'none' in self.adapter_name:
+            self.adapter = IdentityMap()
+            
+        self.is_loaded = False
+        
+        if 'compress_token' in self.adapter_name:
+            match = re.search(r'\d+$', self.adapter_name)
+            self.token_compressor = TokenCompressor(
+                num_compressed_token=int(match.group()),
+                embed_dim=self.config.hidden_size,
+                num_heads=8, 
+            )
+            if 'v1' in self.adapter_name:
+                self.compress_version = 'v1'
+            else:
+                self.compress_version = 'v0'
+
+        # self.ln_vision = LayerNorm(self.config.vision_in_dim)
+        self.frame_position_encoding = nn.Embedding(
+                config.max_num_segments,
+                self.output_dim,
+                )
+        
+        self.adapter.apply(self._init_weights)
+    
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Linear, nn.Embedding)):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        
+    def forward(self, image_features, frame_ids):
+        if 'perceiver' in self.adapter_name:
+            adapted_image_features = self.adapter(image_features, return_embeddings=True)
+        elif 'naive_resampler' in self.adapter_name:
+            adapted_image_features = self.adapter(image_features)
+        elif 'qformer' in self.adapter_name:
+            image_features = self.ln_vision(image_features)
+            query_tokens = self.query_tokens.expand(image_features.shape[0], -1, -1)
+            attn_mask = torch.ones(image_features.size()[:-1], dtype=torch.long).to(image_features.device)
+            adapted_image_features = self.adapter.bert(
+                query_embeds=query_tokens,
+                encoder_hidden_states=image_features,
+                encoder_attention_mask=attn_mask,
+                return_dict=True
+            ).last_hidden_state
+        elif 'none' in self.adapter_name:
+            adapted_image_features = self.adapter(image_features)
+            
+        frame_embeddings = self.frame_position_encoding(frame_ids).unsqueeze(-2)
+        adapted_image_features += frame_embeddings        
+        return adapted_image_features
+    
+    # TODO: addhoc func, rewrite it in the future
+    def compress_token_per_img(self, batch_image_features):
+        if 'compress_token' not in self.adapter_name:
+            return batch_image_features
+        compressed_features = []
+        for image_features in batch_image_features: # image_features [num_frames, tokens, C]
+            # handle non image cases(in that case, image_patch maybe smaller than num_compressed_token)
+            if image_features.shape[1] < self.token_compressor.num_compressed_token:  
+                compressed_features.append(image_features)
+            else:
+                compressed_features.append(self.token_compressor(image_features, compress_version=self.compress_version))
+        return compressed_features
+
+
+    def load_model(self):
+        if self.is_loaded:
+            return
+
+        if getattr(self.config, 'adapter_module_path', None):
+            checkpoint = torch.load(self.config.adapter_module_path, map_location="cpu")
+            
+            def get_w(weights, keyword):
+                return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword + '.' in k}
+            
+            def get_variable_frame_encoding_w(model_weights, load_weights):
+                keyword = 'frame_position_encoding'
+                model_len = model_weights.shape[0]
+                load_weights_f_encoding = get_w(load_weights, keyword)
+
+                load_len = load_weights_f_encoding['weight'].shape[0]
+                if model_len <= load_len:
+                    value = load_weights_f_encoding['weight'][:model_len]
+                else:
+                    value = model_weights.clone().cpu()
+                    value[:load_len] = load_weights_f_encoding['weight']
+                return value
+            
+            if 'qformer' in self.adapter_name and ('projector.bin' not in self.config.adapter_module_path):
+                    state_dict = checkpoint["model"]
+                    self.adapter.load_state_dict(get_w(state_dict, 'Qformer'))
+                    self.ln_vision.load_state_dict(get_w(state_dict, 'ln_vision'))
+                    self.load_state_dict({'query_tokens': state_dict['query_tokens']}, strict=False)
+                    if getattr(self.config, 'pretrain_mm_mlp_adapter', None):
+                        mm_projector_weights = torch.load(self.config.pretrain_mm_mlp_adapter, map_location='cpu')
+                        frame_encoding_weight = get_variable_frame_encoding_w(self.frame_position_encoding.weight, mm_projector_weights)
+                        self.frame_position_encoding.load_state_dict({'weight': frame_encoding_weight})
+            else:
+                frame_encoding_weight = get_variable_frame_encoding_w(self.frame_position_encoding.weight, checkpoint)
+                for k in checkpoint.keys():
+                    if 'frame_position_encoding' in k:
+                        checkpoint[k] = frame_encoding_weight
+                
+                self.load_state_dict(get_w(checkpoint, 'adapter_module'))
+        else:
+            # no pertrain weight, use initalization
+            return
+
+    def freeze_adapter_module(self, freeze_flag):
+        if freeze_flag:
+            for name, p in self.named_parameters():
+                p.requires_grad = False 
+        else:
+            for name, p in self.named_parameters():
+                p.requires_grad = True
+
+            if 'naive_resampler' in self.adapter_name:
+                for name, p in self.named_parameters():
+                    if 'pos_embed' in name:
+                        p.requires_grad = False
diff --git a/app/llava/model/multimodal_encoder/clip_encoder.py b/app/llava/model/multimodal_encoder/clip_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..97dbea3dee79b06ed163f9d85f174753a1237572
--- /dev/null
+++ b/app/llava/model/multimodal_encoder/clip_encoder.py
@@ -0,0 +1,88 @@
+import torch
+import torch.nn as nn
+
+from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
+
+
+class CLIPVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.vision_tower_name = vision_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+
+        if not delay_load:
+            self.load_model()
+        elif getattr(args, 'unfreeze_mm_vision_tower', False):
+            self.load_model()
+        else:
+            self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
+
+    def load_model(self, device_map=None):
+        if self.is_loaded:
+            print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
+            return
+
+        self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
+        self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
+        self.vision_tower.requires_grad_(False)
+
+        self.is_loaded = True
+
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        if self.select_feature == 'patch':
+            image_features = image_features[:, 1:]
+        elif self.select_feature == 'cls_patch':
+            image_features = image_features
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+
+    @property
+    def device(self):
+        return self.vision_tower.device
+
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+
+    @property
+    def num_patches_per_side(self):
+        return self.config.image_size // self.config.patch_size
+
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2
diff --git a/app/llava/model/multimodal_encoder/eva_clip_encoder.py b/app/llava/model/multimodal_encoder/eva_clip_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..30a628513f789ebd9c363bf7887931fd761e1627
--- /dev/null
+++ b/app/llava/model/multimodal_encoder/eva_clip_encoder.py
@@ -0,0 +1,101 @@
+import torch
+import torch.nn as nn
+
+from .processor import Blip2ImageTrainProcessor
+from .eva_vit import create_eva_vit_g
+
+
+class EvaClipVisionTower(nn.Module):
+
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.vision_tower_name = vision_tower
+        # self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+        self.args = args
+
+        if not delay_load:
+            self.load_model()
+
+        # self.is_loaded = True
+
+
+    def load_model(self, device_map=None):
+        if self.is_loaded:
+            print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
+            return
+        
+        if not hasattr(self.args, 'dynamic_resolution'):
+            dynamic_resolution = None
+        else:
+            dynamic_resolution = self.args.dynamic_resolution
+
+
+        if (not hasattr(self.args, 'freeze_vision_encoder')) or self.args.freeze_vision_encoder:
+            use_checkpoint = False
+        else:
+            use_checkpoint = True
+            assert self.args.vit_precision == 'fp32',  'if the vision encoder is training, the type needs to be fp32'
+                    
+        
+        self.image_processor = Blip2ImageTrainProcessor(
+            image_size=self.args.img_size,
+            dynamic_resolution= dynamic_resolution
+        )
+        self.vision_tower = create_eva_vit_g(
+            img_size=self.args.img_size,
+            drop_path_rate=self.args.drop_path_rate,
+            precision=self.args.vit_precision,
+            vit_model_path=self.args.vit_model_path,
+            use_checkpoint=use_checkpoint
+        )
+
+        # self.vision_tower.requires_grad_(False)
+
+        self.is_loaded = True
+
+
+    def feature_select(self, image_features):
+        if self.select_feature == 'patch':
+            image_features = image_features[:, 1:]
+        elif self.select_feature == 'cls_patch':
+            image_features = image_features
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+
+    # @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.unsqueeze(0))
+                image_features.append(self.feature_select(image_forward_out).to(image.dtype))
+
+        else:
+            image_features = self.vision_tower(images.to(dtype=self.dtype))
+            image_features = self.feature_select(image_features).to(images.dtype)
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, dtype=torch.float)
+
+    @property
+    def hidden_size(self):
+        return self.vision_tower.hidden_size
+
+    @property
+    def num_patches(self):
+        return (self.vision_tower.image_size // self.vision_tower.patch_size) ** 2
+    
+    @property
+    def num_patches_per_side(self):
+        return (self.vision_tower.image_size // self.vision_tower.patch_size)
+
+    @property
+    def dtype(self):
+        return self.vision_tower.pos_embed.dtype
+
diff --git a/app/llava/model/multimodal_encoder/eva_vit.py b/app/llava/model/multimodal_encoder/eva_vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..075308498eb32d114f739b5954d2d8481c464181
--- /dev/null
+++ b/app/llava/model/multimodal_encoder/eva_vit.py
@@ -0,0 +1,448 @@
+# Based on EVA, BEIT, timm and DeiT code bases
+# https://github.com/baaivision/EVA
+# https://github.com/rwightman/pytorch-image-models/tree/master/timm
+# https://github.com/microsoft/unilm/tree/master/beit
+# https://github.com/facebookresearch/deit/
+# https://github.com/facebookresearch/dino
+# --------------------------------------------------------'
+import math
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import drop_path, to_2tuple, trunc_normal_
+from timm.models.registry import register_model
+from llava.utils import master_print
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .9, 'interpolation': 'bicubic',
+        'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5),
+        **kwargs
+    }
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return 'p={}'.format(self.drop_prob)
+
+
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        # x = self.drop(x)
+        # commit this for the orignal BERT implement
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(
+            self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
+            proj_drop=0., window_size=None, attn_head_dim=None):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        else:
+            self.q_bias = None
+            self.v_bias = None
+
+        if window_size:
+            self.window_size = window_size
+            self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
+            self.relative_position_bias_table = nn.Parameter(
+                torch.zeros(self.num_relative_distance, num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+            # cls to token & token 2 cls & cls to cls
+
+            # get pair-wise relative position index for each token inside the window
+            coords_h = torch.arange(window_size[0])
+            coords_w = torch.arange(window_size[1])
+            coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+            coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+            relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+            relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+            relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+            relative_coords[:, :, 1] += window_size[1] - 1
+            relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+            relative_position_index = \
+                torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
+            relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+            relative_position_index[0, 0:] = self.num_relative_distance - 3
+            relative_position_index[0:, 0] = self.num_relative_distance - 2
+            relative_position_index[0, 0] = self.num_relative_distance - 1
+
+            self.register_buffer("relative_position_index", relative_position_index)
+        else:
+            self.window_size = None
+            self.relative_position_bias_table = None
+            self.relative_position_index = None
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, rel_pos_bias=None):
+        B, N, C = x.shape
+        qkv_bias = None
+        if self.q_bias is not None:
+            qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias))
+        # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        if self.relative_position_bias_table is not None:
+            relative_position_bias = \
+                self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+                    self.window_size[0] * self.window_size[1] + 1,
+                    self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
+            relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+            attn = attn + relative_position_bias.unsqueeze(0)
+
+        if rel_pos_bias is not None:
+            attn = attn + rel_pos_bias
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Module):
+
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., init_values=None, act_layer=nn.GELU, norm_layer=nn.LayerNorm,
+                 window_size=None, attn_head_dim=None):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            attn_drop=attn_drop, proj_drop=drop, window_size=window_size, attn_head_dim=attn_head_dim)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+        if init_values is not None and init_values > 0:
+            self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)), requires_grad=True)
+            self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)), requires_grad=True)
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+
+    def forward(self, x, rel_pos_bias=None):
+        if self.gamma_1 is None:
+            x = x + self.drop_path(self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+        self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, x, **kwargs):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+
+
+class RelativePositionBias(nn.Module):
+
+    def __init__(self, window_size, num_heads):
+        super().__init__()
+        self.window_size = window_size
+        self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros(self.num_relative_distance, num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+        # cls to token & token 2 cls & cls to cls
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(window_size[0])
+        coords_w = torch.arange(window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+        relative_position_index = \
+            torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
+        relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        relative_position_index[0, 0:] = self.num_relative_distance - 3
+        relative_position_index[0:, 0] = self.num_relative_distance - 2
+        relative_position_index[0, 0] = self.num_relative_distance - 1
+
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        # trunc_normal_(self.relative_position_bias_table, std=.02)
+
+    def forward(self):
+        relative_position_bias = \
+            self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+                self.window_size[0] * self.window_size[1] + 1,
+                self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
+        return relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+
+
+class VisionTransformer(nn.Module):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+                 drop_path_rate=0., norm_layer=nn.LayerNorm, init_values=None,
+                 use_abs_pos_emb=True, use_rel_pos_bias=False, use_shared_rel_pos_bias=False,
+                 use_mean_pooling=True, init_scale=0.001, use_checkpoint=False):
+        super().__init__()
+        self.image_size = img_size
+        self.patch_size = patch_size
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        if use_abs_pos_emb:
+            self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+        else:
+            self.pos_embed = None
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        if use_shared_rel_pos_bias:
+            self.rel_pos_bias = RelativePositionBias(window_size=self.patch_embed.patch_shape, num_heads=num_heads)
+        else:
+            self.rel_pos_bias = None
+        self.use_checkpoint = use_checkpoint
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.use_rel_pos_bias = use_rel_pos_bias
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+                init_values=init_values, window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None)
+            for i in range(depth)])
+        #         self.norm = nn.Identity() if use_mean_pooling else norm_layer(embed_dim)
+        #         self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None
+        #         self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+        if self.pos_embed is not None:
+            trunc_normal_(self.pos_embed, std=.02)
+        trunc_normal_(self.cls_token, std=.02)
+        # trunc_normal_(self.mask_token, std=.02)
+        #         if isinstance(self.head, nn.Linear):
+        #             trunc_normal_(self.head.weight, std=.02)
+        self.apply(self._init_weights)
+        self.fix_init_weight()
+
+    #         if isinstance(self.head, nn.Linear):
+    #             self.head.weight.data.mul_(init_scale)
+    #             self.head.bias.data.mul_(init_scale)
+
+    def fix_init_weight(self):
+        def rescale(param, layer_id):
+            param.div_(math.sqrt(2.0 * layer_id))
+
+        for layer_id, layer in enumerate(self.blocks):
+            rescale(layer.attn.proj.weight.data, layer_id + 1)
+            rescale(layer.mlp.fc2.weight.data, layer_id + 1)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=''):
+        self.num_classes = num_classes
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        batch_size, seq_len, _ = x.size()
+
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+        if self.pos_embed is not None:
+            x = x + self.pos_embed
+        x = self.pos_drop(x)
+
+        rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, rel_pos_bias)
+            else:
+                x = blk(x, rel_pos_bias)
+        return x
+
+    #         x = self.norm(x)
+
+    #         if self.fc_norm is not None:
+    #             t = x[:, 1:, :]
+    #             return self.fc_norm(t.mean(1))
+    #         else:
+    #             return x[:, 0]
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        #         x = self.head(x)
+        return x
+
+    def get_intermediate_layers(self, x):
+        x = self.patch_embed(x)
+        batch_size, seq_len, _ = x.size()
+
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+        if self.pos_embed is not None:
+            x = x + self.pos_embed
+        x = self.pos_drop(x)
+
+        features = []
+        rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None
+        for blk in self.blocks:
+            x = blk(x, rel_pos_bias)
+            features.append(x)
+
+        return features
+
+    @property
+    def hidden_size(self):
+        return self.num_features
+
+
+def interpolate_pos_embed(model, checkpoint_model):
+    if 'pos_embed' in checkpoint_model:
+        pos_embed_checkpoint = checkpoint_model['pos_embed'].float()
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        num_patches = model.patch_embed.num_patches
+        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+        # height (== width) for the checkpoint position embedding
+        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+        # height (== width) for the new position embedding
+        new_size = int(num_patches ** 0.5)
+        # class_token and dist_token are kept unchanged
+        if orig_size != new_size:
+            print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
+            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+            # only the position tokens are interpolated
+            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+            pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
+            pos_tokens = torch.nn.functional.interpolate(
+                pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
+            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+            checkpoint_model['pos_embed'] = new_pos_embed
+
+
+def convert_weights_to_fp16(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+
+    def _convert_weights_to_fp16(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            l.weight.data = l.weight.data.half()
+            if l.bias is not None:
+                l.bias.data = l.bias.data.half()
+
+    #         if isinstance(l, (nn.MultiheadAttention, Attention)):
+    #             for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
+    #                 tensor = getattr(l, attr)
+    #                 if tensor is not None:
+    #                     tensor.data = tensor.data.half()
+
+    model.apply(_convert_weights_to_fp16)
+
+def create_eva_vit_g(img_size=224, drop_path_rate=0.4, use_checkpoint=False, precision="fp16", vit_model_path=None):
+    model = VisionTransformer(
+        img_size=img_size,
+        patch_size=14,
+        use_mean_pooling=False,
+        embed_dim=1408,
+        depth=39,
+        num_heads=1408 // 88,
+        mlp_ratio=4.3637,
+        qkv_bias=True,
+        drop_path_rate=drop_path_rate,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        use_checkpoint=use_checkpoint,
+    )
+    if vit_model_path is not None:
+        state_dict = torch.load(vit_model_path, map_location="cpu")
+        interpolate_pos_embed(model, state_dict)
+
+        incompatible_keys = model.load_state_dict(state_dict, strict=False)
+        #master_print(incompatible_keys)
+
+    if precision == "fp16":
+        #         model.to("cuda")
+        convert_weights_to_fp16(model)
+    return model
diff --git a/app/llava/model/multimodal_encoder/google_siglip_encoder.py b/app/llava/model/multimodal_encoder/google_siglip_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbf20f80a392d19269009edc72fac1d48f0c9a45
--- /dev/null
+++ b/app/llava/model/multimodal_encoder/google_siglip_encoder.py
@@ -0,0 +1,84 @@
+import torch
+import torch.nn as nn
+
+# from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
+# from .siglip.modeling_siglip import SiglipVisionModel
+# from .siglip.configuration_siglip import SiglipVisionConfig
+from transformers import AutoImageProcessor, SiglipVisionModel, SiglipVisionConfig
+
+
+class GoogleSiglipVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.vision_tower_name = vision_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.args = args
+        if not delay_load:
+            self.load_model()
+        elif getattr(args, 'unfreeze_mm_vision_tower', False):
+            self.load_model()
+        else:
+            self.cfg_only = SiglipVisionConfig.from_pretrained(self.vision_tower_name)
+
+    def load_model(self, device_map=None):
+        if self.is_loaded:
+            print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
+            return
+
+        self.image_processor = AutoImageProcessor.from_pretrained(self.vision_tower_name)
+        self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
+        self.vision_tower.requires_grad_(False)
+
+        self.is_loaded = True
+
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        return image_features
+
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+
+    @property
+    def device(self):
+        return self.vision_tower.device
+
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+
+    @property
+    def num_patches_per_side(self):
+        return self.image_processor.size['width'] // self.config.patch_size
+
+    @property
+    def num_patches(self):
+        return (self.image_processor.size['width'] // self.config.patch_size) ** 2
diff --git a/app/llava/model/multimodal_encoder/processor/__init__.py b/app/llava/model/multimodal_encoder/processor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab318449dd49e1c550d90171133680938c62b3c7
--- /dev/null
+++ b/app/llava/model/multimodal_encoder/processor/__init__.py
@@ -0,0 +1 @@
+from .blip_processor import Blip2ImageTrainProcessor
\ No newline at end of file
diff --git a/app/llava/model/multimodal_encoder/processor/__pycache__/__init__.cpython-310.pyc b/app/llava/model/multimodal_encoder/processor/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df971d84732e5f45d3791b892d33955759a9b6e2
Binary files /dev/null and b/app/llava/model/multimodal_encoder/processor/__pycache__/__init__.cpython-310.pyc differ
diff --git a/app/llava/model/multimodal_encoder/processor/__pycache__/__init__.cpython-39.pyc b/app/llava/model/multimodal_encoder/processor/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..971dd3d97cc773392baf139d6491d235560d54d6
Binary files /dev/null and b/app/llava/model/multimodal_encoder/processor/__pycache__/__init__.cpython-39.pyc differ
diff --git a/app/llava/model/multimodal_encoder/processor/__pycache__/blip_processor.cpython-310.pyc b/app/llava/model/multimodal_encoder/processor/__pycache__/blip_processor.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8dc956d44f151978418d50248a2ef826db66550b
Binary files /dev/null and b/app/llava/model/multimodal_encoder/processor/__pycache__/blip_processor.cpython-310.pyc differ
diff --git a/app/llava/model/multimodal_encoder/processor/__pycache__/blip_processor.cpython-39.pyc b/app/llava/model/multimodal_encoder/processor/__pycache__/blip_processor.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ff1fb0593d1e814549f503f30c75b4d572fe290
Binary files /dev/null and b/app/llava/model/multimodal_encoder/processor/__pycache__/blip_processor.cpython-39.pyc differ
diff --git a/app/llava/model/multimodal_encoder/processor/blip_processor.py b/app/llava/model/multimodal_encoder/processor/blip_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef381ff576b7bb4bf4cfafd01800e49eb217f0e5
--- /dev/null
+++ b/app/llava/model/multimodal_encoder/processor/blip_processor.py
@@ -0,0 +1,125 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from omegaconf import OmegaConf
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+from llava.mm_utils import select_best_resolution
+
+class BaseProcessor:
+    def __init__(self):
+        self.transform = lambda x: x
+        return
+
+    def __call__(self, item):
+        return self.transform(item)
+
+    @classmethod
+    def from_config(cls, cfg=None):
+        return cls()
+
+    def build(self, **kwargs):
+        cfg = OmegaConf.create(kwargs)
+
+        return self.from_config(cfg)
+
+
+class BlipImageBaseProcessor(BaseProcessor):
+    def __init__(self, image_mean=None, image_std=None):
+        if image_mean is None:
+            image_mean = (0.48145466, 0.4578275, 0.40821073)
+        if image_std is None:
+            image_std = (0.26862954, 0.26130258, 0.27577711)
+
+        self.normalize = transforms.Normalize(image_mean, image_std)
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+class Blip2ImageTrainProcessor(BlipImageBaseProcessor):
+    def __init__(self, image_size=224, image_mean=None, image_std=None, min_scale=0.5, max_scale=1.0, is_training=True, dynamic_resolution=None):
+        super().__init__(image_mean=image_mean, image_std=image_std)
+
+        self.is_training = is_training
+        self.dynamic_resolution = dynamic_resolution
+        if isinstance(image_size, int):
+            self.img_size = image_size
+            size_tuple = (image_size, image_size)
+        elif isinstance(image_size, tuple):
+            self.img_size = image_size[0]
+            size_tuple = image_size   # H, W
+        self.crop_size = {
+            'height': self.img_size,
+            'width': self.img_size
+        }
+        if self.dynamic_resolution:
+            self.transform_dic = {}
+            for size_ in self.dynamic_resolution:
+                self.transform_dic[size_] = (
+                    transforms.Compose(
+                    [
+                        transforms.Resize(
+                            size_, interpolation=InterpolationMode.BICUBIC # H, W
+                        ),
+                        transforms.ToTensor(),
+                        self.normalize,
+                    ]
+                    )                   
+                )
+        self.transform = transforms.Compose(
+            [
+                transforms.Resize(
+                    size_tuple, interpolation=InterpolationMode.BICUBIC
+                ),
+                transforms.ToTensor(),
+                self.normalize,
+            ]
+        )
+
+    def preprocess(self, item):
+        # if self.dynamic_resolution is not None:
+        #     images = []
+        #     images.append(self.transform(item))
+        #     width, height = item.size
+        #     best_fit_res = select_best_resolution((width, height), self.dynamic_resolution)
+        #     resize_img = self.transform_dic[best_fit_res](item)
+        #     splitted_imgs = self.split_images(resize_img, (self.img_size, self.img_size))
+        #     images.extend(splitted_imgs)
+        #     return images
+        # else:
+        return self.transform(item)
+
+    @classmethod
+    def from_config(cls, cfg=None):
+        if cfg is None:
+            cfg = OmegaConf.create()
+
+        image_size = cfg.get("image_size", 224)
+
+        image_mean = cfg.get("mean", None)
+        image_std = cfg.get("image_std", None)
+
+        min_scale = cfg.get("min_scale", 0.5)
+        max_scale = cfg.get("max_scale", 1.0)
+
+        return cls(
+            image_size=image_size,
+            image_mean=image_mean,
+            image_std=image_std,
+            min_scale=min_scale,
+            max_scale=max_scale,
+        )
+
+    @staticmethod
+    def split_images(image, split_size):
+        splited_images = []
+        _, h, w = image.shape # C, H, W
+        assert h % split_size[0] == 0 and w % split_size[1] == 0, "dynamic resolution must be a multiple of input image size "
+        for i in range(0, h, split_size[0]):
+            for j in range(0, w, split_size[1]):
+                patch = image[:, i:i+split_size[0], j:j+split_size[1]].clone()
+                splited_images.append(patch)
+        return splited_images
\ No newline at end of file
diff --git a/app/llava/model/multimodal_encoder/qformer.py b/app/llava/model/multimodal_encoder/qformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c70d19040eef1f74f11cb009d1d393401579ef8c
--- /dev/null
+++ b/app/llava/model/multimodal_encoder/qformer.py
@@ -0,0 +1,1221 @@
+"""
+Adapted from salesforce@LAVIS. Below is the original copyright:
+ * Copyright (c) 2023, salesforce.com, inc.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+ * By Junnan Li
+ * Based on huggingface code base
+ * https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/models/bert
+"""
+
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple, Dict, Any
+
+import torch
+from torch import Tensor, device, dtype, nn
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+import torch.nn.functional as F
+
+from transformers.activations import ACT2FN
+from transformers.file_utils import (
+    ModelOutput,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import (
+    PreTrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from transformers.utils import logging
+from transformers.models.bert.configuration_bert import BertConfig
+from llava.model.utils import LayerNorm
+
+logger = logging.get_logger(__name__)
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word and position embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id
+        )
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size
+        )
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.LayerNorm = LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))
+        )
+        self.position_embedding_type = getattr(
+            config, "position_embedding_type", "absolute"
+        )
+
+        self.config = config
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        query_embeds=None,
+        past_key_values_length=0,
+    ):
+        if input_ids is not None:
+            seq_length = input_ids.size()[1]
+        else:
+            seq_length = 0
+
+        if position_ids is None:
+            position_ids = self.position_ids[
+                :, past_key_values_length : seq_length + past_key_values_length
+            ].clone()
+
+        if input_ids is not None:
+            embeddings = self.word_embeddings(input_ids)
+            if self.position_embedding_type == "absolute":
+                position_embeddings = self.position_embeddings(position_ids)
+                embeddings = embeddings + position_embeddings
+
+            if query_embeds is not None:
+                embeddings = torch.cat((query_embeds, embeddings), dim=1)
+        else:
+            embeddings = query_embeds
+
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertSelfAttention(nn.Module):
+    def __init__(self, config, is_cross_attention):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+            config, "embedding_size"
+        ):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_width, self.all_head_size)
+            self.value = nn.Linear(config.encoder_width, self.all_head_size)
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if (
+            self.position_embedding_type == "relative_key"
+            or self.position_embedding_type == "relative_key_query"
+        ):
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(
+                2 * config.max_position_embeddings - 1, self.attention_head_size
+            )
+        self.save_attention = False
+
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+
+    def get_attn_gradients(self):
+        return self.attn_gradients
+
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+
+    def get_attention_map(self):
+        return self.attention_map
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (
+            self.num_attention_heads,
+            self.attention_head_size,
+        )
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        mixed_query_layer = self.query(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if (
+            self.position_embedding_type == "relative_key"
+            or self.position_embedding_type == "relative_key_query"
+        ):
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(
+                seq_length, dtype=torch.long, device=hidden_states.device
+            ).view(-1, 1)
+            position_ids_r = torch.arange(
+                seq_length, dtype=torch.long, device=hidden_states.device
+            ).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(
+                distance + self.max_position_embeddings - 1
+            )
+            positional_embedding = positional_embedding.to(
+                dtype=query_layer.dtype
+            )  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum(
+                    "bhld,lrd->bhlr", query_layer, positional_embedding
+                )
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum(
+                    "bhld,lrd->bhlr", query_layer, positional_embedding
+                )
+                relative_position_scores_key = torch.einsum(
+                    "bhrd,lrd->bhlr", key_layer, positional_embedding
+                )
+                attention_scores = (
+                    attention_scores
+                    + relative_position_scores_query
+                    + relative_position_scores_key
+                )
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores.to(torch.float32)).to(attention_scores.dtype)
+
+        if is_cross_attention and self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+
+        context_layer = torch.matmul(attention_probs_dropped, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (
+            (context_layer, attention_probs) if output_attentions else (context_layer,)
+        )
+
+        outputs = outputs + (past_key_value,)
+        return outputs
+
+
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.LayerNorm = LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.self = BertSelfAttention(config, is_cross_attention)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads,
+            self.self.num_attention_heads,
+            self.self.attention_head_size,
+            self.pruned_heads,
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = (
+            self.self.attention_head_size * self.self.num_attention_heads
+        )
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[
+            1:
+        ]  # add attentions if we output them
+        return outputs
+
+
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.LayerNorm = LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+    def __init__(self, config, layer_num):
+        super().__init__()
+        self.config = config
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config)
+        self.layer_num = layer_num
+        if (
+            self.config.add_cross_attention
+            and layer_num % self.config.cross_attention_freq == 0
+        ):
+            self.crossattention = BertAttention(
+                config, is_cross_attention=self.config.add_cross_attention
+            )
+            self.has_cross_attention = True
+        else:
+            self.has_cross_attention = False
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+        self.intermediate_query = BertIntermediate(config)
+        self.output_query = BertOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        query_length=0,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = (
+            past_key_value[:2] if past_key_value is not None else None
+        )
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:-1]
+
+        present_key_value = self_attention_outputs[-1]
+
+        if query_length > 0:
+            query_attention_output = attention_output[:, :query_length, :]
+
+            if self.has_cross_attention:
+                assert (
+                    encoder_hidden_states is not None
+                ), "encoder_hidden_states must be given for cross-attention layers"
+                cross_attention_outputs = self.crossattention(
+                    query_attention_output,
+                    attention_mask,
+                    head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+                query_attention_output = cross_attention_outputs[0]
+                outputs = (
+                    outputs + cross_attention_outputs[1:-1]
+                )  # add cross attentions if we output attention weights
+
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk_query,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                query_attention_output,
+            )
+            if attention_output.shape[1] > query_length:
+                layer_output_text = apply_chunking_to_forward(
+                    self.feed_forward_chunk,
+                    self.chunk_size_feed_forward,
+                    self.seq_len_dim,
+                    attention_output[:, query_length:, :],
+                )
+                layer_output = torch.cat([layer_output, layer_output_text], dim=1)
+        else:
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                attention_output,
+            )
+        outputs = (layer_output,) + outputs
+
+        outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+    def feed_forward_chunk_query(self, attention_output):
+        intermediate_output = self.intermediate_query(attention_output)
+        layer_output = self.output_query(intermediate_output, attention_output)
+        return layer_output
+
+
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [BertLayer(config, i) for i in range(config.num_hidden_layers)]
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        query_length=0,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = (
+            () if output_attentions and self.config.add_cross_attention else None
+        )
+
+        next_decoder_cache = () if use_cache else None
+
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+
+                if use_cache:
+                    logger.warn(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(
+                            *inputs, past_key_value, output_attentions, query_length
+                        )
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                    query_length,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.LayerNorm = LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class BertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = BertConfig
+    base_model_prefix = "bert"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+
+class BertModel(BertPreTrainedModel):
+    """
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+    input to the forward pass.
+    """
+
+    def __init__(self, config, add_pooling_layer=False):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = BertEmbeddings(config)
+
+        self.encoder = BertEncoder(config)
+
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_extended_attention_mask(
+        self,
+        attention_mask: Tensor,
+        input_shape: Tuple[int],
+        device: device,
+        is_decoder: bool,
+        has_query: bool = False,
+    ) -> Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+
+        Arguments:
+            attention_mask (:obj:`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (:obj:`Tuple[int]`):
+                The shape of the input to the model.
+            device: (:obj:`torch.device`):
+                The device of the input to the model.
+
+        Returns:
+            :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if is_decoder:
+                batch_size, seq_length = input_shape
+
+                seq_ids = torch.arange(seq_length, device=device)
+                causal_mask = (
+                    seq_ids[None, None, :].repeat(batch_size, seq_length, 1)
+                    <= seq_ids[None, :, None]
+                )
+
+                # add a prefix ones mask to the causal mask
+                # causal and attention masks must have same type with pytorch version < 1.3
+                causal_mask = causal_mask.to(attention_mask.dtype)
+
+                if causal_mask.shape[1] < attention_mask.shape[1]:
+                    prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
+                    if has_query:  # UniLM style attention mask
+                        causal_mask = torch.cat(
+                            [
+                                torch.zeros(
+                                    (batch_size, prefix_seq_len, seq_length),
+                                    device=device,
+                                    dtype=causal_mask.dtype,
+                                ),
+                                causal_mask,
+                            ],
+                            axis=1,
+                        )
+                    causal_mask = torch.cat(
+                        [
+                            torch.ones(
+                                (batch_size, causal_mask.shape[1], prefix_seq_len),
+                                device=device,
+                                dtype=causal_mask.dtype,
+                            ),
+                            causal_mask,
+                        ],
+                        axis=-1,
+                    )
+                extended_attention_mask = (
+                    causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
+                )
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(
+            dtype=self.dtype
+        )  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        query_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        is_decoder=False,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        # use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if input_ids is None:
+            assert (
+                query_embeds is not None
+            ), "You have to specify query_embeds when input_ids is None"
+
+        # past_key_values_length
+        past_key_values_length = (
+            past_key_values[0][0].shape[2] - self.config.query_length
+            if past_key_values is not None
+            else 0
+        )
+
+        query_length = query_embeds.shape[1] if query_embeds is not None else 0
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            query_embeds=query_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+
+        input_shape = embedding_output.size()[:-1]
+        batch_size, seq_length = input_shape
+        device = embedding_output.device
+
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                ((batch_size, seq_length + past_key_values_length)), device=device
+            )
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if is_decoder:
+            extended_attention_mask = self.get_extended_attention_mask(
+                attention_mask,
+                input_ids.shape,
+                device,
+                is_decoder,
+                has_query=(query_embeds is not None),
+            )
+        else:
+            extended_attention_mask = self.get_extended_attention_mask(
+                attention_mask, input_shape, device, is_decoder
+            )
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if type(encoder_hidden_states) == list:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[
+                    0
+                ].size()
+            else:
+                (
+                    encoder_batch_size,
+                    encoder_sequence_length,
+                    _,
+                ) = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+
+            if type(encoder_attention_mask) == list:
+                encoder_extended_attention_mask = [
+                    self.invert_attention_mask(mask) for mask in encoder_attention_mask
+                ]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+                encoder_extended_attention_mask = self.invert_attention_mask(
+                    encoder_attention_mask
+                )
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(
+                    encoder_attention_mask
+                )
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            query_length=query_length,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = (
+            self.pooler(sequence_output) if self.pooler is not None else None
+        )
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+class BertLMHeadModel(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        query_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=True,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        return_logits=False,
+        is_decoder=True,
+        reduction="mean",
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
+            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        Returns:
+        Example::
+            >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
+            >>> import torch
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+            >>> config = BertConfig.from_pretrained("bert-base-cased")
+            >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+            >>> prediction_logits = outputs.logits
+        """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        if labels is not None:
+            use_cache = False
+        if past_key_values is not None:
+            query_embeds = None
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            query_embeds=query_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+        )
+
+        sequence_output = outputs[0]
+        if query_embeds is not None:
+            sequence_output = outputs[0][:, query_embeds.shape[1] :, :]
+
+        prediction_scores = self.cls(sequence_output)
+
+        if return_logits:
+            return prediction_scores[:, :-1, :].contiguous()
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1)
+            lm_loss = loss_fct(
+                shifted_prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1),
+            )
+            if reduction == "none":
+                lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1)
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, query_embeds, past=None, attention_mask=None, **model_kwargs
+    ):
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_ids.shape)
+        query_mask = input_ids.new_ones(query_embeds.shape[:-1])
+        attention_mask = torch.cat([query_mask, attention_mask], dim=-1)
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            "input_ids": input_ids,
+            "query_embeds": query_embeds,
+            "attention_mask": attention_mask,
+            "past_key_values": past,
+            "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None),
+            "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None),
+            "is_decoder": True,
+        }
+
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (
+                tuple(
+                    past_state.index_select(0, beam_idx) for past_state in layer_past
+                ),
+            )
+        return reordered_past
+
+
+class BertForMaskedLM(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        query_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        return_logits=False,
+        is_decoder=False,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        """
+
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            query_embeds=query_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+        )
+
+        if query_embeds is not None:
+            sequence_output = outputs[0][:, query_embeds.shape[1] :, :]
+        prediction_scores = self.cls(sequence_output)
+
+        if return_logits:
+            return prediction_scores
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)
+            )
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return (
+                ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+            )
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
\ No newline at end of file
diff --git a/app/llava/model/multimodal_encoder/resampler.py b/app/llava/model/multimodal_encoder/resampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..cba5183e6df5e65212e6b15672b399368de16582
--- /dev/null
+++ b/app/llava/model/multimodal_encoder/resampler.py
@@ -0,0 +1,260 @@
+import torch
+import numpy as np
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.init import trunc_normal_
+import math
+
+def get_abs_pos(abs_pos, tgt_size):
+    # abs_pos: L, C
+    # tgt_size: M
+    # return: M, C
+    src_size = int(math.sqrt(abs_pos.size(0)))
+    tgt_size = int(math.sqrt(tgt_size))
+    dtype = abs_pos.dtype
+
+    if src_size != tgt_size:
+        return F.interpolate(
+            abs_pos.float().reshape(1, src_size, src_size, -1).permute(0, 3, 1, 2),
+            size=(tgt_size, tgt_size),
+            mode="bicubic",
+            align_corners=False,
+        ).permute(0, 2, 3, 1).flatten(0, 2).to(dtype=dtype)
+    else:
+        return abs_pos
+
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+
+
+class Resampler(nn.Module):
+    """
+    A 2D perceiver-resampler network with one cross attention layers by
+        (grid_size**2) learnable queries and 2d sincos pos_emb
+    Outputs:
+        A tensor with the shape of (grid_size**2, embed_dim)
+    """
+    def __init__(
+            self,
+            grid_size,
+            embed_dim,
+            num_heads,
+            kv_dim=None,
+            norm_layer=nn.LayerNorm
+    ):
+        super().__init__()
+        self.num_queries = grid_size ** 2
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+
+        self.pos_embed = nn.Parameter(
+            torch.from_numpy(get_2d_sincos_pos_embed(embed_dim, grid_size)).float()
+        ).requires_grad_(False)
+
+        self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim))
+        trunc_normal_(self.query, std=.02)
+
+        if kv_dim is not None and kv_dim != embed_dim:
+            self.kv_proj = nn.Linear(kv_dim, embed_dim, bias=False)
+        else:
+            self.kv_proj = nn.Identity()
+
+        self.attn = nn.MultiheadAttention(embed_dim, num_heads) # batch_first = False
+        self.ln_q = norm_layer(embed_dim)
+        self.ln_kv = norm_layer(embed_dim)
+        
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def forward(self, x, attn_mask=None):
+
+        pos_embed = get_abs_pos(self.pos_embed, x.size(1))
+
+        x = self.kv_proj(x)
+        x = self.ln_kv(x).permute(1, 0, 2)
+
+        N = x.shape[1]
+        q = self.ln_q(self.query).to(dtype=x.dtype)
+        out = self.attn(
+            self._repeat(q, N) + self.pos_embed.unsqueeze(1).to(dtype=x.dtype),
+            x + pos_embed.unsqueeze(1).to(dtype=x.dtype),
+            x,
+            attn_mask=attn_mask)[0]
+        return out.permute(1, 0, 2)
+
+    def _repeat(self, query, N: int):
+        return query.unsqueeze(1).repeat(1, N, 1)
+
+
+
+
+class TokenCompressor(nn.Module):
+    def __init__(
+            self,
+            num_compressed_token,
+            embed_dim,
+            num_heads,
+            kv_dim=None,
+            norm_layer=nn.LayerNorm
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.num_compressed_token = num_compressed_token
+
+        if kv_dim is not None and kv_dim != embed_dim:
+            self.kv_proj = nn.Linear(kv_dim, embed_dim, bias=False)
+        else:
+            self.kv_proj = nn.Identity()
+
+        self.attn = nn.MultiheadAttention(embed_dim, num_heads)
+        self.ln_q = norm_layer(embed_dim)
+        self.ln_kv = norm_layer(embed_dim)
+        
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        # zero initializatoin ,identical
+        if isinstance(m, nn.Linear):
+            # trunc_normal_(m.weight, std=.02)
+            nn.init.constant_(m.weight, 0.0)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def forward(self, x, attn_mask=None, compress_version='v0'):
+        
+        topk_index_sorted = self.token_filter(x, compress_version)
+        out = torch.gather(x, 1, topk_index_sorted.unsqueeze(-1).expand(-1, -1, x.shape[-1])).permute(1, 0, 2)
+        q = torch.gather(x, 1, topk_index_sorted.unsqueeze(-1).expand(-1, -1, x.shape[-1])).clone().detach()
+        q = q.permute(1, 0, 2)
+        x = self.kv_proj(x)
+        x = self.ln_kv(x).permute(1, 0, 2)
+
+        q = self.ln_q(q)
+        out += self.attn(
+            q,
+            x,
+            x,
+            attn_mask=attn_mask)[0]
+        return out.permute(1, 0, 2)
+    
+
+    def token_filter(self, x, compress_version='v0'):
+        tokens = x.clone().detach()
+        tokens_norm = tokens / tokens.norm(dim=-1, keepdim=True)
+        attn_scores = torch.matmul(tokens_norm, tokens_norm.transpose(2, 1))
+        # mask = torch.ones((tokens_norm.shape[1], tokens_norm.shape[1]), device=tokens_norm.device).triu()
+        if compress_version == 'v0':
+            mask = torch.eye(tokens_norm.shape[1], device=tokens_norm.device)
+        elif compress_version == 'v1':
+            mask = torch.ones((tokens_norm.shape[1], tokens_norm.shape[1]), device=tokens_norm.device).triu()
+        else:
+            NotImplementedError
+        attn_scores = attn_scores.masked_fill(mask == 1, 1e-5)
+        importances = 1 - attn_scores.max(dim=-1)[0]
+        topk_index = torch.topk(importances, self.num_compressed_token)[1]
+        topk_index_sorted =  torch.sort(topk_index, dim=-1)[0]
+        return topk_index_sorted
+    
+
+
+
+
+
+
+
+# from flash_perceiver import Perceiver, utils
+# from torchstat import stat
+# batch_size, seq_len, in_dim = 32, 5120, 1024
+
+# latent_dim = 1024
+# num_latents = 128
+# out_dim = 1024
+
+# model = Perceiver(
+#     input_dim=in_dim,
+#     depth=4,
+#     output_dim=out_dim,
+#     num_latents=num_latents,
+#     latent_dim=latent_dim,
+#     cross_heads=1,
+#     cross_head_dim=64,
+#     cross_rotary_emb_dim=0,
+#     cross_attn_dropout=0.0,
+#     latent_heads=8,
+#     latent_head_dim=64,
+#     latent_rotary_emb_dim=0,
+#     latent_attn_dropout=0.0,
+#     weight_tie_layers=False,
+#     gated_mlp=True,
+#     self_per_cross_attn=1,
+#     num_zero_tokens=None,
+#     use_flash_attn=True,
+# ).cuda()
+
+# data = torch.randn(batch_size, seq_len, in_dim, device='cuda:0')
+
+# # `out_dim` specified; averages and projects output
+# # Note: FlashAttention only supports half-precision.
+# #  We need to use `torch.autocast` for the forward-pass
+# with torch.autocast('cuda'):
+#     out = model(data, return_embeddings=True)
+#     print(torch.cuda.max_memory_allocated(device=None))
+# print(out.shape)
\ No newline at end of file
diff --git a/app/llava/model/multimodal_encoder/siglip/__pycache__/configuration_siglip.cpython-310.pyc b/app/llava/model/multimodal_encoder/siglip/__pycache__/configuration_siglip.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..023156b9d69bdd063900032bb55211925c11fcfb
Binary files /dev/null and b/app/llava/model/multimodal_encoder/siglip/__pycache__/configuration_siglip.cpython-310.pyc differ
diff --git a/app/llava/model/multimodal_encoder/siglip/__pycache__/configuration_siglip.cpython-39.pyc b/app/llava/model/multimodal_encoder/siglip/__pycache__/configuration_siglip.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d9965cb87abb13a3da1a7827279a07ad5f47fe7
Binary files /dev/null and b/app/llava/model/multimodal_encoder/siglip/__pycache__/configuration_siglip.cpython-39.pyc differ
diff --git a/app/llava/model/multimodal_encoder/siglip/__pycache__/modeling_siglip.cpython-310.pyc b/app/llava/model/multimodal_encoder/siglip/__pycache__/modeling_siglip.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..58a1313e2fe1ed1ab5f342a4dc1cc0a3deb6cbc7
Binary files /dev/null and b/app/llava/model/multimodal_encoder/siglip/__pycache__/modeling_siglip.cpython-310.pyc differ
diff --git a/app/llava/model/multimodal_encoder/siglip/__pycache__/modeling_siglip.cpython-39.pyc b/app/llava/model/multimodal_encoder/siglip/__pycache__/modeling_siglip.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..146b985f0fc0f409bbd22984765a9b5e3b44a70d
Binary files /dev/null and b/app/llava/model/multimodal_encoder/siglip/__pycache__/modeling_siglip.cpython-39.pyc differ
diff --git a/app/llava/model/multimodal_encoder/siglip/configuration_siglip.py b/app/llava/model/multimodal_encoder/siglip/configuration_siglip.py
new file mode 100644
index 0000000000000000000000000000000000000000..99e37ad88012851ffa1ac987b0f129dc0e08c628
--- /dev/null
+++ b/app/llava/model/multimodal_encoder/siglip/configuration_siglip.py
@@ -0,0 +1,306 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Siglip model configuration"""
+
+import os
+from typing import Union
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+SIGLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "google/siglip-base-patch16-224": "https://huggingface.co/google/siglip-base-patch16-224/resolve/main/config.json",
+}
+
+
+class SiglipTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SiglipTextModel`]. It is used to instantiate a
+    Siglip text encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the text encoder of the Siglip
+    [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Siglip text model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`SiglipModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 64):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        pad_token_id (`int`, *optional*, defaults to 1):
+            The id of the padding token in the vocabulary.
+        bos_token_id (`int`, *optional*, defaults to 49406):
+            The id of the beginning-of-sequence token in the vocabulary.
+        eos_token_id (`int`, *optional*, defaults to 49407):
+            The id of the end-of-sequence token in the vocabulary.
+
+    Example:
+
+    ```python
+    >>> from transformers import SiglipTextConfig, SiglipTextModel
+
+    >>> # Initializing a SiglipTextConfig with google/siglip-base-patch16-224 style configuration
+    >>> configuration = SiglipTextConfig()
+
+    >>> # Initializing a SiglipTextModel (with random weights) from the google/siglip-base-patch16-224 style configuration
+    >>> model = SiglipTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "siglip_text_model"
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        max_position_embeddings=64,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        # This differs from `CLIPTokenizer`'s default and from openai/siglip
+        # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
+        pad_token_id=1,
+        bos_token_id=49406,
+        eos_token_id=49407,
+        _flash_attn_2_enabled=True,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.attention_dropout = attention_dropout
+        self._flash_attn_2_enabled = _flash_attn_2_enabled
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the text config dict if we are loading from SiglipConfig
+        if config_dict.get("model_type") == "siglip":
+            config_dict = config_dict["text_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class SiglipVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
+    Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip
+    [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input images.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+
+    Example:
+
+    ```python
+    >>> from transformers import SiglipVisionConfig, SiglipVisionModel
+
+    >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration
+    >>> configuration = SiglipVisionConfig()
+
+    >>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration
+    >>> model = SiglipVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "siglip_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=16,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        _flash_attn_2_enabled=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self._flash_attn_2_enabled = _flash_attn_2_enabled
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from SiglipConfig
+        if config_dict.get("model_type") == "siglip":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class SiglipConfig(PretrainedConfig):
+    r"""
+    [`SiglipConfig`] is the configuration class to store the configuration of a [`SiglipModel`]. It is used to
+    instantiate a Siglip model according to the specified arguments, defining the text model and vision model configs.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Siglip
+    [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`SiglipTextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`SiglipVisionConfig`].
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import SiglipConfig, SiglipModel
+
+    >>> # Initializing a SiglipConfig with google/siglip-base-patch16-224 style configuration
+    >>> configuration = SiglipConfig()
+
+    >>> # Initializing a SiglipModel (with random weights) from the google/siglip-base-patch16-224 style configuration
+    >>> model = SiglipModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a SiglipConfig from a SiglipTextConfig and a SiglipVisionConfig
+    >>> from transformers import SiglipTextConfig, SiglipVisionConfig
+
+    >>> # Initializing a SiglipText and SiglipVision configuration
+    >>> config_text = SiglipTextConfig()
+    >>> config_vision = SiglipVisionConfig()
+
+    >>> config = SiglipConfig.from_text_vision_configs(config_text, config_vision)
+    ```"""
+
+    model_type = "siglip"
+
+    def __init__(self, text_config=None, vision_config=None, **kwargs):
+        super().__init__(**kwargs)
+
+        if text_config is None:
+            text_config = {}
+            logger.info("`text_config` is `None`. Initializing the `SiglipTextConfig` with default values.")
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("`vision_config` is `None`. initializing the `SiglipVisionConfig` with default values.")
+
+        self.text_config = SiglipTextConfig(**text_config)
+        self.vision_config = SiglipVisionConfig(**vision_config)
+
+        self.initializer_factor = 1.0
+
+    @classmethod
+    def from_text_vision_configs(cls, text_config: SiglipTextConfig, vision_config: SiglipVisionConfig, **kwargs):
+        r"""
+        Instantiate a [`SiglipConfig`] (or a derived class) from siglip text model configuration and siglip vision
+        model configuration.
+
+        Returns:
+            [`SiglipConfig`]: An instance of a configuration object
+        """
+
+        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
diff --git a/app/llava/model/multimodal_encoder/siglip/modeling_siglip.py b/app/llava/model/multimodal_encoder/siglip/modeling_siglip.py
new file mode 100644
index 0000000000000000000000000000000000000000..43e631bc02fa7bdba6f003610768614206042f14
--- /dev/null
+++ b/app/llava/model/multimodal_encoder/siglip/modeling_siglip.py
@@ -0,0 +1,1473 @@
+# coding=utf-8
+# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Siglip model."""
+
+
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn.init import _calculate_fan_in_and_fan_out
+
+from transformers.activations import ACT2FN
+from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_siglip import SiglipConfig, SiglipTextConfig, SiglipVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
+
+SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "google/siglip-base-patch16-224",
+    # See all SigLIP models at https://huggingface.co/models?filter=siglip
+]
+
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+def _trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+
+    # Values are generated by using a truncated uniform distribution and
+    # then using the inverse CDF for the normal distribution.
+    # Get upper and lower cdf values
+    l = norm_cdf((a - mean) / std)
+    u = norm_cdf((b - mean) / std)
+
+    # Uniformly fill tensor with values from [l, u], then translate to
+    # [2l-1, 2u-1].
+    tensor.uniform_(2 * l - 1, 2 * u - 1)
+
+    # Use inverse cdf transform for normal distribution to get truncated
+    # standard normal
+    if tensor.dtype in [torch.float16, torch.bfloat16]:
+        # The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu
+        og_dtype = tensor.dtype
+        tensor = tensor.to(torch.float32)
+        tensor.erfinv_()
+        tensor = tensor.to(og_dtype)
+    else:
+        tensor.erfinv_()
+
+    # Transform to proper mean, std
+    tensor.mul_(std * math.sqrt(2.0))
+    tensor.add_(mean)
+
+    # Clamp to ensure it's in the proper range
+    if tensor.dtype == torch.float16:
+        # The `clamp_` op is not (yet?) defined in float16+cpu
+        tensor = tensor.to(torch.float32)
+        tensor.clamp_(min=a, max=b)
+        tensor = tensor.to(torch.float16)
+    else:
+        tensor.clamp_(min=a, max=b)
+
+
+def trunc_normal_tf_(
+    tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
+) -> torch.Tensor:
+    """Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \\leq \text{mean} \\leq b`.
+
+    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
+    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
+    and the result is subsquently scaled and shifted by the mean and std args.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    """
+    with torch.no_grad():
+        _trunc_normal_(tensor, 0, 1.0, a, b)
+        tensor.mul_(std).add_(mean)
+
+
+def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    if mode == "fan_in":
+        denom = fan_in
+    elif mode == "fan_out":
+        denom = fan_out
+    elif mode == "fan_avg":
+        denom = (fan_in + fan_out) / 2
+
+    variance = scale / denom
+
+    if distribution == "truncated_normal":
+        # constant is stddev of standard normal truncated to (-2, 2)
+        trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
+    elif distribution == "normal":
+        with torch.no_grad():
+            tensor.normal_(std=math.sqrt(variance))
+    elif distribution == "uniform":
+        bound = math.sqrt(3 * variance)
+        with torch.no_grad():
+            tensor.uniform_(-bound, bound)
+    else:
+        raise ValueError(f"invalid distribution {distribution}")
+
+
+def lecun_normal_(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
+
+
+def default_flax_embed_init(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="normal")
+
+
+@dataclass
+# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->Siglip
+class SiglipVisionModelOutput(ModelOutput):
+    """
+    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
+
+    Args:
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    image_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+# Copied from transformers.models.clip.modeling_clip.CLIPTextModelOutput with CLIP->Siglip
+class SiglipTextModelOutput(ModelOutput):
+    """
+    Base class for text model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The text embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    text_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+# Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->Siglip
+class SiglipOutput(ModelOutput):
+    """
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+            Contrastive loss for image-text similarity.
+        logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
+            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
+            similarity scores.
+        logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
+            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
+            similarity scores.
+        text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The text embeddings obtained by applying the projection layer to the pooled output of [`SiglipTextModel`].
+        image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The image embeddings obtained by applying the projection layer to the pooled output of [`SiglipVisionModel`].
+        text_model_output(`BaseModelOutputWithPooling`):
+            The output of the [`SiglipTextModel`].
+        vision_model_output(`BaseModelOutputWithPooling`):
+            The output of the [`SiglipVisionModel`].
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits_per_image: torch.FloatTensor = None
+    logits_per_text: torch.FloatTensor = None
+    text_embeds: torch.FloatTensor = None
+    image_embeds: torch.FloatTensor = None
+    text_model_output: BaseModelOutputWithPooling = None
+    vision_model_output: BaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+class SiglipVisionEmbeddings(nn.Module):
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+
+        self.num_patches_per_side = self.image_size // self.patch_size
+        self.num_patches = self.num_patches_per_side**2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+
+    def forward(self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.BoolTensor) -> torch.Tensor:
+        batch_size = pixel_values.size(0)
+
+        patch_embeds = self.patch_embedding(pixel_values)
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+
+        max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3)
+        max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, max_im_w // self.patch_size
+        boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side)
+        position_ids = torch.full(
+            size=(
+                batch_size,
+                max_nb_patches_h * max_nb_patches_w,
+            ),
+            fill_value=0,
+        )
+
+        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
+            nb_patches_h = p_attn_mask[:, 0].sum()
+            nb_patches_w = p_attn_mask[0].sum()
+
+            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
+            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
+
+            bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True)
+            bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
+
+            pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten()
+            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
+
+        position_ids = position_ids.to(self.position_embedding.weight.device)
+
+        embeddings = embeddings + self.position_embedding(position_ids)
+        return embeddings
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPTextEmbeddings with CLIP->Siglip
+class SiglipTextEmbeddings(nn.Module):
+    def __init__(self, config: SiglipTextConfig):
+        super().__init__()
+        embed_dim = config.hidden_size
+
+        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
+        self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.token_embedding(input_ids)
+
+        position_embeddings = self.position_embedding(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+
+        return embeddings
+
+
+class SiglipAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        k_v_seq_len = key_states.shape[-2]
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
+
+        if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+class SiglipFlashAttention2(SiglipAttention):
+    """
+    Llama flash attention module. This module inherits from `LlamaAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.is_causal = False  # Hack to make sure we don't use a causal mask
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        output_attentions = False
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        # cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        # query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        # if past_key_value is not None:
+        #     cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+        #     key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        dropout_rate = self.dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (LlamaRMSNorm handles it correctly)
+
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                "The input hidden states seems to be silently casted in float32, this might be related to the fact"
+                " you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_output = self._flash_attention_forward(
+            query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, self.embed_dim).contiguous()
+        attn_output = self.out_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights
+
+    def _flash_attention_forward(
+        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+        """
+
+        # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+        causal = self.is_causal and query_length != 1
+
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            attn_output = flash_attn_func(
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
+            )
+
+        return attn_output
+
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
+class SiglipMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Siglip
+class SiglipEncoderLayer(nn.Module):
+    def __init__(self, config: SiglipConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = (
+            SiglipAttention(config)
+            if not getattr(config, "_flash_attn_2_enabled", False)
+            else SiglipFlashAttention2(config)
+        )
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+            attention_mask (`torch.FloatTensor`):
+                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class SiglipPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = SiglipConfig
+    base_model_prefix = "siglip"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+
+        if isinstance(module, SiglipVisionEmbeddings):
+            width = (
+                self.config.vision_config.hidden_size
+                if isinstance(self.config, SiglipConfig)
+                else self.config.hidden_size
+            )
+            nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
+        elif isinstance(module, nn.Embedding):
+            default_flax_embed_init(module.weight)
+        elif isinstance(module, SiglipAttention):
+            nn.init.normal_(module.q_proj.weight)
+            nn.init.normal_(module.k_proj.weight)
+            nn.init.normal_(module.v_proj.weight)
+            nn.init.normal_(module.out_proj.weight)
+            nn.init.zeros_(module.q_proj.bias)
+            nn.init.zeros_(module.k_proj.bias)
+            nn.init.zeros_(module.v_proj.bias)
+            nn.init.zeros_(module.out_proj.bias)
+        elif isinstance(module, SiglipMLP):
+            nn.init.normal_(module.fc1.weight)
+            nn.init.normal_(module.fc2.weight)
+            nn.init.normal_(module.fc1.bias, std=1e-6)
+            nn.init.normal_(module.fc2.bias, std=1e-6)
+        elif isinstance(module, SiglipMultiheadAttentionPoolingHead):
+            nn.init.normal_(module.probe.data)
+            nn.init.normal_(module.attention.in_proj_weight.data)
+            nn.init.zeros_(module.attention.in_proj_bias.data)
+        elif isinstance(module, SiglipModel):
+            logit_scale_init = torch.tensor(0.0)
+            module.logit_scale.data.fill_(logit_scale_init)
+            module.logit_bias.data.zero_()
+        elif isinstance(module, (nn.Linear, nn.Conv2d)):
+            lecun_normal_(module.weight)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+SIGLIP_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`SiglipConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+SIGLIP_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+SIGLIP_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+SIGLIP_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Siglip
+class SiglipEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`SiglipEncoderLayer`].
+
+    Args:
+        config: SiglipConfig
+    """
+
+    def __init__(self, config: SiglipConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    # Ignore copy
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    encoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class SiglipTextTransformer(nn.Module):
+    def __init__(self, config: SiglipTextConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = SiglipTextEmbeddings(config)
+        self.encoder = SiglipEncoder(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+        self.head = nn.Linear(embed_dim, embed_dim)
+
+    @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None:
+            raise ValueError("You have to specify input_ids")
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+
+        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+
+        # note: SigLIP's text model does not use a causal mask, unlike the original CLIP model.
+        # expand attention_mask
+        if attention_mask is not None:
+            # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+        # Assuming "sticky" EOS tokenization, last token is always EOS.
+        pooled_output = last_hidden_state[:, -1, :]
+        pooled_output = self.head(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """The text model from SigLIP without any head or projection on top.""",
+    SIGLIP_START_DOCSTRING,
+)
+class SiglipTextModel(SiglipPreTrainedModel):
+    config_class = SiglipTextConfig
+
+    _no_split_modules = ["SiglipTextEmbeddings", "SiglipEncoderLayer"]
+
+    def __init__(self, config: SiglipTextConfig):
+        super().__init__(config)
+        self.text_model = SiglipTextTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.embeddings.token_embedding
+
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.token_embedding = value
+
+    @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, SiglipTextModel
+
+        >>> model = SiglipTextModel.from_pretrained("google/siglip-base-patch16-224")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224")
+
+        >>> # important: make sure to set padding="max_length" as that's how the model was trained
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        return self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class SiglipVisionTransformer(nn.Module):
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = SiglipVisionEmbeddings(config)
+        self.encoder = SiglipEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.head = SiglipMultiheadAttentionPoolingHead(config)
+
+    @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipVisionConfig)
+    def forward(
+        self,
+        pixel_values,
+        patch_attention_mask: Optional[torch.BoolTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size = pixel_values.size(0)
+        if patch_attention_mask is None:
+            patch_attention_mask = torch.ones(
+                size=(
+                    batch_size,
+                    pixel_values.size(2) // self.config.patch_size,
+                    pixel_values.size(3) // self.config.patch_size,
+                ),
+                dtype=torch.bool,
+                device=pixel_values.device,
+            )
+
+        hidden_states = self.embeddings(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask)
+
+        patch_attention_mask = patch_attention_mask.view(batch_size, -1)
+        # The call to `_upad_input` in `_flash_attention_forward` is expensive
+        # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
+        # avoiding passing the attention_mask, which is equivalent to attending to the full sequence
+        if not torch.any(~patch_attention_mask):
+            attention_mask=None
+        else:
+            attention_mask = (
+                _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype)
+                if not self.config._flash_attn_2_enabled
+                else patch_attention_mask
+            )
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        pooled_output = self.head(
+            hidden_state=last_hidden_state,
+            attention_mask=patch_attention_mask,
+        )
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class SiglipMultiheadAttentionPoolingHead(nn.Module):
+    """Multihead Attention Pooling."""
+
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+
+        self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.attention = torch.nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, batch_first=True)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(config)
+
+    def forward(self, hidden_state, attention_mask):
+        batch_size = hidden_state.shape[0]
+        probe = self.probe.repeat(batch_size, 1, 1)
+
+        hidden_state = self.attention(
+            query=probe, key=hidden_state, value=hidden_state, key_padding_mask=~attention_mask
+        )[0]
+
+        residual = hidden_state
+        hidden_state = self.layernorm(hidden_state)
+        hidden_state = residual + self.mlp(hidden_state)
+
+        return hidden_state[:, 0]
+
+
+@add_start_docstrings(
+    """The vision model from SigLIP without any head or projection on top.""",
+    SIGLIP_START_DOCSTRING,
+)
+class SiglipVisionModel(SiglipPreTrainedModel):
+    config_class = SiglipVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__(config)
+
+        self.vision_model = SiglipVisionTransformer(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipVisionConfig)
+    def forward(
+        self,
+        pixel_values,
+        patch_attention_mask: Optional[torch.BoolTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, SiglipVisionModel
+
+        >>> model = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224")
+        >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled features
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        return self.vision_model(
+            pixel_values=pixel_values,
+            patch_attention_mask=patch_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+@add_start_docstrings(SIGLIP_START_DOCSTRING)
+class SiglipModel(SiglipPreTrainedModel):
+    config_class = SiglipConfig
+
+    def __init__(self, config: SiglipConfig):
+        super().__init__(config)
+
+        if not isinstance(config.text_config, SiglipTextConfig):
+            raise ValueError(
+                "config.text_config is expected to be of type SiglipTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.vision_config, SiglipVisionConfig):
+            raise ValueError(
+                "config.vision_config is expected to be of type SiglipVisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        self.text_model = SiglipTextTransformer(text_config)
+        self.vision_model = SiglipVisionTransformer(vision_config)
+
+        self.logit_scale = nn.Parameter(torch.randn(1))
+        self.logit_bias = nn.Parameter(torch.randn(1))
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`SiglipTextModel`].
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, AutoModel
+        >>> import torch
+
+        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224")
+
+        >>> # important: make sure to set padding="max_length" as that's how the model was trained
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")
+        >>> with torch.no_grad():
+        ...     text_features = model.get_text_features(**inputs)
+        ```"""
+        # Use SigLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = text_outputs[1]
+
+        return pooled_output
+
+    @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`SiglipVisionModel`].
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, AutoModel
+        >>> import torch
+
+        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
+        >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> with torch.no_grad():
+        ...     image_features = model.get_image_features(**inputs)
+        ```"""
+        # Use SiglipModel's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = vision_outputs[1]
+
+        return pooled_output
+
+    @add_start_docstrings_to_model_forward(SIGLIP_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=SiglipOutput, config_class=SiglipConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SiglipOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, AutoModel
+        >>> import torch
+
+        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
+        >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> texts = ["a photo of 2 cats", "a photo of 2 dogs"]
+        >>> # important: we pass `padding=max_length` since the model was trained with this
+        >>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")
+
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+
+        >>> logits_per_image = outputs.logits_per_image
+        >>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
+        >>> print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
+        31.9% that image 0 is 'a photo of 2 cats'
+        ```"""
+        # Use SigLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        image_embeds = vision_outputs[1]
+        text_embeds = text_outputs[1]
+
+        # normalized features
+        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+
+        # cosine similarity as logits
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * self.logit_scale.exp() + self.logit_bias
+        logits_per_image = logits_per_text.t()
+
+        loss = None
+        if return_loss:
+            raise NotImplementedError("SigLIP loss to be implemented")
+
+        if not return_dict:
+            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return SiglipOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
diff --git a/app/llava/model/multimodal_encoder/siglip/processing_siglip.py b/app/llava/model/multimodal_encoder/siglip/processing_siglip.py
new file mode 100644
index 0000000000000000000000000000000000000000..261510f0d7c7450b1f59915ca5a878b94b71a735
--- /dev/null
+++ b/app/llava/model/multimodal_encoder/siglip/processing_siglip.py
@@ -0,0 +1,143 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Image/Text processor class for SigLIP.
+"""
+
+from typing import List, Optional, Union
+
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from transformers.utils import TensorType
+
+
+class SiglipProcessor(ProcessorMixin):
+    r"""
+    Constructs a Siglip processor which wraps a Siglip image processor and a Siglip tokenizer into a single processor.
+
+    [`SiglipProcessor`] offers all the functionalities of [`SiglipImageProcessor`] and [`SiglipTokenizer`]. See the
+    [`~SiglipProcessor.__call__`] and [`~SiglipProcessor.decode`] for more information.
+
+    Args:
+        image_processor ([`SiglipImageProcessor`]):
+            The image processor is a required input.
+        tokenizer ([`SiglipTokenizer`]):
+            The tokenizer is a required input.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "SiglipImageProcessor"
+    tokenizer_class = "SiglipTokenizer"
+
+    def __init__(self, image_processor, tokenizer):
+        super().__init__(image_processor, tokenizer)
+
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        images: ImageInput = None,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: int = None,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to SiglipTokenizer's [`~SiglipTokenizer.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` argument to
+        SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            truncation (`bool`, *optional*):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+
+        if text is None and images is None:
+            raise ValueError("You have to specify either text or images. Both cannot be none.")
+
+        if text is not None:
+            encoding = self.tokenizer(
+                text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
+            )
+
+        if images is not None:
+            image_features = self.image_processor(images, return_tensors=return_tensors)
+
+        if text is not None and images is not None:
+            encoding["pixel_values"] = image_features.pixel_values
+            return encoding
+        elif text is not None:
+            return encoding
+        else:
+            return BatchFeature(data=dict(**image_features), tensor_type=return_tensors)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to SiglipTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to SiglipTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    @property
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names with CLIP->Siglip, T5->Siglip
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
diff --git a/app/llava/model/multimodal_encoder/siglip_encoder.py b/app/llava/model/multimodal_encoder/siglip_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1551579cb7323cdfdabe60478ffb855ec061807
--- /dev/null
+++ b/app/llava/model/multimodal_encoder/siglip_encoder.py
@@ -0,0 +1,84 @@
+import torch
+import torch.nn as nn
+
+# from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
+from .siglip.modeling_siglip import SiglipVisionModel
+from .siglip.configuration_siglip import SiglipVisionConfig
+from transformers import AutoImageProcessor
+
+
+class SiglipVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.vision_tower_name = vision_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.args = args
+        if not delay_load:
+            self.load_model()
+        elif getattr(args, 'unfreeze_mm_vision_tower', False):
+            self.load_model()
+        else:
+            self.cfg_only = SiglipVisionConfig.from_pretrained(self.vision_tower_name)
+
+    def load_model(self, device_map=None):
+        if self.is_loaded:
+            print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
+            return
+
+        self.image_processor = AutoImageProcessor.from_pretrained("google/siglip-so400m-patch14-384")
+        self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map, _flash_attn_2_enabled=True)
+        self.vision_tower.requires_grad_(False)
+
+        self.is_loaded = True
+
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        return image_features
+
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+
+    @property
+    def device(self):
+        return self.vision_tower.device
+
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+
+    @property
+    def num_patches_per_side(self):
+        return self.image_processor.size['width'] // self.config.patch_size
+
+    @property
+    def num_patches(self):
+        return (self.image_processor.size['width'] // self.config.patch_size) ** 2
diff --git a/app/llava/model/multimodal_projector/__pycache__/builder.cpython-310.pyc b/app/llava/model/multimodal_projector/__pycache__/builder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..61b4dd01bc0143966a911253c08bbeaed6dfa2a3
Binary files /dev/null and b/app/llava/model/multimodal_projector/__pycache__/builder.cpython-310.pyc differ
diff --git a/app/llava/model/multimodal_projector/__pycache__/builder.cpython-39.pyc b/app/llava/model/multimodal_projector/__pycache__/builder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f9e2585ab93781209d7dd840f7ed2dd931c2b82e
Binary files /dev/null and b/app/llava/model/multimodal_projector/__pycache__/builder.cpython-39.pyc differ
diff --git a/app/llava/model/multimodal_projector/builder.py b/app/llava/model/multimodal_projector/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..31cd4f48e6055cd6d00a162af30b1c8139e26b57
--- /dev/null
+++ b/app/llava/model/multimodal_projector/builder.py
@@ -0,0 +1,51 @@
+import torch
+import torch.nn as nn
+import re
+
+
+class IdentityMap(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, *args, **kwargs):
+        return x
+
+    @property
+    def config(self):
+        return {"mm_projector_type": 'identity'}
+
+
+class SimpleResBlock(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(channels)
+
+        self.proj = nn.Sequential(
+            nn.Linear(channels, channels),
+            nn.GELU(),
+            nn.Linear(channels, channels)
+        )
+    def forward(self, x):
+        x = self.pre_norm(x)
+        return x + self.proj(x)
+
+
+def build_vision_projector(config, delay_load=False, **kwargs):
+    projector_type = getattr(config, 'mm_projector_type', 'linear')
+
+    if projector_type == 'linear':
+        return nn.Linear(config.mm_hidden_size, config.hidden_size)
+
+    mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
+    if mlp_gelu_match:
+        mlp_depth = int(mlp_gelu_match.group(1))
+        modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
+        for _ in range(1, mlp_depth):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+        return nn.Sequential(*modules)
+
+    if projector_type == 'identity':
+        return IdentityMap()
+
+    raise ValueError(f'Unknown projector type: {projector_type}')
diff --git a/app/llava/model/preprocessor.py b/app/llava/model/preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ebacc7a24f057437f3d51feadd959802629598b
--- /dev/null
+++ b/app/llava/model/preprocessor.py
@@ -0,0 +1,691 @@
+import copy
+import transformers
+import tokenizers
+import torch
+from typing import Dict, Optional, Sequence, List
+from packaging import version
+
+from llava.mm_utils import tokenizer_image_token
+from llava.train.arguments import ModelArguments, TrainingArguments, DataArguments
+from llava.constants import IGNORE_INDEX, MM_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_VIDEO_TOKEN, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN
+from llava import conversation as conversation_lib
+
+IS_TOKENIZER_GREATER_THAN_0_14 = version.parse(tokenizers.__version__) >= version.parse('0.14')
+
+def _tokenize_fn(strings: Sequence[str],
+                 tokenizer: transformers.PreTrainedTokenizer) -> Dict:
+    """Tokenize a list of strings."""
+    tokenized_list = [
+        tokenizer(
+            text,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ) for text in strings
+    ]
+    input_ids = labels = [
+        tokenized.input_ids[0] for tokenized in tokenized_list
+    ]
+    input_ids_lens = labels_lens = [
+        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item()
+        for tokenized in tokenized_list
+    ]
+    return dict(
+        input_ids=input_ids,
+        labels=labels,
+        input_ids_lens=input_ids_lens,
+        labels_lens=labels_lens,
+    )
+
+
+def _mask_targets(target, tokenized_lens, speakers):
+    # cur_idx = 0
+    cur_idx = tokenized_lens[0]
+    tokenized_lens = tokenized_lens[1:]
+    target[:cur_idx] = IGNORE_INDEX
+    # TODO: why +2 ?
+    for tokenized_len, speaker in zip(tokenized_lens, speakers):
+        if speaker == "human":
+            target[cur_idx+2:cur_idx + tokenized_len] = IGNORE_INDEX
+        cur_idx += tokenized_len
+
+
+def _add_speaker_and_signal(header, source, get_conversation=True):
+    """Add speaker and start/end signal on each round."""
+    BEGIN_SIGNAL = "### "
+    END_SIGNAL = "\n"
+    conversation = header
+    for sentence in source:
+        from_str = sentence["from"]
+        if from_str.lower() == "human":
+            from_str = conversation_lib.default_conversation.roles[0]
+        elif from_str.lower() == "gpt":
+            from_str = conversation_lib.default_conversation.roles[1]
+        else:
+            from_str = 'unknown'
+        sentence["value"] = (BEGIN_SIGNAL + from_str + ": " +
+                             sentence["value"] + END_SIGNAL)
+        if get_conversation:
+            conversation += sentence["value"]
+    conversation += BEGIN_SIGNAL
+    return conversation
+
+def preprocess_multimodal(
+    sources: Sequence[str],
+    data_args: DataArguments
+) -> Dict:
+    is_multimodal = data_args.is_multimodal
+    if not is_multimodal:
+        return sources
+
+    for source in sources:
+        for sentence in source:
+
+            if DEFAULT_VIDEO_TOKEN in sentence['value']:
+                # sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, '').strip()
+                # sentence['value'] = DEFAULT_IMAGE_TOKEN + '\n' + sentence['value']
+                sentence['value'] = sentence['value'].strip()
+                if "mmtag" in conversation_lib.default_conversation.version:
+                    raise NotImplementedError
+                    # sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, '<Image>' + DEFAULT_IMAGE_TOKEN + '</Image>')
+                replace_token = DEFAULT_VIDEO_TOKEN
+                if data_args.mm_use_start_end:
+                    replace_token = DEFAULT_VIDEO_START_TOKEN + replace_token + DEFAULT_VIDEO_END_TOKEN
+                sentence["value"] = sentence["value"].replace(DEFAULT_VIDEO_TOKEN, replace_token)
+            
+
+            if DEFAULT_IMAGE_TOKEN in sentence['value']:
+                # sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, '').strip()
+                # sentence['value'] = DEFAULT_IMAGE_TOKEN + '\n' + sentence['value']
+                sentence['value'] = sentence['value'].strip()
+                if "mmtag" in conversation_lib.default_conversation.version:
+                    sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, '<Image>' + DEFAULT_IMAGE_TOKEN + '</Image>')
+                replace_token = DEFAULT_IMAGE_TOKEN
+                if data_args.mm_use_start_end:
+                    replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
+                sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token)
+
+    return sources
+
+
+def preprocess_llama_2(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1], "model": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+
+    # Tokenize conversations
+
+    if has_image:
+        input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+
+    targets = input_ids.clone()
+
+    assert conv.sep_style == conversation_lib.SeparatorStyle.LLAMA_2
+
+    # Mask targets
+    sep = "[/INST] "
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(conv.sep2)
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_v1(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1], "model": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+
+    # Tokenize conversations
+
+    if has_image:
+        input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+
+    targets = input_ids.clone()
+
+    assert conv.sep_style == conversation_lib.SeparatorStyle.TWO
+
+    # Mask targets
+    sep = conv.sep + conv.roles[1] + ": "
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(conv.sep2)
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+
+            if i != 0 and not tokenizer.legacy and IS_TOKENIZER_GREATER_THAN_0_14:
+                round_len -= 1
+                instruction_len -= 1
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_mpt(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1], "model": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+
+    # Tokenize conversations
+
+    if has_image:
+        input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+
+    targets = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.MPT
+
+    # Mask targets
+    sep = conv.sep + conv.roles[1]
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(conv.sep)
+        re_rounds = [conv.sep.join(rounds[:3])] # system + user + gpt
+        for conv_idx in range(3, len(rounds), 2):
+            re_rounds.append(conv.sep.join(rounds[conv_idx:conv_idx+2]))    # user + gpt
+        cur_len = 0
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(re_rounds):
+            if rou == "":
+                break
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 1
+
+            if i != 0 and getattr(tokenizer, 'legacy', False) and IS_TOKENIZER_GREATER_THAN_0_14:
+                round_len += 1
+                instruction_len += 1
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_plain(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+) -> Dict:
+    # add end signal and concatenate MM_TOKEN_INDEXtogether
+    conversations = []
+    for source in sources:
+        assert len(source) == 2
+        # assert DEFAULT_IMAGE_TOKEN in source[0]['value']
+        # source[0]['value'] = DEFAULT_IMAGE_TOKEN
+        conversation = source[0]['value'] + source[1]['value'] + conversation_lib.default_conversation.sep
+        conversations.append(conversation)
+    # tokenize conversations
+    input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations]
+    targets = copy.deepcopy(input_ids)
+    for target, source in zip(targets, sources):
+        tokenized_len = len(tokenizer_image_token(source[0]['value'], tokenizer))
+        target[:tokenized_len] = IGNORE_INDEX
+
+    return dict(input_ids=input_ids, labels=targets)
+
+
+
+
+def preprocess_gemma(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1], "model": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt(use_chat_template=True, tokenizer=tokenizer))
+
+    # Tokenize conversations
+
+    if has_image:
+        input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+
+    targets = input_ids.clone()
+
+    # Mask targets
+    sep = conv.sep + conv.roles[1] + '\n'
+    sep2 = conv.sep2 + '\n' + conv.sep + conv.roles[0]
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(sep2)
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            if i != len(rounds) - 1:
+                rou += conv.sep2 + '\n'
+            if i >= 1 :
+                rou = conv.sep + conv.roles[0] + rou
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer)) - 1
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1
+            else:
+                raise NotImplementedError
+
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_mistral(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1], "model": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt(use_chat_template=True, tokenizer=tokenizer))
+
+    # Tokenize conversations
+
+    if has_image:
+        input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+
+    targets = input_ids.clone()
+
+    # Mask targets
+    sep = " [/INST]"
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(conv.sep2)
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 1
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+        if rou[-1] == ' ':   # '</s> ' another space after </s>
+            cur_len += 1
+
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_thoth(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1], "model": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+
+    # Tokenize conversations
+
+    if has_image:
+        input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+
+    targets = input_ids.clone()
+
+
+    # Mask targets
+    sep = conv.sep + conv.roles[1] + ": "
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(conv.sep2)
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+
+            target[cur_len: cur_len + instruction_len] = IGNORE_INDEX
+            cur_len += round_len + 1
+            if i == 0:
+                cur_len -= 1
+        target[cur_len:] = IGNORE_INDEX
+
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False
+) -> Dict:
+    """
+    Given a list of sources, each is a conversation list. This transform:
+    1. Add signal '### ' at the beginning each sentence, with end signal '\n';
+    2. Concatenate conversations together;
+    3. Tokenize the concatenated conversation;
+    4. Make a deepcopy as the target. Mask human words with IGNORE_INDEX.
+    """
+    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN:
+        return preprocess_plain(sources, tokenizer)
+    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.LLAMA_2:
+        return preprocess_llama_2(sources, tokenizer, has_image=has_image)
+    if conversation_lib.default_conversation.version.startswith("v1"):
+        return preprocess_v1(sources, tokenizer, has_image=has_image)
+    if conversation_lib.default_conversation.version == "mpt":
+        return preprocess_mpt(sources, tokenizer, has_image=has_image)
+    if conversation_lib.default_conversation.version == 'gemma':
+        return preprocess_gemma(sources, tokenizer, has_image=has_image)
+    if conversation_lib.default_conversation.version == 'thoth':
+        return preprocess_thoth(sources, tokenizer, has_image=has_image)
+    if conversation_lib.default_conversation.version == 'mistral':
+        return preprocess_mistral(sources, tokenizer, has_image=has_image)
+    # add end signal and concatenate together
+    conversations = []
+    for source in sources:
+        header = f"{conversation_lib.default_conversation.system}\n\n"
+        conversation = _add_speaker_and_signal(header, source)
+        conversations.append(conversation)
+    # tokenize conversations
+    def get_tokenize_len(prompts):
+        return [len(tokenizer_image_token(prompt, tokenizer)) for prompt in prompts]
+
+    if has_image:
+        input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations]
+    else:
+        conversations_tokenized = _tokenize_fn(conversations, tokenizer)
+        input_ids = conversations_tokenized["input_ids"]
+
+    targets = copy.deepcopy(input_ids)
+    for target, source in zip(targets, sources):
+        if has_image:
+            tokenized_lens = get_tokenize_len([header] + [s["value"] for s in source])
+        else:
+            tokenized_lens = _tokenize_fn([header] + [s["value"] for s in source], tokenizer)["input_ids_lens"]
+        speakers = [sentence["from"] for sentence in source]
+        _mask_targets(target, tokenized_lens, speakers)
+
+    return dict(input_ids=input_ids, labels=targets)
diff --git a/app/llava/model/utils.py b/app/llava/model/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9169ef4b61320f63ea0177fbc5bf7f07698766bf
--- /dev/null
+++ b/app/llava/model/utils.py
@@ -0,0 +1,32 @@
+import torch
+import torch.nn as nn
+from transformers import AutoConfig
+
+
+def auto_upgrade(config):
+    cfg = AutoConfig.from_pretrained(config)
+    if 'llava' in config and 'llava' not in cfg.model_type:
+        assert cfg.model_type == 'llama'
+        print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
+        print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
+        confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
+        if confirm.lower() in ["y", "yes"]:
+            print("Upgrading checkpoint...")
+            assert len(cfg.architectures) == 1
+            setattr(cfg.__class__, "model_type", "llava")
+            cfg.architectures[0] = 'LlavaLlamaForCausalLM'
+            cfg.save_pretrained(config)
+            print("Checkpoint upgraded.")
+        else:
+            print("Checkpoint upgrade aborted.")
+            exit(1)
+
+
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+
+    def forward(self, x: torch.Tensor):
+        with torch.cuda.amp.autocast(dtype=torch.float32):
+            orig_type = x.dtype
+            ret = super().forward(x.type(torch.float32))
+            return ret.type(orig_type)
\ No newline at end of file
diff --git a/app/llava/serve/__init__.py b/app/llava/serve/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/app/llava/serve/cli.py b/app/llava/serve/cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed583278d6661f0bbeb3026d262e6404c093d672
--- /dev/null
+++ b/app/llava/serve/cli.py
@@ -0,0 +1,128 @@
+import argparse
+import torch
+
+from llava.constants import MM_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from llava.conversation import conv_templates, SeparatorStyle
+from llava.model.builder import load_pretrained_model
+from llava.utils import disable_torch_init
+from llava.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path
+
+from PIL import Image
+
+import requests
+from PIL import Image
+from io import BytesIO
+from transformers import TextStreamer
+
+
+def load_image(image_file):
+    if image_file.startswith('http://') or image_file.startswith('https://'):
+        response = requests.get(image_file)
+        image = Image.open(BytesIO(response.content)).convert('RGB')
+    else:
+        image = Image.open(image_file).convert('RGB')
+    return image
+
+
+def main(args):
+    # Model
+    disable_torch_init()
+
+    model_name = get_model_name_from_path(args.model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit, device=args.device)
+
+    if "llama-2" in model_name.lower():
+        conv_mode = "llava_llama_2"
+    elif "mistral" in model_name.lower():
+        conv_mode = "mistral_instruct"
+    elif "v1.6-34b" in model_name.lower():
+        conv_mode = "chatml_direct"
+    elif "v1" in model_name.lower():
+        conv_mode = "llava_v1"
+    elif "mpt" in model_name.lower():
+        conv_mode = "mpt"
+    else:
+        conv_mode = "llava_v0"
+
+    if args.conv_mode is not None and conv_mode != args.conv_mode:
+        print('[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}'.format(conv_mode, args.conv_mode, args.conv_mode))
+    else:
+        args.conv_mode = conv_mode
+
+    conv = conv_templates[args.conv_mode].copy()
+    if "mpt" in model_name.lower():
+        roles = ('user', 'assistant')
+    else:
+        roles = conv.roles
+
+    image = load_image(args.image_file)
+    image_size = image.size
+    # Similar operation in model_worker.py
+    image_tensor = process_images([image], image_processor, model.config)
+    if type(image_tensor) is list:
+        image_tensor = [image.to(model.device, dtype=torch.float16) for image in image_tensor]
+    else:
+        image_tensor = image_tensor.to(model.device, dtype=torch.float16)
+
+    while True:
+        try:
+            inp = input(f"{roles[0]}: ")
+        except EOFError:
+            inp = ""
+        if not inp:
+            print("exit...")
+            break
+
+        print(f"{roles[1]}: ", end="")
+
+        if image is not None:
+            # first message
+            if model.config.mm_use_start_end:
+                inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + inp
+            else:
+                inp = DEFAULT_IMAGE_TOKEN + '\n' + inp
+            conv.append_message(conv.roles[0], inp)
+            image = None
+        else:
+            # later messages
+            conv.append_message(conv.roles[0], inp)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+
+        input_ids = tokenizer_image_token(prompt, tokenizer, MM_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(model.device)
+        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+        keywords = [stop_str]
+        streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=image_tensor,
+                image_sizes=[image_size],
+                do_sample=True if args.temperature > 0 else False,
+                temperature=args.temperature,
+                max_new_tokens=args.max_new_tokens,
+                streamer=streamer,
+                use_cache=True)
+
+        outputs = tokenizer.decode(output_ids[0]).strip()
+        conv.messages[-1][-1] = outputs
+
+        if args.debug:
+            print("\n", {"prompt": prompt, "outputs": outputs}, "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-file", type=str, required=True)
+    parser.add_argument("--device", type=str, default="cuda")
+    parser.add_argument("--conv-mode", type=str, default=None)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--max-new-tokens", type=int, default=512)
+    parser.add_argument("--load-8bit", action="store_true")
+    parser.add_argument("--load-4bit", action="store_true")
+    parser.add_argument("--debug", action="store_true")
+    args = parser.parse_args()
+    main(args)
diff --git a/app/llava/serve/controller.py b/app/llava/serve/controller.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4bf1b4c47ccdb1401b18f8397868ec016d1c43a
--- /dev/null
+++ b/app/llava/serve/controller.py
@@ -0,0 +1,298 @@
+"""
+A controller manages distributed workers.
+It sends worker addresses to clients.
+"""
+import argparse
+import asyncio
+import dataclasses
+from enum import Enum, auto
+import json
+import logging
+import time
+from typing import List, Union
+import threading
+
+from fastapi import FastAPI, Request
+from fastapi.responses import StreamingResponse
+import numpy as np
+import requests
+import uvicorn
+
+from llava.constants import CONTROLLER_HEART_BEAT_EXPIRATION
+from llava.utils import build_logger, server_error_msg
+
+
+logger = build_logger("controller", "controller.log")
+
+
+class DispatchMethod(Enum):
+    LOTTERY = auto()
+    SHORTEST_QUEUE = auto()
+
+    @classmethod
+    def from_str(cls, name):
+        if name == "lottery":
+            return cls.LOTTERY
+        elif name == "shortest_queue":
+            return cls.SHORTEST_QUEUE
+        else:
+            raise ValueError(f"Invalid dispatch method")
+
+
+@dataclasses.dataclass
+class WorkerInfo:
+    model_names: List[str]
+    speed: int
+    queue_length: int
+    check_heart_beat: bool
+    last_heart_beat: str
+
+
+def heart_beat_controller(controller):
+    while True:
+        time.sleep(CONTROLLER_HEART_BEAT_EXPIRATION)
+        controller.remove_stable_workers_by_expiration()
+
+
+class Controller:
+    def __init__(self, dispatch_method: str):
+        # Dict[str -> WorkerInfo]
+        self.worker_info = {}
+        self.dispatch_method = DispatchMethod.from_str(dispatch_method)
+
+        self.heart_beat_thread = threading.Thread(
+            target=heart_beat_controller, args=(self,), daemon=True)
+        self.heart_beat_thread.start()
+
+        logger.info("Init controller")
+
+    def register_worker(self, worker_name: str, check_heart_beat: bool,
+                        worker_status: dict):
+        if worker_name not in self.worker_info:
+            logger.info(f"Register a new worker: {worker_name}")
+        else:
+            logger.info(f"Register an existing worker: {worker_name}")
+
+        if not worker_status:
+            worker_status = self.get_worker_status(worker_name)
+        if not worker_status:
+            return False
+
+        self.worker_info[worker_name] = WorkerInfo(
+            worker_status["model_names"], worker_status["speed"], worker_status["queue_length"],
+            check_heart_beat, time.time())
+
+        logger.info(f"Register done: {worker_name}, {worker_status}")
+        return True
+
+    def get_worker_status(self, worker_name: str):
+        try:
+            r = requests.post(worker_name + "/worker_get_status", timeout=5)
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Get status fails: {worker_name}, {e}")
+            return None
+
+        if r.status_code != 200:
+            logger.error(f"Get status fails: {worker_name}, {r}")
+            return None
+
+        return r.json()
+
+    def remove_worker(self, worker_name: str):
+        del self.worker_info[worker_name]
+
+    def refresh_all_workers(self):
+        old_info = dict(self.worker_info)
+        self.worker_info = {}
+
+        for w_name, w_info in old_info.items():
+            if not self.register_worker(w_name, w_info.check_heart_beat, None):
+                logger.info(f"Remove stale worker: {w_name}")
+
+    def list_models(self):
+        model_names = set()
+
+        for w_name, w_info in self.worker_info.items():
+            model_names.update(w_info.model_names)
+
+        return list(model_names)
+
+    def get_worker_address(self, model_name: str):
+        if self.dispatch_method == DispatchMethod.LOTTERY:
+            worker_names = []
+            worker_speeds = []
+            for w_name, w_info in self.worker_info.items():
+                if model_name in w_info.model_names:
+                    worker_names.append(w_name)
+                    worker_speeds.append(w_info.speed)
+            worker_speeds = np.array(worker_speeds, dtype=np.float32)
+            norm = np.sum(worker_speeds)
+            if norm < 1e-4:
+                return ""
+            worker_speeds = worker_speeds / norm
+            if True:  # Directly return address
+                pt = np.random.choice(np.arange(len(worker_names)),
+                    p=worker_speeds)
+                worker_name = worker_names[pt]
+                return worker_name
+
+            # Check status before returning
+            while True:
+                pt = np.random.choice(np.arange(len(worker_names)),
+                    p=worker_speeds)
+                worker_name = worker_names[pt]
+
+                if self.get_worker_status(worker_name):
+                    break
+                else:
+                    self.remove_worker(worker_name)
+                    worker_speeds[pt] = 0
+                    norm = np.sum(worker_speeds)
+                    if norm < 1e-4:
+                        return ""
+                    worker_speeds = worker_speeds / norm
+                    continue
+            return worker_name
+        elif self.dispatch_method == DispatchMethod.SHORTEST_QUEUE:
+            worker_names = []
+            worker_qlen = []
+            for w_name, w_info in self.worker_info.items():
+                if model_name in w_info.model_names:
+                    worker_names.append(w_name)
+                    worker_qlen.append(w_info.queue_length / w_info.speed)
+            if len(worker_names) == 0:
+                return ""
+            min_index = np.argmin(worker_qlen)
+            w_name = worker_names[min_index]
+            self.worker_info[w_name].queue_length += 1
+            logger.info(f"names: {worker_names}, queue_lens: {worker_qlen}, ret: {w_name}")
+            return w_name
+        else:
+            raise ValueError(f"Invalid dispatch method: {self.dispatch_method}")
+
+    def receive_heart_beat(self, worker_name: str, queue_length: int):
+        if worker_name not in self.worker_info:
+            logger.info(f"Receive unknown heart beat. {worker_name}")
+            return False
+
+        self.worker_info[worker_name].queue_length = queue_length
+        self.worker_info[worker_name].last_heart_beat = time.time()
+        logger.info(f"Receive heart beat. {worker_name}")
+        return True
+
+    def remove_stable_workers_by_expiration(self):
+        expire = time.time() - CONTROLLER_HEART_BEAT_EXPIRATION
+        to_delete = []
+        for worker_name, w_info in self.worker_info.items():
+            if w_info.check_heart_beat and w_info.last_heart_beat < expire:
+                to_delete.append(worker_name)
+
+        for worker_name in to_delete:
+            self.remove_worker(worker_name)
+
+    def worker_api_generate_stream(self, params):
+        worker_addr = self.get_worker_address(params["model"])
+        if not worker_addr:
+            logger.info(f"no worker: {params['model']}")
+            ret = {
+                "text": server_error_msg,
+                "error_code": 2,
+            }
+            yield json.dumps(ret).encode() + b"\0"
+
+        try:
+            response = requests.post(worker_addr + "/worker_generate_stream",
+                json=params, stream=True, timeout=5)
+            for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"):
+                if chunk:
+                    yield chunk + b"\0"
+        except requests.exceptions.RequestException as e:
+            logger.info(f"worker timeout: {worker_addr}")
+            ret = {
+                "text": server_error_msg,
+                "error_code": 3,
+            }
+            yield json.dumps(ret).encode() + b"\0"
+
+
+    # Let the controller act as a worker to achieve hierarchical
+    # management. This can be used to connect isolated sub networks.
+    def worker_api_get_status(self):
+        model_names = set()
+        speed = 0
+        queue_length = 0
+
+        for w_name in self.worker_info:
+            worker_status = self.get_worker_status(w_name)
+            if worker_status is not None:
+                model_names.update(worker_status["model_names"])
+                speed += worker_status["speed"]
+                queue_length += worker_status["queue_length"]
+
+        return {
+            "model_names": list(model_names),
+            "speed": speed,
+            "queue_length": queue_length,
+        }
+
+
+app = FastAPI()
+
+
+@app.post("/register_worker")
+async def register_worker(request: Request):
+    data = await request.json()
+    controller.register_worker(
+        data["worker_name"], data["check_heart_beat"],
+        data.get("worker_status", None))
+
+
+@app.post("/refresh_all_workers")
+async def refresh_all_workers():
+    models = controller.refresh_all_workers()
+
+
+@app.post("/list_models")
+async def list_models():
+    models = controller.list_models()
+    return {"models": models}
+
+
+@app.post("/get_worker_address")
+async def get_worker_address(request: Request):
+    data = await request.json()
+    addr = controller.get_worker_address(data["model"])
+    return {"address": addr}
+
+
+@app.post("/receive_heart_beat")
+async def receive_heart_beat(request: Request):
+    data = await request.json()
+    exist = controller.receive_heart_beat(
+        data["worker_name"], data["queue_length"])
+    return {"exist": exist}
+
+
+@app.post("/worker_generate_stream")
+async def worker_api_generate_stream(request: Request):
+    params = await request.json()
+    generator = controller.worker_api_generate_stream(params)
+    return StreamingResponse(generator)
+
+
+@app.post("/worker_get_status")
+async def worker_api_get_status(request: Request):
+    return controller.worker_api_get_status()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=21001)
+    parser.add_argument("--dispatch-method", type=str, choices=[
+        "lottery", "shortest_queue"], default="shortest_queue")
+    args = parser.parse_args()
+    logger.info(f"args: {args}")
+
+    controller = Controller(args.dispatch_method)
+    uvicorn.run(app, host=args.host, port=args.port, log_level="info")
diff --git a/app/llava/serve/gradio_web_server.py b/app/llava/serve/gradio_web_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..c07efc122950da37455608b609dcf1f2b4103d56
--- /dev/null
+++ b/app/llava/serve/gradio_web_server.py
@@ -0,0 +1,479 @@
+import argparse
+import datetime
+import json
+import os
+import time
+
+import gradio as gr
+import requests
+
+from llava.conversation import (default_conversation, conv_templates,
+                                   SeparatorStyle)
+from llava.constants import LOGDIR
+from llava.utils import (build_logger, server_error_msg,
+    violates_moderation, moderation_msg)
+import hashlib
+
+
+logger = build_logger("gradio_web_server", "gradio_web_server.log")
+
+headers = {"User-Agent": "LLaVA Client"}
+
+no_change_btn = gr.Button()
+enable_btn = gr.Button(interactive=True)
+disable_btn = gr.Button(interactive=False)
+
+priority = {
+    "vicuna-13b": "aaaaaaa",
+    "koala-13b": "aaaaaab",
+}
+
+
+def get_conv_log_filename():
+    t = datetime.datetime.now()
+    name = os.path.join(LOGDIR, f"{t.year}-{t.month:02d}-{t.day:02d}-conv.json")
+    return name
+
+
+def get_model_list():
+    ret = requests.post(args.controller_url + "/refresh_all_workers")
+    assert ret.status_code == 200
+    ret = requests.post(args.controller_url + "/list_models")
+    models = ret.json()["models"]
+    models.sort(key=lambda x: priority.get(x, x))
+    logger.info(f"Models: {models}")
+    return models
+
+
+get_window_url_params = """
+function() {
+    const params = new URLSearchParams(window.location.search);
+    url_params = Object.fromEntries(params);
+    console.log(url_params);
+    return url_params;
+    }
+"""
+
+
+def load_demo(url_params, request: gr.Request):
+    logger.info(f"load_demo. ip: {request.client.host}. params: {url_params}")
+
+    dropdown_update = gr.Dropdown(visible=True)
+    if "model" in url_params:
+        model = url_params["model"]
+        if model in models:
+            dropdown_update = gr.Dropdown(value=model, visible=True)
+
+    state = default_conversation.copy()
+    return state, dropdown_update
+
+
+def load_demo_refresh_model_list(request: gr.Request):
+    logger.info(f"load_demo. ip: {request.client.host}")
+    models = get_model_list()
+    state = default_conversation.copy()
+    dropdown_update = gr.Dropdown(
+        choices=models,
+        value=models[0] if len(models) > 0 else ""
+    )
+    return state, dropdown_update
+
+
+def vote_last_response(state, vote_type, model_selector, request: gr.Request):
+    with open(get_conv_log_filename(), "a") as fout:
+        data = {
+            "tstamp": round(time.time(), 4),
+            "type": vote_type,
+            "model": model_selector,
+            "state": state.dict(),
+            "ip": request.client.host,
+        }
+        fout.write(json.dumps(data) + "\n")
+
+
+def upvote_last_response(state, model_selector, request: gr.Request):
+    logger.info(f"upvote. ip: {request.client.host}")
+    vote_last_response(state, "upvote", model_selector, request)
+    return ("",) + (disable_btn,) * 3
+
+
+def downvote_last_response(state, model_selector, request: gr.Request):
+    logger.info(f"downvote. ip: {request.client.host}")
+    vote_last_response(state, "downvote", model_selector, request)
+    return ("",) + (disable_btn,) * 3
+
+
+def flag_last_response(state, model_selector, request: gr.Request):
+    logger.info(f"flag. ip: {request.client.host}")
+    vote_last_response(state, "flag", model_selector, request)
+    return ("",) + (disable_btn,) * 3
+
+
+def regenerate(state, image_process_mode, request: gr.Request):
+    logger.info(f"regenerate. ip: {request.client.host}")
+    state.messages[-1][-1] = None
+    prev_human_msg = state.messages[-2]
+    if type(prev_human_msg[1]) in (tuple, list):
+        prev_human_msg[1] = (*prev_human_msg[1][:2], image_process_mode)
+    state.skip_next = False
+    return (state, state.to_gradio_chatbot(), "", None) + (disable_btn,) * 5
+
+
+def clear_history(request: gr.Request):
+    logger.info(f"clear_history. ip: {request.client.host}")
+    state = default_conversation.copy()
+    return (state, state.to_gradio_chatbot(), "", None) + (disable_btn,) * 5
+
+
+def add_text(state, text, image, image_process_mode, request: gr.Request):
+    logger.info(f"add_text. ip: {request.client.host}. len: {len(text)}")
+    if len(text) <= 0 and image is None:
+        state.skip_next = True
+        return (state, state.to_gradio_chatbot(), "", None) + (no_change_btn,) * 5
+    if args.moderate:
+        flagged = violates_moderation(text)
+        if flagged:
+            state.skip_next = True
+            return (state, state.to_gradio_chatbot(), moderation_msg, None) + (
+                no_change_btn,) * 5
+
+    text = text[:1536]  # Hard cut-off
+    if image is not None:
+        text = text[:1200]  # Hard cut-off for images
+        if '<image>' not in text:
+            # text = '<Image><image></Image>' + text
+            text = text + '\n<image>'
+        text = (text, image, image_process_mode)
+        state = default_conversation.copy()
+    state.append_message(state.roles[0], text)
+    state.append_message(state.roles[1], None)
+    state.skip_next = False
+    return (state, state.to_gradio_chatbot(), "", None) + (disable_btn,) * 5
+
+
+def http_bot(state, model_selector, temperature, top_p, max_new_tokens, request: gr.Request):
+    logger.info(f"http_bot. ip: {request.client.host}")
+    start_tstamp = time.time()
+    model_name = model_selector
+
+    if state.skip_next:
+        # This generate call is skipped due to invalid inputs
+        yield (state, state.to_gradio_chatbot()) + (no_change_btn,) * 5
+        return
+
+    if len(state.messages) == state.offset + 2:
+        # First round of conversation
+        if "llava" in model_name.lower():
+            if 'llama-2' in model_name.lower():
+                template_name = "llava_llama_2"
+            elif "mistral" in model_name.lower() or "mixtral" in model_name.lower():
+                if 'orca' in model_name.lower():
+                    template_name = "mistral_orca"
+                elif 'hermes' in model_name.lower():
+                    template_name = "chatml_direct"
+                else:
+                    template_name = "mistral_instruct"
+            elif 'llava-v1.6-34b' in model_name.lower():
+                template_name = "chatml_direct"
+            elif "v1" in model_name.lower():
+                if 'mmtag' in model_name.lower():
+                    template_name = "v1_mmtag"
+                elif 'plain' in model_name.lower() and 'finetune' not in model_name.lower():
+                    template_name = "v1_mmtag"
+                else:
+                    template_name = "llava_v1"
+            elif "mpt" in model_name.lower():
+                template_name = "mpt"
+            else:
+                if 'mmtag' in model_name.lower():
+                    template_name = "v0_mmtag"
+                elif 'plain' in model_name.lower() and 'finetune' not in model_name.lower():
+                    template_name = "v0_mmtag"
+                else:
+                    template_name = "llava_v0"
+        elif "mpt" in model_name:
+            template_name = "mpt_text"
+        elif "llama-2" in model_name:
+            template_name = "llama_2"
+        else:
+            template_name = "vicuna_v1"
+        new_state = conv_templates[template_name].copy()
+        new_state.append_message(new_state.roles[0], state.messages[-2][1])
+        new_state.append_message(new_state.roles[1], None)
+        state = new_state
+
+    # Query worker address
+    controller_url = args.controller_url
+    ret = requests.post(controller_url + "/get_worker_address",
+            json={"model": model_name})
+    worker_addr = ret.json()["address"]
+    logger.info(f"model_name: {model_name}, worker_addr: {worker_addr}")
+
+    # No available worker
+    if worker_addr == "":
+        state.messages[-1][-1] = server_error_msg
+        yield (state, state.to_gradio_chatbot(), disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
+        return
+
+    # Construct prompt
+    prompt = state.get_prompt()
+
+    all_images = state.get_images(return_pil=True)
+    all_image_hash = [hashlib.md5(image.tobytes()).hexdigest() for image in all_images]
+    for image, hash in zip(all_images, all_image_hash):
+        t = datetime.datetime.now()
+        filename = os.path.join(LOGDIR, "serve_images", f"{t.year}-{t.month:02d}-{t.day:02d}", f"{hash}.jpg")
+        if not os.path.isfile(filename):
+            os.makedirs(os.path.dirname(filename), exist_ok=True)
+            image.save(filename)
+
+    # Make requests
+    pload = {
+        "model": model_name,
+        "prompt": prompt,
+        "temperature": float(temperature),
+        "top_p": float(top_p),
+        "max_new_tokens": min(int(max_new_tokens), 1536),
+        "stop": state.sep if state.sep_style in [SeparatorStyle.SINGLE, SeparatorStyle.MPT] else state.sep2,
+        "images": f'List of {len(state.get_images())} images: {all_image_hash}',
+    }
+    logger.info(f"==== request ====\n{pload}")
+
+    pload['images'] = state.get_images()
+
+    state.messages[-1][-1] = "▌"
+    yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
+
+    try:
+        # Stream output
+        response = requests.post(worker_addr + "/worker_generate_stream",
+            headers=headers, json=pload, stream=True, timeout=10)
+        for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"):
+            if chunk:
+                data = json.loads(chunk.decode())
+                if data["error_code"] == 0:
+                    output = data["text"][len(prompt):].strip()
+                    state.messages[-1][-1] = output + "▌"
+                    yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
+                else:
+                    output = data["text"] + f" (error_code: {data['error_code']})"
+                    state.messages[-1][-1] = output
+                    yield (state, state.to_gradio_chatbot()) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
+                    return
+                time.sleep(0.03)
+    except requests.exceptions.RequestException as e:
+        state.messages[-1][-1] = server_error_msg
+        yield (state, state.to_gradio_chatbot()) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
+        return
+
+    state.messages[-1][-1] = state.messages[-1][-1][:-1]
+    yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 5
+
+    finish_tstamp = time.time()
+    logger.info(f"{output}")
+
+    with open(get_conv_log_filename(), "a") as fout:
+        data = {
+            "tstamp": round(finish_tstamp, 4),
+            "type": "chat",
+            "model": model_name,
+            "start": round(start_tstamp, 4),
+            "finish": round(finish_tstamp, 4),
+            "state": state.dict(),
+            "images": all_image_hash,
+            "ip": request.client.host,
+        }
+        fout.write(json.dumps(data) + "\n")
+
+title_markdown = ("""
+# 🌋 LLaVA: Large Language and Vision Assistant
+[[Project Page](https://llava-vl.github.io)] [[Code](https://github.com/haotian-liu/LLaVA)] [[Model](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)] | 📚 [[LLaVA](https://arxiv.org/abs/2304.08485)] [[LLaVA-v1.5](https://arxiv.org/abs/2310.03744)] [[LLaVA-v1.6](https://llava-vl.github.io/blog/2024-01-30-llava-1-6/)]
+""")
+
+tos_markdown = ("""
+### Terms of use
+By using this service, users are required to agree to the following terms:
+The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. The service may collect user dialogue data for future research.
+Please click the "Flag" button if you get any inappropriate answer! We will collect those to keep improving our moderator.
+For an optimal experience, please use desktop computers for this demo, as mobile devices may compromise its quality.
+""")
+
+
+learn_more_markdown = ("""
+### License
+The service is a research preview intended for non-commercial use only, subject to the model [License](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) of LLaMA, [Terms of Use](https://openai.com/policies/terms-of-use) of the data generated by OpenAI, and [Privacy Practices](https://chrome.google.com/webstore/detail/sharegpt-share-your-chatg/daiacboceoaocpibfodeljbdfacokfjb) of ShareGPT. Please contact us if you find any potential violation.
+""")
+
+block_css = """
+
+#buttons button {
+    min-width: min(120px,100%);
+}
+
+"""
+
+def build_demo(embed_mode, cur_dir=None, concurrency_count=10):
+    textbox = gr.Textbox(show_label=False, placeholder="Enter text and press ENTER", container=False)
+    with gr.Blocks(title="LLaVA", theme=gr.themes.Default(), css=block_css) as demo:
+        state = gr.State()
+
+        if not embed_mode:
+            gr.Markdown(title_markdown)
+
+        with gr.Row():
+            with gr.Column(scale=3):
+                with gr.Row(elem_id="model_selector_row"):
+                    model_selector = gr.Dropdown(
+                        choices=models,
+                        value=models[0] if len(models) > 0 else "",
+                        interactive=True,
+                        show_label=False,
+                        container=False)
+
+                imagebox = gr.Image(type="pil")
+                image_process_mode = gr.Radio(
+                    ["Crop", "Resize", "Pad", "Default"],
+                    value="Default",
+                    label="Preprocess for non-square image", visible=False)
+
+                if cur_dir is None:
+                    cur_dir = os.path.dirname(os.path.abspath(__file__))
+                gr.Examples(examples=[
+                    [f"{cur_dir}/examples/extreme_ironing.jpg", "What is unusual about this image?"],
+                    [f"{cur_dir}/examples/waterview.jpg", "What are the things I should be cautious about when I visit here?"],
+                ], inputs=[imagebox, textbox])
+
+                with gr.Accordion("Parameters", open=False) as parameter_row:
+                    temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.2, step=0.1, interactive=True, label="Temperature",)
+                    top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, step=0.1, interactive=True, label="Top P",)
+                    max_output_tokens = gr.Slider(minimum=0, maximum=1024, value=512, step=64, interactive=True, label="Max output tokens",)
+
+            with gr.Column(scale=8):
+                chatbot = gr.Chatbot(
+                    elem_id="chatbot",
+                    label="LLaVA Chatbot",
+                    height=650,
+                    layout="panel",
+                )
+                with gr.Row():
+                    with gr.Column(scale=8):
+                        textbox.render()
+                    with gr.Column(scale=1, min_width=50):
+                        submit_btn = gr.Button(value="Send", variant="primary")
+                with gr.Row(elem_id="buttons") as button_row:
+                    upvote_btn = gr.Button(value="👍  Upvote", interactive=False)
+                    downvote_btn = gr.Button(value="👎  Downvote", interactive=False)
+                    flag_btn = gr.Button(value="⚠️  Flag", interactive=False)
+                    #stop_btn = gr.Button(value="⏹️  Stop Generation", interactive=False)
+                    regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=False)
+                    clear_btn = gr.Button(value="🗑️  Clear", interactive=False)
+
+        if not embed_mode:
+            gr.Markdown(tos_markdown)
+            gr.Markdown(learn_more_markdown)
+        url_params = gr.JSON(visible=False)
+
+        # Register listeners
+        btn_list = [upvote_btn, downvote_btn, flag_btn, regenerate_btn, clear_btn]
+        upvote_btn.click(
+            upvote_last_response,
+            [state, model_selector],
+            [textbox, upvote_btn, downvote_btn, flag_btn]
+        )
+        downvote_btn.click(
+            downvote_last_response,
+            [state, model_selector],
+            [textbox, upvote_btn, downvote_btn, flag_btn]
+        )
+        flag_btn.click(
+            flag_last_response,
+            [state, model_selector],
+            [textbox, upvote_btn, downvote_btn, flag_btn]
+        )
+
+        regenerate_btn.click(
+            regenerate,
+            [state, image_process_mode],
+            [state, chatbot, textbox, imagebox] + btn_list
+        ).then(
+            http_bot,
+            [state, model_selector, temperature, top_p, max_output_tokens],
+            [state, chatbot] + btn_list,
+            concurrency_limit=concurrency_count
+        )
+
+        clear_btn.click(
+            clear_history,
+            None,
+            [state, chatbot, textbox, imagebox] + btn_list,
+            queue=False
+        )
+
+        textbox.submit(
+            add_text,
+            [state, textbox, imagebox, image_process_mode],
+            [state, chatbot, textbox, imagebox] + btn_list,
+            queue=False
+        ).then(
+            http_bot,
+            [state, model_selector, temperature, top_p, max_output_tokens],
+            [state, chatbot] + btn_list,
+            concurrency_limit=concurrency_count
+        )
+
+        submit_btn.click(
+            add_text,
+            [state, textbox, imagebox, image_process_mode],
+            [state, chatbot, textbox, imagebox] + btn_list
+        ).then(
+            http_bot,
+            [state, model_selector, temperature, top_p, max_output_tokens],
+            [state, chatbot] + btn_list,
+            concurrency_limit=concurrency_count
+        )
+
+        if args.model_list_mode == "once":
+            demo.load(
+                load_demo,
+                [url_params],
+                [state, model_selector],
+                js=get_window_url_params
+            )
+        elif args.model_list_mode == "reload":
+            demo.load(
+                load_demo_refresh_model_list,
+                None,
+                [state, model_selector],
+                queue=False
+            )
+        else:
+            raise ValueError(f"Unknown model list mode: {args.model_list_mode}")
+
+    return demo
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="0.0.0.0")
+    parser.add_argument("--port", type=int)
+    parser.add_argument("--controller-url", type=str, default="http://localhost:21001")
+    parser.add_argument("--concurrency-count", type=int, default=16)
+    parser.add_argument("--model-list-mode", type=str, default="once",
+        choices=["once", "reload"])
+    parser.add_argument("--share", action="store_true")
+    parser.add_argument("--moderate", action="store_true")
+    parser.add_argument("--embed", action="store_true")
+    args = parser.parse_args()
+    logger.info(f"args: {args}")
+
+    models = get_model_list()
+
+    logger.info(args)
+    demo = build_demo(args.embed, concurrency_count=args.concurrency_count)
+    demo.queue(
+        api_open=False
+    ).launch(
+        server_name=args.host,
+        server_port=args.port,
+        share=args.share
+    )
diff --git a/app/llava/serve/model_worker.py b/app/llava/serve/model_worker.py
new file mode 100644
index 0000000000000000000000000000000000000000..98885cb36103dff333ec0f1415cea5468a947e14
--- /dev/null
+++ b/app/llava/serve/model_worker.py
@@ -0,0 +1,288 @@
+"""
+A model worker executes the model.
+"""
+import argparse
+import asyncio
+import json
+import time
+import threading
+import uuid
+
+from fastapi import FastAPI, Request, BackgroundTasks
+from fastapi.responses import StreamingResponse
+import requests
+import torch
+import uvicorn
+from functools import partial
+
+from llava.constants import WORKER_HEART_BEAT_INTERVAL
+from llava.utils import (build_logger, server_error_msg,
+    pretty_print_semaphore)
+from llava.model.builder import load_pretrained_model
+from llava.mm_utils import process_images, load_image_from_base64, tokenizer_image_token
+from llava.constants import MM_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from transformers import TextIteratorStreamer
+from threading import Thread
+
+
+GB = 1 << 30
+
+worker_id = str(uuid.uuid4())[:6]
+logger = build_logger("model_worker", f"model_worker_{worker_id}.log")
+global_counter = 0
+
+model_semaphore = None
+
+
+def heart_beat_worker(controller):
+
+    while True:
+        time.sleep(WORKER_HEART_BEAT_INTERVAL)
+        controller.send_heart_beat()
+
+
+class ModelWorker:
+    def __init__(self, controller_addr, worker_addr,
+                 worker_id, no_register,
+                 model_path, model_base, model_name,
+                 load_8bit, load_4bit, device, use_flash_attn=False):
+        self.controller_addr = controller_addr
+        self.worker_addr = worker_addr
+        self.worker_id = worker_id
+        if model_path.endswith("/"):
+            model_path = model_path[:-1]
+        if model_name is None:
+            model_paths = model_path.split("/")
+            if model_paths[-1].startswith('checkpoint-'):
+                self.model_name = model_paths[-2] + "_" + model_paths[-1]
+            else:
+                self.model_name = model_paths[-1]
+        else:
+            self.model_name = model_name
+
+        self.device = device
+        logger.info(f"Loading the model {self.model_name} on worker {worker_id} ...")
+        self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model(
+            model_path, model_base, self.model_name, load_8bit, load_4bit, device=self.device, use_flash_attn=use_flash_attn)
+        self.is_multimodal = 'llava' in self.model_name.lower()
+
+        if not no_register:
+            self.register_to_controller()
+            self.heart_beat_thread = threading.Thread(
+                target=heart_beat_worker, args=(self,), daemon=True)
+            self.heart_beat_thread.start()
+
+    def register_to_controller(self):
+        logger.info("Register to controller")
+
+        url = self.controller_addr + "/register_worker"
+        data = {
+            "worker_name": self.worker_addr,
+            "check_heart_beat": True,
+            "worker_status": self.get_status()
+        }
+        r = requests.post(url, json=data)
+        assert r.status_code == 200
+
+    def send_heart_beat(self):
+        logger.info(f"Send heart beat. Models: {[self.model_name]}. "
+                    f"Semaphore: {pretty_print_semaphore(model_semaphore)}. "
+                    f"global_counter: {global_counter}")
+
+        url = self.controller_addr + "/receive_heart_beat"
+
+        while True:
+            try:
+                ret = requests.post(url, json={
+                    "worker_name": self.worker_addr,
+                    "queue_length": self.get_queue_length()}, timeout=5)
+                exist = ret.json()["exist"]
+                break
+            except requests.exceptions.RequestException as e:
+                logger.error(f"heart beat error: {e}")
+            time.sleep(5)
+
+        if not exist:
+            self.register_to_controller()
+
+    def get_queue_length(self):
+        if model_semaphore is None:
+            return 0
+        else:
+            return args.limit_model_concurrency - model_semaphore._value + (len(
+                model_semaphore._waiters) if model_semaphore._waiters is not None else 0)
+
+    def get_status(self):
+        return {
+            "model_names": [self.model_name],
+            "speed": 1,
+            "queue_length": self.get_queue_length(),
+        }
+
+    @torch.inference_mode()
+    def generate_stream(self, params):
+        tokenizer, model, image_processor = self.tokenizer, self.model, self.image_processor
+
+        prompt = params["prompt"]
+        ori_prompt = prompt
+        images = params.get("images", None)
+        num_image_tokens = 0
+        if images is not None and len(images) > 0 and self.is_multimodal:
+            if len(images) > 0:
+                if len(images) != prompt.count(DEFAULT_IMAGE_TOKEN):
+                    raise ValueError("Number of images does not match number of <image> tokens in prompt")
+
+                images = [load_image_from_base64(image) for image in images]
+                image_sizes = [image.size for image in images]
+                images = process_images(images, image_processor, model.config)
+
+                if type(images) is list:
+                    images = [image.to(self.model.device, dtype=torch.float16) for image in images]
+                else:
+                    images = images.to(self.model.device, dtype=torch.float16)
+
+                replace_token = DEFAULT_IMAGE_TOKEN
+                if getattr(self.model.config, 'mm_use_start_end', False):
+                    replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
+                prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
+
+                num_image_tokens = prompt.count(replace_token) * model.get_vision_tower().num_patches
+            else:
+                images = None
+                image_sizes = None
+            image_args = {"images": images, "image_sizes": image_sizes}
+        else:
+            images = None
+            image_args = {}
+
+        temperature = float(params.get("temperature", 1.0))
+        top_p = float(params.get("top_p", 1.0))
+        max_context_length = getattr(model.config, 'max_position_embeddings', 2048)
+        max_new_tokens = min(int(params.get("max_new_tokens", 256)), 1024)
+        stop_str = params.get("stop", None)
+        do_sample = True if temperature > 0.001 else False
+
+        input_ids = tokenizer_image_token(prompt, tokenizer, MM_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device)
+        keywords = [stop_str]
+        # stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=15)
+
+        max_new_tokens = min(max_new_tokens, max_context_length - input_ids.shape[-1] - num_image_tokens)
+
+        if max_new_tokens < 1:
+            yield json.dumps({"text": ori_prompt + "Exceeds max token length. Please start a new conversation, thanks.", "error_code": 0}).encode() + b"\0"
+            return
+
+        thread = Thread(target=model.generate, kwargs=dict(
+            inputs=input_ids,
+            do_sample=do_sample,
+            temperature=temperature,
+            top_p=top_p,
+            max_new_tokens=max_new_tokens,
+            streamer=streamer,
+            use_cache=True,
+            **image_args
+        ))
+        thread.start()
+
+        generated_text = ori_prompt
+        for new_text in streamer:
+            generated_text += new_text
+            if generated_text.endswith(stop_str):
+                generated_text = generated_text[:-len(stop_str)]
+            yield json.dumps({"text": generated_text, "error_code": 0}).encode() + b"\0"
+
+    def generate_stream_gate(self, params):
+        try:
+            for x in self.generate_stream(params):
+                yield x
+        except ValueError as e:
+            print("Caught ValueError:", e)
+            ret = {
+                "text": server_error_msg,
+                "error_code": 1,
+            }
+            yield json.dumps(ret).encode() + b"\0"
+        except torch.cuda.CudaError as e:
+            print("Caught torch.cuda.CudaError:", e)
+            ret = {
+                "text": server_error_msg,
+                "error_code": 1,
+            }
+            yield json.dumps(ret).encode() + b"\0"
+        except Exception as e:
+            print("Caught Unknown Error", e)
+            ret = {
+                "text": server_error_msg,
+                "error_code": 1,
+            }
+            yield json.dumps(ret).encode() + b"\0"
+
+
+app = FastAPI()
+
+
+def release_model_semaphore(fn=None):
+    model_semaphore.release()
+    if fn is not None:
+        fn()
+
+
+@app.post("/worker_generate_stream")
+async def generate_stream(request: Request):
+    global model_semaphore, global_counter
+    global_counter += 1
+    params = await request.json()
+
+    if model_semaphore is None:
+        model_semaphore = asyncio.Semaphore(args.limit_model_concurrency)
+    await model_semaphore.acquire()
+    worker.send_heart_beat()
+    generator = worker.generate_stream_gate(params)
+    background_tasks = BackgroundTasks()
+    background_tasks.add_task(partial(release_model_semaphore, fn=worker.send_heart_beat))
+    return StreamingResponse(generator, background=background_tasks)
+
+
+@app.post("/worker_get_status")
+async def get_status(request: Request):
+    return worker.get_status()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=21002)
+    parser.add_argument("--worker-address", type=str,
+        default="http://localhost:21002")
+    parser.add_argument("--controller-address", type=str,
+        default="http://localhost:21001")
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--model-name", type=str)
+    parser.add_argument("--device", type=str, default="cuda")
+    parser.add_argument("--multi-modal", action="store_true", help="Multimodal mode is automatically detected with model name, please make sure `llava` is included in the model path.")
+    parser.add_argument("--limit-model-concurrency", type=int, default=5)
+    parser.add_argument("--stream-interval", type=int, default=1)
+    parser.add_argument("--no-register", action="store_true")
+    parser.add_argument("--load-8bit", action="store_true")
+    parser.add_argument("--load-4bit", action="store_true")
+    parser.add_argument("--use-flash-attn", action="store_true")
+    args = parser.parse_args()
+    logger.info(f"args: {args}")
+
+    if args.multi_modal:
+        logger.warning("Multimodal mode is automatically detected with model name, please make sure `llava` is included in the model path.")
+
+    worker = ModelWorker(args.controller_address,
+                         args.worker_address,
+                         worker_id,
+                         args.no_register,
+                         args.model_path,
+                         args.model_base,
+                         args.model_name,
+                         args.load_8bit,
+                         args.load_4bit,
+                         args.device,
+                         use_flash_attn=args.use_flash_attn)
+    uvicorn.run(app, host=args.host, port=args.port, log_level="info")
diff --git a/app/llava/serve/register_worker.py b/app/llava/serve/register_worker.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c2c40295e0351f25709ba25554c9329f15bf0d2
--- /dev/null
+++ b/app/llava/serve/register_worker.py
@@ -0,0 +1,26 @@
+"""
+Manually register workers.
+
+Usage:
+python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002
+"""
+
+import argparse
+
+import requests
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--controller-address", type=str)
+    parser.add_argument("--worker-name", type=str)
+    parser.add_argument("--check-heart-beat", action="store_true")
+    args = parser.parse_args()
+
+    url = args.controller_address + "/register_worker"
+    data = {
+        "worker_name": args.worker_name,
+        "check_heart_beat": args.check_heart_beat,
+        "worker_status": None,
+    }
+    r = requests.post(url, json=data)
+    assert r.status_code == 200
diff --git a/app/llava/serve/sglang_worker.py b/app/llava/serve/sglang_worker.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae37504c698740901218089369872c46b2f78aa0
--- /dev/null
+++ b/app/llava/serve/sglang_worker.py
@@ -0,0 +1,244 @@
+"""
+A model worker executes the model.
+"""
+import argparse
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+import json
+import time
+import threading
+import uuid
+
+from fastapi import FastAPI, Request, BackgroundTasks
+from fastapi.responses import StreamingResponse
+import requests
+import re
+import uvicorn
+from functools import partial
+
+from llava.constants import WORKER_HEART_BEAT_INTERVAL
+from llava.utils import (build_logger, server_error_msg,
+    pretty_print_semaphore)
+from llava.mm_utils import process_images, load_image_from_base64, tokenizer_image_token, expand2square
+from llava.constants import DEFAULT_IMAGE_TOKEN
+
+import sglang as sgl
+from sglang.backend.runtime_endpoint import RuntimeEndpoint
+
+
+GB = 1 << 30
+
+worker_id = str(uuid.uuid4())[:6]
+logger = build_logger("model_worker", f"model_worker_{worker_id}.log")
+global_counter = 0
+
+model_semaphore = None
+
+
+def heart_beat_worker(controller):
+    while True:
+        time.sleep(WORKER_HEART_BEAT_INTERVAL)
+        controller.send_heart_beat()
+
+
+@sgl.function
+def pipeline(s, prompt, max_tokens):
+    for p in prompt:
+        if type(p) is str:
+            s += p
+        else:
+            s += sgl.image(p)
+    s += sgl.gen("response", max_tokens=max_tokens)
+
+
+class ModelWorker:
+    def __init__(self, controller_addr, worker_addr, sgl_endpoint,
+                 worker_id, no_register, model_name):
+        self.controller_addr = controller_addr
+        self.worker_addr = worker_addr
+        self.worker_id = worker_id
+
+        # Select backend
+        backend = RuntimeEndpoint(sgl_endpoint)
+        sgl.set_default_backend(backend)
+        model_path = backend.model_info["model_path"]
+
+        if model_path.endswith("/"):
+            model_path = model_path[:-1]
+        if model_name is None:
+            model_paths = model_path.split("/")
+            if model_paths[-1].startswith('checkpoint-'):
+                self.model_name = model_paths[-2] + "_" + model_paths[-1]
+            else:
+                self.model_name = model_paths[-1]
+        else:
+            self.model_name = model_name
+
+        logger.info(f"Loading the SGLANG model {self.model_name} on worker {worker_id} ...")
+
+        if not no_register:
+            self.register_to_controller()
+            self.heart_beat_thread = threading.Thread(
+                target=heart_beat_worker, args=(self,), daemon=True)
+            self.heart_beat_thread.start()
+
+    def register_to_controller(self):
+        logger.info("Register to controller")
+
+        url = self.controller_addr + "/register_worker"
+        data = {
+            "worker_name": self.worker_addr,
+            "check_heart_beat": True,
+            "worker_status": self.get_status()
+        }
+        r = requests.post(url, json=data)
+        assert r.status_code == 200
+
+    def send_heart_beat(self):
+        logger.info(f"Send heart beat. Models: {[self.model_name]}. "
+                    f"Semaphore: {pretty_print_semaphore(model_semaphore)}. "
+                    f"global_counter: {global_counter}")
+
+        url = self.controller_addr + "/receive_heart_beat"
+
+        while True:
+            try:
+                ret = requests.post(url, json={
+                    "worker_name": self.worker_addr,
+                    "queue_length": self.get_queue_length()}, timeout=5)
+                exist = ret.json()["exist"]
+                break
+            except requests.exceptions.RequestException as e:
+                logger.error(f"heart beat error: {e}")
+            time.sleep(5)
+
+        if not exist:
+            self.register_to_controller()
+
+    def get_queue_length(self):
+        if model_semaphore is None:
+            return 0
+        else:
+            return args.limit_model_concurrency - model_semaphore._value + (len(
+                model_semaphore._waiters) if model_semaphore._waiters is not None else 0)
+
+    def get_status(self):
+        return {
+            "model_names": [self.model_name],
+            "speed": 1,
+            "queue_length": self.get_queue_length(),
+        }
+
+    async def generate_stream(self, params):
+        ori_prompt = prompt = params["prompt"]
+        images = params.get("images", None)
+        if images is not None and len(images) > 0:
+            if len(images) > 0:
+                if len(images) != prompt.count(DEFAULT_IMAGE_TOKEN):
+                    raise ValueError("Number of images does not match number of <image> tokens in prompt")
+
+                images = [load_image_from_base64(image) for image in images]
+
+                # FIXME: for image-start/end token
+                # replace_token = DEFAULT_IMAGE_TOKEN
+                # if getattr(self.model.config, 'mm_use_start_end', False):
+                #     replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
+                # prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
+                prompt = prompt.replace(' ' + DEFAULT_IMAGE_TOKEN + '\n', DEFAULT_IMAGE_TOKEN)
+                prompt_split = prompt.split(DEFAULT_IMAGE_TOKEN)
+                prompt = []
+                for i in range(len(prompt_split)):
+                    prompt.append(prompt_split[i])
+                    if i < len(images):
+                        prompt.append(images[i])
+        else:
+            prompt = [prompt]
+
+        temperature = float(params.get("temperature", 1.0))
+        top_p = float(params.get("top_p", 1.0))
+        # max_context_length = getattr(model.config, 'max_position_embeddings', 2048)
+        max_new_tokens = min(int(params.get("max_new_tokens", 256)), 1024)
+        stop_str = params.get("stop", None)
+        stop_str = [stop_str] if stop_str is not None else None
+
+        print({'prompt': prompt, 'max_new_tokens': max_new_tokens, 'temperature': temperature, 'top_p': top_p})
+        state = pipeline.run(prompt, max_new_tokens, temperature=temperature, top_p=top_p, stream=True)
+
+        generated_text = ori_prompt
+        async for text_outputs in state.text_async_iter(var_name="response"):
+            generated_text += text_outputs
+            yield json.dumps({"text": generated_text, "error_code": 0}).encode() + b"\0"
+
+    async def generate_stream_gate(self, params):
+        try:
+            async for x in self.generate_stream(params):
+                yield x
+        except ValueError as e:
+            print("Caught ValueError:", e)
+            ret = {
+                "text": server_error_msg,
+                "error_code": 1,
+            }
+            yield json.dumps(ret).encode() + b"\0"
+        except Exception as e:
+            print("Caught Unknown Error", e)
+            ret = {
+                "text": server_error_msg,
+                "error_code": 1,
+            }
+            yield json.dumps(ret).encode() + b"\0"
+
+
+app = FastAPI()
+
+
+def release_model_semaphore(fn=None):
+    model_semaphore.release()
+    if fn is not None:
+        fn()
+
+
+@app.post("/worker_generate_stream")
+async def generate_stream(request: Request):
+    global model_semaphore, global_counter
+    global_counter += 1
+    params = await request.json()
+
+    if model_semaphore is None:
+        model_semaphore = asyncio.Semaphore(args.limit_model_concurrency)
+    await model_semaphore.acquire()
+    worker.send_heart_beat()
+    generator = worker.generate_stream_gate(params)
+    background_tasks = BackgroundTasks()
+    background_tasks.add_task(partial(release_model_semaphore, fn=worker.send_heart_beat))
+    return StreamingResponse(generator, background=background_tasks)
+
+
+@app.post("/worker_get_status")
+async def get_status(request: Request):
+    return worker.get_status()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=21002)
+    parser.add_argument("--worker-address", type=str,
+        default="http://localhost:21002")
+    parser.add_argument("--controller-address", type=str,
+        default="http://localhost:21001")
+    parser.add_argument("--model-name", type=str)
+    parser.add_argument("--sgl-endpoint", type=str)
+    parser.add_argument("--limit-model-concurrency", type=int, default=5)
+    parser.add_argument("--stream-interval", type=int, default=1)
+    parser.add_argument("--no-register", action="store_true")
+    args = parser.parse_args()
+    logger.info(f"args: {args}")
+
+    worker = ModelWorker(args.controller_address,
+                         args.worker_address,
+                         args.sgl_endpoint,
+                         worker_id,
+                         args.no_register,
+                         args.model_name)
+    uvicorn.run(app, host=args.host, port=args.port, log_level="info")
diff --git a/app/llava/serve/test_message.py b/app/llava/serve/test_message.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b090faed0e630b03b2294545050f1f4f5032cad
--- /dev/null
+++ b/app/llava/serve/test_message.py
@@ -0,0 +1,62 @@
+import argparse
+import json
+
+import requests
+
+from llava.conversation import default_conversation
+
+
+def main():
+    if args.worker_address:
+        worker_addr = args.worker_address
+    else:
+        controller_addr = args.controller_address
+        ret = requests.post(controller_addr + "/refresh_all_workers")
+        ret = requests.post(controller_addr + "/list_models")
+        models = ret.json()["models"]
+        models.sort()
+        print(f"Models: {models}")
+
+        ret = requests.post(controller_addr + "/get_worker_address",
+            json={"model": args.model_name})
+        worker_addr = ret.json()["address"]
+        print(f"worker_addr: {worker_addr}")
+
+    if worker_addr == "":
+        return
+
+    conv = default_conversation.copy()
+    conv.append_message(conv.roles[0], args.message)
+    prompt = conv.get_prompt()
+
+    headers = {"User-Agent": "LLaVA Client"}
+    pload = {
+        "model": args.model_name,
+        "prompt": prompt,
+        "max_new_tokens": args.max_new_tokens,
+        "temperature": 0.7,
+        "stop": conv.sep,
+    }
+    response = requests.post(worker_addr + "/worker_generate_stream", headers=headers,
+            json=pload, stream=True)
+
+    print(prompt.replace(conv.sep, "\n"), end="")
+    for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"):
+        if chunk:
+            data = json.loads(chunk.decode("utf-8"))
+            output = data["text"].split(conv.sep)[-1]
+            print(output, end="\r")
+    print("")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--controller-address", type=str, default="http://localhost:21001")
+    parser.add_argument("--worker-address", type=str)
+    parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
+    parser.add_argument("--max-new-tokens", type=int, default=32)
+    parser.add_argument("--message", type=str, default=
+        "Tell me a story with more than 1000 words.")
+    args = parser.parse_args()
+
+    main()
diff --git a/app/llava/train/arguments.py b/app/llava/train/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9fe77ed80ab45e4a3f2d40b0043852c11689c38
--- /dev/null
+++ b/app/llava/train/arguments.py
@@ -0,0 +1,85 @@
+import transformers
+
+from typing import Dict, Optional, Sequence, List
+from dataclasses import dataclass, field
+
+@dataclass
+class ModelArguments:
+    model_name_or_path: Optional[str] = field(default="facebook/opt-125m")
+    version: Optional[str] = field(default="v0")
+    freeze_backbone: bool = field(default=False)
+    tune_mm_mlp_adapter: bool = field(default=False)
+    vision_tower: Optional[str] = field(default=None)
+    mm_vision_select_layer: Optional[int] = field(default=-1)   # default to the last layer
+    pretrain_mm_mlp_adapter: Optional[str] = field(default=None)
+    mm_projector_type: Optional[str] = field(default='linear')
+    mm_use_start_end: bool = field(default=False)
+    mm_use_patch_token: bool = field(default=True)
+    mm_patch_merge_type: Optional[str] = field(default='flat')
+    mm_vision_select_feature: Optional[str] = field(default="patch")
+    image_grid_pinpoints: Optional[str] = field(default="[(448, 448)]")
+
+    img_size: int = 224
+    drop_path_rate: float = 0.
+    vit_precision: Optional[str] = field(default="fp16")
+    vit_model_path: Optional[str] = field(default=None)
+    qformer_model_path: Optional[str] = field(default=None)
+    num_query_token: int = 32
+
+    adapter_module_name: Optional[str] = field(default=None)
+    adapter_module_path: Optional[str] = field(default=None)
+
+@dataclass
+class DataArguments:
+    dataset_config: str = field(default="",
+                                metadata={"help": "Training dataset config path"})   
+    # data_path: str = field(default=None,
+    #                        metadata={"help": "Path to the training data."})
+    lazy_preprocess: bool = False
+    is_multimodal: bool = False
+    # image_folder: Optional[str] = field(default=None)
+    image_aspect_ratio: str = 'square'
+    # num_segments: int = 10
+    num_segments: int = 10
+    sample_strategy: str = 'fps0.5'
+    external_args: dict = None
+    num_token_per_image: Optional[int] = field(default=32) 
+
+
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    cache_dir: Optional[str] = field(default=None)
+    optim: str = field(default="adamw_torch")
+    remove_unused_columns: bool = field(default=False)
+    freeze_mm_mlp_adapter: bool = field(default=False)
+    freeze_qformer: bool = field(default=True)
+    freeze_adapter: bool = field(default=False)
+    mpt_attn_impl: Optional[str] = field(default="triton")
+    model_max_length: int = field(
+        default=512,
+        metadata={
+            "help":
+            "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
+        },
+    )
+    double_quant: bool = field(
+        default=True,
+        metadata={"help": "Compress the quantization statistics through double quantization."}
+    )
+    quant_type: str = field(
+        default="nf4",
+        metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
+    )
+    bits: int = field(
+        default=16,
+        metadata={"help": "How many bits to use."}
+    )
+    lora_enable: bool = False
+    lora_r: int = 64
+    lora_alpha: int = 16
+    lora_dropout: float = 0.05
+    lora_weight_path: str = ""
+    lora_bias: str = "none"
+    mm_projector_lr: Optional[float] = None
+    lora_lr: Optional[float] = None
+    group_by_modality_length: bool = field(default=False)
\ No newline at end of file
diff --git a/app/llava/train/llama_flash_attn_monkey_patch.py b/app/llava/train/llama_flash_attn_monkey_patch.py
new file mode 100644
index 0000000000000000000000000000000000000000..31db2eff8d1c4b3ae645583dfc5e156e818b6f1c
--- /dev/null
+++ b/app/llava/train/llama_flash_attn_monkey_patch.py
@@ -0,0 +1,115 @@
+from typing import Optional, Tuple
+import warnings
+
+import torch
+
+import transformers
+from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv
+
+try:
+    from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
+except ImportError:
+    from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
+from flash_attn.bert_padding import unpad_input, pad_input
+
+
+def forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.Tensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    if output_attentions:
+        warnings.warn(
+            "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
+        )
+
+    bsz, q_len, _ = hidden_states.size()
+
+    query_states = (
+        self.q_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    key_states = (
+        self.k_proj(hidden_states)
+        .view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    value_states = (
+        self.v_proj(hidden_states)
+        .view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        .transpose(1, 2)
+    )  # shape: (b, num_heads, s, head_dim)
+
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    query_states, key_states = apply_rotary_pos_emb(
+        query_states, key_states, cos, sin, position_ids
+    )
+
+    if past_key_value is not None:
+        # reuse k, v
+        key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+    past_key_value = (key_states, value_states) if use_cache else None
+
+    # repeat k/v heads if n_kv_heads < n_heads
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+    # Transform the data into the format required by flash attention
+    qkv = torch.stack([query_states, key_states, value_states], dim=2)
+    qkv = qkv.transpose(1, 3)  # shape: [b, s, 3, num_heads, head_dim]
+    key_padding_mask = attention_mask
+
+    if key_padding_mask is None:
+        qkv = qkv.reshape(-1, 3, self.num_heads, self.head_dim)
+        cu_q_lens = torch.arange(
+            0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=qkv.device
+        )
+        max_s = q_len
+        output = flash_attn_unpadded_qkvpacked_func(
+            qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
+        )
+        output = output.view(bsz, q_len, -1)
+    else:
+        qkv = qkv.reshape(bsz, q_len, -1)
+        qkv, indices, cu_q_lens, max_s = unpad_input(qkv, key_padding_mask)
+        qkv = qkv.view(-1, 3, self.num_heads, self.head_dim)
+        output_unpad = flash_attn_unpadded_qkvpacked_func(
+            qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
+        )
+        output_unpad = output_unpad.reshape(-1, self.num_heads * self.head_dim)
+        output = pad_input(output_unpad, indices, bsz, q_len)
+
+    return self.o_proj(output), None, past_key_value
+
+
+# Disable the transformation of the attention mask in LlamaModel as the flash attention
+# requires the attention mask to be the same as the key_padding_mask
+def _prepare_decoder_attention_mask(
+    self, attention_mask, input_shape, inputs_embeds, past_key_values_length
+):
+    # [bsz, seq_len]
+    return attention_mask
+
+
+def replace_llama_attn_with_flash_attn():
+    cuda_major, cuda_minor = torch.cuda.get_device_capability()
+    if cuda_major < 8:
+        warnings.warn(
+            "Flash attention is only supported on A100 or H100 GPU during training due to head dim > 64 backward."
+            "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593"
+        )
+    transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = (
+        _prepare_decoder_attention_mask
+    )
+    transformers.models.llama.modeling_llama.LlamaAttention.forward = forward
diff --git a/app/llava/train/llama_xformers_attn_monkey_patch.py b/app/llava/train/llama_xformers_attn_monkey_patch.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8351e41ccd4a64dca237bd8f8be0702b23989dc
--- /dev/null
+++ b/app/llava/train/llama_xformers_attn_monkey_patch.py
@@ -0,0 +1,129 @@
+"""
+Directly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments
+"""
+
+import logging
+import math
+from typing import Optional, Tuple
+
+import torch
+import transformers.models.llama.modeling_llama
+from torch import nn
+
+try:
+    import xformers.ops
+except ImportError:
+    logging.error("xformers not found! Please install it before trying to use it.")
+
+
+def replace_llama_attn_with_xformers_attn():
+    transformers.models.llama.modeling_llama.LlamaAttention.forward = xformers_forward
+
+
+def xformers_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    # pylint: disable=duplicate-code
+    bsz, q_len, _ = hidden_states.size()
+
+    query_states = (
+        self.q_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    key_states = (
+        self.k_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    value_states = (
+        self.v_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    (
+        query_states,
+        key_states,
+    ) = transformers.models.llama.modeling_llama.apply_rotary_pos_emb(
+        query_states, key_states, cos, sin, position_ids
+    )
+    # [bsz, nh, t, hd]
+
+    if past_key_value is not None:
+        # reuse k, v, self_attention
+        key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+    past_key_value = (key_states, value_states) if use_cache else None
+
+    # We only apply xformers optimizations if we don't need to output the whole attention matrix
+    if not output_attentions:
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        # This is a nasty hack. We know attention_mask in transformers is either LowerTriangular or all Zeros.
+        # We therefore check if one element in the upper triangular portion is zero. If it is, then the mask is all zeros.
+        if attention_mask is None or attention_mask[0, 0, 0, 1] == 0:
+            # input and output should be of form (bsz, q_len, num_heads, head_dim)
+            attn_output = xformers.ops.memory_efficient_attention(
+                query_states, key_states, value_states, attn_bias=None
+            )
+        else:
+            # input and output should be of form (bsz, q_len, num_heads, head_dim)
+            attn_output = xformers.ops.memory_efficient_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_bias=xformers.ops.LowerTriangularMask(),
+            )
+        attn_weights = None
+    else:
+        attn_weights = torch.matmul(
+            query_states, key_states.transpose(2, 3)
+        ) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+            attn_weights = torch.max(
+                attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)
+            )
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(
+            attn_weights, dim=-1, dtype=torch.float32
+        ).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2)
+
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+    attn_output = self.o_proj(attn_output)
+    return attn_output, attn_weights, past_key_value
diff --git a/app/llava/train/llava_trainer.py b/app/llava/train/llava_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fb0c1d52ccc8ffa45b6952204d79b82edd05d27
--- /dev/null
+++ b/app/llava/train/llava_trainer.py
@@ -0,0 +1,287 @@
+import os
+import torch
+import torch.nn as nn
+
+from torch.utils.data import Sampler
+
+from transformers import Trainer
+from transformers.trainer import (
+    is_sagemaker_mp_enabled,
+    get_parameter_names,
+    has_length,
+    ALL_LAYERNORM_LAYERS,
+    logger,
+)
+from typing import List, Optional
+
+
+def maybe_zero_3(param, ignore_status=False, name=None):
+    from deepspeed import zero
+    from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+    if hasattr(param, "ds_id"):
+        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
+            if not ignore_status:
+                print(name, 'no ignore status')
+        with zero.GatheredParameters([param]):
+            param = param.data.detach().cpu().clone()
+    else:
+        param = param.detach().cpu().clone()
+    return param
+
+
+def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
+    to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)}
+    to_return = {k: maybe_zero_3(v, ignore_status=True, name=k).cpu() for k, v in to_return.items()}
+    return to_return
+
+
+def split_to_even_chunks(indices, lengths, num_chunks):
+    """
+    Split a list of indices into `chunks` chunks of roughly equal lengths.
+    """
+
+    if len(indices) % num_chunks != 0:
+        return [indices[i::num_chunks] for i in range(num_chunks)]
+
+    num_indices_per_chunk = len(indices) // num_chunks
+
+    chunks = [[] for _ in range(num_chunks)]
+    chunks_lengths = [0 for _ in range(num_chunks)]
+    for index in indices:
+        shortest_chunk = chunks_lengths.index(min(chunks_lengths))
+        chunks[shortest_chunk].append(index)
+        chunks_lengths[shortest_chunk] += lengths[index]
+        if len(chunks[shortest_chunk]) == num_indices_per_chunk:
+            chunks_lengths[shortest_chunk] = float("inf")
+
+    return chunks
+
+
+def get_modality_length_grouped_indices(lengths, batch_size, world_size, generator=None):
+    # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
+    assert all(l != 0 for l in lengths), "Should not have zero length."
+    if all(l > 0 for l in lengths) or all(l < 0 for l in lengths):
+        # all samples are in the same modality
+        return get_length_grouped_indices(lengths, batch_size, world_size, generator=generator)
+    mm_indices, mm_lengths = zip(*[(i, l) for i, l in enumerate(lengths) if l > 0])
+    lang_indices, lang_lengths = zip(*[(i, -l) for i, l in enumerate(lengths) if l < 0])
+
+    mm_shuffle = [mm_indices[i] for i in get_length_grouped_indices(mm_lengths, batch_size, world_size, generator=None)]
+    lang_shuffle = [lang_indices[i] for i in get_length_grouped_indices(lang_lengths, batch_size, world_size, generator=None)]
+    megabatch_size = world_size * batch_size
+    mm_megabatches = [mm_shuffle[i : i + megabatch_size] for i in range(0, len(mm_shuffle), megabatch_size)]
+    lang_megabatches = [lang_shuffle[i : i + megabatch_size] for i in range(0, len(lang_shuffle), megabatch_size)]
+
+    last_mm = mm_megabatches[-1]
+    last_lang = lang_megabatches[-1]
+    additional_batch = last_mm + last_lang
+    megabatches = mm_megabatches[:-1] + lang_megabatches[:-1]
+    megabatch_indices = torch.randperm(len(megabatches), generator=generator)
+    megabatches = [megabatches[i] for i in megabatch_indices]
+
+    if len(additional_batch) > 0:
+        megabatches.append(sorted(additional_batch))
+
+    return [i for megabatch in megabatches for i in megabatch]
+
+
+def get_length_grouped_indices(lengths, batch_size, world_size, generator=None, merge=True):
+    # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
+    indices = torch.randperm(len(lengths), generator=generator)
+    megabatch_size = world_size * batch_size
+    megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)]
+    megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches]
+    megabatches = [split_to_even_chunks(megabatch, lengths, world_size) for megabatch in megabatches]
+
+    return [i for megabatch in megabatches for batch in megabatch for i in batch]
+
+
+class LengthGroupedSampler(Sampler):
+    r"""
+    Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while
+    keeping a bit of randomness.
+    """
+
+    def __init__(
+        self,
+        batch_size: int,
+        world_size: int,
+        lengths: Optional[List[int]] = None,
+        generator=None,
+        group_by_modality: bool = False,
+    ):
+        if lengths is None:
+            raise ValueError("Lengths must be provided.")
+
+        self.batch_size = batch_size
+        self.world_size = world_size
+        self.lengths = lengths
+        self.generator = generator
+        self.group_by_modality = group_by_modality
+
+    def __len__(self):
+        return len(self.lengths)
+
+    def __iter__(self):
+        if self.group_by_modality:
+            indices = get_modality_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
+        else:
+            indices = get_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
+        return iter(indices)
+
+
+class LLaVATrainer(Trainer):
+
+    def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
+        if self.train_dataset is None or not has_length(self.train_dataset):
+            return None
+
+        if self.args.group_by_modality_length:
+            lengths = self.train_dataset.modality_lengths
+            return LengthGroupedSampler(
+                self.args.train_batch_size,
+                world_size=self.args.world_size * self.args.gradient_accumulation_steps,
+                lengths=lengths,
+                group_by_modality=True,
+            )
+        else:
+            return super()._get_train_sampler()
+
+    def create_optimizer(self):
+        """
+        Setup the optimizer.
+
+        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
+        Trainer's init through `optimizers`, or subclass and override this method in a subclass.
+        """
+        if is_sagemaker_mp_enabled():
+            return super().create_optimizer()
+
+        opt_model = self.model
+
+        if self.optimizer is None:
+            decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS)
+            decay_parameters = [name for name in decay_parameters if "bias" not in name]
+            # if self.args.mm_projector_lr is not None:
+            #     projector_parameters = [name for name, _ in opt_model.named_parameters() if "mm_projector" in name]
+            #     optimizer_grouped_parameters = [
+            #         {
+            #             "params": [
+            #                 p for n, p in opt_model.named_parameters() if (n in decay_parameters and n not in projector_parameters and p.requires_grad)
+            #             ],
+            #             "weight_decay": self.args.weight_decay,
+            #         },
+            #         {
+            #             "params": [
+            #                 p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n not in projector_parameters and p.requires_grad)
+            #             ],
+            #             "weight_decay": 0.0,
+            #         },
+            #         {
+            #             "params": [
+            #                 p for n, p in opt_model.named_parameters() if (n in decay_parameters and n in projector_parameters and p.requires_grad)
+            #             ],
+            #             "weight_decay": self.args.weight_decay,
+            #             "lr": self.args.mm_projector_lr,
+            #         },
+            #         {
+            #             "params": [
+            #                 p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n in projector_parameters and p.requires_grad)
+            #             ],
+            #             "weight_decay": 0.0,
+            #             "lr": self.args.mm_projector_lr,
+            #         },
+            #     ]
+            if self.args.lora_lr is not None:
+                lora_parameters = [name for name, _ in opt_model.named_parameters() if "lora" in name]
+                optimizer_grouped_parameters = [
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and n not in lora_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n not in lora_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": 0.0,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and n in lora_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                        "lr": self.args.lora_lr,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n in lora_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": 0.0,
+                        "lr": self.args.lora_lr,
+                    },
+                ]
+            else:
+                optimizer_grouped_parameters = [
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": 0.0,
+                    },
+                ]
+
+            optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
+
+            self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
+            if optimizer_cls.__name__ == "Adam8bit":
+                import bitsandbytes
+
+                manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
+
+                skipped = 0
+                for module in opt_model.modules():
+                    if isinstance(module, nn.Embedding):
+                        skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values())
+                        logger.info(f"skipped {module}: {skipped/2**20}M params")
+                        manager.register_module_override(module, "weight", {"optim_bits": 32})
+                        logger.debug(f"bitsandbytes: will optimize {module} in fp32")
+                logger.info(f"skipped: {skipped/2**20}M params")
+
+        return self.optimizer
+
+    def _save_checkpoint(self, model, trial, metrics=None):
+        if getattr(self.args, 'tune_mm_mlp_adapter', False):
+            from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+            checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
+
+            run_dir = self._get_output_dir(trial=trial)
+            output_dir = os.path.join(run_dir, checkpoint_folder)
+
+            # Only save Adapter
+            keys_to_match = ['mm_projector', 'vision_resampler', 'frame_position_encoding', 'adapter_module']
+            if getattr(self.args, "use_im_start_end", False):
+                keys_to_match.extend(['embed_tokens', 'embed_in', 'wte'])
+            if not getattr(self.args, 'freeze_qformer', True):
+                keys_to_match.extend(['Qformer', 'query_tokens'])
+
+            weight_to_save = get_mm_adapter_state_maybe_zero_3(self.model.named_parameters(), keys_to_match)
+
+            if self.args.local_rank == 0 or self.args.local_rank == -1:
+                self.model.config.save_pretrained(output_dir)
+                torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin'))
+        else:
+            super(LLaVATrainer, self)._save_checkpoint(model, trial, metrics)
+
+    def _save(self, output_dir: Optional[str] = None, state_dict=None):
+        if getattr(self.args, 'tune_mm_mlp_adapter', False):
+            pass
+        else:
+            super(LLaVATrainer, self)._save(output_dir, state_dict)
diff --git a/app/llava/train/train.py b/app/llava/train/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..f45992ce0e78eb031f56ee33e8e6586793030dde
--- /dev/null
+++ b/app/llava/train/train.py
@@ -0,0 +1,474 @@
+# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
+# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
+#    Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+import os
+import copy
+from dataclasses import dataclass, field
+import json
+import logging
+import pathlib
+from typing import Dict, Optional, Sequence, List
+from webbrowser import get
+
+import torch
+
+import transformers
+import tokenizers
+
+# from llava.constants import IGNORE_INDEX, MM_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+# from torch.utils.data import Dataset
+from llava.train.llava_trainer import LLaVATrainer
+from llava.train.arguments import ModelArguments, TrainingArguments, DataArguments
+from llava.datasets.super_dataset import make_supervised_data_module, make_supervised_data_module_concatdataset
+from llava import conversation as conversation_lib
+from llava.model import *
+# from llava.mm_utils import tokenizer_image_token
+# from llava.model.preprocessor import preprocess, preprocess_multimodal
+# from PIL import Image
+
+
+local_rank = None
+
+
+def rank0_print(*args):
+    if local_rank == 0:
+        print(*args)
+
+
+# from packaging import version
+# IS_TOKENIZER_GREATER_THAN_0_14 = version.parse(tokenizers.__version__) >= version.parse('0.14')
+
+
+
+def maybe_zero_3(param, ignore_status=False, name=None):
+    from deepspeed import zero
+    from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+    if hasattr(param, "ds_id"):
+        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
+            if not ignore_status:
+                logging.warning(f"{name}: param.ds_status != ZeroParamStatus.NOT_AVAILABLE: {param.ds_status}")
+        with zero.GatheredParameters([param]):
+            param = param.data.detach().cpu().clone()
+    else:
+        param = param.detach().cpu().clone()
+    return param
+
+
+# Borrowed from peft.utils.get_peft_model_state_dict
+def get_peft_state_maybe_zero_3(named_params, bias):
+    if bias == "none":
+        to_return = {k: t for k, t in named_params if "lora_" in k}
+    elif bias == "all":
+        to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
+    elif bias == "lora_only":
+        to_return = {}
+        maybe_lora_bias = {}
+        lora_bias_names = set()
+        for k, t in named_params:
+            if "lora_" in k:
+                to_return[k] = t
+                bias_name = k.split("lora_")[0] + "bias"
+                lora_bias_names.add(bias_name)
+            elif "bias" in k:
+                maybe_lora_bias[k] = t
+        for k, t in maybe_lora_bias:
+            if bias_name in lora_bias_names:
+                to_return[bias_name] = t
+    else:
+        raise NotImplementedError
+    to_return = {k: maybe_zero_3(v, ignore_status=True) for k, v in to_return.items()}
+    return to_return
+
+
+def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only=True):
+    to_return = {k: t for k, t in named_params if "lora_" not in k}
+    if require_grad_only:
+        to_return = {k: t for k, t in to_return.items() if t.requires_grad}
+    to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()}
+    return to_return
+
+
+def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
+    to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)}
+    to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()}
+    return to_return
+
+
+def find_all_linear_names(model):
+    cls = torch.nn.Linear
+    lora_module_names = set()
+    multimodal_keywords = ['mm_projector', 'vision_tower', 'vision_resampler']
+    for name, module in model.named_modules():
+        if any(mm_keyword in name for mm_keyword in multimodal_keywords):
+            continue
+        if isinstance(module, cls):
+            names = name.split('.')
+            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
+
+    if 'lm_head' in lora_module_names: # needed for 16-bit
+        lora_module_names.remove('lm_head')
+    return list(lora_module_names)
+
+
+def safe_save_model_for_hf_trainer(trainer: transformers.Trainer,
+                                   output_dir: str):
+    """Collects the state dict and dump to disk."""
+
+    if getattr(trainer.args, "tune_mm_mlp_adapter", False):
+        # Only save Adapter
+        keys_to_match = ['mm_projector', 'frame_position_encoding', 'adapter_module']
+        if getattr(trainer.args, "use_im_start_end", False):
+            keys_to_match.extend(['embed_tokens', 'embed_in', 'wte'])
+        if not getattr(trainer.args, 'freeze_qformer', True):
+            keys_to_match.extend(['Qformer', 'query_tokens'])
+
+
+        weight_to_save = get_mm_adapter_state_maybe_zero_3(trainer.model.named_parameters(), keys_to_match)
+        trainer.model.config.save_pretrained(output_dir)
+
+        current_folder = output_dir.split('/')[-1]
+        parent_folder = os.path.dirname(output_dir)
+        if trainer.args.local_rank == 0 or trainer.args.local_rank == -1:
+            if current_folder.startswith('checkpoint-'):
+                mm_projector_folder = os.path.join(parent_folder, "mm_projector")
+                os.makedirs(mm_projector_folder, exist_ok=True)
+                torch.save(weight_to_save, os.path.join(mm_projector_folder, f'{current_folder}.bin'))
+            else:
+                torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin'))
+        return
+
+    if trainer.deepspeed:
+        torch.cuda.synchronize()
+        trainer.save_model(output_dir)
+        return
+
+    state_dict = trainer.model.state_dict()
+    if trainer.args.should_save:
+        cpu_state_dict = {
+            key: value.cpu()
+            for key, value in state_dict.items()
+        }
+        del state_dict
+        trainer._save(output_dir, state_dict=cpu_state_dict)  # noqa
+
+
+def smart_tokenizer_and_embedding_resize(
+    special_tokens_dict: Dict,
+    tokenizer: transformers.PreTrainedTokenizer,
+    model: transformers.PreTrainedModel,
+):
+    """Resize tokenizer and embedding.
+
+    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
+    """
+    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
+    model.resize_token_embeddings(len(tokenizer))
+
+    if num_new_tokens > 0:
+        input_embeddings = model.get_input_embeddings().weight.data
+        output_embeddings = model.get_output_embeddings().weight.data
+
+        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+            dim=0, keepdim=True)
+        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+            dim=0, keepdim=True)
+
+        input_embeddings[-num_new_tokens:] = input_embeddings_avg
+        output_embeddings[-num_new_tokens:] = output_embeddings_avg
+
+
+def train(attn_implementation=None):
+    global local_rank
+
+    parser = transformers.HfArgumentParser(
+        (ModelArguments, DataArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    local_rank = training_args.local_rank
+    compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
+
+    bnb_model_from_pretrained_args = {}
+    if training_args.bits in [4, 8]:
+        from transformers import BitsAndBytesConfig
+        bnb_model_from_pretrained_args.update(dict(
+            device_map={"": training_args.device},
+            load_in_4bit=training_args.bits == 4,
+            load_in_8bit=training_args.bits == 8,
+            quantization_config=BitsAndBytesConfig(
+                load_in_4bit=training_args.bits == 4,
+                load_in_8bit=training_args.bits == 8,
+                llm_int8_skip_modules=["mm_projector"],
+                llm_int8_threshold=6.0,
+                llm_int8_has_fp16_weight=False,
+                bnb_4bit_compute_dtype=compute_dtype,
+                bnb_4bit_use_double_quant=training_args.double_quant,
+                bnb_4bit_quant_type=training_args.quant_type # {'fp4', 'nf4'}
+            )
+        ))
+
+    if model_args.vision_tower is not None:
+        if 'mpt' in model_args.model_name_or_path:
+            config = transformers.AutoConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)
+            config.attn_config['attn_impl'] = training_args.mpt_attn_impl
+            model = LlavaMptForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                config=config,
+                cache_dir=training_args.cache_dir,
+                **bnb_model_from_pretrained_args
+            )
+        elif 'mistral' in model_args.model_name_or_path.lower():
+            model = LlavaMistralForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=training_args.cache_dir,
+                attn_implementation=attn_implementation,
+                torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+                **bnb_model_from_pretrained_args
+            )   
+        elif 'gemma' in model_args.model_name_or_path.lower(): 
+            model = LlavaGemmaForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=training_args.cache_dir,
+                attn_implementation=attn_implementation,
+                torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+                **bnb_model_from_pretrained_args
+            )
+        elif 'thoth' in model_args.model_name_or_path.lower(): 
+            model = LlavaThothForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=training_args.cache_dir,
+                attn_implementation=attn_implementation,
+                torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+                **bnb_model_from_pretrained_args
+            )                           
+        else:
+            model = LlavaLlamaForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=training_args.cache_dir,
+                attn_implementation=attn_implementation,
+                torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+                **bnb_model_from_pretrained_args
+            )
+    else:
+        model = transformers.LlamaForCausalLM.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            attn_implementation=attn_implementation,
+            torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+            **bnb_model_from_pretrained_args
+        )
+    model.config.use_cache = False
+
+    if model_args.freeze_backbone:
+        model.model.requires_grad_(False)
+
+    if training_args.bits in [4, 8]:
+        from peft import prepare_model_for_kbit_training
+        model.config.torch_dtype=(torch.float32 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
+        model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing)
+
+    if training_args.gradient_checkpointing:
+        if hasattr(model, "enable_input_require_grads"):
+            model.enable_input_require_grads()
+        else:
+            def make_inputs_require_grad(module, input, output):
+                output.requires_grad_(True)
+            model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+    if training_args.lora_enable:
+        from peft import LoraConfig, get_peft_model
+        lora_config = LoraConfig(
+            r=training_args.lora_r,
+            lora_alpha=training_args.lora_alpha,
+            target_modules=find_all_linear_names(model),
+            lora_dropout=training_args.lora_dropout,
+            bias=training_args.lora_bias,
+            task_type="CAUSAL_LM",
+        )
+        if training_args.bits == 16:
+            if training_args.bf16:
+                model.to(torch.bfloat16)
+            if training_args.fp16:
+                model.to(torch.float16)
+        rank0_print("Adding LoRA adapters...")
+        model = get_peft_model(model, lora_config)
+
+    if 'mpt' in model_args.model_name_or_path:
+        tokenizer = transformers.AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            model_max_length=training_args.model_max_length,
+            padding_side="right"
+        )
+    elif 'thoth' in model_args.model_name_or_path:
+        tokenizer = transformers.AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            model_max_length=training_args.model_max_length,
+            use_fast=True
+        ) 
+    else:
+        tokenizer = transformers.AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            model_max_length=training_args.model_max_length,
+            padding_side="right",
+            use_fast=False,
+        )
+
+    if model_args.version == "v0":
+        if tokenizer.pad_token is None:
+            smart_tokenizer_and_embedding_resize(
+                special_tokens_dict=dict(pad_token="[PAD]"),
+                tokenizer=tokenizer,
+                model=model,
+            )
+    elif model_args.version == "v0.5":
+        tokenizer.pad_token = tokenizer.unk_token
+    else:
+        if 'thoth' not in model_args.model_name_or_path:
+            tokenizer.pad_token = tokenizer.unk_token 
+        if model_args.version in conversation_lib.conv_templates:
+            conversation_lib.default_conversation = conversation_lib.conv_templates[model_args.version]
+        else:
+            conversation_lib.default_conversation = conversation_lib.conv_templates["vicuna_v1"]
+
+    model_args.max_num_segments = data_args.num_segments
+    if model_args.vision_tower is not None:
+        model.get_model().initialize_vision_modules(
+            model_args=model_args,
+            fsdp=training_args.fsdp
+        )
+        
+        vision_tower = model.get_vision_tower()
+    
+        data_args.image_processor = vision_tower.image_processor
+        data_args.is_multimodal = True
+
+        model.config.image_aspect_ratio = data_args.image_aspect_ratio
+        model.config.tokenizer_padding_side = tokenizer.padding_side
+        model.config.tokenizer_model_max_length = tokenizer.model_max_length
+
+        model.config.tune_mm_mlp_adapter = training_args.tune_mm_mlp_adapter = model_args.tune_mm_mlp_adapter
+        if model_args.tune_mm_mlp_adapter:
+            model.requires_grad_(False)
+            for p in model.get_model().mm_projector.parameters():
+                p.requires_grad = True
+
+        model.config.freeze_mm_mlp_adapter = training_args.freeze_mm_mlp_adapter
+        if training_args.freeze_mm_mlp_adapter:
+            for p in model.get_model().mm_projector.parameters():
+                p.requires_grad = False
+
+        # frame position encoding always train
+        if model.get_model().get_frame_position_encoding():
+            model.get_frame_position_encoding().weight.requires_grad = True
+
+        if training_args.bits in [4, 8]:
+            model.get_model().mm_projector.to(dtype=compute_dtype, device=training_args.device)
+
+        model.config.mm_use_start_end = data_args.mm_use_start_end = model_args.mm_use_start_end
+        model.config.mm_projector_lr = training_args.mm_projector_lr
+        model.config.lora_lr = training_args.lora_lr
+        training_args.use_im_start_end = model_args.mm_use_start_end
+        model.config.mm_use_patch_token = model_args.mm_use_patch_token
+        model.initialize_vision_tokenizer(model_args, tokenizer=tokenizer)
+
+        
+        if getattr(training_args, "freeze_vision_encoder", True):
+            for p in model.get_vision_tower().parameters():
+                p.requires_grad = False
+        else:
+            for p in model.get_vision_tower().parameters():
+                p.requires_grad = True
+        
+
+        if getattr(model_args, 'qformer_model_path', None):
+            if getattr(training_args, "freeze_qformer", True):
+                for p in model.get_qformer().parameters():
+                    p.requires_grad = False
+                for p in model.get_ln_vision().parameters():
+                    p.requires_grad = False          
+                model.get_query_tokens().requires_grad = False
+            else:
+                for p in model.get_qformer().parameters():
+                    p.requires_grad = True
+                for p in model.get_ln_vision().parameters():
+                    p.requires_grad = True          
+                model.get_query_tokens().requires_grad = True
+        
+        if getattr(model_args, 'adapter_module_name', None):
+            model.get_adapter_module().freeze_adapter_module(getattr(training_args, "freeze_adapter", False))
+        
+   
+    # deepspeed will handle fp16/bf16 automatically
+
+    trainable_params = [name for (name, param) in model.named_parameters() if param.requires_grad == True]
+
+    rank0_print(f"==> Trainable parameters: {trainable_params}")
+
+    if training_args.bits in [4, 8]:
+        from peft.tuners.lora import LoraLayer
+        for name, module in model.named_modules():
+            if isinstance(module, LoraLayer):
+                if training_args.bf16:
+                    module = module.to(torch.bfloat16)
+            if 'norm' in name:
+                module = module.to(torch.float32)
+            if 'lm_head' in name or 'embed_tokens' in name:
+                if hasattr(module, 'weight'):
+                    if training_args.bf16 and module.weight.dtype == torch.float32:
+                        module = module.to(torch.bfloat16)
+
+    data_args.image_grid_pinpoints = model_args.image_grid_pinpoints
+    if not training_args.group_by_modality_length:
+        data_module = make_supervised_data_module(tokenizer=tokenizer,
+                                                  data_args=data_args,
+                                                  num_workers=training_args.dataloader_num_workers)
+    else:
+        data_module = make_supervised_data_module_concatdataset(tokenizer=tokenizer,
+                                                data_args=data_args,
+                                                num_workers=training_args.dataloader_num_workers)
+
+    trainer = LLaVATrainer(model=model,
+                    tokenizer=tokenizer,
+                    args=training_args,
+                    **data_module)
+
+    if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
+        trainer.train(resume_from_checkpoint=True)
+    else:
+        trainer.train()
+    trainer.save_state()
+
+    model.config.use_cache = True
+
+    if training_args.lora_enable:
+        state_dict = get_peft_state_maybe_zero_3(
+            model.named_parameters(), training_args.lora_bias
+        )
+        non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(
+            model.named_parameters()
+        )
+        if training_args.local_rank == 0 or training_args.local_rank == -1:
+            model.config.save_pretrained(training_args.output_dir)
+            model.save_pretrained(training_args.output_dir, state_dict=state_dict)
+            torch.save(non_lora_state_dict, os.path.join(training_args.output_dir, 'non_lora_trainables.bin'))
+    else:
+        safe_save_model_for_hf_trainer(trainer=trainer,
+                                       output_dir=training_args.output_dir)
+ 
+
+if __name__ == "__main__":
+    train()
diff --git a/app/llava/train/train_xformers.py b/app/llava/train/train_xformers.py
new file mode 100644
index 0000000000000000000000000000000000000000..23a59bf4ee0f365de9fbf3838836b170058126d6
--- /dev/null
+++ b/app/llava/train/train_xformers.py
@@ -0,0 +1,13 @@
+# Make it more memory efficient by monkey patching the LLaMA model with xformers attention.
+
+# Need to call this before importing transformers.
+from llava.train.llama_xformers_attn_monkey_patch import (
+    replace_llama_attn_with_xformers_attn,
+)
+
+replace_llama_attn_with_xformers_attn()
+
+from llava.train.train import train
+
+if __name__ == "__main__":
+    train()
diff --git a/app/llava/utils.py b/app/llava/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..97765d93f018a5ab17b01f1ee7b511f64c664fd4
--- /dev/null
+++ b/app/llava/utils.py
@@ -0,0 +1,185 @@
+import datetime
+import logging
+import logging.handlers
+import os
+import sys
+import math
+import random
+import requests
+import torch.distributed as dist
+
+from llava.constants import LOGDIR
+
+server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
+moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
+
+handler = None
+
+
+def build_logger(logger_name, logger_filename):
+    global handler
+
+    formatter = logging.Formatter(
+        fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+
+    # Set the format of root handlers
+    if not logging.getLogger().handlers:
+        logging.basicConfig(level=logging.INFO)
+    logging.getLogger().handlers[0].setFormatter(formatter)
+
+    # Redirect stdout and stderr to loggers
+    stdout_logger = logging.getLogger("stdout")
+    stdout_logger.setLevel(logging.INFO)
+    sl = StreamToLogger(stdout_logger, logging.INFO)
+    sys.stdout = sl
+
+    stderr_logger = logging.getLogger("stderr")
+    stderr_logger.setLevel(logging.ERROR)
+    sl = StreamToLogger(stderr_logger, logging.ERROR)
+    sys.stderr = sl
+
+    # Get logger
+    logger = logging.getLogger(logger_name)
+    logger.setLevel(logging.INFO)
+
+    # Add a file handler for all loggers
+    if handler is None:
+        os.makedirs(LOGDIR, exist_ok=True)
+        filename = os.path.join(LOGDIR, logger_filename)
+        handler = logging.handlers.TimedRotatingFileHandler(
+            filename, when='D', utc=True, encoding='UTF-8')
+        handler.setFormatter(formatter)
+
+        for name, item in logging.root.manager.loggerDict.items():
+            if isinstance(item, logging.Logger):
+                item.addHandler(handler)
+
+    return logger
+
+
+class StreamToLogger(object):
+    """
+    Fake file-like stream object that redirects writes to a logger instance.
+    """
+    def __init__(self, logger, log_level=logging.INFO):
+        self.terminal = sys.stdout
+        self.logger = logger
+        self.log_level = log_level
+        self.linebuf = ''
+
+    def __getattr__(self, attr):
+        return getattr(self.terminal, attr)
+
+    def write(self, buf):
+        temp_linebuf = self.linebuf + buf
+        self.linebuf = ''
+        for line in temp_linebuf.splitlines(True):
+            # From the io.TextIOWrapper docs:
+            #   On output, if newline is None, any '\n' characters written
+            #   are translated to the system default line separator.
+            # By default sys.stdout.write() expects '\n' newlines and then
+            # translates them so this is still cross platform.
+            if line[-1] == '\n':
+                self.logger.log(self.log_level, line.rstrip())
+            else:
+                self.linebuf += line
+
+    def flush(self):
+        if self.linebuf != '':
+            self.logger.log(self.log_level, self.linebuf.rstrip())
+        self.linebuf = ''
+
+
+def disable_torch_init():
+    """
+    Disable the redundant torch default initialization to accelerate model creation.
+    """
+    import torch
+    setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
+    setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
+
+
+def violates_moderation(text):
+    """
+    Check whether the text violates OpenAI moderation API.
+    """
+    url = "https://api.openai.com/v1/moderations"
+    headers = {"Content-Type": "application/json",
+               "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]}
+    text = text.replace("\n", "")
+    data = "{" + '"input": ' + f'"{text}"' + "}"
+    data = data.encode("utf-8")
+    try:
+        ret = requests.post(url, headers=headers, data=data, timeout=5)
+        flagged = ret.json()["results"][0]["flagged"]
+    except requests.exceptions.RequestException as e:
+        flagged = False
+    except KeyError as e:
+        flagged = False
+
+    return flagged
+
+
+def pretty_print_semaphore(semaphore):
+    if semaphore is None:
+        return "None"
+    return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"
+
+def master_print(*args):
+    import torch
+    if torch.cuda.current_device() == 0:
+        print(*args)
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+def is_main_process():
+    return get_rank() == 0
+
+
+class DatasetIter(object):
+    def __init__(self, size, world_size, local_rank, num_workers=1):
+        self.size = size
+        self.world_size = world_size
+        self.local_rank = local_rank
+        # self.num_workers = 1 if num_workers == 0 else num_workers
+        assert num_workers == 1, 'num workers must be 1'
+        self.num_workers = num_workers
+        self.per_worker = int(math.floor(self.size / float(self.world_size * self.num_workers)))
+        self.worker_indexs = dict()
+
+        for worker_id in range(self.num_workers):
+            self.init_worker_index(worker_id)
+    def init_worker_index(self, worker_id):
+
+        start = self.per_worker * (self.local_rank * self.num_workers + worker_id)
+        end = min(start + self.per_worker, self.size)
+        rank_indexs = list(range(start, end))
+        random.shuffle(rank_indexs)
+
+        self.worker_indexs[worker_id] = rank_indexs
+
+    def increment(self, worker_id):
+
+        if len(self.worker_indexs[worker_id]) == 0:
+            self.init_worker_index(worker_id)
+
+        next_iter, self.worker_indexs[worker_id] = self.worker_indexs[worker_id][0], self.worker_indexs[worker_id][1:]
+        return next_iter
\ No newline at end of file
diff --git a/app/log-neuron-cc.txt b/app/log-neuron-cc.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8366b5977fa5f0706a0c59b49bdbdb56b4b6f636
--- /dev/null
+++ b/app/log-neuron-cc.txt
@@ -0,0 +1,6583 @@
+2024-06-01T05:59:59Z INFO 2398673 [root]: /root/anaconda3/envs/masp_fastapi/bin/neuronx-cc --target=trn1 compile --framework XLA /tmp/root/neuroncc_compile_workdir/6dc0ff72-9752-4e1f-8880-eae65c0e6f3a/model.MODULE_3143bb21695f957f3b75+2c9e451d.hlo.pb --output /tmp/root/neuroncc_compile_workdir/6dc0ff72-9752-4e1f-8880-eae65c0e6f3a/model.MODULE_3143bb21695f957f3b75+2c9e451d.neff --enable-experimental-spmd '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2' --model-type=transformer --auto-cast=none --verbose=35
+2024-06-01T05:59:59Z INFO 2398881 [root]: XLA detected
+2024-06-01T05:59:59Z INFO 2398881 [root]: Pipeline: Frontend HHChecker WalrusDriver BIRLinker Kelper
+2024-06-01T05:59:59Z INFO 2398881 [root]: Intermediate files stored in /root/llava_mistral_0531/app/neuronxcc-42xre48h, output in /root/llava_mistral_0531/app
+2024-06-01T05:59:59Z INFO 2398881 [pipeline.Pipeline.0]: Job Pipeline len(in_states) 1
+2024-06-01T05:59:59Z INFO 2398881 [pipeline.Pipeline.0]: Processing input #0
+2024-06-01T05:59:59Z INFO 2398881 [pipeline.Pipeline.0]: Running pipeline Pipeline.0
+2024-06-01T05:59:59Z INFO 2398881 [pipeline.Pipeline.0]: Starting job job.Frontend.0
+2024-06-01T05:59:59Z INFO 2398881 [job.Frontend.0]: Job Frontend len(in_states) 1
+2024-06-01T05:59:59Z INFO 2398881 [job.Frontend.0]: Processing input #0
+2024-06-01T05:59:59Z INFO 2398881 [job.Frontend.0]: Start model loading
+2024-06-01T05:59:59Z INFO 2398881 [job.Frontend.0]: IR signature: 5857e7e9c4c9db30eb0cc18d2b5420e2e92674c2b00fad0af990aea9edbf34fb for model.MODULE_3143bb21695f957f3b75+2c9e451d.hlo.pb
+2024-06-01T05:59:59Z INFO 2398881 [job.Frontend.0]: Executing: /root/anaconda3/envs/masp_fastapi/lib/python3.9/site-packages/neuronxcc/starfish/bin/hlo2penguin --input /tmp/root/neuroncc_compile_workdir/6dc0ff72-9752-4e1f-8880-eae65c0e6f3a/model.MODULE_3143bb21695f957f3b75+2c9e451d.hlo.pb --out-dir ./ --output penguin.py --remat --max-costly-ops=2 --max-live-in-size=5 --max-remat-chain-size=10 --max-mem-multiple=1.8 --min-def-use-distance=500 --remat-policy=transformer --allow-same-pass-remat=true --split-abc --layers-per-module=1 --coalesce-all-gathers=false --coalesce-reduce-scatters=false --coalesce-all-reduces=false --spmd --emit-tensor-level-dropout-ops --emit-tensor-level-rng-ops --native-to-custom-softmax --partitioner-opts='--transformer'
+2024-06-01T06:00:00Z INFO 2398881 [job.Frontend.0]: DEBUG: needsModular_PreSplit? Yes. macCnt 69397231385664 threshold 4398046511104
+INFO: Found compute bound graph
+
+Pre-Partition Pre-Opt Histogram:
+total HLO instructions: 3956
+           broadcast       618  15.62% ################################################################
+            multiply       580  14.66% ############################################################
+           parameter       359   9.07% #####################################
+         custom-call       355   8.97% ####################################
+            constant       325   8.22% #################################
+             reshape       324   8.19% #################################
+                 dot       290   7.33% ##############################
+                 add       193   4.88% ###################
+             convert       133   3.36% #############
+              reduce       129   3.26% #############
+               slice       128   3.24% #############
+            subtract        96   2.43% #########
+               rsqrt        65   1.64% ######
+           transpose        65   1.64% ######
+              divide        64   1.62% ######
+         concatenate        64   1.62% ######
+          all-reduce        64   1.62% ######
+              select        32   0.81% ###
+         exponential        32   0.81% ###
+            logistic        32   0.81% ###
+             compare         2   0.05% 
+                iota         1   0.03% 
+       dynamic-slice         1   0.03% 
+                sine         1   0.03% 
+              cosine         1   0.03% 
+                 and         1   0.03% 
+               tuple         1   0.03% 
+
+INFO: IoStatistics: total inputs: 359
+INFO: IoStatistics: total outputs: 65
+INFO: IoStatistics: total passthrough tensors: 0
+INFO: IoStatistics: total outputs read from: 64
+INFO: IoStatistics: total redundant outputs: 0
+INFO: IoStatistics: total ifmap size (KiB): 4937889
+INFO: IoStatistics: total ofmap size (KiB): 1172218
+INFO: IoStatistics: total must-alias size (KiB): 1171968
+INFO: IoStatistics: total may-alias size (KiB): 0
+INFO: HloMacCount has found 69397231239168
+INFO: Traffic has found 43459703100
+INFO: AIF 3193.64
+
+Pre-Partition Post-Op Histogram:
+total HLO instructions: 3988
+           broadcast       618  15.50% ################################################################
+            multiply       613  15.37% ###############################################################
+           parameter       359   9.00% #####################################
+            constant       325   8.15% #################################
+             reshape       323   8.10% #################################
+         custom-call       291   7.30% ##############################
+                 dot       289   7.25% #############################
+                 add       193   4.84% ###################
+           transpose       161   4.04% ################
+             convert       133   3.34% #############
+              reduce       129   3.23% #############
+               slice       128   3.21% #############
+            subtract        96   2.41% #########
+               rsqrt        65   1.63% ######
+         concatenate        64   1.60% ######
+          all-reduce        64   1.60% ######
+         exponential        32   0.80% ###
+              select        32   0.80% ###
+            logistic        32   0.80% ###
+              divide        32   0.80% ###
+                iota         2   0.05% 
+             compare         2   0.05% 
+       dynamic-slice         1   0.03% 
+                sine         1   0.03% 
+              cosine         1   0.03% 
+                 and         1   0.03% 
+               tuple         1   0.03% 
+
+DEBUG: needsModular_PreSplit? Yes. macCnt 69397231239168 threshold 4398046511104
+DEBUG: transformer model
+INFO: Partitioner configs:DefaultFlow GT BO LBL SA MaxDisj:2 MaxSep:4 LPM:1
+Potential split-points stats: #CC 64 #AR 64 #AG 0 #BN 0 nClamp 0
+DEBUG: needsModular_SplitFinder? Yes.
+Num of unique Module Definitions: 4
+DefMap:  0 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 63
+New disjoint wave: start 1 len 62 NumReps: 31
+INFO: Number of splitPoints: 33
+INFO: Total num elems for IntermediateOutput 9534875280
+DefMap:  0 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 32
+Wrote HLO netlist to hlo_netlist.json
+Processing partition 0
+INFO: COLLECTIVE ctype=AllReduce op=add dtype=bfloat16 num_elements=150011904 replica_groups={{0,1,2,3}}
+Replaced 0 dropout sequences with OffloadedDropout
+INFO: HloMacCount has found 555719098368
+INFO: Traffic has found 1343013928
+INFO: AIF 827.57
+HLO Ops used in computation: add all-reduce and broadcast compare concatenate constant convert cosine custom-call divide dot exponential iota multiply parameter reduce reshape rsqrt select sine slice subtract transpose tuple 
+INFO: Number of Native SoftmaxDx's detected and replaced: 0
+INFO: Number of Native Softmax's detected and replaced: 0
+INFO: Number of Native SoftmaxDx's detected and replaced: 0
+INFO: Number of Native Softmax's detected and replaced: 0
+INFO: Number of Native SoftmaxDx's detected and replaced: 0
+INFO: Number of Native Softmax's detected and replaced: 0
+INFO: Number of Native SoftmaxDx's detected and replaced: 0
+INFO: Number of Native Softmax's detected and replaced: 1
+Processing partition 1
+INFO: COLLECTIVE ctype=AllReduce op=add dtype=bfloat16 num_elements=150011904 replica_groups={{0,1,2,3}}
+INFO: COLLECTIVE ctype=AllReduce op=add dtype=bfloat16 num_elements=150011904 replica_groups={{0,1,2,3}}
+Replaced 0 dropout sequences with OffloadedDropout
+INFO: HloMacCount has found 2168647090176
+INFO: Traffic has found 2631196452
+INFO: AIF 1648.41
+HLO Ops used in computation: add all-reduce broadcast concatenate constant convert custom-call divide dot exponential logistic multiply parameter reduce reshape rsqrt select slice subtract transpose tuple 
+INFO: Number of Native SoftmaxDx's detected and replaced: 0
+INFO: Number of Native Softmax's detected and replaced: 0
+INFO: Number of Native SoftmaxDx's detected and replaced: 0
+INFO: Number of Native Softmax's detected and replaced: 0
+INFO: Number of Native SoftmaxDx's detected and replaced: 0
+INFO: Number of Native Softmax's detected and replaced: 0
+INFO: Number of Native SoftmaxDx's detected and replaced: 0
+INFO: Number of Native Softmax's detected and replaced: 1
+Processing partition 2
+INFO: COLLECTIVE ctype=AllReduce op=add dtype=bfloat16 num_elements=150011904 replica_groups={{0,1,2,3}}
+INFO: COLLECTIVE ctype=AllReduce op=add dtype=bfloat16 num_elements=150011904 replica_groups={{0,1,2,3}}
+Replaced 0 dropout sequences with OffloadedDropout
+INFO: HloMacCount has found 2168647090176
+INFO: Traffic has found 2631196452
+INFO: AIF 1648.41
+HLO Ops used in computation: add all-reduce broadcast concatenate constant convert custom-call divide dot exponential logistic multiply parameter reduce reshape rsqrt select slice subtract transpose tuple 
+INFO: Number of Native SoftmaxDx's detected and replaced: 0
+INFO: Number of Native Softmax's detected and replaced: 0
+INFO: Number of Native SoftmaxDx's detected and replaced: 0
+INFO: Number of Native Softmax's detected and replaced: 0
+INFO: Number of Native SoftmaxDx's detected and replaced: 0
+INFO: Number of Native Softmax's detected and replaced: 0
+INFO: Number of Native SoftmaxDx's detected and replaced: 0
+INFO: Number of Native Softmax's detected and replaced: 1
+Processing partition 3
+INFO: COLLECTIVE ctype=AllReduce op=add dtype=bfloat16 num_elements=150011904 replica_groups={{0,1,2,3}}
+Replaced 0 dropout sequences with OffloadedDropout
+INFO: HloMacCount has found 1613452345344
+INFO: Traffic has found 1354008628
+INFO: AIF 2383.22
+HLO Ops used in computation: add all-reduce broadcast constant convert custom-call dot dynamic-slice logistic multiply parameter reduce reshape rsqrt transpose tuple 
+INFO: Number of Native SoftmaxDx's detected and replaced: 0
+INFO: Number of Native Softmax's detected and replaced: 0
+INFO: Number of Native SoftmaxDx's detected and replaced: 0
+INFO: Number of Native Softmax's detected and replaced: 0
+INFO: Number of Native SoftmaxDx's detected and replaced: 0
+INFO: Number of Native Softmax's detected and replaced: 0
+Invoking RemoveOptimizationBarriers pass
+Invoking Mark No Token pass.
+Possible Remats = 13
+Good Remat Candidates = 0
+Number of chains (re) materialized = 0
+Number of instructions (re) materialized = 0
+Invoking RemoveOptimizationBarriers pass
+Invoking Mark No Token pass.
+Possible Remats = 17
+Good Remat Candidates = 0
+Number of chains (re) materialized = 0
+Number of instructions (re) materialized = 0
+Invoking RemoveOptimizationBarriers pass
+Invoking Mark No Token pass.
+Possible Remats = 17
+Good Remat Candidates = 0
+Number of chains (re) materialized = 0
+Number of instructions (re) materialized = 0
+Invoking RemoveOptimizationBarriers pass
+Invoking Mark No Token pass.
+Possible Remats = 2
+Good Remat Candidates = 0
+Number of chains (re) materialized = 0
+Number of instructions (re) materialized = 0
+
+2024-06-01T06:00:00Z USER 2398881 [job.Frontend.0]: Compilation is optimized for faster compilation time. Please refer to neuron documentation for additional details
+2024-06-01T06:00:00Z INFO 2398881 [job.Frontend.0]: Start tensorization
+2024-06-01T06:00:00Z WARNING 2398881 [job.Frontend.0]: TVM not detected.
+2024-06-01T06:00:00Z INFO 2398881 [job.Frontend.0]: Num parallel jobs: 8
+2024-06-01T06:00:00Z INFO 2398881 [root/Tensorizer/All]: Enter time region
+2024-06-01T06:00:00Z INFO 2398881 [Tensorizer]: Max workers: 4
+2024-06-01T06:00:00Z INFO 2399014 [Tensorizer]: Building model from Penguin script "penguin.py.000000"...
+2024-06-01T06:00:00Z INFO 2399017 [Tensorizer]: Building model from Penguin script "penguin.py.000001"...
+2024-06-01T06:00:00Z INFO 2399018 [Tensorizer]: Building model from Penguin script "penguin.py.000002"...
+2024-06-01T06:00:00Z INFO 2399019 [Tensorizer]: Building model from Penguin script "penguin.py.000003"...
+2024-06-01T06:00:00Z INFO 2399014 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --enable-dse-after-mask-propagation --run-pg-layout-and-tiling --spmd --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --sunda-batchnorm --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check
+2024-06-01T06:00:00Z INFO 2399014 [Tensorizer]: Building model from Penguin script "penguin.py.000000"...
+2024-06-01T06:00:00Z INFO 2399019 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --enable-dse-after-mask-propagation --run-pg-layout-and-tiling --spmd --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --sunda-batchnorm --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check
+2024-06-01T06:00:00Z INFO 2399019 [Tensorizer]: Building model from Penguin script "penguin.py.000003"...
+2024-06-01T06:00:00Z INFO 2399017 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --enable-dse-after-mask-propagation --run-pg-layout-and-tiling --spmd --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --sunda-batchnorm --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check
+2024-06-01T06:00:00Z INFO 2399017 [Tensorizer]: Building model from Penguin script "penguin.py.000001"...
+2024-06-01T06:00:00Z INFO 2399019 [Tensorizer]: Successfully built model.
+2024-06-01T06:00:00Z INFO 2399018 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --enable-dse-after-mask-propagation --run-pg-layout-and-tiling --spmd --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --sunda-batchnorm --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check
+2024-06-01T06:00:00Z INFO 2399018 [Tensorizer]: Building model from Penguin script "penguin.py.000002"...
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/DoNothing]: Running DoNothing
+2024-06-01T06:00:00Z INFO 2399019 [DoNothing]: Finished (changed=True)
+2024-06-01T06:00:00Z INFO 2399014 [Tensorizer]: Successfully built model.
+2024-06-01T06:00:00Z USER 2399014 [sg0000/Tensorizer/DoNothing]: Running DoNothing
+2024-06-01T06:00:00Z INFO 2399014 [DoNothing]: Finished (changed=True)
+2024-06-01T06:00:00Z INFO 2399017 [Tensorizer]: Successfully built model.
+2024-06-01T06:00:00Z USER 2399017 [sg0001/Tensorizer/DoNothing]: Running DoNothing
+2024-06-01T06:00:00Z INFO 2399017 [DoNothing]: Finished (changed=True)
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction
+2024-06-01T06:00:00Z INFO 2399019 [AliasDependencyInduction]: Finished (changed=False)
+2024-06-01T06:00:00Z INFO 2399018 [Tensorizer]: Successfully built model.
+2024-06-01T06:00:00Z USER 2399018 [sg0002/Tensorizer/DoNothing]: Running DoNothing
+2024-06-01T06:00:00Z INFO 2399018 [DoNothing]: Finished (changed=True)
+2024-06-01T06:00:00Z USER 2399014 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds
+2024-06-01T06:00:00Z USER 2399014 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction
+2024-06-01T06:00:00Z INFO 2399014 [AliasDependencyInduction]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399018 [sg0002/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds
+2024-06-01T06:00:00Z USER 2399018 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction
+2024-06-01T06:00:00Z INFO 2399018 [AliasDependencyInduction]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399018 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.000 seconds
+2024-06-01T06:00:00Z USER 2399018 [sg0002/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR
+2024-06-01T06:00:00Z INFO 2399018 [CanonicalizeIR]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.000 seconds
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR
+2024-06-01T06:00:00Z INFO 2399019 [CanonicalizeIR]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout
+2024-06-01T06:00:00Z INFO 2399019 [LegalizeCCOpLayout]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates
+2024-06-01T06:00:00Z INFO 2399019 [ResolveComplicatePredicates]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution
+2024-06-01T06:00:00Z INFO 2399019 [AffinePredicateResolution]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/EliminateDivs]: Running EliminateDivs
+2024-06-01T06:00:00Z INFO 2399019 [EliminateDivs]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest
+2024-06-01T06:00:00Z INFO 2399019 [PerfectLoopNest]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/Simplifier]: Running Simplifier
+2024-06-01T06:00:00Z INFO 2399019 [Simplifier]: Finished (changed=True)
+2024-06-01T06:00:00Z USER 2399014 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.000 seconds
+2024-06-01T06:00:00Z USER 2399014 [sg0000/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR
+2024-06-01T06:00:00Z INFO 2399014 [CanonicalizeIR]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399014 [sg0000/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399014 [sg0000/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout
+2024-06-01T06:00:00Z INFO 2399014 [LegalizeCCOpLayout]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399014 [sg0000/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399014 [sg0000/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates
+2024-06-01T06:00:00Z INFO 2399014 [ResolveComplicatePredicates]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399014 [sg0000/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399014 [sg0000/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution
+2024-06-01T06:00:00Z INFO 2399014 [AffinePredicateResolution]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399014 [sg0000/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399014 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs
+2024-06-01T06:00:00Z INFO 2399014 [EliminateDivs]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399014 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399014 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest
+2024-06-01T06:00:00Z INFO 2399014 [PerfectLoopNest]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399014 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399014 [sg0000/Tensorizer/Simplifier]: Running Simplifier
+2024-06-01T06:00:00Z INFO 2399014 [Simplifier]: Finished (changed=True)
+2024-06-01T06:00:00Z USER 2399014 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.003 seconds
+2024-06-01T06:00:00Z USER 2399014 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier
+2024-06-01T06:00:00Z INFO 2399014 [GenericAccessSimplifier]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399014 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399014 [sg0000/Tensorizer/SpmdDCE]: Running SpmdDCE
+2024-06-01T06:00:00Z INFO 2399014 [SpmdDCE]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399014 [sg0000/Tensorizer/SpmdDCE]: SpmdDCE finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399014 [sg0000/Tensorizer/TCTransform]: Running TCTransform
+2024-06-01T06:00:00Z INFO 2399014 [TCTransform]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399017 [sg0001/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds
+2024-06-01T06:00:00Z USER 2399017 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction
+2024-06-01T06:00:00Z INFO 2399017 [AliasDependencyInduction]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399017 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399017 [sg0001/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR
+2024-06-01T06:00:00Z INFO 2399017 [CanonicalizeIR]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399017 [sg0001/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399017 [sg0001/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout
+2024-06-01T06:00:00Z INFO 2399017 [LegalizeCCOpLayout]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399017 [sg0001/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399017 [sg0001/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates
+2024-06-01T06:00:00Z INFO 2399017 [ResolveComplicatePredicates]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399017 [sg0001/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399017 [sg0001/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution
+2024-06-01T06:00:00Z INFO 2399017 [AffinePredicateResolution]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399017 [sg0001/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399017 [sg0001/Tensorizer/EliminateDivs]: Running EliminateDivs
+2024-06-01T06:00:00Z INFO 2399017 [EliminateDivs]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399017 [sg0001/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399017 [sg0001/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest
+2024-06-01T06:00:00Z INFO 2399017 [PerfectLoopNest]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399017 [sg0001/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399017 [sg0001/Tensorizer/Simplifier]: Running Simplifier
+2024-06-01T06:00:00Z INFO 2399017 [Simplifier]: Finished (changed=True)
+2024-06-01T06:00:00Z USER 2399017 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.004 seconds
+2024-06-01T06:00:00Z USER 2399017 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier
+2024-06-01T06:00:00Z INFO 2399017 [GenericAccessSimplifier]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/Simplifier]: Simplifier finished after 0.003 seconds
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier
+2024-06-01T06:00:00Z INFO 2399019 [GenericAccessSimplifier]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/SpmdDCE]: Running SpmdDCE
+2024-06-01T06:00:00Z INFO 2399019 [SpmdDCE]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/SpmdDCE]: SpmdDCE finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/TCTransform]: Running TCTransform
+2024-06-01T06:00:00Z INFO 2399019 [TCTransform]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/CommuteConcat]: Running CommuteConcat
+2024-06-01T06:00:00Z INFO 2399019 [CommuteConcat]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/LowerTensorOp]: Running LowerTensorOp
+2024-06-01T06:00:00Z USER 2399018 [sg0002/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399018 [sg0002/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout
+2024-06-01T06:00:00Z INFO 2399019 [LowerTensorOp]: Finished (changed=True)
+2024-06-01T06:00:00Z INFO 2399018 [LegalizeCCOpLayout]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.008 seconds
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm
+2024-06-01T06:00:00Z INFO 2399019 [ExpandBatchNorm]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.002 seconds
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/TCTransform]: Running TCTransform
+2024-06-01T06:00:00Z INFO 2399019 [TCTransform]: Finished (changed=True)
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/TCTransform]: TCTransform finished after 0.003 seconds
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/EliminateDivs]: Running EliminateDivs
+2024-06-01T06:00:00Z INFO 2399019 [EliminateDivs]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.002 seconds
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier
+2024-06-01T06:00:00Z INFO 2399019 [GenericAccessSimplifier]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR
+2024-06-01T06:00:00Z INFO 2399019 [CanonicalizeIR]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.002 seconds
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/TensorOpFusion]: Running TensorOpFusion
+2024-06-01T06:00:00Z INFO 2399019 [TensorOpFusion]: Finished (changed=True)
+2024-06-01T06:00:00Z USER 2399017 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399017 [sg0001/Tensorizer/SpmdDCE]: Running SpmdDCE
+2024-06-01T06:00:00Z INFO 2399017 [SpmdDCE]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399017 [sg0001/Tensorizer/SpmdDCE]: SpmdDCE finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399017 [sg0001/Tensorizer/TCTransform]: Running TCTransform
+2024-06-01T06:00:00Z INFO 2399017 [TCTransform]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399018 [sg0002/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399018 [sg0002/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates
+2024-06-01T06:00:00Z INFO 2399018 [ResolveComplicatePredicates]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399018 [sg0002/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399018 [sg0002/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution
+2024-06-01T06:00:00Z INFO 2399018 [AffinePredicateResolution]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399018 [sg0002/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399018 [sg0002/Tensorizer/EliminateDivs]: Running EliminateDivs
+2024-06-01T06:00:00Z INFO 2399018 [EliminateDivs]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399018 [sg0002/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399018 [sg0002/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest
+2024-06-01T06:00:00Z INFO 2399018 [PerfectLoopNest]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399018 [sg0002/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399018 [sg0002/Tensorizer/Simplifier]: Running Simplifier
+2024-06-01T06:00:00Z INFO 2399018 [Simplifier]: Finished (changed=True)
+2024-06-01T06:00:00Z USER 2399014 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399014 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat
+2024-06-01T06:00:00Z INFO 2399014 [CommuteConcat]: Finished (changed=False)
+2024-06-01T06:00:00Z USER 2399014 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.001 seconds
+2024-06-01T06:00:00Z USER 2399014 [sg0000/Tensorizer/LowerTensorOp]: Running LowerTensorOp
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/TensorOpFusion]: TensorOpFusion finished after 0.002 seconds
+2024-06-01T06:00:00Z USER 2399019 [sg0003/Tensorizer/TensorOpTransform]: Running TensorOpTransform
+2024-06-01T06:00:00Z INFO 2399019 [TensorOpTransform]: Finished (changed=True)
+2024-06-01T06:00:00Z USER 2399018 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.004 seconds
+2024-06-01T06:00:00Z USER 2399018 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier
+2024-06-01T06:00:00Z INFO 2399018 [GenericAccessSimplifier]: Finished (changed=False)
+2024-06-01T06:00:01Z INFO 2399014 [LowerTensorOp]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.006 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp
+2024-06-01T06:00:01Z INFO 2399019 [LateLowerTensorOp]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.002 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/MemcpyElimination]: Running MemcpyElimination
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/SpmdDCE]: Running SpmdDCE
+2024-06-01T06:00:01Z INFO 2399018 [SpmdDCE]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat
+2024-06-01T06:00:01Z INFO 2399017 [CommuteConcat]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.001 seconds
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/LowerTensorOp]: Running LowerTensorOp
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/SpmdDCE]: SpmdDCE finished after 0.002 seconds
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/TCTransform]: Running TCTransform
+2024-06-01T06:00:01Z INFO 2399018 [TCTransform]: Finished (changed=False)
+2024-06-01T06:00:01Z INFO 2399019 [MemcpyElimination]: Finished (changed=True)
+2024-06-01T06:00:01Z INFO 2399017 [LowerTensorOp]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat
+2024-06-01T06:00:01Z INFO 2399018 [CommuteConcat]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.041 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/LoopFusion]: Running LoopFusion
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.016 seconds
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm
+2024-06-01T06:00:01Z INFO 2399017 [ExpandBatchNorm]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.004 seconds
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/TCTransform]: Running TCTransform
+2024-06-01T06:00:01Z INFO 2399017 [TCTransform]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.006 seconds
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/EliminateDivs]: Running EliminateDivs
+2024-06-01T06:00:01Z INFO 2399017 [EliminateDivs]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399014 [sg0000/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.016 seconds
+2024-06-01T06:00:01Z USER 2399014 [sg0000/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm
+2024-06-01T06:00:01Z INFO 2399019 [LoopFusion]: Finished (changed=True)
+2024-06-01T06:00:01Z INFO 2399014 [ExpandBatchNorm]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/LoopFusion]: LoopFusion finished after 0.034 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/Rematerialization]: Running Rematerialization
+2024-06-01T06:00:01Z INFO 2399019 [Rematerialization]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.004 seconds
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier
+2024-06-01T06:00:01Z INFO 2399017 [GenericAccessSimplifier]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.004 seconds
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR
+2024-06-01T06:00:01Z INFO 2399017 [CanonicalizeIR]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.003 seconds
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/TensorOpFusion]: Running TensorOpFusion
+2024-06-01T06:00:01Z INFO 2399017 [TensorOpFusion]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399014 [sg0000/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.004 seconds
+2024-06-01T06:00:01Z USER 2399014 [sg0000/Tensorizer/TCTransform]: Running TCTransform
+2024-06-01T06:00:01Z INFO 2399014 [TCTransform]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/Rematerialization]: Rematerialization finished after 0.004 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/Simplifier]: Running Simplifier
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/TensorOpFusion]: TensorOpFusion finished after 0.005 seconds
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/TensorOpTransform]: Running TensorOpTransform
+2024-06-01T06:00:01Z INFO 2399019 [Simplifier]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/Simplifier]: Simplifier finished after 0.010 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/Delinearization]: Running Delinearization
+2024-06-01T06:00:01Z INFO 2399019 [Delinearization]: Finished (changed=True)
+2024-06-01T06:00:01Z INFO 2399017 [TensorOpTransform]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/Delinearization]: Delinearization finished after 0.005 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination
+2024-06-01T06:00:01Z INFO 2399019 [AliasDependencyElimination]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.001 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.001 seconds
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/LowerTensorOp]: Running LowerTensorOp
+2024-06-01T06:00:01Z USER 2399014 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.005 seconds
+2024-06-01T06:00:01Z USER 2399014 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs
+2024-06-01T06:00:01Z INFO 2399014 [EliminateDivs]: Finished (changed=False)
+2024-06-01T06:00:01Z INFO 2399018 [LowerTensorOp]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.014 seconds
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp
+2024-06-01T06:00:01Z INFO 2399017 [LateLowerTensorOp]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399014 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.004 seconds
+2024-06-01T06:00:01Z USER 2399014 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier
+2024-06-01T06:00:01Z INFO 2399014 [GenericAccessSimplifier]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.006 seconds
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/MemcpyElimination]: Running MemcpyElimination
+2024-06-01T06:00:01Z USER 2399014 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.003 seconds
+2024-06-01T06:00:01Z USER 2399014 [sg0000/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR
+2024-06-01T06:00:01Z INFO 2399014 [CanonicalizeIR]: Finished (changed=False)
+2024-06-01T06:00:01Z INFO 2399019 [DeadStoreElimination]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399014 [sg0000/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.003 seconds
+2024-06-01T06:00:01Z USER 2399014 [sg0000/Tensorizer/TensorOpFusion]: Running TensorOpFusion
+2024-06-01T06:00:01Z INFO 2399014 [TensorOpFusion]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.060 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction
+2024-06-01T06:00:01Z INFO 2399019 [AliasDependencyInduction]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.000 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/Simplifier]: Running Simplifier
+2024-06-01T06:00:01Z INFO 2399019 [Simplifier]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.017 seconds
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm
+2024-06-01T06:00:01Z INFO 2399018 [ExpandBatchNorm]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/Simplifier]: Simplifier finished after 0.003 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/LICM]: Running LICM
+2024-06-01T06:00:01Z INFO 2399019 [LICM]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/LICM]: LICM finished after 0.002 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/Delinearization]: Running Delinearization
+2024-06-01T06:00:01Z INFO 2399019 [Delinearization]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/Delinearization]: Delinearization finished after 0.002 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/LoopFusion]: Running LoopFusion
+2024-06-01T06:00:01Z INFO 2399019 [LoopFusion]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/LoopFusion]: LoopFusion finished after 0.008 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/SimplifySlice]: Running SimplifySlice
+2024-06-01T06:00:01Z INFO 2399019 [SimplifySlice]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.004 seconds
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/TCTransform]: Running TCTransform
+2024-06-01T06:00:01Z INFO 2399018 [TCTransform]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.001 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/LICM]: Running LICM
+2024-06-01T06:00:01Z INFO 2399019 [LICM]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/LICM]: LICM finished after 0.001 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/Simplifier]: Running Simplifier
+2024-06-01T06:00:01Z INFO 2399019 [Simplifier]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399014 [sg0000/Tensorizer/TensorOpFusion]: TensorOpFusion finished after 0.004 seconds
+2024-06-01T06:00:01Z USER 2399014 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.006 seconds
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/EliminateDivs]: Running EliminateDivs
+2024-06-01T06:00:01Z INFO 2399018 [EliminateDivs]: Finished (changed=False)
+2024-06-01T06:00:01Z INFO 2399014 [TensorOpTransform]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.004 seconds
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier
+2024-06-01T06:00:01Z INFO 2399018 [GenericAccessSimplifier]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399014 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.011 seconds
+2024-06-01T06:00:01Z USER 2399014 [sg0000/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp
+2024-06-01T06:00:01Z INFO 2399014 [LateLowerTensorOp]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.004 seconds
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR
+2024-06-01T06:00:01Z INFO 2399018 [CanonicalizeIR]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.003 seconds
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/TensorOpFusion]: Running TensorOpFusion
+2024-06-01T06:00:01Z INFO 2399018 [TensorOpFusion]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/TensorOpFusion]: TensorOpFusion finished after 0.005 seconds
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/TensorOpTransform]: Running TensorOpTransform
+2024-06-01T06:00:01Z INFO 2399018 [TensorOpTransform]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399014 [sg0000/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.004 seconds
+2024-06-01T06:00:01Z USER 2399014 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.015 seconds
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/Simplifier]: Simplifier finished after 0.002 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/ValueNumbering]: Running ValueNumbering
+2024-06-01T06:00:01Z INFO 2399018 [LateLowerTensorOp]: Finished (changed=True)
+2024-06-01T06:00:01Z INFO 2399019 [ValueNumbering]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.006 seconds
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/MemcpyElimination]: Running MemcpyElimination
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.002 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/LICM]: Running LICM
+2024-06-01T06:00:01Z INFO 2399019 [LICM]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/LICM]: LICM finished after 0.001 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/PadElimination]: Running PadElimination
+2024-06-01T06:00:01Z INFO 2399019 [PadElimination]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/PadElimination]: PadElimination finished after 0.000 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/Delinearization]: Running Delinearization
+2024-06-01T06:00:01Z INFO 2399019 [Delinearization]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/Delinearization]: Delinearization finished after 0.001 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/LoopFusion]: Running LoopFusion
+2024-06-01T06:00:01Z INFO 2399019 [LoopFusion]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/LoopFusion]: LoopFusion finished after 0.005 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier
+2024-06-01T06:00:01Z INFO 2399019 [GenericAccessSimplifier]: Finished (changed=False)
+2024-06-01T06:00:01Z INFO 2399017 [MemcpyElimination]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/Simplifier]: Running Simplifier
+2024-06-01T06:00:01Z INFO 2399019 [Simplifier]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/Simplifier]: Simplifier finished after 0.002 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/LICM]: Running LICM
+2024-06-01T06:00:01Z INFO 2399019 [LICM]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/LICM]: LICM finished after 0.001 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/ValueNumbering]: Running ValueNumbering
+2024-06-01T06:00:01Z INFO 2399019 [ValueNumbering]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.001 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/TCTransform]: Running TCTransform
+2024-06-01T06:00:01Z INFO 2399019 [TCTransform]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/CommuteConcat]: Running CommuteConcat
+2024-06-01T06:00:01Z INFO 2399019 [CommuteConcat]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.001 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom
+2024-06-01T06:00:01Z INFO 2399019 [RecognizeOpIdiom]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.231 seconds
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.004 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/MaskPropagation]: Running MaskPropagation
+2024-06-01T06:00:01Z INFO 2399019 [MaskPropagation]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.003 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination
+2024-06-01T06:00:01Z INFO 2399019 [DeadStoreElimination]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.051 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/Recompute]: Running Recompute
+2024-06-01T06:00:01Z INFO 2399019 [Recompute]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/Recompute]: Recompute finished after 0.000 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination
+2024-06-01T06:00:01Z INFO 2399019 [DeadCodeElimination]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.001 seconds
+2024-06-01T06:00:01Z INFO 2399019 [Tensorizer]: After optimization: 12 statements
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/DoNothing]: Running DoNothing
+2024-06-01T06:00:01Z INFO 2399019 [DoNothing]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/MutateDataType]: Running MutateDataType
+2024-06-01T06:00:01Z INFO 2399019 [MutateDataType]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/MutateDataType]: MutateDataType finished after 0.001 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/AutoCastTCInputs]: Running AutoCastTCInputs
+2024-06-01T06:00:01Z INFO 2399019 [AutoCastTCInputs]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/AutoCastTCInputs]: AutoCastTCInputs finished after 0.002 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier
+2024-06-01T06:00:01Z INFO 2399019 [GenericAccessSimplifier]: Finished (changed=False)
+2024-06-01T06:00:01Z INFO 2399014 [MemcpyElimination]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/Simplifier]: Running Simplifier
+2024-06-01T06:00:01Z INFO 2399019 [Simplifier]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399014 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.205 seconds
+2024-06-01T06:00:01Z USER 2399014 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion
+2024-06-01T06:00:01Z INFO 2399018 [MemcpyElimination]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.197 seconds
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/Simplifier]: Simplifier finished after 0.003 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination
+2024-06-01T06:00:01Z INFO 2399019 [AliasDependencyElimination]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.001 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/TileCCOps]: Running TileCCOps
+2024-06-01T06:00:01Z INFO 2399019 [TileCCOps]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/TileCCOps]: TileCCOps finished after 0.004 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/DelinearIndices]: Running DelinearIndices
+2024-06-01T06:00:01Z INFO 2399019 [DelinearIndices]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.016 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/Delinearization]: Running Delinearization
+2024-06-01T06:00:01Z INFO 2399019 [Delinearization]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/Delinearization]: Delinearization finished after 0.002 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/DelinearIndices]: Running DelinearIndices
+2024-06-01T06:00:01Z INFO 2399019 [DelinearIndices]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.004 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination
+2024-06-01T06:00:01Z INFO 2399019 [DeadCodeElimination]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.001 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC
+2024-06-01T06:00:01Z INFO 2399019 [InferIntrinsicOnCC]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.010 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict
+2024-06-01T06:00:01Z INFO 2399019 [ResolveAccessConflict]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.003 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/LICM]: Running LICM
+2024-06-01T06:00:01Z INFO 2399019 [LICM]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/LICM]: LICM finished after 0.001 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt
+2024-06-01T06:00:01Z INFO 2399019 [LocalLayoutOpt]: Finished (changed=True)
+2024-06-01T06:00:01Z INFO 2399017 [LoopFusion]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.008 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/DelinearIndices]: Running DelinearIndices
+2024-06-01T06:00:01Z INFO 2399019 [DelinearIndices]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.004 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/Delinearization]: Running Delinearization
+2024-06-01T06:00:01Z INFO 2399019 [Delinearization]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/Delinearization]: Delinearization finished after 0.002 seconds
+2024-06-01T06:00:01Z INFO 2399019 [LayoutPreprocessing]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.010 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.230 seconds
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/Rematerialization]: Running Rematerialization
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.007 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.034 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt
+2024-06-01T06:00:01Z INFO 2399017 [Rematerialization]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/Rematerialization]: Rematerialization finished after 0.013 seconds
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/Simplifier]: Running Simplifier
+2024-06-01T06:00:01Z INFO 2399019 [PAGLayoutOpt]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.013 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/MaskPropagation]: Running MaskPropagation
+2024-06-01T06:00:01Z INFO 2399019 [MaskPropagation]: Finished (changed=False)
+2024-06-01T06:00:01Z INFO 2399017 [Simplifier]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.002 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis
+2024-06-01T06:00:01Z INFO 2399019 [LowerCCOpBlockAxis]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.004 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling
+2024-06-01T06:00:01Z INFO 2399019 [CanonicalizeDAGForPGTiling]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.015 seconds
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/Delinearization]: Running Delinearization
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.005 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/PGTiling]: Running PGTiling
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass
+2024-06-01T06:00:01Z INFO 2399017 [Delinearization]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.016 seconds
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination
+2024-06-01T06:00:01Z INFO 2399017 [AliasDependencyElimination]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.002 seconds
+2024-06-01T06:00:01Z USER 2399017 [sg0001/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination
+2024-06-01T06:00:01Z INFO 2399018 [LoopFusion]: Finished (changed=True)
+2024-06-01T06:00:01Z INFO 2399014 [LoopFusion]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.017 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/CuttingAndMacroGeneration]: Running CuttingAndMacroGeneration
+2024-06-01T06:00:01Z USER 2399014 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.246 seconds
+2024-06-01T06:00:01Z USER 2399014 [sg0000/Tensorizer/Rematerialization]: Running Rematerialization
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.235 seconds
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/Rematerialization]: Running Rematerialization
+2024-06-01T06:00:01Z INFO 2399014 [Rematerialization]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399014 [sg0000/Tensorizer/Rematerialization]: Rematerialization finished after 0.013 seconds
+2024-06-01T06:00:01Z USER 2399014 [sg0000/Tensorizer/Simplifier]: Running Simplifier
+2024-06-01T06:00:01Z INFO 2399014 [Simplifier]: Finished (changed=True)
+2024-06-01T06:00:01Z INFO 2399018 [Rematerialization]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399014 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.009 seconds
+2024-06-01T06:00:01Z USER 2399014 [sg0000/Tensorizer/Delinearization]: Running Delinearization
+2024-06-01T06:00:01Z INFO 2399019 [CuttingAndMacroGeneration]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/Rematerialization]: Rematerialization finished after 0.015 seconds
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/Simplifier]: Running Simplifier
+2024-06-01T06:00:01Z INFO 2399014 [Delinearization]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/CuttingAndMacroGeneration]: CuttingAndMacroGeneration finished after 0.045 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/PGTiling]: PGTiling finished after 0.090 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes
+2024-06-01T06:00:01Z INFO 2399018 [Simplifier]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.017 seconds
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/Delinearization]: Running Delinearization
+2024-06-01T06:00:01Z INFO 2399019 [InsertIOTransposes]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399014 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.017 seconds
+2024-06-01T06:00:01Z USER 2399014 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination
+2024-06-01T06:00:01Z INFO 2399014 [AliasDependencyElimination]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399014 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.002 seconds
+2024-06-01T06:00:01Z USER 2399014 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination
+2024-06-01T06:00:01Z INFO 2399018 [Delinearization]: Finished (changed=True)
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.019 seconds
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination
+2024-06-01T06:00:01Z INFO 2399018 [AliasDependencyElimination]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.002 seconds
+2024-06-01T06:00:01Z USER 2399018 [sg0002/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.010 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 0.236 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/TilingProfiler]: Running TilingProfiler
+2024-06-01T06:00:01Z INFO 2399019 [TilingProfiler]: Finished (changed=False)
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.006 seconds
+2024-06-01T06:00:01Z USER 2399019 [sg0003/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop
+2024-06-01T06:00:02Z INFO 2399019 [FlattenMacroLoop]: Finished (changed=True)
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.009 seconds
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor
+2024-06-01T06:00:02Z INFO 2399019 [InferNeuronTensor]: Finished (changed=True)
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.025 seconds
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier
+2024-06-01T06:00:02Z INFO 2399017 [DeadStoreElimination]: Finished (changed=False)
+2024-06-01T06:00:02Z INFO 2399019 [NeuronSimplifier]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.190 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction
+2024-06-01T06:00:02Z INFO 2399017 [AliasDependencyInduction]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.000 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/Simplifier]: Running Simplifier
+2024-06-01T06:00:02Z INFO 2399017 [Simplifier]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.005 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/LICM]: Running LICM
+2024-06-01T06:00:02Z INFO 2399017 [LICM]: Finished (changed=True)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/LICM]: LICM finished after 0.004 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/Delinearization]: Running Delinearization
+2024-06-01T06:00:02Z INFO 2399017 [Delinearization]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.007 seconds
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/LICM]: Running LICM
+2024-06-01T06:00:02Z INFO 2399019 [LICM]: Finished (changed=True)
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/LICM]: LICM finished after 0.003 seconds
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul
+2024-06-01T06:00:02Z INFO 2399019 [RewriteReplicationMatmul]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.002 seconds
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop
+2024-06-01T06:00:02Z INFO 2399019 [FlattenMacroLoop]: Finished (changed=True)
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.006 seconds
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.005 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion
+2024-06-01T06:00:02Z INFO 2399014 [DeadStoreElimination]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.180 seconds
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction
+2024-06-01T06:00:02Z INFO 2399014 [AliasDependencyInduction]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.000 seconds
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/Simplifier]: Running Simplifier
+2024-06-01T06:00:02Z INFO 2399014 [Simplifier]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.004 seconds
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/LICM]: Running LICM
+2024-06-01T06:00:02Z INFO 2399014 [LICM]: Finished (changed=True)
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/LICM]: LICM finished after 0.003 seconds
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/Delinearization]: Running Delinearization
+2024-06-01T06:00:02Z INFO 2399018 [DeadStoreElimination]: Finished (changed=False)
+2024-06-01T06:00:02Z INFO 2399014 [Delinearization]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.195 seconds
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction
+2024-06-01T06:00:02Z INFO 2399018 [AliasDependencyInduction]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.000 seconds
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/Simplifier]: Running Simplifier
+2024-06-01T06:00:02Z INFO 2399018 [Simplifier]: Finished (changed=False)
+2024-06-01T06:00:02Z INFO 2399017 [LoopFusion]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.005 seconds
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/LICM]: Running LICM
+2024-06-01T06:00:02Z INFO 2399018 [LICM]: Finished (changed=True)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.062 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/SimplifySlice]: Running SimplifySlice
+2024-06-01T06:00:02Z INFO 2399017 [SimplifySlice]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.002 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/LICM]: Running LICM
+2024-06-01T06:00:02Z INFO 2399017 [LICM]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/LICM]: LICM finished after 0.003 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/Simplifier]: Running Simplifier
+2024-06-01T06:00:02Z INFO 2399017 [Simplifier]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.005 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/ValueNumbering]: Running ValueNumbering
+2024-06-01T06:00:02Z INFO 2399017 [ValueNumbering]: Finished (changed=True)
+2024-06-01T06:00:02Z INFO 2399019 [SimplifyMacroPredicates]: Finished (changed=True)
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/LICM]: LICM finished after 0.004 seconds
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/Delinearization]: Running Delinearization
+2024-06-01T06:00:02Z INFO 2399018 [Delinearization]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.114 seconds
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.005 seconds
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.005 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/LICM]: Running LICM
+2024-06-01T06:00:02Z INFO 2399017 [LICM]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/PadElimination]: Running PadElimination
+2024-06-01T06:00:02Z INFO 2399017 [PadElimination]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/PadElimination]: PadElimination finished after 0.000 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/Delinearization]: Running Delinearization
+2024-06-01T06:00:02Z INFO 2399017 [Delinearization]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion
+2024-06-01T06:00:02Z INFO 2399019 [DataLocalityOpt]: Finished (changed=True)
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.093 seconds
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler
+2024-06-01T06:00:02Z INFO 2399018 [LoopFusion]: Finished (changed=False)
+2024-06-01T06:00:02Z INFO 2399014 [LoopFusion]: Finished (changed=False)
+2024-06-01T06:00:02Z INFO 2399019 [DMATilingProfiler]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.079 seconds
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/SimplifySlice]: Running SimplifySlice
+2024-06-01T06:00:02Z INFO 2399018 [SimplifySlice]: Finished (changed=False)
+2024-06-01T06:00:02Z INFO 2399017 [LoopFusion]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.091 seconds
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/SimplifySlice]: Running SimplifySlice
+2024-06-01T06:00:02Z INFO 2399014 [SimplifySlice]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.059 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier
+2024-06-01T06:00:02Z INFO 2399017 [GenericAccessSimplifier]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/Simplifier]: Running Simplifier
+2024-06-01T06:00:02Z INFO 2399017 [Simplifier]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.004 seconds
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/LICM]: Running LICM
+2024-06-01T06:00:02Z INFO 2399018 [LICM]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.003 seconds
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.003 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/LICM]: Running LICM
+2024-06-01T06:00:02Z INFO 2399019 [NeuronSimplifier]: Finished (changed=False)
+2024-06-01T06:00:02Z INFO 2399017 [LICM]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.002 seconds
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/LICM]: Running LICM
+2024-06-01T06:00:02Z INFO 2399014 [LICM]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.008 seconds
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro
+2024-06-01T06:00:02Z INFO 2399019 [LegalizeSundaMacro]: Finished (changed=True)
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/LICM]: LICM finished after 0.004 seconds
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/Simplifier]: Running Simplifier
+2024-06-01T06:00:02Z INFO 2399014 [Simplifier]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.006 seconds
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/LICM]: LICM finished after 0.003 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/ValueNumbering]: Running ValueNumbering
+2024-06-01T06:00:02Z INFO 2399017 [ValueNumbering]: Finished (changed=False)
+2024-06-01T06:00:02Z INFO 2399019 [NeuronSimplifier]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.003 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/TCTransform]: Running TCTransform
+2024-06-01T06:00:02Z INFO 2399017 [TCTransform]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.004 seconds
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering
+2024-06-01T06:00:02Z INFO 2399014 [ValueNumbering]: Finished (changed=True)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.003 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat
+2024-06-01T06:00:02Z INFO 2399017 [CommuteConcat]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/LICM]: LICM finished after 0.004 seconds
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/Simplifier]: Running Simplifier
+2024-06-01T06:00:02Z INFO 2399018 [Simplifier]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.001 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom
+2024-06-01T06:00:02Z INFO 2399017 [RecognizeOpIdiom]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.005 seconds
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/ValueNumbering]: Running ValueNumbering
+2024-06-01T06:00:02Z INFO 2399018 [ValueNumbering]: Finished (changed=True)
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.005 seconds
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/LICM]: Running LICM
+2024-06-01T06:00:02Z INFO 2399014 [LICM]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.005 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/MaskPropagation]: Running MaskPropagation
+2024-06-01T06:00:02Z INFO 2399017 [MaskPropagation]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.003 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.009 seconds
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest
+2024-06-01T06:00:02Z INFO 2399019 [PerfectLoopNest]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.002 seconds
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop
+2024-06-01T06:00:02Z INFO 2399019 [FlattenMacroLoop]: Finished (changed=True)
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.006 seconds
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/LICM]: Running LICM
+2024-06-01T06:00:02Z INFO 2399018 [LICM]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.007 seconds
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/RewriteWeights]: Running RewriteWeights
+2024-06-01T06:00:02Z INFO 2399019 [RewriteWeights]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.002 seconds
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/ReshapeWeights]: Running ReshapeWeights
+2024-06-01T06:00:02Z INFO 2399019 [ReshapeWeights]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.001 seconds
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop
+2024-06-01T06:00:02Z INFO 2399019 [FlattenMacroLoop]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.003 seconds
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/LICM]: LICM finished after 0.004 seconds
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/PadElimination]: Running PadElimination
+2024-06-01T06:00:02Z INFO 2399018 [PadElimination]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/PadElimination]: PadElimination finished after 0.000 seconds
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/Delinearization]: Running Delinearization
+2024-06-01T06:00:02Z INFO 2399018 [Delinearization]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.005 seconds
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/PadElimination]: Running PadElimination
+2024-06-01T06:00:02Z INFO 2399014 [PadElimination]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/PadElimination]: PadElimination finished after 0.000 seconds
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/Delinearization]: Running Delinearization
+2024-06-01T06:00:02Z INFO 2399014 [Delinearization]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion
+2024-06-01T06:00:02Z INFO 2399017 [DeadStoreElimination]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.126 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/Recompute]: Running Recompute
+2024-06-01T06:00:02Z INFO 2399017 [Recompute]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/Recompute]: Recompute finished after 0.000 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination
+2024-06-01T06:00:02Z INFO 2399017 [DeadCodeElimination]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.001 seconds
+2024-06-01T06:00:02Z INFO 2399017 [Tensorizer]: After optimization: 25 statements
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/DoNothing]: Running DoNothing
+2024-06-01T06:00:02Z INFO 2399017 [DoNothing]: Finished (changed=True)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/MutateDataType]: Running MutateDataType
+2024-06-01T06:00:02Z INFO 2399017 [MutateDataType]: Finished (changed=True)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/MutateDataType]: MutateDataType finished after 0.001 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/AutoCastTCInputs]: Running AutoCastTCInputs
+2024-06-01T06:00:02Z INFO 2399017 [AutoCastTCInputs]: Finished (changed=True)
+2024-06-01T06:00:02Z INFO 2399018 [LoopFusion]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/AutoCastTCInputs]: AutoCastTCInputs finished after 0.002 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier
+2024-06-01T06:00:02Z INFO 2399017 [GenericAccessSimplifier]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.079 seconds
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier
+2024-06-01T06:00:02Z INFO 2399018 [GenericAccessSimplifier]: Finished (changed=False)
+2024-06-01T06:00:02Z INFO 2399014 [LoopFusion]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/Simplifier]: Running Simplifier
+2024-06-01T06:00:02Z INFO 2399017 [Simplifier]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.066 seconds
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier
+2024-06-01T06:00:02Z INFO 2399014 [GenericAccessSimplifier]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/Simplifier]: Running Simplifier
+2024-06-01T06:00:02Z INFO 2399014 [Simplifier]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.003 seconds
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/LICM]: Running LICM
+2024-06-01T06:00:02Z INFO 2399014 [LICM]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.004 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination
+2024-06-01T06:00:02Z INFO 2399017 [AliasDependencyElimination]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.003 seconds
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/Simplifier]: Running Simplifier
+2024-06-01T06:00:02Z INFO 2399018 [Simplifier]: Finished (changed=False)
+2024-06-01T06:00:02Z INFO 2399019 [SimplifyMacroPredicates]: Finished (changed=True)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.002 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/TileCCOps]: Running TileCCOps
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.006 seconds
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/LICM]: Running LICM
+2024-06-01T06:00:02Z INFO 2399017 [TileCCOps]: Finished (changed=True)
+2024-06-01T06:00:02Z INFO 2399018 [LICM]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/TileCCOps]: TileCCOps finished after 0.011 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/LICM]: LICM finished after 0.003 seconds
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/ValueNumbering]: Running ValueNumbering
+2024-06-01T06:00:02Z INFO 2399018 [ValueNumbering]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.004 seconds
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/TCTransform]: Running TCTransform
+2024-06-01T06:00:02Z INFO 2399018 [TCTransform]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.003 seconds
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat
+2024-06-01T06:00:02Z INFO 2399018 [CommuteConcat]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom
+2024-06-01T06:00:02Z INFO 2399018 [RecognizeOpIdiom]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering
+2024-06-01T06:00:02Z INFO 2399014 [ValueNumbering]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.008 seconds
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/MaskPropagation]: Running MaskPropagation
+2024-06-01T06:00:02Z INFO 2399017 [DelinearIndices]: Finished (changed=True)
+2024-06-01T06:00:02Z INFO 2399018 [MaskPropagation]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.057 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/Delinearization]: Running Delinearization
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.004 seconds
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination
+2024-06-01T06:00:02Z INFO 2399017 [Delinearization]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.005 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices
+2024-06-01T06:00:02Z INFO 2399017 [DelinearIndices]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.004 seconds
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/TCTransform]: Running TCTransform
+2024-06-01T06:00:02Z INFO 2399014 [TCTransform]: Finished (changed=True)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.016 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination
+2024-06-01T06:00:02Z INFO 2399017 [DeadCodeElimination]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.003 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.003 seconds
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat
+2024-06-01T06:00:02Z INFO 2399014 [CommuteConcat]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.165 seconds
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/InferInitValue]: Running InferInitValue
+2024-06-01T06:00:02Z INFO 2399017 [InferIntrinsicOnCC]: Finished (changed=True)
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom
+2024-06-01T06:00:02Z INFO 2399014 [RecognizeOpIdiom]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.025 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.006 seconds
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation
+2024-06-01T06:00:02Z INFO 2399017 [ResolveAccessConflict]: Finished (changed=True)
+2024-06-01T06:00:02Z INFO 2399014 [MaskPropagation]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.008 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/LICM]: Running LICM
+2024-06-01T06:00:02Z INFO 2399017 [LICM]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.004 seconds
+2024-06-01T06:00:02Z USER 2399014 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt
+2024-06-01T06:00:02Z INFO 2399017 [LocalLayoutOpt]: Finished (changed=True)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.019 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices
+2024-06-01T06:00:02Z INFO 2399017 [DelinearIndices]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.012 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/Delinearization]: Running Delinearization
+2024-06-01T06:00:02Z INFO 2399017 [Delinearization]: Finished (changed=False)
+2024-06-01T06:00:02Z INFO 2399018 [DeadStoreElimination]: Finished (changed=False)
+2024-06-01T06:00:02Z INFO 2399019 [InferInitValue]: Finished (changed=True)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds
+2024-06-01T06:00:02Z INFO 2399017 [LayoutPreprocessing]: Finished (changed=True)
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/InferInitValue]: InferInitValue finished after 0.100 seconds
+2024-06-01T06:00:02Z USER 2399019 [sg0003/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.153 seconds
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/Recompute]: Running Recompute
+2024-06-01T06:00:02Z INFO 2399018 [Recompute]: Finished (changed=False)
+2024-06-01T06:00:02Z INFO 2399019 [NeuronSimplifier]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/Recompute]: Recompute finished after 0.000 seconds
+2024-06-01T06:00:02Z USER 2399018 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination
+2024-06-01T06:00:02Z INFO 2399018 [DeadCodeElimination]: Finished (changed=False)
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.014 seconds
+2024-06-01T06:00:02Z USER 2399017 [sg0001/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds
+2024-06-01T06:00:03Z INFO 2399018 [Tensorizer]: After optimization: 25 statements
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/DoNothing]: Running DoNothing
+2024-06-01T06:00:03Z INFO 2399018 [DoNothing]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/MutateDataType]: Running MutateDataType
+2024-06-01T06:00:03Z INFO 2399018 [MutateDataType]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.014 seconds
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.062 seconds
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt
+2024-06-01T06:00:03Z INFO 2399014 [DeadStoreElimination]: Finished (changed=False)
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.123 seconds
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/Recompute]: Running Recompute
+2024-06-01T06:00:03Z INFO 2399014 [Recompute]: Finished (changed=False)
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/Recompute]: Recompute finished after 0.001 seconds
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination
+2024-06-01T06:00:03Z INFO 2399014 [DeadCodeElimination]: Finished (changed=False)
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.001 seconds
+2024-06-01T06:00:03Z INFO 2399014 [Tensorizer]: After optimization: 20 statements
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/DoNothing]: Running DoNothing
+2024-06-01T06:00:03Z INFO 2399014 [DoNothing]: Finished (changed=True)
+2024-06-01T06:00:03Z INFO 2399017 [PAGLayoutOpt]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/MutateDataType]: Running MutateDataType
+2024-06-01T06:00:03Z INFO 2399014 [MutateDataType]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.023 seconds
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/MaskPropagation]: Running MaskPropagation
+2024-06-01T06:00:03Z INFO 2399017 [MaskPropagation]: Finished (changed=False)
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.003 seconds
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis
+2024-06-01T06:00:03Z INFO 2399017 [LowerCCOpBlockAxis]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/MutateDataType]: MutateDataType finished after 0.003 seconds
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/AutoCastTCInputs]: Running AutoCastTCInputs
+2024-06-01T06:00:03Z INFO 2399018 [AutoCastTCInputs]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.008 seconds
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling
+2024-06-01T06:00:03Z INFO 2399017 [CanonicalizeDAGForPGTiling]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/AutoCastTCInputs]: AutoCastTCInputs finished after 0.003 seconds
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier
+2024-06-01T06:00:03Z INFO 2399018 [GenericAccessSimplifier]: Finished (changed=False)
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.009 seconds
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/SimplifyTensor]: Running SimplifyTensor
+2024-06-01T06:00:03Z INFO 2399019 [SimplifyTensor]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.003 seconds
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/Simplifier]: Running Simplifier
+2024-06-01T06:00:03Z INFO 2399018 [Simplifier]: Finished (changed=False)
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.008 seconds
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/LICM]: Running LICM
+2024-06-01T06:00:03Z INFO 2399019 [LICM]: Finished (changed=False)
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.005 seconds
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination
+2024-06-01T06:00:03Z INFO 2399018 [AliasDependencyElimination]: Finished (changed=False)
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/LICM]: LICM finished after 0.003 seconds
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/SundaISel]: Running SundaISel
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.003 seconds
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/TileCCOps]: Running TileCCOps
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.005 seconds
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/PGTiling]: Running PGTiling
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass
+2024-06-01T06:00:03Z INFO 2399018 [TileCCOps]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/TileCCOps]: TileCCOps finished after 0.011 seconds
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices
+2024-06-01T06:00:03Z INFO 2399019 [SundaISel]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/MutateDataType]: MutateDataType finished after 0.001 seconds
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/AutoCastTCInputs]: Running AutoCastTCInputs
+2024-06-01T06:00:03Z INFO 2399014 [AutoCastTCInputs]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/AutoCastTCInputs]: AutoCastTCInputs finished after 0.002 seconds
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier
+2024-06-01T06:00:03Z INFO 2399014 [GenericAccessSimplifier]: Finished (changed=False)
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/Simplifier]: Running Simplifier
+2024-06-01T06:00:03Z INFO 2399014 [Simplifier]: Finished (changed=False)
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.003 seconds
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination
+2024-06-01T06:00:03Z INFO 2399014 [AliasDependencyElimination]: Finished (changed=False)
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.001 seconds
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/TileCCOps]: Running TileCCOps
+2024-06-01T06:00:03Z INFO 2399017 [AGOrderingAnalysisPass]: WARNING: none P dims of loadstore 521 of IO tensor |V2 %output4|NHWC|[2289, 2, 8, 2, 128] is not sorted, index list: [8, 0, 1]
+2024-06-01T06:00:03Z INFO 2399017 [AGOrderingAnalysisPass]: WARNING: none P dims of loadstore 513 of IO tensor |V2 %output3|NHWC|[2289, 2, 8, 2, 2, 64] is not sorted, index list: [8, 0, 1, 2, 6]
+2024-06-01T06:00:03Z INFO 2399014 [TileCCOps]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.054 seconds
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/CuttingAndMacroGeneration]: Running CuttingAndMacroGeneration
+2024-06-01T06:00:03Z INFO 2399018 [DelinearIndices]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/TileCCOps]: TileCCOps finished after 0.006 seconds
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.056 seconds
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/Delinearization]: Running Delinearization
+2024-06-01T06:00:03Z INFO 2399018 [Delinearization]: Finished (changed=False)
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/SundaISel]: SundaISel finished after 0.032 seconds
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/PreprocessNkiKernels]: Running PreprocessNkiKernels
+2024-06-01T06:00:03Z INFO 2399019 [PreprocessNkiKernels]: Finished (changed=False)
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/PreprocessNkiKernels]: PreprocessNkiKernels finished after 0.002 seconds
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange
+2024-06-01T06:00:03Z INFO 2399019 [NeuronLoopInterchange]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.002 seconds
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates
+2024-06-01T06:00:03Z INFO 2399014 [DelinearIndices]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.005 seconds
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.039 seconds
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/Delinearization]: Running Delinearization
+2024-06-01T06:00:03Z INFO 2399014 [Delinearization]: Finished (changed=False)
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices
+2024-06-01T06:00:03Z INFO 2399018 [DelinearIndices]: Finished (changed=False)
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.016 seconds
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination
+2024-06-01T06:00:03Z INFO 2399018 [DeadCodeElimination]: Finished (changed=False)
+2024-06-01T06:00:03Z INFO 2399014 [DelinearIndices]: Finished (changed=False)
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.013 seconds
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination
+2024-06-01T06:00:03Z INFO 2399014 [DeadCodeElimination]: Finished (changed=False)
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC
+2024-06-01T06:00:03Z INFO 2399018 [InferIntrinsicOnCC]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.024 seconds
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict
+2024-06-01T06:00:03Z INFO 2399014 [InferIntrinsicOnCC]: Finished (changed=False)
+2024-06-01T06:00:03Z INFO 2399018 [ResolveAccessConflict]: Finished (changed=True)
+2024-06-01T06:00:03Z INFO 2399019 [NeuronSimplifyPredicates]: Finished (changed=False)
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.018 seconds
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.084 seconds
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion
+2024-06-01T06:00:03Z INFO 2399014 [ResolveAccessConflict]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.007 seconds
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/LICM]: Running LICM
+2024-06-01T06:00:03Z INFO 2399017 [CuttingAndMacroGeneration]: Finished (changed=True)
+2024-06-01T06:00:03Z INFO 2399018 [LICM]: Finished (changed=False)
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/CuttingAndMacroGeneration]: CuttingAndMacroGeneration finished after 0.135 seconds
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/PGTiling]: PGTiling finished after 0.203 seconds
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes
+2024-06-01T06:00:03Z INFO 2399019 [NeuronLoopFusion]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.022 seconds
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange
+2024-06-01T06:00:03Z INFO 2399019 [NeuronLoopInterchange]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.002 seconds
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/NeuronLICM]: Running NeuronLICM
+2024-06-01T06:00:03Z INFO 2399019 [NeuronLICM]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/LICM]: LICM finished after 0.003 seconds
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt
+2024-06-01T06:00:03Z INFO 2399017 [InsertIOTransposes]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.007 seconds
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/LICM]: Running LICM
+2024-06-01T06:00:03Z INFO 2399014 [LICM]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.040 seconds
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 0.443 seconds
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/TilingProfiler]: Running TilingProfiler
+2024-06-01T06:00:03Z INFO 2399018 [LocalLayoutOpt]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.010 seconds
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims
+2024-06-01T06:00:03Z INFO 2399017 [TilingProfiler]: Finished (changed=False)
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.027 seconds
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices
+2024-06-01T06:00:03Z INFO 2399019 [FactorizeBlkDims]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.018 seconds
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.015 seconds
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/NeuronInstComb]: Running NeuronInstComb
+2024-06-01T06:00:03Z INFO 2399018 [DelinearIndices]: Finished (changed=False)
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.016 seconds
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/Delinearization]: Running Delinearization
+2024-06-01T06:00:03Z INFO 2399018 [Delinearization]: Finished (changed=False)
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/LICM]: LICM finished after 0.003 seconds
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.005 seconds
+2024-06-01T06:00:03Z INFO 2399018 [LayoutPreprocessing]: Finished (changed=True)
+2024-06-01T06:00:03Z INFO 2399017 [FlattenMacroLoop]: Finished (changed=True)
+2024-06-01T06:00:03Z INFO 2399019 [NeuronInstComb]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.017 seconds
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis
+2024-06-01T06:00:03Z INFO 2399014 [LocalLayoutOpt]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.033 seconds
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.026 seconds
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering
+2024-06-01T06:00:03Z INFO 2399019 [NeuronValueNumbering]: Finished (changed=False)
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.020 seconds
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.053 seconds
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.023 seconds
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices
+2024-06-01T06:00:03Z INFO 2399014 [DelinearIndices]: Finished (changed=False)
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.005 seconds
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/NeuronInstComb]: Running NeuronInstComb
+2024-06-01T06:00:03Z INFO 2399018 [PAGLayoutOpt]: Finished (changed=True)
+2024-06-01T06:00:03Z INFO 2399019 [NeuronInstComb]: Finished (changed=False)
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.014 seconds
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/Delinearization]: Running Delinearization
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.031 seconds
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/MaskPropagation]: Running MaskPropagation
+2024-06-01T06:00:03Z INFO 2399014 [Delinearization]: Finished (changed=False)
+2024-06-01T06:00:03Z INFO 2399018 [MaskPropagation]: Finished (changed=False)
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.007 seconds
+2024-06-01T06:00:03Z INFO 2399014 [LayoutPreprocessing]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.004 seconds
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis
+2024-06-01T06:00:03Z INFO 2399017 [InferNeuronTensor]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.088 seconds
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier
+2024-06-01T06:00:03Z INFO 2399018 [LowerCCOpBlockAxis]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.011 seconds
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling
+2024-06-01T06:00:03Z INFO 2399018 [CanonicalizeDAGForPGTiling]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.003 seconds
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/VectorizeDMA]: Running VectorizeDMA
+2024-06-01T06:00:03Z INFO 2399017 [NeuronSimplifier]: Finished (changed=False)
+2024-06-01T06:00:03Z INFO 2399019 [VectorizeDMA]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.020 seconds
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.007 seconds
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.024 seconds
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/LICM]: Running LICM
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.019 seconds
+2024-06-01T06:00:03Z INFO 2399017 [LICM]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.088 seconds
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/LICM]: LICM finished after 0.007 seconds
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul
+2024-06-01T06:00:03Z INFO 2399017 [RewriteReplicationMatmul]: Finished (changed=False)
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.004 seconds
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.007 seconds
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/PGTiling]: Running PGTiling
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass
+2024-06-01T06:00:03Z INFO 2399014 [PAGLayoutOpt]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.029 seconds
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation
+2024-06-01T06:00:03Z INFO 2399014 [MaskPropagation]: Finished (changed=False)
+2024-06-01T06:00:03Z INFO 2399019 [NeuronSimplifyPredicates]: Finished (changed=False)
+2024-06-01T06:00:03Z INFO 2399017 [FlattenMacroLoop]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.004 seconds
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.025 seconds
+2024-06-01T06:00:03Z USER 2399017 [sg0001/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates
+2024-06-01T06:00:03Z INFO 2399014 [LowerCCOpBlockAxis]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.010 seconds
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling
+2024-06-01T06:00:03Z INFO 2399018 [AGOrderingAnalysisPass]: WARNING: none P dims of loadstore 521 of IO tensor |V2 %output6|NHWC|[2289, 2, 8, 2, 128] is not sorted, index list: [8, 0, 1]
+2024-06-01T06:00:03Z INFO 2399018 [AGOrderingAnalysisPass]: WARNING: none P dims of loadstore 513 of IO tensor |V2 %output5|NHWC|[2289, 2, 8, 2, 2, 64] is not sorted, index list: [8, 0, 1, 2, 6]
+2024-06-01T06:00:03Z INFO 2399014 [CanonicalizeDAGForPGTiling]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.055 seconds
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/CuttingAndMacroGeneration]: Running CuttingAndMacroGeneration
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.006 seconds
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/PGTiling]: Running PGTiling
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.063 seconds
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce
+2024-06-01T06:00:03Z INFO 2399019 [LegalizePartitionReduce]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.005 seconds
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/DeConcat]: Running DeConcat
+2024-06-01T06:00:03Z INFO 2399019 [DeConcat]: Finished (changed=False)
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/DeConcat]: DeConcat finished after 0.002 seconds
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion
+2024-06-01T06:00:03Z INFO 2399014 [AGOrderingAnalysisPass]: WARNING: none P dims of loadstore 478 of IO tensor |V2 %output2|NHWC|[2289, 2, 8, 2, 128] is not sorted, index list: [8, 0, 1]
+2024-06-01T06:00:03Z INFO 2399014 [AGOrderingAnalysisPass]: WARNING: none P dims of loadstore 470 of IO tensor |V2 %output1|NHWC|[2289, 2, 8, 2, 2, 64] is not sorted, index list: [8, 0, 1, 2, 6]
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.049 seconds
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/CuttingAndMacroGeneration]: Running CuttingAndMacroGeneration
+2024-06-01T06:00:03Z INFO 2399019 [PartialSimdFusion]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.032 seconds
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/TritiumFusion]: Running TritiumFusion
+2024-06-01T06:00:03Z INFO 2399018 [CuttingAndMacroGeneration]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/CuttingAndMacroGeneration]: CuttingAndMacroGeneration finished after 0.143 seconds
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/PGTiling]: PGTiling finished after 0.211 seconds
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes
+2024-06-01T06:00:03Z INFO 2399019 [TritiumFusion]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.076 seconds
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/CCOpFusion]: Running CCOpFusion
+2024-06-01T06:00:03Z INFO 2399014 [CuttingAndMacroGeneration]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/CuttingAndMacroGeneration]: CuttingAndMacroGeneration finished after 0.112 seconds
+2024-06-01T06:00:03Z INFO 2399018 [InsertIOTransposes]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/PGTiling]: PGTiling finished after 0.173 seconds
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes
+2024-06-01T06:00:03Z INFO 2399019 [CCOpFusion]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.032 seconds
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 0.477 seconds
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/TilingProfiler]: Running TilingProfiler
+2024-06-01T06:00:03Z INFO 2399014 [InsertIOTransposes]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.017 seconds
+2024-06-01T06:00:03Z INFO 2399018 [TilingProfiler]: Finished (changed=False)
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 0.393 seconds
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/TilingProfiler]: Running TilingProfiler
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.013 seconds
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop
+2024-06-01T06:00:03Z INFO 2399014 [TilingProfiler]: Finished (changed=False)
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.019 seconds
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.011 seconds
+2024-06-01T06:00:03Z USER 2399014 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop
+2024-06-01T06:00:03Z INFO 2399019 [VectorizeMatMult]: Finished (changed=False)
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.013 seconds
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion
+2024-06-01T06:00:03Z INFO 2399018 [FlattenMacroLoop]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.033 seconds
+2024-06-01T06:00:03Z USER 2399018 [sg0002/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor
+2024-06-01T06:00:03Z INFO 2399019 [PartialLoopFusion]: Finished (changed=True)
+2024-06-01T06:00:03Z INFO 2399014 [FlattenMacroLoop]: Finished (changed=True)
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.013 seconds
+2024-06-01T06:00:03Z USER 2399019 [sg0003/Tensorizer/NeuronLICM]: Running NeuronLICM
+2024-06-01T06:00:04Z INFO 2399019 [NeuronLICM]: Finished (changed=True)
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.003 seconds
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/LowerTranspose]: Running LowerTranspose
+2024-06-01T06:00:04Z INFO 2399019 [LowerTranspose]: Finished (changed=True)
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.026 seconds
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.009 seconds
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb
+2024-06-01T06:00:04Z INFO 2399019 [LateNeuronInstComb]: Finished (changed=False)
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.004 seconds
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/SplitAccGrp]: Running SplitAccGrp
+2024-06-01T06:00:04Z INFO 2399019 [SplitAccGrp]: Finished (changed=True)
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.002 seconds
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/SpillPSum]: Running SpillPSum
+2024-06-01T06:00:04Z INFO 2399019 [SpillPSum]: Finished (changed=True)
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/SpillPSum]: SpillPSum finished after 0.021 seconds
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics
+2024-06-01T06:00:04Z INFO 2399019 [LowerIntrinsics]: Finished (changed=False)
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.002 seconds
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/LegalizeType]: Running LegalizeType
+2024-06-01T06:00:04Z INFO 2399018 [InferNeuronTensor]: Finished (changed=True)
+2024-06-01T06:00:04Z INFO 2399019 [LegalizeType]: Finished (changed=True)
+2024-06-01T06:00:04Z USER 2399018 [sg0002/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.097 seconds
+2024-06-01T06:00:04Z USER 2399018 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/LegalizeType]: LegalizeType finished after 0.003 seconds
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/NeuronLICM]: Running NeuronLICM
+2024-06-01T06:00:04Z INFO 2399014 [InferNeuronTensor]: Finished (changed=True)
+2024-06-01T06:00:04Z INFO 2399019 [NeuronLICM]: Finished (changed=False)
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.082 seconds
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier
+2024-06-01T06:00:04Z INFO 2399018 [NeuronSimplifier]: Finished (changed=False)
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.005 seconds
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/InferPSumTensor]: Running InferPSumTensor
+2024-06-01T06:00:04Z USER 2399018 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.025 seconds
+2024-06-01T06:00:04Z USER 2399018 [sg0002/Tensorizer/LICM]: Running LICM
+2024-06-01T06:00:04Z INFO 2399014 [NeuronSimplifier]: Finished (changed=False)
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.020 seconds
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/LICM]: Running LICM
+2024-06-01T06:00:04Z INFO 2399018 [LICM]: Finished (changed=True)
+2024-06-01T06:00:04Z INFO 2399019 [InferPSumTensor]: Finished (changed=False)
+2024-06-01T06:00:04Z INFO 2399014 [LICM]: Finished (changed=True)
+2024-06-01T06:00:04Z USER 2399018 [sg0002/Tensorizer/LICM]: LICM finished after 0.007 seconds
+2024-06-01T06:00:04Z USER 2399018 [sg0002/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul
+2024-06-01T06:00:04Z INFO 2399018 [RewriteReplicationMatmul]: Finished (changed=False)
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/LICM]: LICM finished after 0.006 seconds
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul
+2024-06-01T06:00:04Z INFO 2399014 [RewriteReplicationMatmul]: Finished (changed=False)
+2024-06-01T06:00:04Z INFO 2399017 [SimplifyMacroPredicates]: Finished (changed=True)
+2024-06-01T06:00:04Z USER 2399018 [sg0002/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.004 seconds
+2024-06-01T06:00:04Z USER 2399018 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop
+2024-06-01T06:00:04Z USER 2399017 [sg0001/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.474 seconds
+2024-06-01T06:00:04Z USER 2399017 [sg0001/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.019 seconds
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/WeightCoalescing]: Running WeightCoalescing
+2024-06-01T06:00:04Z INFO 2399019 [WeightCoalescing]: Finished (changed=False)
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.001 seconds
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess
+2024-06-01T06:00:04Z INFO 2399018 [FlattenMacroLoop]: Finished (changed=True)
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.004 seconds
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop
+2024-06-01T06:00:04Z USER 2399018 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.031 seconds
+2024-06-01T06:00:04Z USER 2399018 [sg0002/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates
+2024-06-01T06:00:04Z INFO 2399019 [LegalizeSundaAccess]: Finished (changed=False)
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.021 seconds
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/TernaryFission]: Running TernaryFission
+2024-06-01T06:00:04Z INFO 2399014 [FlattenMacroLoop]: Finished (changed=True)
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.022 seconds
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates
+2024-06-01T06:00:04Z INFO 2399019 [TernaryFission]: Finished (changed=True)
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/TernaryFission]: TernaryFission finished after 0.062 seconds
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/RelaxPredicates]: Running RelaxPredicates
+2024-06-01T06:00:04Z INFO 2399019 [RelaxPredicates]: Finished (changed=False)
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.006 seconds
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/TensorInitialization]: Running TensorInitialization
+2024-06-01T06:00:04Z INFO 2399019 [TensorInitialization]: Finished (changed=True)
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.012 seconds
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates
+2024-06-01T06:00:04Z INFO 2399017 [DataLocalityOpt]: Finished (changed=True)
+2024-06-01T06:00:04Z USER 2399017 [sg0001/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.184 seconds
+2024-06-01T06:00:04Z USER 2399017 [sg0001/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler
+2024-06-01T06:00:04Z INFO 2399017 [DMATilingProfiler]: Finished (changed=False)
+2024-06-01T06:00:04Z USER 2399017 [sg0001/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.006 seconds
+2024-06-01T06:00:04Z USER 2399017 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier
+2024-06-01T06:00:04Z INFO 2399019 [NeuronSimplifyPredicates]: Finished (changed=True)
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.078 seconds
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro
+2024-06-01T06:00:04Z INFO 2399017 [NeuronSimplifier]: Finished (changed=False)
+2024-06-01T06:00:04Z INFO 2399019 [ExpandISAMacro]: Finished (changed=False)
+2024-06-01T06:00:04Z USER 2399017 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.023 seconds
+2024-06-01T06:00:04Z USER 2399017 [sg0001/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.004 seconds
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor
+2024-06-01T06:00:04Z INFO 2399017 [LegalizeSundaMacro]: Finished (changed=True)
+2024-06-01T06:00:04Z INFO 2399019 [SimplifyNeuronTensor]: Finished (changed=True)
+2024-06-01T06:00:04Z USER 2399017 [sg0001/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.012 seconds
+2024-06-01T06:00:04Z USER 2399017 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.008 seconds
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt
+2024-06-01T06:00:04Z INFO 2399019 [DMALocalityOpt]: Finished (changed=True)
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.001 seconds
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/DataStreaming]: Running DataStreaming
+2024-06-01T06:00:04Z INFO 2399017 [NeuronSimplifier]: Finished (changed=False)
+2024-06-01T06:00:04Z INFO 2399019 [DataStreaming]: Finished (changed=True)
+2024-06-01T06:00:04Z USER 2399017 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.020 seconds
+2024-06-01T06:00:04Z USER 2399017 [sg0001/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest
+2024-06-01T06:00:04Z INFO 2399017 [PerfectLoopNest]: Finished (changed=False)
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/DataStreaming]: DataStreaming finished after 0.005 seconds
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/SFKVectorizer]: Running SFKVectorizer
+2024-06-01T06:00:04Z USER 2399017 [sg0001/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.005 seconds
+2024-06-01T06:00:04Z USER 2399017 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop
+2024-06-01T06:00:04Z INFO 2399017 [FlattenMacroLoop]: Finished (changed=True)
+2024-06-01T06:00:04Z USER 2399017 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.018 seconds
+2024-06-01T06:00:04Z USER 2399017 [sg0001/Tensorizer/RewriteWeights]: Running RewriteWeights
+2024-06-01T06:00:04Z INFO 2399017 [RewriteWeights]: Finished (changed=False)
+2024-06-01T06:00:04Z USER 2399017 [sg0001/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.004 seconds
+2024-06-01T06:00:04Z USER 2399017 [sg0001/Tensorizer/ReshapeWeights]: Running ReshapeWeights
+2024-06-01T06:00:04Z INFO 2399017 [ReshapeWeights]: Finished (changed=False)
+2024-06-01T06:00:04Z USER 2399017 [sg0001/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.001 seconds
+2024-06-01T06:00:04Z USER 2399017 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop
+2024-06-01T06:00:04Z INFO 2399017 [FlattenMacroLoop]: Finished (changed=False)
+2024-06-01T06:00:04Z USER 2399017 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.010 seconds
+2024-06-01T06:00:04Z USER 2399017 [sg0001/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates
+2024-06-01T06:00:04Z INFO 2399019 [SFKVectorizer]: Finished (changed=True)
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.170 seconds
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst
+2024-06-01T06:00:04Z INFO 2399019 [LateLegalizeInst]: Finished (changed=True)
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.005 seconds
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp
+2024-06-01T06:00:04Z INFO 2399019 [CoalesceCCOp]: Finished (changed=False)
+2024-06-01T06:00:04Z INFO 2399014 [SimplifyMacroPredicates]: Finished (changed=True)
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.002 seconds
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/StaticProfiler]: Running StaticProfiler
+2024-06-01T06:00:04Z INFO 2399019 [StaticProfiler]: Finished (changed=False)
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.425 seconds
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.004 seconds
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets
+2024-06-01T06:00:04Z INFO 2399018 [SimplifyMacroPredicates]: Finished (changed=True)
+2024-06-01T06:00:04Z USER 2399018 [sg0002/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.499 seconds
+2024-06-01T06:00:04Z USER 2399018 [sg0002/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt
+2024-06-01T06:00:04Z INFO 2399019 [SplitAPUnionSets]: Finished (changed=True)
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.080 seconds
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata
+2024-06-01T06:00:04Z INFO 2399019 [DumpGraphAndMetadata]: Finished (changed=False)
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.005 seconds
+2024-06-01T06:00:04Z USER 2399019 [sg0003/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop
+2024-06-01T06:00:04Z INFO 2399014 [DataLocalityOpt]: Finished (changed=True)
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.153 seconds
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler
+2024-06-01T06:00:04Z INFO 2399014 [DMATilingProfiler]: Finished (changed=False)
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.004 seconds
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier
+2024-06-01T06:00:04Z INFO 2399014 [NeuronSimplifier]: Finished (changed=False)
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.020 seconds
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro
+2024-06-01T06:00:04Z INFO 2399014 [LegalizeSundaMacro]: Finished (changed=True)
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.010 seconds
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier
+2024-06-01T06:00:04Z INFO 2399014 [NeuronSimplifier]: Finished (changed=False)
+2024-06-01T06:00:04Z INFO 2399018 [DataLocalityOpt]: Finished (changed=True)
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.021 seconds
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest
+2024-06-01T06:00:04Z INFO 2399014 [PerfectLoopNest]: Finished (changed=False)
+2024-06-01T06:00:04Z USER 2399018 [sg0002/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.188 seconds
+2024-06-01T06:00:04Z USER 2399018 [sg0002/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler
+2024-06-01T06:00:04Z INFO 2399018 [DMATilingProfiler]: Finished (changed=False)
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.004 seconds
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop
+2024-06-01T06:00:04Z USER 2399018 [sg0002/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.006 seconds
+2024-06-01T06:00:04Z USER 2399018 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier
+2024-06-01T06:00:04Z INFO 2399014 [FlattenMacroLoop]: Finished (changed=True)
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.013 seconds
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/RewriteWeights]: Running RewriteWeights
+2024-06-01T06:00:04Z INFO 2399014 [RewriteWeights]: Finished (changed=True)
+2024-06-01T06:00:04Z INFO 2399018 [NeuronSimplifier]: Finished (changed=False)
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.004 seconds
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/ReshapeWeights]: Running ReshapeWeights
+2024-06-01T06:00:04Z INFO 2399014 [ReshapeWeights]: Finished (changed=True)
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.001 seconds
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop
+2024-06-01T06:00:04Z INFO 2399014 [FlattenMacroLoop]: Finished (changed=False)
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.008 seconds
+2024-06-01T06:00:04Z USER 2399014 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates
+2024-06-01T06:00:04Z USER 2399018 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.018 seconds
+2024-06-01T06:00:04Z USER 2399018 [sg0002/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro
+2024-06-01T06:00:04Z INFO 2399018 [LegalizeSundaMacro]: Finished (changed=True)
+2024-06-01T06:00:05Z USER 2399018 [sg0002/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.011 seconds
+2024-06-01T06:00:05Z USER 2399018 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier
+2024-06-01T06:00:05Z INFO 2399018 [NeuronSimplifier]: Finished (changed=False)
+2024-06-01T06:00:05Z USER 2399018 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.019 seconds
+2024-06-01T06:00:05Z USER 2399018 [sg0002/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest
+2024-06-01T06:00:05Z INFO 2399018 [PerfectLoopNest]: Finished (changed=False)
+2024-06-01T06:00:05Z USER 2399018 [sg0002/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.003 seconds
+2024-06-01T06:00:05Z USER 2399018 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop
+2024-06-01T06:00:05Z INFO 2399018 [FlattenMacroLoop]: Finished (changed=True)
+2024-06-01T06:00:05Z USER 2399018 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.015 seconds
+2024-06-01T06:00:05Z USER 2399018 [sg0002/Tensorizer/RewriteWeights]: Running RewriteWeights
+2024-06-01T06:00:05Z INFO 2399018 [RewriteWeights]: Finished (changed=False)
+2024-06-01T06:00:05Z USER 2399018 [sg0002/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.003 seconds
+2024-06-01T06:00:05Z USER 2399018 [sg0002/Tensorizer/ReshapeWeights]: Running ReshapeWeights
+2024-06-01T06:00:05Z INFO 2399018 [ReshapeWeights]: Finished (changed=False)
+2024-06-01T06:00:05Z USER 2399018 [sg0002/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.001 seconds
+2024-06-01T06:00:05Z USER 2399018 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop
+2024-06-01T06:00:05Z INFO 2399018 [FlattenMacroLoop]: Finished (changed=False)
+2024-06-01T06:00:05Z USER 2399018 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.007 seconds
+2024-06-01T06:00:05Z USER 2399018 [sg0002/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates
+2024-06-01T06:00:05Z INFO 2399017 [SimplifyMacroPredicates]: Finished (changed=True)
+2024-06-01T06:00:05Z USER 2399017 [sg0001/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.581 seconds
+2024-06-01T06:00:05Z USER 2399017 [sg0001/Tensorizer/InferInitValue]: Running InferInitValue
+2024-06-01T06:00:05Z INFO 2399019 [BirCodeGenLoop]: Finished (changed=False)
+2024-06-01T06:00:05Z USER 2399019 [sg0003/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.561 seconds
+2024-06-01T06:00:05Z INFO 2399017 [InferInitValue]: Finished (changed=True)
+2024-06-01T06:00:05Z USER 2399017 [sg0001/Tensorizer/InferInitValue]: InferInitValue finished after 0.293 seconds
+2024-06-01T06:00:05Z USER 2399017 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier
+2024-06-01T06:00:05Z INFO 2399017 [NeuronSimplifier]: Finished (changed=False)
+2024-06-01T06:00:05Z USER 2399017 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.023 seconds
+2024-06-01T06:00:05Z USER 2399017 [sg0001/Tensorizer/SimplifyTensor]: Running SimplifyTensor
+2024-06-01T06:00:05Z INFO 2399017 [SimplifyTensor]: Finished (changed=True)
+2024-06-01T06:00:05Z USER 2399017 [sg0001/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.023 seconds
+2024-06-01T06:00:05Z USER 2399017 [sg0001/Tensorizer/LICM]: Running LICM
+2024-06-01T06:00:05Z INFO 2399014 [SimplifyMacroPredicates]: Finished (changed=True)
+2024-06-01T06:00:05Z INFO 2399017 [LICM]: Finished (changed=False)
+2024-06-01T06:00:05Z USER 2399014 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.497 seconds
+2024-06-01T06:00:05Z USER 2399014 [sg0000/Tensorizer/InferInitValue]: Running InferInitValue
+2024-06-01T06:00:05Z USER 2399017 [sg0001/Tensorizer/LICM]: LICM finished after 0.006 seconds
+2024-06-01T06:00:05Z USER 2399017 [sg0001/Tensorizer/SundaISel]: Running SundaISel
+2024-06-01T06:00:05Z INFO 2399017 [SundaISel]: Finished (changed=True)
+2024-06-01T06:00:05Z USER 2399017 [sg0001/Tensorizer/SundaISel]: SundaISel finished after 0.060 seconds
+2024-06-01T06:00:05Z USER 2399017 [sg0001/Tensorizer/PreprocessNkiKernels]: Running PreprocessNkiKernels
+2024-06-01T06:00:05Z INFO 2399017 [PreprocessNkiKernels]: Finished (changed=False)
+2024-06-01T06:00:05Z USER 2399017 [sg0001/Tensorizer/PreprocessNkiKernels]: PreprocessNkiKernels finished after 0.002 seconds
+2024-06-01T06:00:05Z USER 2399017 [sg0001/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange
+2024-06-01T06:00:05Z INFO 2399017 [NeuronLoopInterchange]: Finished (changed=True)
+2024-06-01T06:00:05Z USER 2399017 [sg0001/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.003 seconds
+2024-06-01T06:00:05Z USER 2399017 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates
+2024-06-01T06:00:05Z INFO 2399018 [SimplifyMacroPredicates]: Finished (changed=True)
+2024-06-01T06:00:05Z USER 2399018 [sg0002/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.575 seconds
+2024-06-01T06:00:05Z USER 2399018 [sg0002/Tensorizer/InferInitValue]: Running InferInitValue
+2024-06-01T06:00:05Z INFO 2399019 [Tensorizer]: BirCodeGen estimate #instances=785630 in sg0003
+2024-06-01T06:00:05Z INFO 2399014 [InferInitValue]: Finished (changed=True)
+2024-06-01T06:00:05Z USER 2399014 [sg0000/Tensorizer/InferInitValue]: InferInitValue finished after 0.242 seconds
+2024-06-01T06:00:05Z USER 2399014 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier
+2024-06-01T06:00:05Z INFO 2399019 [Tensorizer]: IR signature: 9123d3d332c29b924480b369ff3b7dd70f420870accc8d7728f7e514b7d821ee for sg0003/Tensorizer
+2024-06-01T06:00:05Z INFO 2399019 [Tensorizer]: Weights total number of bytes: 65536
+2024-06-01T06:00:05Z INFO 2399014 [NeuronSimplifier]: Finished (changed=False)
+2024-06-01T06:00:05Z USER 2399014 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.022 seconds
+2024-06-01T06:00:05Z USER 2399014 [sg0000/Tensorizer/SimplifyTensor]: Running SimplifyTensor
+2024-06-01T06:00:05Z INFO 2399014 [SimplifyTensor]: Finished (changed=True)
+2024-06-01T06:00:05Z USER 2399014 [sg0000/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.025 seconds
+2024-06-01T06:00:05Z USER 2399014 [sg0000/Tensorizer/LICM]: Running LICM
+2024-06-01T06:00:05Z INFO 2399014 [LICM]: Finished (changed=False)
+2024-06-01T06:00:05Z USER 2399014 [sg0000/Tensorizer/LICM]: LICM finished after 0.007 seconds
+2024-06-01T06:00:05Z USER 2399014 [sg0000/Tensorizer/SundaISel]: Running SundaISel
+2024-06-01T06:00:05Z INFO 2399017 [NeuronSimplifyPredicates]: Finished (changed=False)
+2024-06-01T06:00:05Z USER 2399017 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.254 seconds
+2024-06-01T06:00:05Z USER 2399017 [sg0001/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion
+2024-06-01T06:00:05Z INFO 2399014 [SundaISel]: Finished (changed=True)
+2024-06-01T06:00:05Z USER 2399014 [sg0000/Tensorizer/SundaISel]: SundaISel finished after 0.067 seconds
+2024-06-01T06:00:05Z USER 2399014 [sg0000/Tensorizer/PreprocessNkiKernels]: Running PreprocessNkiKernels
+2024-06-01T06:00:05Z INFO 2399014 [PreprocessNkiKernels]: Finished (changed=False)
+2024-06-01T06:00:05Z USER 2399014 [sg0000/Tensorizer/PreprocessNkiKernels]: PreprocessNkiKernels finished after 0.003 seconds
+2024-06-01T06:00:05Z USER 2399014 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange
+2024-06-01T06:00:05Z INFO 2399014 [NeuronLoopInterchange]: Finished (changed=True)
+2024-06-01T06:00:05Z USER 2399014 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.003 seconds
+2024-06-01T06:00:05Z USER 2399014 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates
+2024-06-01T06:00:05Z INFO 2399017 [NeuronLoopFusion]: Finished (changed=True)
+2024-06-01T06:00:05Z USER 2399017 [sg0001/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.085 seconds
+2024-06-01T06:00:05Z USER 2399017 [sg0001/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange
+2024-06-01T06:00:05Z INFO 2399017 [NeuronLoopInterchange]: Finished (changed=True)
+2024-06-01T06:00:05Z USER 2399017 [sg0001/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.003 seconds
+2024-06-01T06:00:05Z USER 2399017 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM
+2024-06-01T06:00:05Z INFO 2399017 [NeuronLICM]: Finished (changed=True)
+2024-06-01T06:00:05Z USER 2399017 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.017 seconds
+2024-06-01T06:00:05Z USER 2399017 [sg0001/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims
+2024-06-01T06:00:05Z INFO 2399018 [InferInitValue]: Finished (changed=True)
+2024-06-01T06:00:05Z USER 2399018 [sg0002/Tensorizer/InferInitValue]: InferInitValue finished after 0.287 seconds
+2024-06-01T06:00:05Z USER 2399018 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier
+2024-06-01T06:00:05Z INFO 2399017 [FactorizeBlkDims]: Finished (changed=True)
+2024-06-01T06:00:05Z INFO 2399018 [NeuronSimplifier]: Finished (changed=False)
+2024-06-01T06:00:05Z USER 2399017 [sg0001/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.032 seconds
+2024-06-01T06:00:05Z USER 2399017 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb
+2024-06-01T06:00:05Z USER 2399018 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.027 seconds
+2024-06-01T06:00:05Z USER 2399018 [sg0002/Tensorizer/SimplifyTensor]: Running SimplifyTensor
+2024-06-01T06:00:06Z INFO 2399018 [SimplifyTensor]: Finished (changed=True)
+2024-06-01T06:00:06Z USER 2399018 [sg0002/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.029 seconds
+2024-06-01T06:00:06Z USER 2399018 [sg0002/Tensorizer/LICM]: Running LICM
+2024-06-01T06:00:06Z INFO 2399018 [LICM]: Finished (changed=False)
+2024-06-01T06:00:06Z USER 2399018 [sg0002/Tensorizer/LICM]: LICM finished after 0.008 seconds
+2024-06-01T06:00:06Z USER 2399018 [sg0002/Tensorizer/SundaISel]: Running SundaISel
+2024-06-01T06:00:06Z INFO 2399018 [SundaISel]: Finished (changed=True)
+2024-06-01T06:00:06Z USER 2399018 [sg0002/Tensorizer/SundaISel]: SundaISel finished after 0.074 seconds
+2024-06-01T06:00:06Z USER 2399018 [sg0002/Tensorizer/PreprocessNkiKernels]: Running PreprocessNkiKernels
+2024-06-01T06:00:06Z INFO 2399018 [PreprocessNkiKernels]: Finished (changed=False)
+2024-06-01T06:00:06Z USER 2399018 [sg0002/Tensorizer/PreprocessNkiKernels]: PreprocessNkiKernels finished after 0.003 seconds
+2024-06-01T06:00:06Z USER 2399018 [sg0002/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange
+2024-06-01T06:00:06Z INFO 2399018 [NeuronLoopInterchange]: Finished (changed=True)
+2024-06-01T06:00:06Z INFO 2399017 [NeuronInstComb]: Finished (changed=True)
+2024-06-01T06:00:06Z USER 2399018 [sg0002/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.004 seconds
+2024-06-01T06:00:06Z USER 2399018 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates
+2024-06-01T06:00:06Z USER 2399017 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.148 seconds
+2024-06-01T06:00:06Z USER 2399017 [sg0001/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering
+2024-06-01T06:00:06Z INFO 2399014 [NeuronSimplifyPredicates]: Finished (changed=False)
+2024-06-01T06:00:06Z INFO 2399017 [NeuronValueNumbering]: Finished (changed=True)
+2024-06-01T06:00:06Z USER 2399014 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.273 seconds
+2024-06-01T06:00:06Z USER 2399014 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion
+2024-06-01T06:00:06Z USER 2399017 [sg0001/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.011 seconds
+2024-06-01T06:00:06Z USER 2399017 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb
+2024-06-01T06:00:06Z INFO 2399017 [NeuronInstComb]: Finished (changed=False)
+2024-06-01T06:00:06Z USER 2399017 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.021 seconds
+2024-06-01T06:00:06Z USER 2399017 [sg0001/Tensorizer/VectorizeDMA]: Running VectorizeDMA
+2024-06-01T06:00:06Z INFO 2399014 [NeuronLoopFusion]: Finished (changed=True)
+2024-06-01T06:00:06Z INFO 2399017 [VectorizeDMA]: Finished (changed=True)
+2024-06-01T06:00:06Z USER 2399014 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.061 seconds
+2024-06-01T06:00:06Z USER 2399014 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange
+2024-06-01T06:00:06Z INFO 2399014 [NeuronLoopInterchange]: Finished (changed=True)
+2024-06-01T06:00:06Z USER 2399014 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.003 seconds
+2024-06-01T06:00:06Z USER 2399014 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM
+2024-06-01T06:00:06Z USER 2399017 [sg0001/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.026 seconds
+2024-06-01T06:00:06Z USER 2399017 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates
+2024-06-01T06:00:06Z INFO 2399014 [NeuronLICM]: Finished (changed=True)
+2024-06-01T06:00:06Z USER 2399014 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.015 seconds
+2024-06-01T06:00:06Z USER 2399014 [sg0000/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims
+2024-06-01T06:00:06Z INFO 2399014 [FactorizeBlkDims]: Finished (changed=False)
+2024-06-01T06:00:06Z USER 2399014 [sg0000/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.021 seconds
+2024-06-01T06:00:06Z USER 2399014 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb
+2024-06-01T06:00:06Z INFO 2399018 [NeuronSimplifyPredicates]: Finished (changed=False)
+2024-06-01T06:00:06Z USER 2399018 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.272 seconds
+2024-06-01T06:00:06Z USER 2399018 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion
+2024-06-01T06:00:06Z INFO 2399014 [NeuronInstComb]: Finished (changed=True)
+2024-06-01T06:00:06Z USER 2399014 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.158 seconds
+2024-06-01T06:00:06Z USER 2399014 [sg0000/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering
+2024-06-01T06:00:06Z INFO 2399017 [NeuronSimplifyPredicates]: Finished (changed=False)
+2024-06-01T06:00:06Z INFO 2399014 [NeuronValueNumbering]: Finished (changed=False)
+2024-06-01T06:00:06Z USER 2399017 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.205 seconds
+2024-06-01T06:00:06Z USER 2399017 [sg0001/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce
+2024-06-01T06:00:06Z USER 2399014 [sg0000/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.009 seconds
+2024-06-01T06:00:06Z USER 2399014 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb
+2024-06-01T06:00:06Z INFO 2399017 [LegalizePartitionReduce]: Finished (changed=True)
+2024-06-01T06:00:06Z USER 2399017 [sg0001/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.007 seconds
+2024-06-01T06:00:06Z USER 2399017 [sg0001/Tensorizer/DeConcat]: Running DeConcat
+2024-06-01T06:00:06Z INFO 2399017 [DeConcat]: Finished (changed=False)
+2024-06-01T06:00:06Z USER 2399017 [sg0001/Tensorizer/DeConcat]: DeConcat finished after 0.004 seconds
+2024-06-01T06:00:06Z USER 2399017 [sg0001/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion
+2024-06-01T06:00:06Z INFO 2399014 [NeuronInstComb]: Finished (changed=False)
+2024-06-01T06:00:06Z USER 2399014 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.020 seconds
+2024-06-01T06:00:06Z USER 2399014 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA
+2024-06-01T06:00:06Z INFO 2399018 [NeuronLoopFusion]: Finished (changed=True)
+2024-06-01T06:00:06Z USER 2399018 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.087 seconds
+2024-06-01T06:00:06Z USER 2399018 [sg0002/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange
+2024-06-01T06:00:06Z INFO 2399018 [NeuronLoopInterchange]: Finished (changed=True)
+2024-06-01T06:00:06Z INFO 2399014 [VectorizeDMA]: Finished (changed=True)
+2024-06-01T06:00:06Z USER 2399018 [sg0002/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.003 seconds
+2024-06-01T06:00:06Z USER 2399018 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM
+2024-06-01T06:00:06Z USER 2399014 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.020 seconds
+2024-06-01T06:00:06Z USER 2399014 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates
+2024-06-01T06:00:06Z INFO 2399018 [NeuronLICM]: Finished (changed=True)
+2024-06-01T06:00:06Z USER 2399018 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.017 seconds
+2024-06-01T06:00:06Z USER 2399018 [sg0002/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims
+2024-06-01T06:00:06Z INFO 2399018 [FactorizeBlkDims]: Finished (changed=True)
+2024-06-01T06:00:06Z USER 2399018 [sg0002/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.031 seconds
+2024-06-01T06:00:06Z USER 2399018 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb
+2024-06-01T06:00:06Z INFO 2399017 [PartialSimdFusion]: Finished (changed=True)
+2024-06-01T06:00:06Z USER 2399017 [sg0001/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.138 seconds
+2024-06-01T06:00:06Z USER 2399017 [sg0001/Tensorizer/TritiumFusion]: Running TritiumFusion
+2024-06-01T06:00:06Z INFO 2399018 [NeuronInstComb]: Finished (changed=True)
+2024-06-01T06:00:06Z USER 2399018 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.148 seconds
+2024-06-01T06:00:06Z USER 2399018 [sg0002/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering
+2024-06-01T06:00:06Z INFO 2399018 [NeuronValueNumbering]: Finished (changed=True)
+2024-06-01T06:00:06Z USER 2399018 [sg0002/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.010 seconds
+2024-06-01T06:00:06Z USER 2399018 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb
+2024-06-01T06:00:06Z INFO 2399014 [NeuronSimplifyPredicates]: Finished (changed=False)
+2024-06-01T06:00:06Z USER 2399014 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.225 seconds
+2024-06-01T06:00:06Z USER 2399014 [sg0000/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce
+2024-06-01T06:00:06Z INFO 2399014 [LegalizePartitionReduce]: Finished (changed=True)
+2024-06-01T06:00:06Z USER 2399014 [sg0000/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.005 seconds
+2024-06-01T06:00:06Z USER 2399014 [sg0000/Tensorizer/DeConcat]: Running DeConcat
+2024-06-01T06:00:06Z INFO 2399018 [NeuronInstComb]: Finished (changed=False)
+2024-06-01T06:00:06Z INFO 2399014 [DeConcat]: Finished (changed=False)
+2024-06-01T06:00:06Z USER 2399018 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.021 seconds
+2024-06-01T06:00:06Z USER 2399018 [sg0002/Tensorizer/VectorizeDMA]: Running VectorizeDMA
+2024-06-01T06:00:06Z USER 2399014 [sg0000/Tensorizer/DeConcat]: DeConcat finished after 0.004 seconds
+2024-06-01T06:00:06Z USER 2399014 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion
+2024-06-01T06:00:06Z INFO 2399018 [VectorizeDMA]: Finished (changed=True)
+2024-06-01T06:00:06Z USER 2399018 [sg0002/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.027 seconds
+2024-06-01T06:00:06Z USER 2399018 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates
+2024-06-01T06:00:06Z INFO 2399017 [TritiumFusion]: Finished (changed=True)
+2024-06-01T06:00:06Z USER 2399017 [sg0001/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.182 seconds
+2024-06-01T06:00:06Z USER 2399017 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion
+2024-06-01T06:00:06Z INFO 2399014 [PartialSimdFusion]: Finished (changed=True)
+2024-06-01T06:00:06Z USER 2399014 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.110 seconds
+2024-06-01T06:00:06Z USER 2399014 [sg0000/Tensorizer/TritiumFusion]: Running TritiumFusion
+2024-06-01T06:00:06Z INFO 2399017 [CCOpFusion]: Finished (changed=True)
+2024-06-01T06:00:06Z USER 2399017 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.086 seconds
+2024-06-01T06:00:06Z USER 2399017 [sg0001/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult
+2024-06-01T06:00:07Z INFO 2399014 [TritiumFusion]: Finished (changed=True)
+2024-06-01T06:00:07Z USER 2399014 [sg0000/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.097 seconds
+2024-06-01T06:00:07Z USER 2399014 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion
+2024-06-01T06:00:07Z INFO 2399017 [VectorizeMatMult]: Finished (changed=False)
+2024-06-01T06:00:07Z USER 2399017 [sg0001/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.103 seconds
+2024-06-01T06:00:07Z USER 2399017 [sg0001/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion
+2024-06-01T06:00:07Z INFO 2399018 [NeuronSimplifyPredicates]: Finished (changed=False)
+2024-06-01T06:00:07Z USER 2399018 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.209 seconds
+2024-06-01T06:00:07Z USER 2399018 [sg0002/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce
+2024-06-01T06:00:07Z INFO 2399018 [LegalizePartitionReduce]: Finished (changed=True)
+2024-06-01T06:00:07Z USER 2399018 [sg0002/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.008 seconds
+2024-06-01T06:00:07Z USER 2399018 [sg0002/Tensorizer/DeConcat]: Running DeConcat
+2024-06-01T06:00:07Z INFO 2399018 [DeConcat]: Finished (changed=False)
+2024-06-01T06:00:07Z USER 2399018 [sg0002/Tensorizer/DeConcat]: DeConcat finished after 0.004 seconds
+2024-06-01T06:00:07Z USER 2399018 [sg0002/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion
+2024-06-01T06:00:07Z INFO 2399014 [CCOpFusion]: Finished (changed=True)
+2024-06-01T06:00:07Z USER 2399014 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.054 seconds
+2024-06-01T06:00:07Z USER 2399014 [sg0000/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult
+2024-06-01T06:00:07Z INFO 2399017 [PartialLoopFusion]: Finished (changed=True)
+2024-06-01T06:00:07Z USER 2399017 [sg0001/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.050 seconds
+2024-06-01T06:00:07Z USER 2399017 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM
+2024-06-01T06:00:07Z INFO 2399017 [NeuronLICM]: Finished (changed=False)
+2024-06-01T06:00:07Z USER 2399017 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.011 seconds
+2024-06-01T06:00:07Z USER 2399017 [sg0001/Tensorizer/LowerTranspose]: Running LowerTranspose
+2024-06-01T06:00:07Z INFO 2399017 [LowerTranspose]: Finished (changed=True)
+2024-06-01T06:00:07Z USER 2399017 [sg0001/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.034 seconds
+2024-06-01T06:00:07Z USER 2399017 [sg0001/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb
+2024-06-01T06:00:07Z INFO 2399014 [VectorizeMatMult]: Finished (changed=False)
+2024-06-01T06:00:07Z USER 2399014 [sg0000/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.076 seconds
+2024-06-01T06:00:07Z USER 2399014 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion
+2024-06-01T06:00:07Z INFO 2399018 [PartialSimdFusion]: Finished (changed=True)
+2024-06-01T06:00:07Z INFO 2399014 [PartialLoopFusion]: Finished (changed=True)
+2024-06-01T06:00:07Z USER 2399018 [sg0002/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.137 seconds
+2024-06-01T06:00:07Z USER 2399018 [sg0002/Tensorizer/TritiumFusion]: Running TritiumFusion
+2024-06-01T06:00:07Z INFO 2399017 [LateNeuronInstComb]: Finished (changed=True)
+2024-06-01T06:00:07Z USER 2399017 [sg0001/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.066 seconds
+2024-06-01T06:00:07Z USER 2399017 [sg0001/Tensorizer/SplitAccGrp]: Running SplitAccGrp
+2024-06-01T06:00:07Z INFO 2399017 [SplitAccGrp]: Finished (changed=False)
+2024-06-01T06:00:07Z USER 2399017 [sg0001/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.003 seconds
+2024-06-01T06:00:07Z USER 2399017 [sg0001/Tensorizer/SpillPSum]: Running SpillPSum
+2024-06-01T06:00:07Z USER 2399014 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.040 seconds
+2024-06-01T06:00:07Z USER 2399014 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM
+2024-06-01T06:00:07Z INFO 2399014 [NeuronLICM]: Finished (changed=False)
+2024-06-01T06:00:07Z USER 2399014 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.007 seconds
+2024-06-01T06:00:07Z USER 2399014 [sg0000/Tensorizer/LowerTranspose]: Running LowerTranspose
+2024-06-01T06:00:07Z INFO 2399014 [LowerTranspose]: Finished (changed=True)
+2024-06-01T06:00:07Z USER 2399014 [sg0000/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.019 seconds
+2024-06-01T06:00:07Z USER 2399014 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb
+2024-06-01T06:00:07Z INFO 2399017 [SpillPSum]: Finished (changed=True)
+2024-06-01T06:00:07Z USER 2399017 [sg0001/Tensorizer/SpillPSum]: SpillPSum finished after 0.075 seconds
+2024-06-01T06:00:07Z USER 2399017 [sg0001/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics
+2024-06-01T06:00:07Z INFO 2399017 [LowerIntrinsics]: Finished (changed=True)
+2024-06-01T06:00:07Z USER 2399017 [sg0001/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.032 seconds
+2024-06-01T06:00:07Z USER 2399017 [sg0001/Tensorizer/LegalizeType]: Running LegalizeType
+2024-06-01T06:00:07Z INFO 2399017 [LegalizeType]: Finished (changed=True)
+2024-06-01T06:00:07Z USER 2399017 [sg0001/Tensorizer/LegalizeType]: LegalizeType finished after 0.007 seconds
+2024-06-01T06:00:07Z USER 2399017 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM
+2024-06-01T06:00:07Z INFO 2399014 [LateNeuronInstComb]: Finished (changed=True)
+2024-06-01T06:00:07Z USER 2399014 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.076 seconds
+2024-06-01T06:00:07Z USER 2399014 [sg0000/Tensorizer/SplitAccGrp]: Running SplitAccGrp
+2024-06-01T06:00:07Z INFO 2399014 [SplitAccGrp]: Finished (changed=False)
+2024-06-01T06:00:07Z INFO 2399017 [NeuronLICM]: Finished (changed=True)
+2024-06-01T06:00:07Z USER 2399014 [sg0000/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.002 seconds
+2024-06-01T06:00:07Z USER 2399014 [sg0000/Tensorizer/SpillPSum]: Running SpillPSum
+2024-06-01T06:00:07Z USER 2399017 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.013 seconds
+2024-06-01T06:00:07Z USER 2399017 [sg0001/Tensorizer/InferPSumTensor]: Running InferPSumTensor
+2024-06-01T06:00:07Z INFO 2399018 [TritiumFusion]: Finished (changed=True)
+2024-06-01T06:00:07Z USER 2399018 [sg0002/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.178 seconds
+2024-06-01T06:00:07Z USER 2399018 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion
+2024-06-01T06:00:07Z INFO 2399014 [SpillPSum]: Finished (changed=True)
+2024-06-01T06:00:07Z USER 2399014 [sg0000/Tensorizer/SpillPSum]: SpillPSum finished after 0.054 seconds
+2024-06-01T06:00:07Z USER 2399014 [sg0000/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics
+2024-06-01T06:00:07Z INFO 2399017 [InferPSumTensor]: Finished (changed=False)
+2024-06-01T06:00:07Z USER 2399017 [sg0001/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.062 seconds
+2024-06-01T06:00:07Z USER 2399017 [sg0001/Tensorizer/WeightCoalescing]: Running WeightCoalescing
+2024-06-01T06:00:07Z INFO 2399017 [WeightCoalescing]: Finished (changed=False)
+2024-06-01T06:00:07Z USER 2399017 [sg0001/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.002 seconds
+2024-06-01T06:00:07Z USER 2399017 [sg0001/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess
+2024-06-01T06:00:07Z INFO 2399014 [LowerIntrinsics]: Finished (changed=True)
+2024-06-01T06:00:07Z USER 2399014 [sg0000/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.031 seconds
+2024-06-01T06:00:07Z USER 2399014 [sg0000/Tensorizer/LegalizeType]: Running LegalizeType
+2024-06-01T06:00:07Z INFO 2399014 [LegalizeType]: Finished (changed=True)
+2024-06-01T06:00:07Z INFO 2399018 [CCOpFusion]: Finished (changed=True)
+2024-06-01T06:00:07Z USER 2399014 [sg0000/Tensorizer/LegalizeType]: LegalizeType finished after 0.005 seconds
+2024-06-01T06:00:07Z USER 2399014 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM
+2024-06-01T06:00:07Z USER 2399018 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.084 seconds
+2024-06-01T06:00:07Z USER 2399018 [sg0002/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult
+2024-06-01T06:00:07Z INFO 2399014 [NeuronLICM]: Finished (changed=True)
+2024-06-01T06:00:07Z USER 2399014 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.012 seconds
+2024-06-01T06:00:07Z USER 2399014 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor
+2024-06-01T06:00:07Z INFO 2399014 [InferPSumTensor]: Finished (changed=False)
+2024-06-01T06:00:07Z USER 2399014 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.050 seconds
+2024-06-01T06:00:07Z USER 2399014 [sg0000/Tensorizer/WeightCoalescing]: Running WeightCoalescing
+2024-06-01T06:00:07Z INFO 2399014 [WeightCoalescing]: Finished (changed=False)
+2024-06-01T06:00:07Z USER 2399014 [sg0000/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.003 seconds
+2024-06-01T06:00:07Z USER 2399014 [sg0000/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess
+2024-06-01T06:00:07Z INFO 2399018 [VectorizeMatMult]: Finished (changed=False)
+2024-06-01T06:00:07Z USER 2399018 [sg0002/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.117 seconds
+2024-06-01T06:00:07Z USER 2399018 [sg0002/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion
+2024-06-01T06:00:07Z INFO 2399017 [LegalizeSundaAccess]: Finished (changed=True)
+2024-06-01T06:00:07Z USER 2399017 [sg0001/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.200 seconds
+2024-06-01T06:00:07Z USER 2399017 [sg0001/Tensorizer/TernaryFission]: Running TernaryFission
+2024-06-01T06:00:07Z INFO 2399018 [PartialLoopFusion]: Finished (changed=True)
+2024-06-01T06:00:07Z USER 2399018 [sg0002/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.054 seconds
+2024-06-01T06:00:07Z USER 2399018 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM
+2024-06-01T06:00:07Z INFO 2399018 [NeuronLICM]: Finished (changed=False)
+2024-06-01T06:00:07Z USER 2399018 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.012 seconds
+2024-06-01T06:00:07Z USER 2399018 [sg0002/Tensorizer/LowerTranspose]: Running LowerTranspose
+2024-06-01T06:00:07Z INFO 2399018 [LowerTranspose]: Finished (changed=True)
+2024-06-01T06:00:07Z USER 2399018 [sg0002/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.038 seconds
+2024-06-01T06:00:07Z USER 2399018 [sg0002/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb
+2024-06-01T06:00:07Z INFO 2399014 [LegalizeSundaAccess]: Finished (changed=True)
+2024-06-01T06:00:07Z USER 2399014 [sg0000/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.179 seconds
+2024-06-01T06:00:07Z USER 2399014 [sg0000/Tensorizer/TernaryFission]: Running TernaryFission
+2024-06-01T06:00:07Z INFO 2399018 [LateNeuronInstComb]: Finished (changed=True)
+2024-06-01T06:00:07Z USER 2399018 [sg0002/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.066 seconds
+2024-06-01T06:00:07Z USER 2399018 [sg0002/Tensorizer/SplitAccGrp]: Running SplitAccGrp
+2024-06-01T06:00:07Z INFO 2399018 [SplitAccGrp]: Finished (changed=False)
+2024-06-01T06:00:07Z USER 2399018 [sg0002/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.003 seconds
+2024-06-01T06:00:07Z USER 2399018 [sg0002/Tensorizer/SpillPSum]: Running SpillPSum
+2024-06-01T06:00:07Z INFO 2399018 [SpillPSum]: Finished (changed=True)
+2024-06-01T06:00:07Z USER 2399018 [sg0002/Tensorizer/SpillPSum]: SpillPSum finished after 0.076 seconds
+2024-06-01T06:00:07Z USER 2399018 [sg0002/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics
+2024-06-01T06:00:07Z INFO 2399018 [LowerIntrinsics]: Finished (changed=True)
+2024-06-01T06:00:07Z USER 2399018 [sg0002/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.037 seconds
+2024-06-01T06:00:07Z USER 2399018 [sg0002/Tensorizer/LegalizeType]: Running LegalizeType
+2024-06-01T06:00:07Z INFO 2399017 [TernaryFission]: Finished (changed=True)
+2024-06-01T06:00:07Z INFO 2399018 [LegalizeType]: Finished (changed=True)
+2024-06-01T06:00:07Z USER 2399017 [sg0001/Tensorizer/TernaryFission]: TernaryFission finished after 0.279 seconds
+2024-06-01T06:00:07Z USER 2399017 [sg0001/Tensorizer/RelaxPredicates]: Running RelaxPredicates
+2024-06-01T06:00:07Z USER 2399018 [sg0002/Tensorizer/LegalizeType]: LegalizeType finished after 0.007 seconds
+2024-06-01T06:00:07Z USER 2399018 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM
+2024-06-01T06:00:07Z INFO 2399014 [TernaryFission]: Finished (changed=True)
+2024-06-01T06:00:07Z INFO 2399018 [NeuronLICM]: Finished (changed=True)
+2024-06-01T06:00:07Z USER 2399014 [sg0000/Tensorizer/TernaryFission]: TernaryFission finished after 0.223 seconds
+2024-06-01T06:00:07Z USER 2399014 [sg0000/Tensorizer/RelaxPredicates]: Running RelaxPredicates
+2024-06-01T06:00:07Z USER 2399018 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.014 seconds
+2024-06-01T06:00:07Z USER 2399018 [sg0002/Tensorizer/InferPSumTensor]: Running InferPSumTensor
+2024-06-01T06:00:07Z INFO 2399017 [RelaxPredicates]: Finished (changed=False)
+2024-06-01T06:00:07Z USER 2399017 [sg0001/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.040 seconds
+2024-06-01T06:00:07Z USER 2399017 [sg0001/Tensorizer/TensorInitialization]: Running TensorInitialization
+2024-06-01T06:00:08Z INFO 2399014 [RelaxPredicates]: Finished (changed=False)
+2024-06-01T06:00:08Z USER 2399014 [sg0000/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.038 seconds
+2024-06-01T06:00:08Z USER 2399014 [sg0000/Tensorizer/TensorInitialization]: Running TensorInitialization
+2024-06-01T06:00:08Z INFO 2399018 [InferPSumTensor]: Finished (changed=False)
+2024-06-01T06:00:08Z USER 2399018 [sg0002/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.065 seconds
+2024-06-01T06:00:08Z USER 2399018 [sg0002/Tensorizer/WeightCoalescing]: Running WeightCoalescing
+2024-06-01T06:00:08Z INFO 2399018 [WeightCoalescing]: Finished (changed=False)
+2024-06-01T06:00:08Z USER 2399018 [sg0002/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.003 seconds
+2024-06-01T06:00:08Z USER 2399018 [sg0002/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess
+2024-06-01T06:00:08Z INFO 2399018 [LegalizeSundaAccess]: Finished (changed=True)
+2024-06-01T06:00:08Z USER 2399018 [sg0002/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.215 seconds
+2024-06-01T06:00:08Z USER 2399018 [sg0002/Tensorizer/TernaryFission]: Running TernaryFission
+2024-06-01T06:00:08Z INFO 2399018 [TernaryFission]: Finished (changed=True)
+2024-06-01T06:00:08Z USER 2399018 [sg0002/Tensorizer/TernaryFission]: TernaryFission finished after 0.278 seconds
+2024-06-01T06:00:08Z USER 2399018 [sg0002/Tensorizer/RelaxPredicates]: Running RelaxPredicates
+2024-06-01T06:00:08Z INFO 2399018 [RelaxPredicates]: Finished (changed=False)
+2024-06-01T06:00:08Z USER 2399018 [sg0002/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.040 seconds
+2024-06-01T06:00:08Z USER 2399018 [sg0002/Tensorizer/TensorInitialization]: Running TensorInitialization
+2024-06-01T06:00:09Z INFO 2399017 [TensorInitialization]: Finished (changed=True)
+2024-06-01T06:00:09Z USER 2399017 [sg0001/Tensorizer/TensorInitialization]: TensorInitialization finished after 1.095 seconds
+2024-06-01T06:00:09Z USER 2399017 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates
+2024-06-01T06:00:09Z INFO 2399014 [TensorInitialization]: Finished (changed=True)
+2024-06-01T06:00:09Z USER 2399014 [sg0000/Tensorizer/TensorInitialization]: TensorInitialization finished after 1.225 seconds
+2024-06-01T06:00:09Z USER 2399014 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates
+2024-06-01T06:00:09Z INFO 2399017 [NeuronSimplifyPredicates]: Finished (changed=True)
+2024-06-01T06:00:09Z USER 2399017 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.427 seconds
+2024-06-01T06:00:09Z USER 2399017 [sg0001/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro
+2024-06-01T06:00:09Z INFO 2399017 [ExpandISAMacro]: Finished (changed=False)
+2024-06-01T06:00:09Z USER 2399017 [sg0001/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.011 seconds
+2024-06-01T06:00:09Z USER 2399017 [sg0001/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor
+2024-06-01T06:00:09Z INFO 2399017 [SimplifyNeuronTensor]: Finished (changed=True)
+2024-06-01T06:00:09Z USER 2399017 [sg0001/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.039 seconds
+2024-06-01T06:00:09Z USER 2399017 [sg0001/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt
+2024-06-01T06:00:09Z INFO 2399017 [DMALocalityOpt]: Finished (changed=True)
+2024-06-01T06:00:09Z USER 2399017 [sg0001/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.002 seconds
+2024-06-01T06:00:09Z USER 2399017 [sg0001/Tensorizer/DataStreaming]: Running DataStreaming
+2024-06-01T06:00:09Z INFO 2399017 [DataStreaming]: Finished (changed=True)
+2024-06-01T06:00:09Z USER 2399017 [sg0001/Tensorizer/DataStreaming]: DataStreaming finished after 0.014 seconds
+2024-06-01T06:00:09Z USER 2399017 [sg0001/Tensorizer/SFKVectorizer]: Running SFKVectorizer
+2024-06-01T06:00:09Z INFO 2399014 [NeuronSimplifyPredicates]: Finished (changed=True)
+2024-06-01T06:00:09Z USER 2399014 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.416 seconds
+2024-06-01T06:00:09Z USER 2399014 [sg0000/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro
+2024-06-01T06:00:09Z INFO 2399014 [ExpandISAMacro]: Finished (changed=False)
+2024-06-01T06:00:09Z USER 2399014 [sg0000/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.009 seconds
+2024-06-01T06:00:09Z USER 2399014 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor
+2024-06-01T06:00:09Z INFO 2399014 [SimplifyNeuronTensor]: Finished (changed=True)
+2024-06-01T06:00:09Z USER 2399014 [sg0000/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.035 seconds
+2024-06-01T06:00:09Z USER 2399014 [sg0000/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt
+2024-06-01T06:00:09Z INFO 2399014 [DMALocalityOpt]: Finished (changed=True)
+2024-06-01T06:00:09Z USER 2399014 [sg0000/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.002 seconds
+2024-06-01T06:00:09Z USER 2399014 [sg0000/Tensorizer/DataStreaming]: Running DataStreaming
+2024-06-01T06:00:09Z INFO 2399014 [DataStreaming]: Finished (changed=True)
+2024-06-01T06:00:09Z USER 2399014 [sg0000/Tensorizer/DataStreaming]: DataStreaming finished after 0.012 seconds
+2024-06-01T06:00:09Z USER 2399014 [sg0000/Tensorizer/SFKVectorizer]: Running SFKVectorizer
+2024-06-01T06:00:09Z INFO 2399018 [TensorInitialization]: Finished (changed=True)
+2024-06-01T06:00:09Z USER 2399018 [sg0002/Tensorizer/TensorInitialization]: TensorInitialization finished after 1.220 seconds
+2024-06-01T06:00:09Z USER 2399018 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates
+2024-06-01T06:00:10Z INFO 2399017 [SFKVectorizer]: Finished (changed=True)
+2024-06-01T06:00:10Z USER 2399017 [sg0001/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.626 seconds
+2024-06-01T06:00:10Z USER 2399017 [sg0001/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst
+2024-06-01T06:00:10Z INFO 2399017 [LateLegalizeInst]: Finished (changed=False)
+2024-06-01T06:00:10Z USER 2399017 [sg0001/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.004 seconds
+2024-06-01T06:00:10Z USER 2399017 [sg0001/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp
+2024-06-01T06:00:10Z INFO 2399017 [CoalesceCCOp]: Finished (changed=False)
+2024-06-01T06:00:10Z USER 2399017 [sg0001/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.004 seconds
+2024-06-01T06:00:10Z USER 2399017 [sg0001/Tensorizer/StaticProfiler]: Running StaticProfiler
+2024-06-01T06:00:10Z INFO 2399018 [NeuronSimplifyPredicates]: Finished (changed=True)
+2024-06-01T06:00:10Z INFO 2399017 [StaticProfiler]: Finished (changed=False)
+2024-06-01T06:00:10Z USER 2399018 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.431 seconds
+2024-06-01T06:00:10Z USER 2399018 [sg0002/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro
+2024-06-01T06:00:10Z INFO 2399014 [SFKVectorizer]: Finished (changed=True)
+2024-06-01T06:00:10Z USER 2399014 [sg0000/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.531 seconds
+2024-06-01T06:00:10Z USER 2399014 [sg0000/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst
+2024-06-01T06:00:10Z INFO 2399018 [ExpandISAMacro]: Finished (changed=False)
+2024-06-01T06:00:10Z INFO 2399014 [LateLegalizeInst]: Finished (changed=True)
+2024-06-01T06:00:10Z USER 2399018 [sg0002/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.011 seconds
+2024-06-01T06:00:10Z USER 2399018 [sg0002/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor
+2024-06-01T06:00:10Z USER 2399014 [sg0000/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.008 seconds
+2024-06-01T06:00:10Z USER 2399014 [sg0000/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp
+2024-06-01T06:00:10Z INFO 2399014 [CoalesceCCOp]: Finished (changed=False)
+2024-06-01T06:00:10Z USER 2399014 [sg0000/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.003 seconds
+2024-06-01T06:00:10Z USER 2399014 [sg0000/Tensorizer/StaticProfiler]: Running StaticProfiler
+2024-06-01T06:00:10Z USER 2399017 [sg0001/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.010 seconds
+2024-06-01T06:00:10Z USER 2399017 [sg0001/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets
+2024-06-01T06:00:10Z INFO 2399014 [StaticProfiler]: Finished (changed=False)
+2024-06-01T06:00:10Z USER 2399014 [sg0000/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.008 seconds
+2024-06-01T06:00:10Z USER 2399014 [sg0000/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets
+2024-06-01T06:00:10Z INFO 2399018 [SimplifyNeuronTensor]: Finished (changed=True)
+2024-06-01T06:00:10Z USER 2399018 [sg0002/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.038 seconds
+2024-06-01T06:00:10Z USER 2399018 [sg0002/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt
+2024-06-01T06:00:10Z INFO 2399018 [DMALocalityOpt]: Finished (changed=True)
+2024-06-01T06:00:10Z USER 2399018 [sg0002/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.002 seconds
+2024-06-01T06:00:10Z USER 2399018 [sg0002/Tensorizer/DataStreaming]: Running DataStreaming
+2024-06-01T06:00:10Z INFO 2399018 [DataStreaming]: Finished (changed=True)
+2024-06-01T06:00:10Z USER 2399018 [sg0002/Tensorizer/DataStreaming]: DataStreaming finished after 0.014 seconds
+2024-06-01T06:00:10Z USER 2399018 [sg0002/Tensorizer/SFKVectorizer]: Running SFKVectorizer
+2024-06-01T06:00:10Z INFO 2399017 [SplitAPUnionSets]: Finished (changed=True)
+2024-06-01T06:00:10Z USER 2399017 [sg0001/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.389 seconds
+2024-06-01T06:00:10Z USER 2399017 [sg0001/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata
+2024-06-01T06:00:10Z INFO 2399017 [DumpGraphAndMetadata]: Finished (changed=False)
+2024-06-01T06:00:10Z USER 2399017 [sg0001/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.013 seconds
+2024-06-01T06:00:10Z USER 2399017 [sg0001/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop
+2024-06-01T06:00:10Z INFO 2399014 [SplitAPUnionSets]: Finished (changed=True)
+2024-06-01T06:00:10Z USER 2399014 [sg0000/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.506 seconds
+2024-06-01T06:00:10Z USER 2399014 [sg0000/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata
+2024-06-01T06:00:10Z INFO 2399014 [DumpGraphAndMetadata]: Finished (changed=False)
+2024-06-01T06:00:10Z USER 2399014 [sg0000/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.018 seconds
+2024-06-01T06:00:10Z USER 2399014 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop
+2024-06-01T06:00:10Z INFO 2399018 [SFKVectorizer]: Finished (changed=True)
+2024-06-01T06:00:10Z USER 2399018 [sg0002/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.612 seconds
+2024-06-01T06:00:10Z USER 2399018 [sg0002/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst
+2024-06-01T06:00:10Z INFO 2399018 [LateLegalizeInst]: Finished (changed=False)
+2024-06-01T06:00:10Z USER 2399018 [sg0002/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.004 seconds
+2024-06-01T06:00:10Z USER 2399018 [sg0002/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp
+2024-06-01T06:00:10Z INFO 2399018 [CoalesceCCOp]: Finished (changed=False)
+2024-06-01T06:00:10Z USER 2399018 [sg0002/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.004 seconds
+2024-06-01T06:00:10Z USER 2399018 [sg0002/Tensorizer/StaticProfiler]: Running StaticProfiler
+2024-06-01T06:00:11Z INFO 2399018 [StaticProfiler]: Finished (changed=False)
+2024-06-01T06:00:11Z USER 2399018 [sg0002/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.009 seconds
+2024-06-01T06:00:11Z USER 2399018 [sg0002/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets
+2024-06-01T06:00:11Z INFO 2399018 [SplitAPUnionSets]: Finished (changed=True)
+2024-06-01T06:00:11Z USER 2399018 [sg0002/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.467 seconds
+2024-06-01T06:00:11Z USER 2399018 [sg0002/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata
+2024-06-01T06:00:11Z INFO 2399018 [DumpGraphAndMetadata]: Finished (changed=False)
+2024-06-01T06:00:11Z USER 2399018 [sg0002/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.018 seconds
+2024-06-01T06:00:11Z USER 2399018 [sg0002/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop
+2024-06-01T06:00:12Z INFO 2399014 [BirCodeGenLoop]: Finished (changed=False)
+2024-06-01T06:00:12Z USER 2399014 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 1.528 seconds
+2024-06-01T06:00:12Z INFO 2399017 [BirCodeGenLoop]: Finished (changed=False)
+2024-06-01T06:00:12Z USER 2399017 [sg0001/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 1.786 seconds
+2024-06-01T06:00:13Z INFO 2399014 [Tensorizer]: BirCodeGen estimate #instances=2041695 in sg0000
+2024-06-01T06:00:13Z INFO 2399014 [Tensorizer]: IR signature: 412a6c9a1008d89196e6db07da4da89894f32c5cfa3df3bd990acd05973ed975 for sg0000/Tensorizer
+2024-06-01T06:00:13Z INFO 2399014 [Tensorizer]: Weights total number of bytes: 98560
+2024-06-01T06:00:13Z INFO 2399018 [BirCodeGenLoop]: Finished (changed=False)
+2024-06-01T06:00:13Z USER 2399018 [sg0002/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 2.218 seconds
+2024-06-01T06:00:13Z INFO 2399017 [Tensorizer]: BirCodeGen estimate #instances=2834332 in sg0001
+2024-06-01T06:00:14Z INFO 2399017 [Tensorizer]: IR signature: aed3545016ebcc5e528ba107e413171e21026ff478e7ad171a7b3e7e3c420e38 for sg0001/Tensorizer
+2024-06-01T06:00:14Z INFO 2399017 [Tensorizer]: Weights total number of bytes: 98304
+2024-06-01T06:00:15Z INFO 2399018 [Tensorizer]: BirCodeGen estimate #instances=2834332 in sg0002
+2024-06-01T06:00:15Z INFO 2399018 [Tensorizer]: IR signature: 3a62b97c3a38848a9481f51101ed6cf9a46be2c0b7cb39e4ad364496846bea60 for sg0002/Tensorizer
+2024-06-01T06:00:15Z INFO 2399018 [Tensorizer]: Weights total number of bytes: 98304
+2024-06-01T06:00:20Z INFO 2398881 [root/Tensorizer/All]: Exit time region: delta=19.941s
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: End tensorization
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input1
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input2
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input0
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input69
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input70
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input71
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input72
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input73
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input0
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input74
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input75
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input76
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input77
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input78
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input79
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input80
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input81
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input82
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input83
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input84
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input85
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input86
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input87
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input88
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input89
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input90
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input91
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input353
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input354
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input355
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input356
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input3
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input357
+2024-06-01T06:00:20Z INFO 2398881 [job.Frontend.0]: Network input: input358
+2024-06-01T06:00:24Z INFO 2398881 [job.Frontend.0]: wrote bir.json
+2024-06-01T06:00:24Z INFO 2398881 [job.Frontend.0]: wrote tensor_map.json
+2024-06-01T06:00:31Z INFO 2398881 [job.Frontend.0]: wrote bir.json
+2024-06-01T06:00:31Z INFO 2398881 [job.Frontend.0]: wrote tensor_map.json
+2024-06-01T06:00:37Z INFO 2398881 [job.Frontend.0]: wrote bir.json
+2024-06-01T06:00:37Z INFO 2398881 [job.Frontend.0]: wrote tensor_map.json
+2024-06-01T06:00:38Z INFO 2398881 [job.Frontend.0]: wrote bir.json
+2024-06-01T06:00:38Z INFO 2398881 [job.Frontend.0]: wrote tensor_map.json
+2024-06-01T06:00:39Z INFO 2398881 [job.Frontend.0]: Job #0 finished
+2024-06-01T06:00:39Z INFO 2398881 [pipeline.Pipeline.0]: Finished job job.Frontend.0
+2024-06-01T06:00:39Z INFO 2398881 [pipeline.Pipeline.0]: Starting job job.HHChecker.0
+2024-06-01T06:00:39Z INFO 2398881 [job.HHChecker.0]: Job HHChecker len(in_states) 4
+2024-06-01T06:00:39Z INFO 2403750 [job.HHChecker.0]: Processing input #0
+2024-06-01T06:00:39Z INFO 2403758 [job.HHChecker.0]: Processing input #1
+2024-06-01T06:00:39Z INFO 2403760 [job.HHChecker.0]: Processing input #3
+2024-06-01T06:00:39Z INFO 2403759 [job.HHChecker.0]: Processing input #2
+2024-06-01T06:00:39Z INFO 2403750 [job.HHChecker.0]: Job #0 finished
+2024-06-01T06:00:39Z INFO 2403760 [job.HHChecker.0]: Job #3 finished
+2024-06-01T06:00:39Z INFO 2403759 [job.HHChecker.0]: Job #2 finished
+2024-06-01T06:00:39Z INFO 2403758 [job.HHChecker.0]: Job #1 finished
+2024-06-01T06:00:39Z INFO 2398881 [pipeline.Pipeline.0]: Finished job job.HHChecker.0
+2024-06-01T06:00:39Z INFO 2398881 [pipeline.Pipeline.0]: Starting job job.WalrusDriver.0
+2024-06-01T06:00:39Z INFO 2398881 [job.WalrusDriver.0]: BackendDriver has 4 states
+2024-06-01T06:00:39Z INFO 2398881 [job.WalrusDriver.0]: BackendDriver MT cwd: /root/llava_mistral_0531/app/neuronxcc-42xre48h
+2024-06-01T06:00:39Z INFO 2398881 [job.WalrusDriver.0]: Linking neff json file "neff.json"
+2024-06-01T06:00:39Z INFO 2398881 [job.WalrusDriver.0]: Writing linked neff.json as sgLnk/bir_linker_neff.json
+2024-06-01T06:00:39Z INFO 2398881 [job.WalrusDriver.0]: Writing linked kelf files
+2024-06-01T06:00:39Z INFO 2398881 [job.WalrusDriver.0]: Finding clusts
+2024-06-01T06:00:39Z INFO 2398881 [job.WalrusDriver.0]: Linking incoming subgraphs sg00,sg01,sg02,sg03 to kelf-0.json
+2024-06-01T06:00:39Z INFO 2398881 [job.WalrusDriver.0]: Writing update "info.json"
+2024-06-01T06:00:39Z INFO 2398881 [job.WalrusDriver.0]: Linker successfully updated metadata files.
+2024-06-01T06:00:39Z INFO 2398881 [job.WalrusDriver.0]: StateId sg00 Dir /root/llava_mistral_0531/app/neuronxcc-42xre48h/sg00
+2024-06-01T06:00:39Z INFO 2398881 [job.WalrusDriver.0]: StateId sg01 Dir /root/llava_mistral_0531/app/neuronxcc-42xre48h/sg01
+2024-06-01T06:00:39Z INFO 2398881 [job.WalrusDriver.0]: StateId sg02 Dir /root/llava_mistral_0531/app/neuronxcc-42xre48h/sg02
+2024-06-01T06:00:39Z INFO 2398881 [job.WalrusDriver.0]: StateId sg03 Dir /root/llava_mistral_0531/app/neuronxcc-42xre48h/sg03
+2024-06-01T06:00:39Z INFO 2398881 [job.WalrusDriver.0]: Number of subgraphs to link: 4
+2024-06-01T06:00:39Z INFO 2398881 [job.BIRLinker.1]: Creating directory sgLnk/sg00
+2024-06-01T06:00:39Z INFO 2398881 [job.WalrusDriver.0]: lnkState: {"model": ["/tmp/root/neuroncc_compile_workdir/6dc0ff72-9752-4e1f-8880-eae65c0e6f3a/model.MODULE_3143bb21695f957f3b75+2c9e451d.hlo.pb"], "tensormap": "tensor_map.json", "bir": "bir.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "state_dir": "/root/llava_mistral_0531/app/neuronxcc-42xre48h/sgLnk/sg00", "state_id": "sg00"}
+2024-06-01T06:00:39Z INFO 2398881 [job.WalrusDriver.0]: BackendDriver in_state.num_states 4
+2024-06-01T06:00:39Z INFO 2398881 [job.WalrusDriver.0]: Executing /root/anaconda3/envs/masp_fastapi/lib/python3.9/site-packages/neuronxcc/starfish/bin/walrus_driver --optlevel 2 --allocator coloring --verbose 35 --logfile-verbose 20 --logfile /root/llava_mistral_0531/app/log-neuron-cc.txt --sync-pool-dve -o walrus_bir.out.json --enable-call-graph --enable-mt-backend --link-subgraphs sg00,sg01,sg02,sg03 --link-dir sgLnk/sg00 -i bir.json --min_split_size 10240 --skip_split_vns '' --no_split_dram --split_huge_dram_tensor 1.0 --enable-SPMD-opt=true --preprocessing_only --max_tensorizer_distance 64 --pack_same_shape_only --instruction_fetch_latency 511 --max-partitions 1 --policy 3 --auxflag 0 --interleave none --schedule-delayed-latency 1 --postsched-mm-accum-reorder=false --max-load-lower-bound 0.14 --allreduce-buffer-size 500 --dram-page-size 512 --dram-rotation-size -1 --allreduce-rotation-dis 8 --repeat-load-thres 4 --save-len-thres 512 --save-dma-cnt-thres 32 --relaxed-order=true --enable-anti-dependence-reduction=false --num-semaphores-per-queue 16 --numcores 1 --act-root-json /root/anaconda3/envs/masp_fastapi/lib/python3.9/site-packages/neuronxcc/pwp/pwp_bin_trainium/act_info.json --dve-root-json /root/anaconda3/envs/masp_fastapi/lib/python3.9/site-packages/neuronxcc/dve/dve_bin/dve_info.json --enable-verifier=true --enable-birsim=false --enable-birsim-sync-only=false --enable-data-race-checker=true --enable-new-backend=true --inject-error=NONE --enable-internal-partitioner --neff-output-filename /tmp/root/neuroncc_compile_workdir/6dc0ff72-9752-4e1f-8880-eae65c0e6f3a/model.MODULE_3143bb21695f957f3b75+2c9e451d.neff
+2024-06-01T06:00:39Z INFO 2398881 [job.WalrusDriver.0]: Working directory is /root/llava_mistral_0531/app/neuronxcc-42xre48h
+2024-06-01T06:00:39Z INFO 2398881 [job.WalrusDriver.0]: use_logger=False
+2024-06-01T06:00:39Z INFO 2403803 [BackendDriver]: max_allowed_parallelism=192
+2024-06-01T06:00:39Z INFO 2403803 [BackendDriver]: Loading module from sg00/bir.json
+2024-06-01T06:00:39Z INFO 2403803 [BackendDriver]: Loading module from sg02/bir.json
+2024-06-01T06:00:39Z INFO 2403803 [BackendDriver]: Loading module from sg03/bir.json
+2024-06-01T06:00:39Z INFO 2403803 [BackendDriver]: Loading module from sg01/bir.json
+2024-06-01T06:00:41Z INFO 2403803 [BackendDriver]: Backend driver mtBackend: true numModules: 4 Cwd: "/root/llava_mistral_0531/app/neuronxcc-42xre48h"
+2024-06-01T06:00:41Z INFO 2403803 [BackendDriver]: Modular flow call graph is enabled
+2024-06-01T06:00:41Z INFO 2403803 [BackendDriver]: Internal partitioner is enabled
+2024-06-01T06:00:41Z USER 2403803 [BackendDriver]: Running mod_parallel_pass
+2024-06-01T06:00:41Z INFO 2403803 [BackendDriver]: Inputs to mod_parallel_pass: modules=4 functions=4 allocs=458 blocks=4 instructions=110 Max writers: 11 Max Readers: 43
+2024-06-01T06:00:41Z USER 2403803 (sg03) [ModuleForkPass]: Running rewrite_matmult_sparse
+2024-06-01T06:00:41Z USER 2403803 (sg00) [ModuleForkPass]: Running rewrite_matmult_sparse
+2024-06-01T06:00:41Z USER 2403803 (sg02) [ModuleForkPass]: Running rewrite_matmult_sparse
+2024-06-01T06:00:41Z USER 2403803 (sg01) [ModuleForkPass]: Running rewrite_matmult_sparse
+2024-06-01T06:00:41Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to rewrite_matmult_sparse: modules=1 functions=1 allocs=61 blocks=1 instructions=30 Max writers: 4 Max Readers: 5
+2024-06-01T06:00:41Z USER 2403803 (sg03) [ModuleForkPass]: rewrite_matmult_sparse finished after 0.004 seconds
+2024-06-01T06:00:41Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  1001mb, ru_maxrss:  1001mb (delta=0mb)
+2024-06-01T06:00:41Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to rewrite_matmult_sparse: modules=1 functions=1 allocs=131 blocks=1 instructions=27 Max writers: 11 Max Readers: 43
+2024-06-01T06:00:41Z USER 2403803 (sg01) [ModuleForkPass]: rewrite_matmult_sparse finished after 0.009 seconds
+2024-06-01T06:00:41Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to rewrite_matmult_sparse: modules=1 functions=1 allocs=131 blocks=1 instructions=27 Max writers: 11 Max Readers: 43
+2024-06-01T06:00:41Z USER 2403803 (sg02) [ModuleForkPass]: rewrite_matmult_sparse finished after 0.012 seconds
+2024-06-01T06:00:41Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to rewrite_matmult_sparse: modules=1 functions=1 allocs=135 blocks=1 instructions=26 Max writers: 11 Max Readers: 36
+2024-06-01T06:00:41Z USER 2403803 (sg00) [ModuleForkPass]: rewrite_matmult_sparse finished after 0.042 seconds
+2024-06-01T06:00:41Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  917mb, ru_maxrss:  1001mb (delta=0mb)
+2024-06-01T06:00:42Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  483mb, ru_maxrss:  1001mb (delta=0mb)
+2024-06-01T06:00:42Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  483mb, ru_maxrss:  1001mb (delta=0mb)
+2024-06-01T06:00:42Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 61 memory location(s), 1 block(s), and 30 instruction(s). Max writers: 4 Max Readers: 5
+2024-06-01T06:00:42Z USER 2403803 (sg03) [ModuleForkPass]: Running birverifier
+2024-06-01T06:00:42Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=61 blocks=1 instructions=30 Max writers: 4 Max Readers: 5
+2024-06-01T06:00:42Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 131 memory location(s), 1 block(s), and 27 instruction(s). Max writers: 11 Max Readers: 43
+2024-06-01T06:00:42Z USER 2403803 (sg01) [ModuleForkPass]: Running birverifier
+2024-06-01T06:00:42Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 131 memory location(s), 1 block(s), and 27 instruction(s). Max writers: 11 Max Readers: 43
+2024-06-01T06:00:42Z USER 2403803 (sg02) [ModuleForkPass]: Running birverifier
+2024-06-01T06:00:42Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 135 memory location(s), 1 block(s), and 26 instruction(s). Max writers: 11 Max Readers: 36
+2024-06-01T06:00:42Z USER 2403803 (sg00) [ModuleForkPass]: Running birverifier
+2024-06-01T06:00:42Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=131 blocks=1 instructions=27 Max writers: 11 Max Readers: 43
+2024-06-01T06:00:42Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=131 blocks=1 instructions=27 Max writers: 11 Max Readers: 43
+2024-06-01T06:00:42Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=135 blocks=1 instructions=26 Max writers: 11 Max Readers: 36
+2024-06-01T06:00:47Z USER 2403803 (sg00) [ModuleForkPass]: birverifier finished after 5.322 seconds
+2024-06-01T06:00:47Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  2215mb, ru_maxrss:  2215mb (delta=1214mb)
+2024-06-01T06:00:47Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 135 memory location(s), 1 block(s), and 26 instruction(s). Max writers: 11 Max Readers: 36
+2024-06-01T06:00:47Z USER 2403803 (sg00) [ModuleForkPass]: Running expand_replication
+2024-06-01T06:00:47Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=135 blocks=1 instructions=26 Max writers: 11 Max Readers: 36
+2024-06-01T06:00:47Z INFO 2403803 (sg00) [ExpandReplication]: Found 0 replicated matmults
+2024-06-01T06:00:47Z USER 2403803 (sg00) [ModuleForkPass]: expand_replication finished after 0.014 seconds
+2024-06-01T06:00:47Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  2226mb, ru_maxrss:  2226mb (delta=3mb)
+2024-06-01T06:00:47Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 135 memory location(s), 1 block(s), and 26 instruction(s). Max writers: 11 Max Readers: 36
+2024-06-01T06:00:47Z USER 2403803 (sg00) [ModuleForkPass]: Running unroll
+2024-06-01T06:00:47Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=135 blocks=1 instructions=26 Max writers: 11 Max Readers: 36
+2024-06-01T06:00:47Z INFO 2403803 (sg00) [Unroll]: INFO (Unroll) Start unrolling at Sat Jun  1 06:00:47 2024
+2024-06-01T06:00:47Z INFO 2403803 (sg00) [Unroll]: INFO (Unroll) adjusting parallelfor 
+2024-06-01T06:00:47Z INFO 2403803 (sg00) [Unroll]: remove parallefor in axis i2_92_0_1995
+2024-06-01T06:00:47Z INFO 2403803 (sg00) [Unroll]: remove parallefor in axis i2_92_1_1995
+2024-06-01T06:00:47Z INFO 2403803 (sg00) [Unroll]: remove parallefor in axis i1_92_1_1~1
+2024-06-01T06:00:47Z INFO 2403803 (sg00) [Unroll]: remove parallefor in axis i2_92_0~1
+2024-06-01T06:00:47Z INFO 2403803 (sg00) [Unroll]: remove parallefor in axis i2_92_1~1
+2024-06-01T06:00:48Z USER 2403803 (sg03) [ModuleForkPass]: birverifier finished after 6.756 seconds
+2024-06-01T06:00:48Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  3296mb, ru_maxrss:  3296mb (delta=2295mb)
+2024-06-01T06:00:48Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 61 memory location(s), 1 block(s), and 30 instruction(s). Max writers: 4 Max Readers: 5
+2024-06-01T06:00:48Z USER 2403803 (sg03) [ModuleForkPass]: Running expand_replication
+2024-06-01T06:00:48Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=61 blocks=1 instructions=30 Max writers: 4 Max Readers: 5
+2024-06-01T06:00:48Z INFO 2403803 (sg03) [ExpandReplication]: Found 0 replicated matmults
+2024-06-01T06:00:48Z USER 2403803 (sg03) [ModuleForkPass]: expand_replication finished after 0.003 seconds
+2024-06-01T06:00:48Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  3306mb, ru_maxrss:  3306mb (delta=3mb)
+2024-06-01T06:00:48Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 61 memory location(s), 1 block(s), and 30 instruction(s). Max writers: 4 Max Readers: 5
+2024-06-01T06:00:48Z USER 2403803 (sg03) [ModuleForkPass]: Running unroll
+2024-06-01T06:00:48Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=61 blocks=1 instructions=30 Max writers: 4 Max Readers: 5
+2024-06-01T06:00:48Z INFO 2403803 (sg03) [Unroll]: INFO (Unroll) Start unrolling at Sat Jun  1 06:00:48 2024
+2024-06-01T06:00:48Z INFO 2403803 (sg03) [Unroll]: INFO (Unroll) adjusting parallelfor 
+2024-06-01T06:00:50Z USER 2403803 (sg02) [ModuleForkPass]: birverifier finished after 8.663 seconds
+2024-06-01T06:00:50Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  5977mb, ru_maxrss:  5977mb (delta=4976mb)
+2024-06-01T06:00:50Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 131 memory location(s), 1 block(s), and 27 instruction(s). Max writers: 11 Max Readers: 43
+2024-06-01T06:00:50Z USER 2403803 (sg02) [ModuleForkPass]: Running expand_replication
+2024-06-01T06:00:50Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=131 blocks=1 instructions=27 Max writers: 11 Max Readers: 43
+2024-06-01T06:00:50Z INFO 2403803 (sg02) [ExpandReplication]: Found 0 replicated matmults
+2024-06-01T06:00:50Z USER 2403803 (sg02) [ModuleForkPass]: expand_replication finished after 0.033 seconds
+2024-06-01T06:00:50Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  6065mb, ru_maxrss:  6065mb (delta=42mb)
+2024-06-01T06:00:50Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 131 memory location(s), 1 block(s), and 27 instruction(s). Max writers: 11 Max Readers: 43
+2024-06-01T06:00:50Z USER 2403803 (sg02) [ModuleForkPass]: Running unroll
+2024-06-01T06:00:50Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=131 blocks=1 instructions=27 Max writers: 11 Max Readers: 43
+2024-06-01T06:00:50Z INFO 2403803 (sg02) [Unroll]: INFO (Unroll) Start unrolling at Sat Jun  1 06:00:50 2024
+2024-06-01T06:00:51Z INFO 2403803 (sg02) [Unroll]: INFO (Unroll) adjusting parallelfor 
+2024-06-01T06:00:51Z INFO 2403803 (sg02) [Unroll]: remove parallefor in axis i2_86_586_0_0
+2024-06-01T06:00:51Z INFO 2403803 (sg02) [Unroll]: remove parallefor in axis i2_86_586_0_1_0
+2024-06-01T06:00:51Z INFO 2403803 (sg02) [Unroll]: remove parallefor in axis i2_86_586_0_1_1
+2024-06-01T06:00:51Z INFO 2403803 (sg02) [Unroll]: remove parallefor in axis i0_88_0_1_1_2302
+2024-06-01T06:00:51Z INFO 2403803 (sg02) [Unroll]: remove parallefor in axis i2_92_0_2305
+2024-06-01T06:00:51Z INFO 2403803 (sg02) [Unroll]: remove parallefor in axis i2_92_1_2305
+2024-06-01T06:00:51Z INFO 2403803 (sg02) [Unroll]: remove parallefor in axis i1_92_1_1~1
+2024-06-01T06:00:51Z INFO 2403803 (sg02) [Unroll]: remove parallefor in axis i2_92_0~1
+2024-06-01T06:00:51Z INFO 2403803 (sg02) [Unroll]: remove parallefor in axis i2_92_1~1
+2024-06-01T06:00:51Z USER 2403803 (sg01) [ModuleForkPass]: birverifier finished after 9.648 seconds
+2024-06-01T06:00:51Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  6992mb, ru_maxrss:  6992mb (delta=5991mb)
+2024-06-01T06:00:51Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 131 memory location(s), 1 block(s), and 27 instruction(s). Max writers: 11 Max Readers: 43
+2024-06-01T06:00:51Z USER 2403803 (sg01) [ModuleForkPass]: Running expand_replication
+2024-06-01T06:00:51Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=131 blocks=1 instructions=27 Max writers: 11 Max Readers: 43
+2024-06-01T06:00:51Z INFO 2403803 (sg01) [ExpandReplication]: Found 0 replicated matmults
+2024-06-01T06:00:51Z USER 2403803 (sg01) [ModuleForkPass]: expand_replication finished after 0.064 seconds
+2024-06-01T06:00:51Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  7041mb, ru_maxrss:  7041mb (delta=16mb)
+2024-06-01T06:00:51Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 131 memory location(s), 1 block(s), and 27 instruction(s). Max writers: 11 Max Readers: 43
+2024-06-01T06:00:51Z USER 2403803 (sg01) [ModuleForkPass]: Running unroll
+2024-06-01T06:00:52Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=131 blocks=1 instructions=27 Max writers: 11 Max Readers: 43
+2024-06-01T06:00:52Z INFO 2403803 (sg01) [Unroll]: INFO (Unroll) Start unrolling at Sat Jun  1 06:00:52 2024
+2024-06-01T06:00:52Z INFO 2403803 (sg01) [Unroll]: INFO (Unroll) adjusting parallelfor 
+2024-06-01T06:00:52Z INFO 2403803 (sg01) [Unroll]: remove parallefor in axis i2_86_586_0_0
+2024-06-01T06:00:52Z INFO 2403803 (sg01) [Unroll]: remove parallefor in axis i2_86_586_0_1_0
+2024-06-01T06:00:52Z INFO 2403803 (sg01) [Unroll]: remove parallefor in axis i2_86_586_0_1_1
+2024-06-01T06:00:52Z INFO 2403803 (sg01) [Unroll]: remove parallefor in axis i0_88_0_1_1_2302
+2024-06-01T06:00:52Z INFO 2403803 (sg01) [Unroll]: remove parallefor in axis i2_92_0_2305
+2024-06-01T06:00:52Z INFO 2403803 (sg01) [Unroll]: remove parallefor in axis i2_92_1_2305
+2024-06-01T06:00:52Z INFO 2403803 (sg01) [Unroll]: remove parallefor in axis i1_92_1_1~1
+2024-06-01T06:00:52Z INFO 2403803 (sg01) [Unroll]: remove parallefor in axis i2_92_0~1
+2024-06-01T06:00:52Z INFO 2403803 (sg01) [Unroll]: remove parallefor in axis i2_92_1~1
+2024-06-01T06:00:54Z INFO 2403803 (sg00) [Unroll]: INFO (Unroll) DONE unrolling Sat Jun  1 06:00:47 2024
+
+2024-06-01T06:00:55Z INFO 2403803 (sg00) [Unroll]: sg0000 Instruction count after Unroll: 
+2024-06-01T06:00:55Z INFO 2403803 (sg00) [Unroll]: Total count: 341145
+2024-06-01T06:00:55Z INFO 2403803 (sg00) [Unroll]: Matmult: 173945
+2024-06-01T06:00:55Z INFO 2403803 (sg00) [Unroll]: GenericCopy: 48652
+2024-06-01T06:00:55Z INFO 2403803 (sg00) [Unroll]: TensorReduce: 27648
+2024-06-01T06:00:55Z INFO 2403803 (sg00) [Unroll]: Memset: 24581
+2024-06-01T06:00:55Z INFO 2403803 (sg00) [Unroll]: TensorScalarPtr: 16327
+2024-06-01T06:00:55Z INFO 2403803 (sg00) [Unroll]: Activation: 14170
+2024-06-01T06:00:55Z INFO 2403803 (sg00) [Unroll]: Select: 11520
+2024-06-01T06:00:55Z INFO 2403803 (sg00) [Unroll]: TensorTensor: 7410
+2024-06-01T06:00:55Z INFO 2403803 (sg00) [Unroll]: Load: 7279
+2024-06-01T06:00:55Z INFO 2403803 (sg00) [Unroll]: Save: 7184
+2024-06-01T06:00:55Z INFO 2403803 (sg00) [Unroll]: Reciprocal: 2304
+2024-06-01T06:00:55Z INFO 2403803 (sg00) [Unroll]: StreamShuffle: 84
+2024-06-01T06:00:55Z INFO 2403803 (sg00) [Unroll]: Iota: 36
+2024-06-01T06:00:55Z INFO 2403803 (sg00) [Unroll]: DMACopy: 3
+2024-06-01T06:00:55Z INFO 2403803 (sg00) [Unroll]: CollectiveCompute: 2
+2024-06-01T06:00:55Z USER 2403803 (sg00) [ModuleForkPass]: unroll finished after 7.585 seconds
+2024-06-01T06:00:55Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  9294mb, ru_maxrss:  9294mb (delta=7068mb)
+2024-06-01T06:00:55Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 141957 memory location(s), 1 block(s), and 332441 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:00:55Z USER 2403803 (sg00) [ModuleForkPass]: Running psum_legalization
+2024-06-01T06:00:55Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=141957 blocks=1 instructions=332441 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:00:55Z USER 2403803 (sg00) [ModuleForkPass]: psum_legalization finished after 0.526 seconds
+2024-06-01T06:00:55Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  9892mb, ru_maxrss:  9892mb (delta=508mb)
+2024-06-01T06:00:55Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 141957 memory location(s), 1 block(s), and 332441 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:00:55Z USER 2403803 (sg00) [ModuleForkPass]: Running pre_opts
+2024-06-01T06:00:55Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=141957 blocks=1 instructions=332441 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:00:55Z INFO 2403803 (sg00) [PreOpts]: ModuleDef 0 #intermediates 66
+2024-06-01T06:00:55Z INFO 2403803 (sg00) [PreOpts]: NetlistInfo #InstanceInputs 8 InstanceOutputs 2
+2024-06-01T06:00:55Z USER 2403803 (sg00) [ModuleForkPass]: pre_opts finished after 0.097 seconds
+2024-06-01T06:00:55Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  10097mb, ru_maxrss:  10097mb (delta=111mb)
+2024-06-01T06:00:55Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 141957 memory location(s), 1 block(s), and 332441 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:00:55Z USER 2403803 (sg00) [ModuleForkPass]: Running error_injector
+2024-06-01T06:00:56Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=141957 blocks=1 instructions=332441 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:00:56Z WARNING 2403803 (sg00) [ErrorInjector]: Unrecognized injected error value "0"
+2024-06-01T06:00:56Z USER 2403803 (sg00) [ModuleForkPass]: error_injector finished after 0.154 seconds
+2024-06-01T06:00:56Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  10375mb, ru_maxrss:  10375mb (delta=130mb)
+2024-06-01T06:00:56Z INFO 2403803 (sg03) [Unroll]: INFO (Unroll) DONE unrolling Sat Jun  1 06:00:48 2024
+
+2024-06-01T06:00:56Z INFO 2403803 (sg03) [Unroll]: sg0003 Instruction count after Unroll: 
+2024-06-01T06:00:56Z INFO 2403803 (sg03) [Unroll]: Total count: 353020
+2024-06-01T06:00:56Z INFO 2403803 (sg03) [Unroll]: Matmult: 293456
+2024-06-01T06:00:56Z INFO 2403803 (sg03) [Unroll]: GenericCopy: 27328
+2024-06-01T06:00:56Z INFO 2403803 (sg03) [Unroll]: Load: 16163
+2024-06-01T06:00:56Z INFO 2403803 (sg03) [Unroll]: TensorTensor: 7325
+2024-06-01T06:00:56Z INFO 2403803 (sg03) [Unroll]: Activation: 4882
+2024-06-01T06:00:56Z INFO 2403803 (sg03) [Unroll]: TensorScalarPtr: 2656
+2024-06-01T06:00:56Z INFO 2403803 (sg03) [Unroll]: Save: 639
+2024-06-01T06:00:56Z INFO 2403803 (sg03) [Unroll]: GenericIndirectLoad: 512
+2024-06-01T06:00:56Z INFO 2403803 (sg03) [Unroll]: Iota: 33
+2024-06-01T06:00:56Z INFO 2403803 (sg03) [Unroll]: Memset: 12
+2024-06-01T06:00:56Z INFO 2403803 (sg03) [Unroll]: StreamShuffle: 8
+2024-06-01T06:00:56Z INFO 2403803 (sg03) [Unroll]: DMACopy: 4
+2024-06-01T06:00:56Z INFO 2403803 (sg03) [Unroll]: CollectiveCompute: 2
+2024-06-01T06:00:56Z USER 2403803 (sg03) [ModuleForkPass]: unroll finished after 7.369 seconds
+2024-06-01T06:00:56Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  10463mb, ru_maxrss:  10463mb (delta=7155mb)
+2024-06-01T06:00:56Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 141957 memory location(s), 1 block(s), and 332441 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:00:56Z USER 2403803 (sg00) [ModuleForkPass]: Running constant_propagate
+2024-06-01T06:00:56Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:00:56Z USER 2403803 (sg03) [ModuleForkPass]: Running psum_legalization
+2024-06-01T06:00:56Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:00:56Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=141957 blocks=1 instructions=332441 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:00:56Z USER 2403803 (sg03) [ModuleForkPass]: psum_legalization finished after 0.360 seconds
+2024-06-01T06:00:56Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  10818mb, ru_maxrss:  10818mb (delta=323mb)
+2024-06-01T06:00:56Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:00:56Z USER 2403803 (sg03) [ModuleForkPass]: Running pre_opts
+2024-06-01T06:00:56Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:00:56Z INFO 2403803 (sg03) [PreOpts]: ModuleDef 3 #intermediates 66
+2024-06-01T06:00:56Z INFO 2403803 (sg03) [PreOpts]: NetlistInfo #InstanceInputs 7 InstanceOutputs 1
+2024-06-01T06:00:56Z USER 2403803 (sg03) [ModuleForkPass]: pre_opts finished after 0.012 seconds
+2024-06-01T06:00:56Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  10855mb, ru_maxrss:  10855mb (delta=12mb)
+2024-06-01T06:00:56Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:00:56Z USER 2403803 (sg03) [ModuleForkPass]: Running error_injector
+2024-06-01T06:00:56Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:00:56Z WARNING 2403803 (sg03) [ErrorInjector]: Unrecognized injected error value "0"
+2024-06-01T06:00:56Z USER 2403803 (sg03) [ModuleForkPass]: error_injector finished after 0.010 seconds
+2024-06-01T06:00:56Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  10888mb, ru_maxrss:  10888mb (delta=12mb)
+2024-06-01T06:00:56Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:00:56Z USER 2403803 (sg03) [ModuleForkPass]: Running constant_propagate
+2024-06-01T06:00:56Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:00:56Z INFO 2403803 (sg03) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0
+2024-06-01T06:00:56Z INFO 2403803 (sg00) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0
+2024-06-01T06:00:57Z INFO 2403803 (sg03) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0
+2024-06-01T06:00:57Z USER 2403803 (sg03) [ModuleForkPass]: constant_propagate finished after 0.630 seconds
+2024-06-01T06:00:57Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  11583mb, ru_maxrss:  11583mb (delta=674mb)
+2024-06-01T06:00:57Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:00:57Z USER 2403803 (sg03) [ModuleForkPass]: Running vn_splitter
+2024-06-01T06:00:57Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:00:57Z INFO 2403803 (sg03) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 2
+2024-06-01T06:00:57Z INFO 2403803 (sg03) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0
+2024-06-01T06:00:57Z INFO 2403803 (sg00) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0
+2024-06-01T06:00:57Z INFO 2403803 (sg03) [ShrinkDN]: INFO (ShrinkDN): Shrunk 32 nodes. Total savings 4032 bytes/partition
+2024-06-01T06:00:57Z INFO 2403803 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0
+2024-06-01T06:00:57Z INFO 2403803 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0
+2024-06-01T06:00:57Z INFO 2403803 (sg03) [VNSplitterPass]: INFO (VNSplitter) Time: 0.02 seconds
+2024-06-01T06:00:57Z INFO 2403803 (sg03) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.422 seconds
+2024-06-01T06:00:57Z INFO 2403803 (sg03) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.111 seconds
+2024-06-01T06:00:57Z USER 2403803 (sg03) [ModuleForkPass]: vn_splitter finished after 0.636 seconds
+2024-06-01T06:00:57Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  12280mb, ru_maxrss:  12280mb (delta=685mb)
+2024-06-01T06:00:57Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:00:57Z USER 2403803 (sg03) [ModuleForkPass]: Running lower_ac
+2024-06-01T06:00:57Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:00:58Z INFO 2403803 (sg03) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies.
+2024-06-01T06:00:58Z USER 2403803 (sg03) [ModuleForkPass]: lower_ac finished after 0.052 seconds
+2024-06-01T06:00:58Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  12363mb, ru_maxrss:  12363mb (delta=68mb)
+2024-06-01T06:00:58Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:00:58Z USER 2403803 (sg03) [ModuleForkPass]: Running input_dma_coalescing
+2024-06-01T06:00:58Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:00:58Z INFO 2403803 (sg03) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads
+2024-06-01T06:00:58Z USER 2403803 (sg03) [ModuleForkPass]: input_dma_coalescing finished after 0.175 seconds
+2024-06-01T06:00:58Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  12571mb, ru_maxrss:  12571mb (delta=195mb)
+2024-06-01T06:00:58Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:00:58Z USER 2403803 (sg03) [ModuleForkPass]: Running early_peephole_opts
+2024-06-01T06:00:58Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to early_peephole_opts: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:00:58Z INFO 2403803 (sg03) [EarlyPeepholeOpts]: PeepholeOpts enabled? ActivationAccumulate: true
+2024-06-01T06:00:58Z INFO 2403803 (sg03) [EarlyPeepholeOpts]: Activation Accumulate: 0
+2024-06-01T06:00:58Z USER 2403803 (sg03) [ModuleForkPass]: early_peephole_opts finished after 0.050 seconds
+2024-06-01T06:00:58Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  12638mb, ru_maxrss:  12637mb (delta=53mb)
+2024-06-01T06:00:58Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:00:58Z USER 2403803 (sg03) [ModuleForkPass]: Running pre_sched
+2024-06-01T06:00:58Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:00:58Z INFO 2403803 (sg03) [PreSched]: Start PRE scheduling 2 cores:  1 at: Sat Jun  1 06:00:58 2024
+2024-06-01T06:00:58Z INFO 2403803 [LayerSpiller]: LayerSpill: Start...
+2024-06-01T06:00:58Z INFO 2403803 [LayerSpiller]: LayerSpill: Found 2 Splits CCs
+2024-06-01T06:00:58Z INFO 2403803 [LayerSpiller]: Grouped CCs to 1 clusters.
+2024-06-01T06:00:58Z INFO 2403803 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors
+2024-06-01T06:00:58Z INFO 2403803 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts
+2024-06-01T06:00:58Z INFO 2403803 [LayerSpiller]: LayerSpill: Done.
+2024-06-01T06:00:58Z INFO 2403803 (sg03) [PreSched]: Start split live ranges Sat Jun  1 06:00:58 2024
+2024-06-01T06:00:58Z INFO 2403803 (sg03) [PreSched]: Num_Splits: 0
+2024-06-01T06:00:58Z INFO 2403803 (sg03) [PreSched]: End split live ranges Sat Jun  1 06:00:58 2024
+2024-06-01T06:00:58Z INFO 2403803 (sg03) [PreSched]: Strt remove redundncies Sat Jun  1 06:00:58 2024
+2024-06-01T06:00:58Z INFO 2403803 (sg03) [PreSched]: remove_redundant_memsets
+2024-06-01T06:00:58Z USER 2403803 (sg00) [ModuleForkPass]: constant_propagate finished after 2.403 seconds
+2024-06-01T06:00:58Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  12829mb, ru_maxrss:  12829mb (delta=2343mb)
+2024-06-01T06:00:58Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 141957 memory location(s), 1 block(s), and 332441 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:00:58Z USER 2403803 (sg00) [ModuleForkPass]: Running vn_splitter
+2024-06-01T06:00:58Z INFO 2403803 (sg03) [PreSched]: remove_redundant_memsets: 0
+2024-06-01T06:00:58Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=141957 blocks=1 instructions=332441 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:00:58Z INFO 2403803 (sg03) [PreSched]: remove_redundant_loads
+2024-06-01T06:00:58Z INFO 2403803 (sg03) [PreSched]: remove_redundant_loads: 0
+2024-06-01T06:00:58Z INFO 2403803 (sg03) [PreSched]: End remove redundncies Sat Jun  1 06:00:58 2024
+2024-06-01T06:00:58Z INFO 2403803 (sg03) [PreSched]: Start DCE Sat Jun  1 06:00:58 2024
+2024-06-01T06:00:58Z INFO 2403803 (sg00) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 800
+2024-06-01T06:00:58Z INFO 2403803 (sg00) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0
+2024-06-01T06:00:59Z INFO 2403803 (sg03) [PreSched]: End DCE Sat Jun  1 06:00:59 2024
+2024-06-01T06:00:59Z INFO 2403803 (sg03) [PreSched]: Start build flow dependencies Sat Jun  1 06:00:59 2024
+2024-06-01T06:00:59Z INFO 2403803 (sg03) [build_flow_deps]: Start build fdeps. Invocation: 1Sat Jun  1 06:00:59 2024
+2024-06-01T06:00:59Z INFO 2403803 (sg03) [build_flow_deps]: Allocs: 66596 instructions: 353547
+2024-06-01T06:00:59Z INFO 2403803 (sg00) [ShrinkDN]: INFO (ShrinkDN): Shrunk 1058 nodes. Total savings 574520 bytes/partition
+2024-06-01T06:01:00Z INFO 2403803 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0
+2024-06-01T06:01:00Z INFO 2403803 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0
+2024-06-01T06:01:00Z INFO 2403803 (sg00) [VNSplitterPass]: INFO (VNSplitter) Time: 0.152 seconds
+2024-06-01T06:01:00Z INFO 2403803 (sg00) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.583 seconds
+2024-06-01T06:01:00Z INFO 2403803 (sg00) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.46 seconds
+2024-06-01T06:01:00Z USER 2403803 (sg00) [ModuleForkPass]: vn_splitter finished after 1.994 seconds
+2024-06-01T06:01:00Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  12920mb, ru_maxrss:  12920mb (delta=73mb)
+2024-06-01T06:01:01Z INFO 2403803 (sg03) [build_flow_deps]: Build fdeps inserted 1121669 edges 
+2024-06-01T06:01:01Z INFO 2403803 (sg03) [build_flow_deps]: Done build fdeps 1121669 Sat Jun  1 06:01:01 2024
+2024-06-01T06:01:01Z INFO 2403803 (sg03) [PreSched]: End build flow dependencies Sat Jun  1 06:01:01 2024
+2024-06-01T06:01:01Z INFO 2403803 (sg03) [PreSched]: Start remove useless insts Sat Jun  1 06:01:01 2024
+2024-06-01T06:01:01Z INFO 2403803 (sg03) [PreSched]: remove_useless_insts
+2024-06-01T06:01:01Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 141957 memory location(s), 1 block(s), and 332441 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:01:01Z USER 2403803 (sg00) [ModuleForkPass]: Running lower_ac
+2024-06-01T06:01:02Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=141957 blocks=1 instructions=332441 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:01:02Z INFO 2403803 (sg00) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies.
+2024-06-01T06:01:02Z USER 2403803 (sg00) [ModuleForkPass]: lower_ac finished after 0.636 seconds
+2024-06-01T06:01:02Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  11742mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:02Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 141957 memory location(s), 1 block(s), and 332441 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:01:02Z USER 2403803 (sg00) [ModuleForkPass]: Running input_dma_coalescing
+2024-06-01T06:01:03Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=141957 blocks=1 instructions=332441 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:01:04Z INFO 2403803 (sg00) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads
+2024-06-01T06:01:04Z USER 2403803 (sg00) [ModuleForkPass]: input_dma_coalescing finished after 1.379 seconds
+2024-06-01T06:01:04Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  11503mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:04Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 141957 memory location(s), 1 block(s), and 332441 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:01:04Z USER 2403803 (sg00) [ModuleForkPass]: Running early_peephole_opts
+2024-06-01T06:01:04Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to early_peephole_opts: modules=1 functions=1 allocs=141957 blocks=1 instructions=332441 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:01:04Z INFO 2403803 (sg00) [EarlyPeepholeOpts]: PeepholeOpts enabled? ActivationAccumulate: true
+2024-06-01T06:01:05Z INFO 2403803 (sg02) [Unroll]: INFO (Unroll) DONE unrolling Sat Jun  1 06:00:50 2024
+
+2024-06-01T06:01:05Z INFO 2403803 (sg02) [Unroll]: sg0002 Instruction count after Unroll: 
+2024-06-01T06:01:05Z INFO 2403803 (sg02) [Unroll]: Total count: 701966
+2024-06-01T06:01:05Z INFO 2403803 (sg02) [Unroll]: Matmult: 474564
+2024-06-01T06:01:05Z INFO 2403803 (sg02) [Unroll]: GenericCopy: 76522
+2024-06-01T06:01:05Z INFO 2403803 (sg02) [Unroll]: Load: 28422
+2024-06-01T06:01:05Z INFO 2403803 (sg02) [Unroll]: TensorReduce: 27648
+2024-06-01T06:01:05Z INFO 2403803 (sg02) [Unroll]: Memset: 24581
+2024-06-01T06:01:05Z INFO 2403803 (sg02) [Unroll]: Activation: 19040
+2024-06-01T06:01:05Z INFO 2403803 (sg02) [Unroll]: TensorScalarPtr: 17280
+2024-06-01T06:01:05Z INFO 2403803 (sg02) [Unroll]: TensorTensor: 16960
+2024-06-01T06:01:05Z INFO 2403803 (sg02) [Unroll]: Select: 11520
+2024-06-01T06:01:05Z INFO 2403803 (sg02) [Unroll]: Save: 3116
+2024-06-01T06:01:05Z INFO 2403803 (sg02) [Unroll]: Reciprocal: 2304
+2024-06-01T06:01:05Z INFO 2403803 (sg02) [Unroll]: DMACopy: 5
+2024-06-01T06:01:05Z INFO 2403803 (sg02) [Unroll]: CollectiveCompute: 4
+2024-06-01T06:01:05Z USER 2403803 (sg02) [ModuleForkPass]: unroll finished after 15.092 seconds
+2024-06-01T06:01:05Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  11503mb, ru_maxrss:  12920mb (delta=6807mb)
+2024-06-01T06:01:06Z INFO 2403803 (sg00) [EarlyPeepholeOpts]: Activation Accumulate: 13824
+2024-06-01T06:01:06Z USER 2403803 (sg00) [ModuleForkPass]: early_peephole_opts finished after 1.801 seconds
+2024-06-01T06:01:06Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  11503mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:06Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 214329 memory location(s), 1 block(s), and 693262 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:06Z USER 2403803 (sg02) [ModuleForkPass]: Running psum_legalization
+2024-06-01T06:01:06Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 139781 memory location(s), 1 block(s), and 320921 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:01:06Z USER 2403803 (sg00) [ModuleForkPass]: Running pre_sched
+2024-06-01T06:01:06Z INFO 2403803 (sg03) [PreSched]: remove Useless Instructions: 0
+2024-06-01T06:01:06Z INFO 2403803 (sg03) [PreSched]: End remove useless insts Sat Jun  1 06:01:06 2024
+2024-06-01T06:01:06Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=139781 blocks=1 instructions=320921 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:01:06Z INFO 2403803 (sg00) [PreSched]: Start PRE scheduling 2 cores:  1 at: Sat Jun  1 06:01:06 2024
+2024-06-01T06:01:06Z INFO 2403803 [LayerSpiller]: LayerSpill: Start...
+2024-06-01T06:01:06Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=214329 blocks=1 instructions=693262 Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:06Z USER 2403803 (sg02) [ModuleForkPass]: psum_legalization finished after 0.487 seconds
+2024-06-01T06:01:06Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  11503mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:06Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 214329 memory location(s), 1 block(s), and 693262 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:06Z USER 2403803 (sg02) [ModuleForkPass]: Running pre_opts
+2024-06-01T06:01:06Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=214329 blocks=1 instructions=693262 Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:06Z INFO 2403803 (sg02) [PreOpts]: ModuleDef 2 #intermediates 66
+2024-06-01T06:01:06Z INFO 2403803 (sg02) [PreOpts]: NetlistInfo #InstanceInputs 9 InstanceOutputs 2
+2024-06-01T06:01:06Z USER 2403803 (sg02) [ModuleForkPass]: pre_opts finished after 0.017 seconds
+2024-06-01T06:01:06Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  11503mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:06Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 214329 memory location(s), 1 block(s), and 693262 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:06Z USER 2403803 (sg02) [ModuleForkPass]: Running error_injector
+2024-06-01T06:01:06Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=214329 blocks=1 instructions=693262 Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:06Z WARNING 2403803 (sg02) [ErrorInjector]: Unrecognized injected error value "0"
+2024-06-01T06:01:06Z USER 2403803 (sg02) [ModuleForkPass]: error_injector finished after 0.016 seconds
+2024-06-01T06:01:06Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  11503mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:06Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 214329 memory location(s), 1 block(s), and 693262 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:06Z USER 2403803 (sg02) [ModuleForkPass]: Running constant_propagate
+2024-06-01T06:01:06Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=214329 blocks=1 instructions=693262 Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:07Z INFO 2403803 [LayerSpiller]: LayerSpill: Found 1 Splits CCs
+2024-06-01T06:01:07Z INFO 2403803 [LayerSpiller]: Grouped CCs to 1 clusters.
+2024-06-01T06:01:07Z INFO 2403803 (sg01) [Unroll]: INFO (Unroll) DONE unrolling Sat Jun  1 06:00:52 2024
+
+2024-06-01T06:01:07Z INFO 2403803 (sg01) [Unroll]: sg0001 Instruction count after Unroll: 
+2024-06-01T06:01:07Z INFO 2403803 (sg01) [Unroll]: Total count: 701966
+2024-06-01T06:01:07Z INFO 2403803 (sg01) [Unroll]: Matmult: 474564
+2024-06-01T06:01:07Z INFO 2403803 (sg01) [Unroll]: GenericCopy: 76522
+2024-06-01T06:01:07Z INFO 2403803 (sg01) [Unroll]: Load: 28422
+2024-06-01T06:01:07Z INFO 2403803 (sg01) [Unroll]: TensorReduce: 27648
+2024-06-01T06:01:07Z INFO 2403803 (sg01) [Unroll]: Memset: 24581
+2024-06-01T06:01:07Z INFO 2403803 (sg01) [Unroll]: Activation: 19040
+2024-06-01T06:01:07Z INFO 2403803 (sg01) [Unroll]: TensorScalarPtr: 17280
+2024-06-01T06:01:07Z INFO 2403803 (sg01) [Unroll]: TensorTensor: 16960
+2024-06-01T06:01:07Z INFO 2403803 (sg01) [Unroll]: Select: 11520
+2024-06-01T06:01:07Z INFO 2403803 (sg01) [Unroll]: Save: 3116
+2024-06-01T06:01:07Z INFO 2403803 (sg01) [Unroll]: Reciprocal: 2304
+2024-06-01T06:01:07Z INFO 2403803 (sg01) [Unroll]: DMACopy: 5
+2024-06-01T06:01:07Z INFO 2403803 (sg01) [Unroll]: CollectiveCompute: 4
+2024-06-01T06:01:07Z USER 2403803 (sg01) [ModuleForkPass]: unroll finished after 15.287 seconds
+2024-06-01T06:01:07Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  11503mb, ru_maxrss:  12920mb (delta=5829mb)
+2024-06-01T06:01:07Z INFO 2403803 (sg02) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0
+2024-06-01T06:01:07Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 214329 memory location(s), 1 block(s), and 693262 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:07Z USER 2403803 (sg01) [ModuleForkPass]: Running psum_legalization
+2024-06-01T06:01:07Z INFO 2403803 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors
+2024-06-01T06:01:07Z INFO 2403803 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts
+2024-06-01T06:01:07Z INFO 2403803 [LayerSpiller]: LayerSpill: Done.
+2024-06-01T06:01:07Z INFO 2403803 (sg00) [PreSched]: Start split live ranges Sat Jun  1 06:01:06 2024
+2024-06-01T06:01:07Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=214329 blocks=1 instructions=693262 Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:08Z INFO 2403803 (sg03) [PreSched]: DONE PRE scheduling Sat Jun  1 06:01:08 2024
+2024-06-01T06:01:08Z USER 2403803 (sg03) [ModuleForkPass]: pre_sched finished after 9.739 seconds
+2024-06-01T06:01:08Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  11503mb, ru_maxrss:  12920mb (delta=262mb)
+2024-06-01T06:01:08Z USER 2403803 (sg01) [ModuleForkPass]: psum_legalization finished after 0.662 seconds
+2024-06-01T06:01:08Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  11503mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:08Z INFO 2403803 (sg00) [PreSched]: Num_Splits: 30
+2024-06-01T06:01:08Z INFO 2403803 (sg00) [PreSched]: End split live ranges Sat Jun  1 06:01:08 2024
+2024-06-01T06:01:08Z INFO 2403803 (sg00) [PreSched]: Strt remove redundncies Sat Jun  1 06:01:08 2024
+2024-06-01T06:01:08Z INFO 2403803 (sg00) [PreSched]: remove_redundant_memsets
+2024-06-01T06:01:08Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:01:08Z USER 2403803 (sg03) [ModuleForkPass]: Running tensor_copy_elim
+2024-06-01T06:01:08Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:01:08Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 214329 memory location(s), 1 block(s), and 693262 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:08Z USER 2403803 (sg01) [ModuleForkPass]: Running pre_opts
+2024-06-01T06:01:08Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=214329 blocks=1 instructions=693262 Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:08Z INFO 2403803 (sg01) [PreOpts]: ModuleDef 1 #intermediates 66
+2024-06-01T06:01:08Z INFO 2403803 (sg01) [PreOpts]: NetlistInfo #InstanceInputs 10 InstanceOutputs 2
+2024-06-01T06:01:08Z USER 2403803 (sg01) [ModuleForkPass]: pre_opts finished after 0.151 seconds
+2024-06-01T06:01:08Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  11503mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:08Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 214329 memory location(s), 1 block(s), and 693262 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:08Z USER 2403803 (sg01) [ModuleForkPass]: Running error_injector
+2024-06-01T06:01:08Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=214329 blocks=1 instructions=693262 Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:08Z WARNING 2403803 (sg01) [ErrorInjector]: Unrecognized injected error value "0"
+2024-06-01T06:01:08Z USER 2403803 (sg01) [ModuleForkPass]: error_injector finished after 0.267 seconds
+2024-06-01T06:01:08Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  11503mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:08Z INFO 2403803 (sg00) [PreSched]: remove_redundant_memsets: 0
+2024-06-01T06:01:08Z INFO 2403803 (sg00) [PreSched]: remove_redundant_loads
+2024-06-01T06:01:08Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 214329 memory location(s), 1 block(s), and 693262 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:08Z USER 2403803 (sg01) [ModuleForkPass]: Running constant_propagate
+2024-06-01T06:01:09Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=214329 blocks=1 instructions=693262 Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:09Z INFO 2403803 (sg00) [PreSched]: remove_redundant_loads: 0
+2024-06-01T06:01:09Z INFO 2403803 (sg00) [PreSched]: End remove redundncies Sat Jun  1 06:01:09 2024
+2024-06-01T06:01:09Z INFO 2403803 (sg00) [PreSched]: Start DCE Sat Jun  1 06:01:09 2024
+2024-06-01T06:01:09Z INFO 2403803 (sg01) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0
+2024-06-01T06:01:09Z INFO 2403803 (sg02) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0
+2024-06-01T06:01:10Z INFO 2403803 (sg00) [PreSched]: End DCE Sat Jun  1 06:01:10 2024
+2024-06-01T06:01:10Z INFO 2403803 (sg03) [TensorCopyElim]: Tensor CP elimination: 0
+2024-06-01T06:01:10Z INFO 2403803 (sg01) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0
+2024-06-01T06:01:11Z INFO 2403803 (sg00) [PreSched]: Start build flow dependencies Sat Jun  1 06:01:11 2024
+2024-06-01T06:01:11Z INFO 2403803 (sg00) [build_flow_deps]: Start build fdeps. Invocation: 2Sat Jun  1 06:01:11 2024
+2024-06-01T06:01:11Z INFO 2403803 (sg00) [build_flow_deps]: Allocs: 139713 instructions: 320853
+2024-06-01T06:01:12Z USER 2403803 (sg02) [ModuleForkPass]: constant_propagate finished after 5.804 seconds
+2024-06-01T06:01:12Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  11504mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:12Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 214329 memory location(s), 1 block(s), and 693262 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:12Z USER 2403803 (sg02) [ModuleForkPass]: Running vn_splitter
+2024-06-01T06:01:12Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=214329 blocks=1 instructions=693262 Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:12Z INFO 2403803 (sg02) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 801
+2024-06-01T06:01:12Z INFO 2403803 (sg02) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0
+2024-06-01T06:01:12Z USER 2403803 (sg01) [ModuleForkPass]: constant_propagate finished after 4.113 seconds
+2024-06-01T06:01:12Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  11518mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:12Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 214329 memory location(s), 1 block(s), and 693262 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:12Z USER 2403803 (sg01) [ModuleForkPass]: Running vn_splitter
+2024-06-01T06:01:13Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=214329 blocks=1 instructions=693262 Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:13Z INFO 2403803 (sg01) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 801
+2024-06-01T06:01:13Z INFO 2403803 (sg01) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0
+2024-06-01T06:01:14Z INFO 2403803 (sg02) [ShrinkDN]: INFO (ShrinkDN): Shrunk 1344 nodes. Total savings 576672 bytes/partition
+2024-06-01T06:01:14Z INFO 2403803 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0
+2024-06-01T06:01:14Z INFO 2403803 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0
+2024-06-01T06:01:14Z INFO 2403803 (sg02) [VNSplitterPass]: INFO (VNSplitter) Time: 0.031 seconds
+2024-06-01T06:01:14Z INFO 2403803 (sg02) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.916 seconds
+2024-06-01T06:01:14Z INFO 2403803 (sg02) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.37 seconds
+2024-06-01T06:01:14Z USER 2403803 (sg02) [ModuleForkPass]: vn_splitter finished after 1.669 seconds
+2024-06-01T06:01:14Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  11568mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:14Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 214329 memory location(s), 1 block(s), and 693262 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:14Z USER 2403803 (sg02) [ModuleForkPass]: Running lower_ac
+2024-06-01T06:01:14Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=214329 blocks=1 instructions=693262 Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:14Z INFO 2403803 (sg02) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies.
+2024-06-01T06:01:14Z USER 2403803 (sg02) [ModuleForkPass]: lower_ac finished after 0.110 seconds
+2024-06-01T06:01:14Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  11580mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:14Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 214329 memory location(s), 1 block(s), and 693262 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:14Z USER 2403803 (sg02) [ModuleForkPass]: Running input_dma_coalescing
+2024-06-01T06:01:14Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=214329 blocks=1 instructions=693262 Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:15Z INFO 2403803 (sg00) [build_flow_deps]: Build fdeps inserted 784997 edges 
+2024-06-01T06:01:15Z INFO 2403803 (sg00) [build_flow_deps]: Done build fdeps 784997 Sat Jun  1 06:01:15 2024
+2024-06-01T06:01:15Z INFO 2403803 (sg00) [PreSched]: End build flow dependencies Sat Jun  1 06:01:15 2024
+2024-06-01T06:01:15Z INFO 2403803 (sg00) [PreSched]: Start remove useless insts Sat Jun  1 06:01:15 2024
+2024-06-01T06:01:15Z INFO 2403803 (sg00) [PreSched]: remove_useless_insts
+2024-06-01T06:01:15Z INFO 2403803 (sg00) [PreSched]: remove Useless Instructions: 0
+2024-06-01T06:01:15Z INFO 2403803 (sg00) [PreSched]: End remove useless insts Sat Jun  1 06:01:15 2024
+2024-06-01T06:01:16Z INFO 2403803 (sg01) [ShrinkDN]: INFO (ShrinkDN): Shrunk 1344 nodes. Total savings 576672 bytes/partition
+2024-06-01T06:01:16Z INFO 2403803 (sg00) [PreSched]: DONE PRE scheduling Sat Jun  1 06:01:16 2024
+2024-06-01T06:01:16Z USER 2403803 (sg00) [ModuleForkPass]: pre_sched finished after 10.364 seconds
+2024-06-01T06:01:16Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  11594mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:16Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 139713 memory location(s), 1 block(s), and 320853 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:01:16Z USER 2403803 (sg00) [ModuleForkPass]: Running tensor_copy_elim
+2024-06-01T06:01:16Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=139713 blocks=1 instructions=320853 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:01:17Z INFO 2403803 (sg00) [TensorCopyElim]: Tensor CP elimination: 288
+2024-06-01T06:01:18Z USER 2403803 (sg00) [ModuleForkPass]: tensor_copy_elim finished after 1.782 seconds
+2024-06-01T06:01:18Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  11595mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:18Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 139713 memory location(s), 1 block(s), and 320565 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:01:18Z USER 2403803 (sg00) [ModuleForkPass]: Running mm_packing
+2024-06-01T06:01:18Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to mm_packing: modules=1 functions=1 allocs=139713 blocks=1 instructions=320565 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:01:18Z INFO 2403803 (sg00) [MMPacking]: INFO (MMPack) Running the preprocessing step.
+2024-06-01T06:01:19Z INFO 2403803 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0
+2024-06-01T06:01:19Z INFO 2403803 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0
+2024-06-01T06:01:19Z INFO 2403803 (sg01) [VNSplitterPass]: INFO (VNSplitter) Time: 0.033 seconds
+2024-06-01T06:01:19Z INFO 2403803 (sg01) [VNSplitterPass]: INFO (VerticalFusion) Time: 2.103 seconds
+2024-06-01T06:01:19Z INFO 2403803 (sg01) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.926 seconds
+2024-06-01T06:01:19Z USER 2403803 (sg01) [ModuleForkPass]: vn_splitter finished after 6.265 seconds
+2024-06-01T06:01:19Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  11595mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:19Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 214329 memory location(s), 1 block(s), and 693262 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:19Z USER 2403803 (sg01) [ModuleForkPass]: Running lower_ac
+2024-06-01T06:01:19Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=214329 blocks=1 instructions=693262 Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:20Z USER 2403803 (sg03) [ModuleForkPass]: tensor_copy_elim finished after 12.239 seconds
+2024-06-01T06:01:20Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  11596mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:20Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:01:20Z USER 2403803 (sg03) [ModuleForkPass]: Running mm_packing
+2024-06-01T06:01:20Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to mm_packing: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:01:20Z INFO 2403803 (sg03) [MMPacking]: INFO (MMPack) Running the preprocessing step.
+2024-06-01T06:01:21Z INFO 2403803 (sg01) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies.
+2024-06-01T06:01:21Z USER 2403803 (sg01) [ModuleForkPass]: lower_ac finished after 1.471 seconds
+2024-06-01T06:01:21Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  11596mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:21Z INFO 2403803 (sg00) [MMPacking]: INFO (MMPack) mlBPCG size 119672, CCS = 2368
+2024-06-01T06:01:21Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 214329 memory location(s), 1 block(s), and 693262 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:21Z USER 2403803 (sg01) [ModuleForkPass]: Running input_dma_coalescing
+2024-06-01T06:01:21Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=214329 blocks=1 instructions=693262 Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:22Z INFO 2403803 (sg00) [MMPacking]: INFO (MMPack) agRGCG size 791342
+2024-06-01T06:01:22Z INFO 2403803 (sg00) [MMPackingPass]: INFO (MMPacking) Time: 3.949 seconds
+2024-06-01T06:01:23Z USER 2403803 (sg00) [ModuleForkPass]: mm_packing finished after 4.367 seconds
+2024-06-01T06:01:23Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  11664mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:24Z INFO 2403803 (sg03) [MMPacking]: INFO (MMPack) mlBPCG size 77926, CCS = 4996
+2024-06-01T06:01:24Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 139713 memory location(s), 1 block(s), and 320565 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:01:24Z USER 2403803 (sg00) [ModuleForkPass]: Running coloring_allocator_psum
+2024-06-01T06:01:24Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=139713 blocks=1 instructions=320565 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:01:27Z INFO 2403803 (sg02) [DMAOptimizationBase]: DMA input Coalescing combined 9114 input loads
+2024-06-01T06:01:27Z USER 2403803 (sg02) [ModuleForkPass]: input_dma_coalescing finished after 12.649 seconds
+2024-06-01T06:01:27Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  11341mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:27Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 209772 memory location(s), 1 block(s), and 688705 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:27Z USER 2403803 (sg02) [ModuleForkPass]: Running early_peephole_opts
+2024-06-01T06:01:27Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to early_peephole_opts: modules=1 functions=1 allocs=209772 blocks=1 instructions=688705 Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:27Z INFO 2403803 (sg02) [EarlyPeepholeOpts]: PeepholeOpts enabled? ActivationAccumulate: true
+2024-06-01T06:01:28Z INFO 2403803 (sg02) [EarlyPeepholeOpts]: Activation Accumulate: 13824
+2024-06-01T06:01:28Z USER 2403803 (sg02) [ModuleForkPass]: early_peephole_opts finished after 1.050 seconds
+2024-06-01T06:01:28Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  11344mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:28Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 207596 memory location(s), 1 block(s), and 677185 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:28Z USER 2403803 (sg02) [ModuleForkPass]: Running pre_sched
+2024-06-01T06:01:28Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=207596 blocks=1 instructions=677185 Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:28Z INFO 2403803 (sg02) [PreSched]: Start PRE scheduling 2 cores:  1 at: Sat Jun  1 06:01:28 2024
+2024-06-01T06:01:28Z INFO 2403803 [LayerSpiller]: LayerSpill: Start...
+2024-06-01T06:01:28Z INFO 2403803 [LayerSpiller]: LayerSpill: Found 3 Splits CCs
+2024-06-01T06:01:28Z INFO 2403803 [LayerSpiller]: Grouped CCs to 2 clusters.
+2024-06-01T06:01:29Z INFO 2403803 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors
+2024-06-01T06:01:29Z INFO 2403803 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts
+2024-06-01T06:01:29Z INFO 2403803 [LayerSpiller]: LayerSpill: Done.
+2024-06-01T06:01:29Z INFO 2403803 (sg02) [PreSched]: Start split live ranges Sat Jun  1 06:01:28 2024
+2024-06-01T06:01:29Z INFO 2403803 (sg02) [PreSched]: Num_Splits: 0
+2024-06-01T06:01:29Z INFO 2403803 (sg02) [PreSched]: End split live ranges Sat Jun  1 06:01:29 2024
+2024-06-01T06:01:29Z INFO 2403803 (sg02) [PreSched]: Strt remove redundncies Sat Jun  1 06:01:29 2024
+2024-06-01T06:01:29Z INFO 2403803 (sg02) [PreSched]: remove_redundant_memsets
+2024-06-01T06:01:30Z INFO 2403803 (sg02) [PreSched]: remove_redundant_memsets: 0
+2024-06-01T06:01:30Z INFO 2403803 (sg02) [PreSched]: remove_redundant_loads
+2024-06-01T06:01:31Z INFO 2403803 (sg02) [PreSched]: remove_redundant_loads: 0
+2024-06-01T06:01:31Z INFO 2403803 (sg02) [PreSched]: End remove redundncies Sat Jun  1 06:01:31 2024
+2024-06-01T06:01:31Z INFO 2403803 (sg02) [PreSched]: Start DCE Sat Jun  1 06:01:31 2024
+2024-06-01T06:01:31Z INFO 2403803 (sg03) [MMPacking]: INFO (MMPack) agRGCG size 501728
+2024-06-01T06:01:31Z INFO 2403803 (sg03) [MMPackingPass]: INFO (MMPacking) Time: 11.142 seconds
+2024-06-01T06:01:31Z USER 2403803 (sg03) [ModuleForkPass]: mm_packing finished after 11.337 seconds
+2024-06-01T06:01:31Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  11422mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:31Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:01:31Z USER 2403803 (sg03) [ModuleForkPass]: Running coloring_allocator_psum
+2024-06-01T06:01:31Z INFO 2403803 (sg02) [PreSched]: End DCE Sat Jun  1 06:01:31 2024
+2024-06-01T06:01:31Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:01:32Z INFO 2403803 (sg02) [PreSched]: Start build flow dependencies Sat Jun  1 06:01:32 2024
+2024-06-01T06:01:32Z INFO 2403803 (sg02) [build_flow_deps]: Start build fdeps. Invocation: 3Sat Jun  1 06:01:32 2024
+2024-06-01T06:01:32Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 7412047876
+2024-06-01T06:01:32Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 3583 bytes
+2024-06-01T06:01:32Z INFO 2403803 (sg02) [build_flow_deps]: Allocs: 207468 instructions: 677057
+2024-06-01T06:01:32Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 300279840
+2024-06-01T06:01:32Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 3695 bytes
+2024-06-01T06:01:32Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 16384
+2024-06-01T06:01:32Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:01:32Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: Allocating functions
+2024-06-01T06:01:32Z INFO 2403803 (sg03) [ColoringAllocator::Rep]:   linearize and check
+2024-06-01T06:01:32Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 1662748872
+2024-06-01T06:01:32Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 1781 bytes
+2024-06-01T06:01:32Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 800907024
+2024-06-01T06:01:32Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 871 bytes
+2024-06-01T06:01:32Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 16384
+2024-06-01T06:01:32Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:01:32Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: Allocating functions
+2024-06-01T06:01:32Z INFO 2403803 (sg00) [ColoringAllocator::Rep]:   linearize and check
+2024-06-01T06:01:33Z INFO 2403803 (sg03) [PSUM_Allocator]:   allocating PSUM
+2024-06-01T06:01:33Z INFO 2403803 (sg03) [PSUM_Allocator]:     main loop
+2024-06-01T06:01:33Z INFO 2403803 (sg03) [PSUM_Allocator]:     renumber locations
+2024-06-01T06:01:33Z INFO 2403803 (sg03) [PSUM_Allocator]:         size = 29981
+2024-06-01T06:01:33Z INFO 2403803 (sg03) [PSUM_Allocator]: build_no_bitmap start
+2024-06-01T06:01:33Z INFO 2403803 (sg03) [PSUM_Allocator]: 100% PSUM demand before spilling
+2024-06-01T06:01:33Z INFO 2403803 (sg03) [PSUM_Allocator]:           PSUM high-water mark = 8 tensors
+2024-06-01T06:01:33Z INFO 2403803 (sg03) [PSUM_Allocator]:         found 97488 edges
+2024-06-01T06:01:33Z INFO 2403803 (sg03) [PSUM_Allocator]:         mean: 6.50332
+2024-06-01T06:01:33Z INFO 2403803 (sg03) [PSUM_Allocator]:         median: 7
+2024-06-01T06:01:33Z INFO 2403803 (sg03) [PSUM_Allocator]:         adjacency vectors require 779904 bytes
+2024-06-01T06:01:33Z INFO 2403803 (sg03) [PSUM_Allocator]: build_no_bitmap done
+2024-06-01T06:01:33Z INFO 2403803 (sg03) [PSUM_Allocator]:       find costs
+2024-06-01T06:01:34Z INFO 2403803 (sg03) [PSUM_Allocator]:       best-of-n loop, heuristic = 0
+2024-06-01T06:01:34Z INFO 2403803 (sg03) [PSUM_Allocator]:         simplify interference graph
+2024-06-01T06:01:34Z INFO 2403803 (sg03) [PSUM_Allocator]:           initialize low and high
+2024-06-01T06:01:34Z INFO 2403803 (sg03) [PSUM_Allocator]:             lo = 29981
+2024-06-01T06:01:34Z INFO 2403803 (sg03) [PSUM_Allocator]:             hi = 0
+2024-06-01T06:01:34Z INFO 2403803 (sg03) [PSUM_Allocator]:             inf = 0
+2024-06-01T06:01:34Z INFO 2403803 (sg03) [PSUM_Allocator]:             total = 29981
+2024-06-01T06:01:34Z INFO 2403803 (sg03) [PSUM_Allocator]:           simplify
+2024-06-01T06:01:34Z INFO 2403803 (sg03) [PSUM_Allocator]:             new candidates = 0
+2024-06-01T06:01:34Z INFO 2403803 (sg03) [PSUM_Allocator]:         select ranges
+2024-06-01T06:01:34Z INFO 2403803 (sg03) [PSUM_Allocator]:           no more spills
+2024-06-01T06:01:34Z INFO 2403803 (sg03) [PSUM_Allocator]:         PSUM score = 0 (lower is better)
+2024-06-01T06:01:34Z INFO 2403803 (sg03) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles
+2024-06-01T06:01:34Z INFO 2403803 (sg03) [PSUM_Allocator]: 100% PSUM utilization after allocation
+2024-06-01T06:01:35Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 7412047876
+2024-06-01T06:01:35Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 3583 bytes
+2024-06-01T06:01:35Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 300279840
+2024-06-01T06:01:35Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 3695 bytes
+2024-06-01T06:01:35Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 16384
+2024-06-01T06:01:35Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:01:35Z USER 2403803 (sg03) [ModuleForkPass]: coloring_allocator_psum finished after 3.622 seconds
+2024-06-01T06:01:35Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  11724mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:35Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:01:35Z USER 2403803 (sg03) [ModuleForkPass]: Running dma_optimization_psum
+2024-06-01T06:01:35Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:01:36Z INFO 2403803 (sg03) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions
+2024-06-01T06:01:36Z INFO 2403803 (sg03) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations
+2024-06-01T06:01:36Z USER 2403803 (sg03) [ModuleForkPass]: dma_optimization_psum finished after 1.424 seconds
+2024-06-01T06:01:36Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  11647mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:37Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:01:37Z USER 2403803 (sg03) [ModuleForkPass]: Running address_rotation_psum
+2024-06-01T06:01:37Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:01:37Z INFO 2403803 (sg00) [PSUM_Allocator]:   allocating PSUM
+2024-06-01T06:01:37Z INFO 2403803 (sg00) [PSUM_Allocator]:     main loop
+2024-06-01T06:01:37Z INFO 2403803 (sg02) [build_flow_deps]: Build fdeps inserted 1949358 edges 
+2024-06-01T06:01:37Z INFO 2403803 (sg02) [build_flow_deps]: Done build fdeps 1949358 Sat Jun  1 06:01:37 2024
+2024-06-01T06:01:37Z INFO 2403803 (sg02) [PreSched]: End build flow dependencies Sat Jun  1 06:01:37 2024
+2024-06-01T06:01:37Z INFO 2403803 (sg02) [PreSched]: Start remove useless insts Sat Jun  1 06:01:37 2024
+2024-06-01T06:01:37Z INFO 2403803 (sg02) [PreSched]: remove_useless_insts
+2024-06-01T06:01:37Z INFO 2403803 (sg00) [PSUM_Allocator]:     renumber locations
+2024-06-01T06:01:37Z INFO 2403803 (sg00) [PSUM_Allocator]:         size = 42827
+2024-06-01T06:01:38Z INFO 2403803 (sg02) [PreSched]: remove Useless Instructions: 0
+2024-06-01T06:01:38Z INFO 2403803 (sg02) [PreSched]: End remove useless insts Sat Jun  1 06:01:38 2024
+2024-06-01T06:01:38Z INFO 2403803 (sg00) [PSUM_Allocator]: build_no_bitmap start
+2024-06-01T06:01:38Z INFO 2403803 (sg03) [DMAOptimizationBase]: PSUM Rotation rotated 80 PSUM Banks
+2024-06-01T06:01:38Z INFO 2403803 (sg00) [PSUM_Allocator]: 600% PSUM demand before spilling
+2024-06-01T06:01:38Z INFO 2403803 (sg00) [PSUM_Allocator]:           PSUM high-water mark = 48 tensors
+2024-06-01T06:01:38Z INFO 2403803 (sg00) [PSUM_Allocator]:         found 1299010 edges
+2024-06-01T06:01:38Z INFO 2403803 (sg00) [PSUM_Allocator]:         mean: 60.6631
+2024-06-01T06:01:38Z INFO 2403803 (sg00) [PSUM_Allocator]:         median: 40.7687
+2024-06-01T06:01:38Z INFO 2403803 (sg00) [PSUM_Allocator]:         adjacency vectors require 10392080 bytes
+2024-06-01T06:01:38Z INFO 2403803 (sg00) [PSUM_Allocator]: build_no_bitmap done
+2024-06-01T06:01:38Z INFO 2403803 (sg00) [PSUM_Allocator]:       find costs
+2024-06-01T06:01:39Z INFO 2403803 (sg00) [PSUM_Allocator]:       best-of-n loop, heuristic = 0
+2024-06-01T06:01:39Z INFO 2403803 (sg00) [PSUM_Allocator]:         simplify interference graph
+2024-06-01T06:01:39Z INFO 2403803 (sg00) [PSUM_Allocator]:           initialize low and high
+2024-06-01T06:01:39Z INFO 2403803 (sg00) [PSUM_Allocator]:             lo = 7483
+2024-06-01T06:01:39Z INFO 2403803 (sg00) [PSUM_Allocator]:             hi = 22112
+2024-06-01T06:01:39Z INFO 2403803 (sg00) [PSUM_Allocator]:             inf = 13232
+2024-06-01T06:01:39Z INFO 2403803 (sg00) [PSUM_Allocator]:             total = 42827
+2024-06-01T06:01:39Z INFO 2403803 (sg00) [PSUM_Allocator]:           simplify
+2024-06-01T06:01:39Z INFO 2403803 (sg00) [PSUM_Allocator]:             new candidates = 640
+2024-06-01T06:01:39Z INFO 2403803 (sg00) [PSUM_Allocator]:         select ranges
+2024-06-01T06:01:40Z INFO 2403803 (sg03) [DMAOptimizationBase]: PSUM Rotation rotated 60 PSUM Banks
+2024-06-01T06:01:40Z INFO 2403803 (sg02) [PreSched]: DONE PRE scheduling Sat Jun  1 06:01:40 2024
+2024-06-01T06:01:40Z INFO 2403803 (sg00) [PSUM_Allocator]:           PSUM spills = 640 tensors
+2024-06-01T06:01:40Z INFO 2403803 (sg00) [PSUM_Allocator]:         PSUM score = 2.68493e+06 (lower is better)
+2024-06-01T06:01:40Z INFO 2403803 (sg00) [PSUM_Allocator]:       best PSUM heuristic = 0
+2024-06-01T06:01:40Z INFO 2403803 (sg00) [PSUM_Allocator]:       collect spills
+2024-06-01T06:01:40Z USER 2403803 (sg02) [ModuleForkPass]: pre_sched finished after 11.901 seconds
+2024-06-01T06:01:40Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  11694mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:40Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 207468 memory location(s), 1 block(s), and 677057 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:40Z USER 2403803 (sg02) [ModuleForkPass]: Running tensor_copy_elim
+2024-06-01T06:01:40Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=207468 blocks=1 instructions=677057 Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:41Z INFO 2403803 (sg03) [DMAOptimizationBase]: PSUM Rotation rotated 79 PSUM Banks
+2024-06-01T06:01:41Z USER 2403803 (sg03) [ModuleForkPass]: address_rotation_psum finished after 4.394 seconds
+2024-06-01T06:01:41Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  11592mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:41Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:01:41Z USER 2403803 (sg03) [ModuleForkPass]: Running coloring_allocator_sb
+2024-06-01T06:01:41Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:01:41Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 7412047876
+2024-06-01T06:01:41Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 3583 bytes
+2024-06-01T06:01:41Z INFO 2403803 (sg00) [PSUM_Allocator]:       insert spills
+2024-06-01T06:01:41Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 300279840
+2024-06-01T06:01:41Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 3695 bytes
+2024-06-01T06:01:41Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 16384
+2024-06-01T06:01:41Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:01:41Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: Allocating functions
+2024-06-01T06:01:41Z INFO 2403803 (sg03) [ColoringAllocator::Rep]:   linearize and check
+2024-06-01T06:01:42Z INFO 2403803 (sg02) [TensorCopyElim]: Tensor CP elimination: 0
+2024-06-01T06:01:42Z INFO 2403803 (sg00) [PSUM_Allocator]:     main loop
+2024-06-01T06:01:43Z INFO 2403803 (sg03) [SB_Allocator]:   allocating SB
+2024-06-01T06:01:43Z INFO 2403803 (sg03) [SB_Allocator]:     main loop
+2024-06-01T06:01:43Z INFO 2403803 (sg00) [PSUM_Allocator]:     renumber locations
+2024-06-01T06:01:43Z INFO 2403803 (sg00) [PSUM_Allocator]:         size = 45899
+2024-06-01T06:01:43Z INFO 2403803 (sg03) [SB_Allocator]:       renumber locations
+2024-06-01T06:01:43Z INFO 2403803 (sg03) [SB_Allocator]:         size = 36598
+2024-06-01T06:01:43Z INFO 2403803 (sg03) [SB_Allocator]:       find partners
+2024-06-01T06:01:43Z INFO 2403803 (sg00) [PSUM_Allocator]: build_no_bitmap start
+2024-06-01T06:01:43Z INFO 2403803 (sg03) [SB_Allocator]:         found 29981 accumulation groups
+2024-06-01T06:01:43Z INFO 2403803 (sg00) [PSUM_Allocator]:           PSUM high-water mark = 8 tensors
+2024-06-01T06:01:43Z INFO 2403803 (sg00) [PSUM_Allocator]:         found 93250 edges
+2024-06-01T06:01:43Z INFO 2403803 (sg00) [PSUM_Allocator]:         mean: 4.06327
+2024-06-01T06:01:43Z INFO 2403803 (sg00) [PSUM_Allocator]:         median: 4.27405
+2024-06-01T06:01:43Z INFO 2403803 (sg00) [PSUM_Allocator]:         adjacency vectors require 746000 bytes
+2024-06-01T06:01:43Z INFO 2403803 (sg00) [PSUM_Allocator]: build_no_bitmap done
+2024-06-01T06:01:43Z INFO 2403803 (sg03) [SB_Allocator]:           largest = _dot.4881-t397_i2239
+2024-06-01T06:01:43Z INFO 2403803 (sg03) [SB_Allocator]:             tensors = 64
+2024-06-01T06:01:43Z INFO 2403803 (sg03) [SB_Allocator]:             requires 147456 bytes/partition
+2024-06-01T06:01:43Z INFO 2403803 (sg03) [SB_Allocator]:       expanding partners
+2024-06-01T06:01:43Z INFO 2403803 (sg00) [PSUM_Allocator]:       find costs
+2024-06-01T06:01:44Z USER 2403803 (sg02) [ModuleForkPass]: tensor_copy_elim finished after 3.230 seconds
+2024-06-01T06:01:44Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  11651mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:44Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 207468 memory location(s), 1 block(s), and 677057 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:44Z USER 2403803 (sg02) [ModuleForkPass]: Running mm_packing
+2024-06-01T06:01:44Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to mm_packing: modules=1 functions=1 allocs=207468 blocks=1 instructions=677057 Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:44Z INFO 2403803 (sg02) [MMPacking]: INFO (MMPack) Running the preprocessing step.
+2024-06-01T06:01:44Z INFO 2403803 (sg00) [PSUM_Allocator]:       best-of-n loop, heuristic = 0
+2024-06-01T06:01:44Z INFO 2403803 (sg00) [PSUM_Allocator]:         simplify interference graph
+2024-06-01T06:01:44Z INFO 2403803 (sg00) [PSUM_Allocator]:           initialize low and high
+2024-06-01T06:01:44Z INFO 2403803 (sg00) [PSUM_Allocator]:             lo = 45387
+2024-06-01T06:01:44Z INFO 2403803 (sg00) [PSUM_Allocator]:             hi = 512
+2024-06-01T06:01:44Z INFO 2403803 (sg00) [PSUM_Allocator]:             inf = 0
+2024-06-01T06:01:44Z INFO 2403803 (sg00) [PSUM_Allocator]:             total = 45899
+2024-06-01T06:01:44Z INFO 2403803 (sg00) [PSUM_Allocator]:           simplify
+2024-06-01T06:01:44Z INFO 2403803 (sg00) [PSUM_Allocator]:             new candidates = 0
+2024-06-01T06:01:44Z INFO 2403803 (sg00) [PSUM_Allocator]:         select ranges
+2024-06-01T06:01:44Z INFO 2403803 (sg00) [PSUM_Allocator]:           no more spills
+2024-06-01T06:01:44Z INFO 2403803 (sg00) [PSUM_Allocator]:         PSUM score = 0 (lower is better)
+2024-06-01T06:01:44Z INFO 2403803 (sg00) [PSUM_Allocator]: spilling from PSUM cost about 2.68493e+06 cycles
+2024-06-01T06:01:44Z INFO 2403803 (sg00) [PSUM_Allocator]: number of tensors spilled from PSUM = 640
+2024-06-01T06:01:44Z INFO 2403803 (sg00) [PSUM_Allocator]: 100% PSUM utilization after allocation
+2024-06-01T06:01:44Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 1662748872
+2024-06-01T06:01:44Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 1781 bytes
+2024-06-01T06:01:45Z INFO 2403803 (sg03) [SB_Allocator]:       find first defs
+2024-06-01T06:01:45Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 800907024
+2024-06-01T06:01:45Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 871 bytes
+2024-06-01T06:01:45Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 16384
+2024-06-01T06:01:45Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:01:45Z USER 2403803 (sg00) [ModuleForkPass]: coloring_allocator_psum finished after 20.275 seconds
+2024-06-01T06:01:45Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  11627mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:45Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 145857 memory location(s), 1 block(s), and 326709 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:01:45Z USER 2403803 (sg00) [ModuleForkPass]: Running dma_optimization_psum
+2024-06-01T06:01:45Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=145857 blocks=1 instructions=326709 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:01:45Z INFO 2403803 (sg03) [SB_Allocator]:       find loads
+2024-06-01T06:01:46Z INFO 2403803 (sg03) [SB_Allocator]:         0 pin count
+2024-06-01T06:01:46Z INFO 2403803 (sg03) [SB_Allocator]:         16131 remat count
+2024-06-01T06:01:46Z INFO 2403803 (sg03) [SB_Allocator]:       build interference graph
+2024-06-01T06:01:46Z INFO 2403803 (sg03) [SB_Allocator]:         pass 1 int-tree
+2024-06-01T06:01:46Z INFO 2403803 (sg03) [SB_Allocator]:            Num intervals 36598 Num locations 36598
+2024-06-01T06:01:46Z INFO 2403803 (sg03) [SB_Allocator]:            IntervalTree Build Done
+2024-06-01T06:01:46Z INFO 2403803 (sg03) [SB_Allocator]:            info.neighbors init Done
+2024-06-01T06:01:47Z INFO 2403803 (sg03) [SB_Allocator]:            info.neighbors partners Done
+2024-06-01T06:01:47Z INFO 2403803 (sg03) [SB_Allocator]:            IntervalTree readback Done
+2024-06-01T06:01:47Z INFO 2403803 (sg03) [SB_Allocator]:         edge: 1976529
+2024-06-01T06:01:47Z INFO 2403803 (sg03) [SB_Allocator]:         mean: 108.013
+2024-06-01T06:01:47Z INFO 2403803 (sg03) [SB_Allocator]:         median: 97.4214
+2024-06-01T06:01:47Z INFO 2403803 (sg03) [SB_Allocator]:       find costs
+2024-06-01T06:01:47Z INFO 2403803 (sg03) [SB_Allocator]:       best-of-n loop, heuristic = 0
+2024-06-01T06:01:47Z INFO 2403803 (sg03) [SB_Allocator]:         simplify interference graph
+2024-06-01T06:01:47Z INFO 2403803 (sg03) [SB_Allocator]:           initialize safe & unsafe
+2024-06-01T06:01:47Z INFO 2403803 (sg03) [SB_Allocator]:               safe = 5095
+2024-06-01T06:01:47Z INFO 2403803 (sg03) [SB_Allocator]:             unsafe = 25551
+2024-06-01T06:01:47Z INFO 2403803 (sg03) [SB_Allocator]:                inf = 5952
+2024-06-01T06:01:47Z INFO 2403803 (sg03) [SB_Allocator]:              total = 36598
+2024-06-01T06:01:47Z INFO 2403803 (sg03) [SB_Allocator]:           simplify
+2024-06-01T06:01:47Z INFO 2403803 (sg03) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 21103 #Pinned 0 #Safe 0 minCost 0.00123727 maxCost 0.493205 locations 36598
+2024-06-01T06:01:48Z INFO 2403803 (sg03) [SB_Allocator]:             new candidates = 13561
+2024-06-01T06:01:48Z INFO 2403803 (sg03) [SB_Allocator]:             (including 5120 infinite cost tensors)
+2024-06-01T06:01:48Z INFO 2403803 (sg03) [SB_Allocator]:         select ranges
+2024-06-01T06:01:48Z INFO 2403803 (sg03) [SB_Allocator]:           Total: 36598
+2024-06-01T06:01:48Z INFO 2403803 (sg03) [SB_Allocator]:             Spilled: 0.000 (0)
+2024-06-01T06:01:48Z INFO 2403803 (sg03) [SB_Allocator]:             Allocated: 1.000 (36598)
+2024-06-01T06:01:48Z INFO 2403803 (sg03) [SB_Allocator]:               Rover zone: 0.476 (17409)
+2024-06-01T06:01:48Z INFO 2403803 (sg03) [SB_Allocator]:               Pre-rover zone: 0.010 (356)
+2024-06-01T06:01:48Z INFO 2403803 (sg03) [SB_Allocator]:               Post-rover zone: 0.515 (18833)
+2024-06-01T06:01:48Z INFO 2403803 (sg03) [SB_Allocator]:               Slice zone: 0.000 (0)
+2024-06-01T06:01:48Z INFO 2403803 (sg03) [SB_Allocator]:               Blocks nothing: 0.000 (1)
+2024-06-01T06:01:48Z INFO 2403803 (sg03) [SB_Allocator]:               Blocks medium: 0.000 (0)
+2024-06-01T06:01:48Z INFO 2403803 (sg03) [SB_Allocator]:               Blocks tall: 1.000 (36597)
+2024-06-01T06:01:48Z INFO 2403803 (sg03) [SB_Allocator]:                 Visited until tall blocking (mean): 0.998
+2024-06-01T06:01:48Z INFO 2403803 (sg03) [SB_Allocator]:                 Visited until tall blocking (median): 1.000
+2024-06-01T06:01:48Z INFO 2403803 (sg03) [SB_Allocator]:                 Visited until tall blocking (p95): 1.000
+2024-06-01T06:01:48Z INFO 2403803 (sg03) [SB_Allocator]:           Success
+2024-06-01T06:01:48Z INFO 2403803 (sg01) [DMAOptimizationBase]: DMA input Coalescing combined 9114 input loads
+2024-06-01T06:01:48Z USER 2403803 (sg01) [ModuleForkPass]: input_dma_coalescing finished after 27.073 seconds
+2024-06-01T06:01:48Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  11846mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:48Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 209772 memory location(s), 1 block(s), and 688705 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:48Z USER 2403803 (sg01) [ModuleForkPass]: Running early_peephole_opts
+2024-06-01T06:01:48Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to early_peephole_opts: modules=1 functions=1 allocs=209772 blocks=1 instructions=688705 Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:48Z INFO 2403803 (sg01) [EarlyPeepholeOpts]: PeepholeOpts enabled? ActivationAccumulate: true
+2024-06-01T06:01:48Z INFO 2403803 (sg03) [SB_Allocator]:           SB spills = 0 tensors
+2024-06-01T06:01:48Z INFO 2403803 (sg03) [SB_Allocator]:                size = 0 bytes/partition
+2024-06-01T06:01:48Z INFO 2403803 (sg03) [SB_Allocator]:              remats = 0 tensors
+2024-06-01T06:01:48Z INFO 2403803 (sg03) [SB_Allocator]:            unpinned = 0 tensors
+2024-06-01T06:01:48Z INFO 2403803 (sg03) [SB_Allocator]:                size = 0 bytes/partition
+2024-06-01T06:01:48Z INFO 2403803 (sg03) [SB_Allocator]:         SB score = 0
+2024-06-01T06:01:48Z INFO 2403803 (sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 1024 spill/reload instructions
+2024-06-01T06:01:48Z INFO 2403803 (sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 1024 spill/reload memory locations
+2024-06-01T06:01:48Z USER 2403803 (sg00) [ModuleForkPass]: dma_optimization_psum finished after 3.440 seconds
+2024-06-01T06:01:48Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  11814mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:48Z INFO 2403803 (sg03) [SB_Allocator]: spilling from SB cost about 0 cycles
+2024-06-01T06:01:48Z INFO 2403803 (sg03) [SB_Allocator]: 0 bytes/partition (0%) successfully pinned
+2024-06-01T06:01:48Z INFO 2403803 (sg03) [SB_Allocator]: pinning saved approximately 0 cycles
+2024-06-01T06:01:48Z INFO 2403803 (sg03) [SB_Allocator]: 0% SB utilization after allocation
+2024-06-01T06:01:48Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 7412047876
+2024-06-01T06:01:48Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 3583 bytes
+2024-06-01T06:01:48Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 300279840
+2024-06-01T06:01:48Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 3695 bytes
+2024-06-01T06:01:49Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 144833 memory location(s), 1 block(s), and 325685 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:01:49Z USER 2403803 (sg00) [ModuleForkPass]: Running address_rotation_psum
+2024-06-01T06:01:49Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 16384
+2024-06-01T06:01:49Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:01:49Z USER 2403803 (sg03) [ModuleForkPass]: coloring_allocator_sb finished after 7.592 seconds
+2024-06-01T06:01:49Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  11733mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:49Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=144833 blocks=1 instructions=325685 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:01:49Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:01:49Z USER 2403803 (sg03) [ModuleForkPass]: Running address_rotation_sb
+2024-06-01T06:01:49Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:01:49Z INFO 2403803 (sg01) [EarlyPeepholeOpts]: Activation Accumulate: 13824
+2024-06-01T06:01:49Z USER 2403803 (sg01) [ModuleForkPass]: early_peephole_opts finished after 0.896 seconds
+2024-06-01T06:01:49Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  11741mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:49Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 207596 memory location(s), 1 block(s), and 677185 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:49Z USER 2403803 (sg01) [ModuleForkPass]: Running pre_sched
+2024-06-01T06:01:49Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=207596 blocks=1 instructions=677185 Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:49Z INFO 2403803 (sg01) [PreSched]: Start PRE scheduling 2 cores:  1 at: Sat Jun  1 06:01:49 2024
+2024-06-01T06:01:49Z INFO 2403803 [LayerSpiller]: LayerSpill: Start...
+2024-06-01T06:01:49Z INFO 2403803 (sg02) [MMPacking]: INFO (MMPack) mlBPCG size 201710, CCS = 8001
+2024-06-01T06:01:50Z INFO 2403803 (sg03) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address
+2024-06-01T06:01:50Z USER 2403803 (sg03) [ModuleForkPass]: address_rotation_sb finished after 1.084 seconds
+2024-06-01T06:01:50Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  11770mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:50Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:01:50Z USER 2403803 (sg03) [ModuleForkPass]: Running dma_optimization_sb
+2024-06-01T06:01:50Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:01:50Z INFO 2403803 (sg03) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 7712344100, 92.2163% input load, 0.00331977% output write, 7.78035% spill/reload [sg0003]
+2024-06-01T06:01:50Z INFO 2403803 [LayerSpiller]: LayerSpill: Found 3 Splits CCs
+2024-06-01T06:01:50Z INFO 2403803 [LayerSpiller]: Grouped CCs to 2 clusters.
+2024-06-01T06:01:50Z INFO 2403803 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 13654 PSUM Banks
+2024-06-01T06:01:51Z INFO 2403803 (sg03) [DMAOptimizationBase]: removed 0 identical load 
+2024-06-01T06:01:51Z INFO 2403803 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 1934 PSUM Banks
+2024-06-01T06:01:52Z INFO 2403803 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors
+2024-06-01T06:01:52Z INFO 2403803 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts
+2024-06-01T06:01:52Z INFO 2403803 [LayerSpiller]: LayerSpill: Done.
+2024-06-01T06:01:52Z INFO 2403803 (sg01) [PreSched]: Start split live ranges Sat Jun  1 06:01:49 2024
+2024-06-01T06:01:53Z INFO 2403803 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 4033 PSUM Banks
+2024-06-01T06:01:53Z USER 2403803 (sg00) [ModuleForkPass]: address_rotation_psum finished after 4.334 seconds
+2024-06-01T06:01:53Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  11823mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:53Z INFO 2403803 (sg02) [MMPacking]: INFO (MMPack) agRGCG size 1376416
+2024-06-01T06:01:53Z INFO 2403803 (sg02) [MMPackingPass]: INFO (MMPacking) Time: 9.209 seconds
+2024-06-01T06:01:53Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 144833 memory location(s), 1 block(s), and 325685 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:01:53Z USER 2403803 (sg00) [ModuleForkPass]: Running coloring_allocator_sb
+2024-06-01T06:01:53Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=144833 blocks=1 instructions=325685 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:01:53Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 1662748872
+2024-06-01T06:01:53Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 1781 bytes
+2024-06-01T06:01:53Z USER 2403803 (sg02) [ModuleForkPass]: mm_packing finished after 9.671 seconds
+2024-06-01T06:01:53Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  11791mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:01:53Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 800907024
+2024-06-01T06:01:53Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 871 bytes
+2024-06-01T06:01:54Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 16384
+2024-06-01T06:01:54Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:01:54Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: Allocating functions
+2024-06-01T06:01:54Z INFO 2403803 (sg00) [ColoringAllocator::Rep]:   linearize and check
+2024-06-01T06:01:54Z INFO 2403803 (sg01) [PreSched]: Num_Splits: 0
+2024-06-01T06:01:54Z INFO 2403803 (sg01) [PreSched]: End split live ranges Sat Jun  1 06:01:54 2024
+2024-06-01T06:01:54Z INFO 2403803 (sg01) [PreSched]: Strt remove redundncies Sat Jun  1 06:01:54 2024
+2024-06-01T06:01:54Z INFO 2403803 (sg01) [PreSched]: remove_redundant_memsets
+2024-06-01T06:01:54Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 207468 memory location(s), 1 block(s), and 677057 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:54Z USER 2403803 (sg02) [ModuleForkPass]: Running coloring_allocator_psum
+2024-06-01T06:01:54Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=207468 blocks=1 instructions=677057 Max writers: 576 Max Readers: 184036
+2024-06-01T06:01:54Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 9089670928
+2024-06-01T06:01:54Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 2980 bytes
+2024-06-01T06:01:54Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 1312604160
+2024-06-01T06:01:54Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 3303 bytes
+2024-06-01T06:01:54Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 32768
+2024-06-01T06:01:54Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:01:54Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: Allocating functions
+2024-06-01T06:01:54Z INFO 2403803 (sg02) [ColoringAllocator::Rep]:   linearize and check
+2024-06-01T06:01:54Z INFO 2403803 (sg00) [SB_Allocator]:   allocating SB
+2024-06-01T06:01:54Z INFO 2403803 (sg00) [SB_Allocator]:     main loop
+2024-06-01T06:01:54Z INFO 2403803 (sg01) [PreSched]: remove_redundant_memsets: 0
+2024-06-01T06:01:54Z INFO 2403803 (sg01) [PreSched]: remove_redundant_loads
+2024-06-01T06:01:55Z INFO 2403803 (sg00) [SB_Allocator]:       renumber locations
+2024-06-01T06:01:55Z INFO 2403803 (sg00) [SB_Allocator]:         size = 98596
+2024-06-01T06:01:55Z INFO 2403803 (sg00) [SB_Allocator]:       find partners
+2024-06-01T06:01:56Z INFO 2403803 (sg00) [SB_Allocator]:         found 42827 accumulation groups
+2024-06-01T06:01:56Z INFO 2403803 (sg00) [SB_Allocator]:           largest = _dot.414-t975_i83
+2024-06-01T06:01:56Z INFO 2403803 (sg00) [SB_Allocator]:             tensors = 64
+2024-06-01T06:01:56Z INFO 2403803 (sg00) [SB_Allocator]:             requires 98304 bytes/partition
+2024-06-01T06:01:56Z INFO 2403803 (sg00) [SB_Allocator]:       expanding partners
+2024-06-01T06:01:56Z INFO 2403803 (sg01) [PreSched]: remove_redundant_loads: 0
+2024-06-01T06:01:56Z INFO 2403803 (sg01) [PreSched]: End remove redundncies Sat Jun  1 06:01:56 2024
+2024-06-01T06:01:56Z INFO 2403803 (sg01) [PreSched]: Start DCE Sat Jun  1 06:01:56 2024
+2024-06-01T06:01:56Z INFO 2403803 (sg00) [SB_Allocator]:       find first defs
+2024-06-01T06:01:56Z INFO 2403803 (sg02) [PSUM_Allocator]:   allocating PSUM
+2024-06-01T06:01:56Z INFO 2403803 (sg02) [PSUM_Allocator]:     main loop
+2024-06-01T06:01:57Z INFO 2403803 (sg03) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions
+2024-06-01T06:01:57Z INFO 2403803 (sg00) [SB_Allocator]:       find loads
+2024-06-01T06:01:57Z INFO 2403803 (sg00) [SB_Allocator]:         0 pin count
+2024-06-01T06:01:57Z INFO 2403803 (sg00) [SB_Allocator]:         7211 remat count
+2024-06-01T06:01:57Z INFO 2403803 (sg00) [SB_Allocator]:       build interference graph
+2024-06-01T06:01:57Z INFO 2403803 (sg00) [SB_Allocator]:         pass 1 int-tree
+2024-06-01T06:01:57Z INFO 2403803 (sg02) [PSUM_Allocator]:     renumber locations
+2024-06-01T06:01:57Z INFO 2403803 (sg02) [PSUM_Allocator]:         size = 74858
+2024-06-01T06:01:57Z INFO 2403803 (sg03) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions
+2024-06-01T06:01:57Z INFO 2403803 (sg00) [SB_Allocator]:            Num intervals 98596 Num locations 98596
+2024-06-01T06:01:57Z INFO 2403803 (sg00) [SB_Allocator]:            IntervalTree Build Done
+2024-06-01T06:01:57Z INFO 2403803 (sg00) [SB_Allocator]:            info.neighbors init Done
+2024-06-01T06:01:57Z INFO 2403803 (sg03) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(7.11204e+09)
+2024-06-01T06:01:57Z INFO 2403803 (sg00) [SB_Allocator]:            info.neighbors partners Done
+2024-06-01T06:01:58Z INFO 2403803 (sg03) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs
+2024-06-01T06:01:58Z INFO 2403803 (sg02) [PSUM_Allocator]: build_no_bitmap start
+2024-06-01T06:01:58Z INFO 2403803 (sg00) [SB_Allocator]:            IntervalTree readback Done
+2024-06-01T06:01:58Z INFO 2403803 (sg00) [SB_Allocator]:         edge: 58030361
+2024-06-01T06:01:58Z INFO 2403803 (sg00) [SB_Allocator]:         mean: 1177.13
+2024-06-01T06:01:58Z INFO 2403803 (sg00) [SB_Allocator]:         median: 674.385
+2024-06-01T06:01:58Z INFO 2403803 (sg00) [SB_Allocator]:       find costs
+2024-06-01T06:01:58Z INFO 2403803 (sg01) [PreSched]: End DCE Sat Jun  1 06:01:58 2024
+2024-06-01T06:01:58Z INFO 2403803 (sg02) [PSUM_Allocator]: 600% PSUM demand before spilling
+2024-06-01T06:01:58Z INFO 2403803 (sg02) [PSUM_Allocator]:           PSUM high-water mark = 48 tensors
+2024-06-01T06:01:58Z INFO 2403803 (sg02) [PSUM_Allocator]:         found 1403796 edges
+2024-06-01T06:01:58Z INFO 2403803 (sg02) [PSUM_Allocator]:         mean: 37.5056
+2024-06-01T06:01:58Z INFO 2403803 (sg02) [PSUM_Allocator]:         median: 25.4724
+2024-06-01T06:01:58Z INFO 2403803 (sg02) [PSUM_Allocator]:         adjacency vectors require 11230368 bytes
+2024-06-01T06:01:58Z INFO 2403803 (sg02) [PSUM_Allocator]: build_no_bitmap done
+2024-06-01T06:01:58Z INFO 2403803 (sg02) [PSUM_Allocator]:       find costs
+2024-06-01T06:01:58Z INFO 2403803 (sg00) [SB_Allocator]:       best-of-n loop, heuristic = 0
+2024-06-01T06:01:58Z INFO 2403803 (sg00) [SB_Allocator]:         simplify interference graph
+2024-06-01T06:01:58Z INFO 2403803 (sg00) [SB_Allocator]:           initialize safe & unsafe
+2024-06-01T06:01:58Z INFO 2403803 (sg03) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions
+2024-06-01T06:01:58Z INFO 2403803 (sg03) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations
+2024-06-01T06:01:58Z INFO 2403803 (sg03) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic
+2024-06-01T06:01:59Z INFO 2403803 (sg01) [PreSched]: Start build flow dependencies Sat Jun  1 06:01:59 2024
+2024-06-01T06:01:59Z INFO 2403803 (sg01) [build_flow_deps]: Start build fdeps. Invocation: 4Sat Jun  1 06:01:59 2024
+2024-06-01T06:01:59Z INFO 2403803 (sg00) [SB_Allocator]:               safe = 3063
+2024-06-01T06:01:59Z INFO 2403803 (sg00) [SB_Allocator]:             unsafe = 74356
+2024-06-01T06:01:59Z INFO 2403803 (sg00) [SB_Allocator]:                inf = 21177
+2024-06-01T06:01:59Z INFO 2403803 (sg00) [SB_Allocator]:              total = 98596
+2024-06-01T06:01:59Z INFO 2403803 (sg00) [SB_Allocator]:           simplify
+2024-06-01T06:01:59Z INFO 2403803 (sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 74305 #Pinned 0 #Safe 0 minCost 4.10363e-05 maxCost 0.0540829 locations 98596
+2024-06-01T06:01:59Z INFO 2403803 (sg01) [build_flow_deps]: Allocs: 207468 instructions: 677057
+2024-06-01T06:02:00Z INFO 2403803 (sg03) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions
+2024-06-01T06:02:00Z INFO 2403803 (sg03) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations
+2024-06-01T06:02:00Z INFO 2403803 (sg03) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic
+2024-06-01T06:02:00Z INFO 2403803 (sg03) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs
+2024-06-01T06:02:00Z INFO 2403803 (sg02) [PSUM_Allocator]:       best-of-n loop, heuristic = 0
+2024-06-01T06:02:00Z INFO 2403803 (sg02) [PSUM_Allocator]:         simplify interference graph
+2024-06-01T06:02:00Z INFO 2403803 (sg02) [PSUM_Allocator]:           initialize low and high
+2024-06-01T06:02:00Z INFO 2403803 (sg02) [PSUM_Allocator]:             lo = 39514
+2024-06-01T06:02:00Z INFO 2403803 (sg02) [PSUM_Allocator]:             hi = 22112
+2024-06-01T06:02:00Z INFO 2403803 (sg02) [PSUM_Allocator]:             inf = 13232
+2024-06-01T06:02:00Z INFO 2403803 (sg02) [PSUM_Allocator]:             total = 74858
+2024-06-01T06:02:00Z INFO 2403803 (sg02) [PSUM_Allocator]:           simplify
+2024-06-01T06:02:00Z INFO 2403803 (sg03) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions
+2024-06-01T06:02:00Z INFO 2403803 (sg03) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations
+2024-06-01T06:02:00Z INFO 2403803 (sg03) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic
+2024-06-01T06:02:01Z INFO 2403803 (sg02) [PSUM_Allocator]:             new candidates = 640
+2024-06-01T06:02:01Z INFO 2403803 (sg02) [PSUM_Allocator]:         select ranges
+2024-06-01T06:02:01Z INFO 2403803 (sg03) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions
+2024-06-01T06:02:01Z INFO 2403803 (sg03) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions
+2024-06-01T06:02:01Z INFO 2403803 (sg03) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations
+2024-06-01T06:02:02Z INFO 2403803 (sg02) [PSUM_Allocator]:           PSUM spills = 640 tensors
+2024-06-01T06:02:02Z INFO 2403803 (sg02) [PSUM_Allocator]:         PSUM score = 2.68493e+06 (lower is better)
+2024-06-01T06:02:02Z INFO 2403803 (sg02) [PSUM_Allocator]:       best PSUM heuristic = 0
+2024-06-01T06:02:02Z INFO 2403803 (sg02) [PSUM_Allocator]:       collect spills
+2024-06-01T06:02:02Z INFO 2403803 (sg03) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads
+2024-06-01T06:02:02Z INFO 2403803 (sg03) [DMAOptimizationBase]:  average loaded DMA size 3583 bytes
+2024-06-01T06:02:02Z INFO 2403803 (sg03) [DMAOptimizationBase]:  average saved DMA size 3695 bytes
+2024-06-01T06:02:03Z INFO 2403803 (sg03) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 7412047876
+2024-06-01T06:02:03Z INFO 2403803 (sg03) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 3583 bytes
+2024-06-01T06:02:03Z INFO 2403803 (sg01) [build_flow_deps]: Build fdeps inserted 1949358 edges 
+2024-06-01T06:02:03Z INFO 2403803 (sg01) [build_flow_deps]: Done build fdeps 1949358 Sat Jun  1 06:02:03 2024
+2024-06-01T06:02:03Z INFO 2403803 (sg01) [PreSched]: End build flow dependencies Sat Jun  1 06:02:03 2024
+2024-06-01T06:02:03Z INFO 2403803 (sg01) [PreSched]: Start remove useless insts Sat Jun  1 06:02:03 2024
+2024-06-01T06:02:03Z INFO 2403803 (sg01) [PreSched]: remove_useless_insts
+2024-06-01T06:02:03Z INFO 2403803 (sg03) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 300279840
+2024-06-01T06:02:03Z INFO 2403803 (sg03) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 3695 bytes
+2024-06-01T06:02:04Z INFO 2403803 (sg01) [PreSched]: remove Useless Instructions: 0
+2024-06-01T06:02:04Z INFO 2403803 (sg01) [PreSched]: End remove useless insts Sat Jun  1 06:02:04 2024
+2024-06-01T06:02:04Z INFO 2403803 (sg03) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs
+2024-06-01T06:02:05Z INFO 2403803 (sg03) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic
+2024-06-01T06:02:05Z INFO 2403803 (sg03) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 0, 0% out of total dma traffic
+2024-06-01T06:02:05Z INFO 2403803 (sg03) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 7712344100, 92.2163% input load, 0.00331977% output write, 7.78035% spill/reload [sg0003]
+2024-06-01T06:02:05Z INFO 2403803 (sg01) [PreSched]: DONE PRE scheduling Sat Jun  1 06:02:05 2024
+2024-06-01T06:02:05Z USER 2403803 (sg01) [ModuleForkPass]: pre_sched finished after 15.415 seconds
+2024-06-01T06:02:05Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  12688mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:02:05Z INFO 2403803 (sg03) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 7412047876
+2024-06-01T06:02:05Z INFO 2403803 (sg03) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 3583 bytes
+2024-06-01T06:02:05Z INFO 2403803 (sg02) [PSUM_Allocator]:       insert spills
+2024-06-01T06:02:05Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 207468 memory location(s), 1 block(s), and 677057 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:02:05Z USER 2403803 (sg01) [ModuleForkPass]: Running tensor_copy_elim
+2024-06-01T06:02:05Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=207468 blocks=1 instructions=677057 Max writers: 576 Max Readers: 184036
+2024-06-01T06:02:05Z INFO 2403803 (sg03) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 300279840
+2024-06-01T06:02:05Z INFO 2403803 (sg03) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 3695 bytes
+2024-06-01T06:02:05Z INFO 2403803 (sg01) [TensorCopyElim]: Tensor CP elimination: 0
+2024-06-01T06:02:05Z INFO 2403803 (sg03) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 16384
+2024-06-01T06:02:05Z INFO 2403803 (sg03) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 128 bytes
+2024-06-01T06:02:05Z INFO 2403803 (sg03) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 3587 bytes
+2024-06-01T06:02:05Z INFO 2403803 (sg03) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module);
+2024-06-01T06:02:06Z INFO 2403803 (sg03) [DMAOptimizationBase]: DMA optimization re-enable optimization
+2024-06-01T06:02:06Z USER 2403803 (sg03) [ModuleForkPass]: dma_optimization_sb finished after 15.852 seconds
+2024-06-01T06:02:06Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  12584mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:02:06Z USER 2403803 (sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.776 seconds
+2024-06-01T06:02:06Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  12555mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:02:06Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:02:06Z USER 2403803 (sg03) [ModuleForkPass]: Running address_rotation_sb
+2024-06-01T06:02:06Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 207468 memory location(s), 1 block(s), and 677057 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:02:06Z USER 2403803 (sg01) [ModuleForkPass]: Running mm_packing
+2024-06-01T06:02:06Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to mm_packing: modules=1 functions=1 allocs=207468 blocks=1 instructions=677057 Max writers: 576 Max Readers: 184036
+2024-06-01T06:02:06Z INFO 2403803 (sg01) [MMPacking]: INFO (MMPack) Running the preprocessing step.
+2024-06-01T06:02:06Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:02:06Z INFO 2403803 (sg02) [PSUM_Allocator]:     main loop
+2024-06-01T06:02:07Z INFO 2403803 (sg02) [PSUM_Allocator]:     renumber locations
+2024-06-01T06:02:07Z INFO 2403803 (sg02) [PSUM_Allocator]:         size = 77930
+2024-06-01T06:02:08Z INFO 2403803 (sg02) [PSUM_Allocator]: build_no_bitmap start
+2024-06-01T06:02:08Z INFO 2403803 (sg00) [SB_Allocator]:             new candidates = 4313
+2024-06-01T06:02:08Z INFO 2403803 (sg00) [SB_Allocator]:         select ranges
+2024-06-01T06:02:08Z INFO 2403803 (sg02) [PSUM_Allocator]:           PSUM high-water mark = 8 tensors
+2024-06-01T06:02:08Z INFO 2403803 (sg02) [PSUM_Allocator]:         found 198036 edges
+2024-06-01T06:02:08Z INFO 2403803 (sg02) [PSUM_Allocator]:         mean: 5.08241
+2024-06-01T06:02:08Z INFO 2403803 (sg02) [PSUM_Allocator]:         median: 6.33887
+2024-06-01T06:02:08Z INFO 2403803 (sg02) [PSUM_Allocator]:         adjacency vectors require 1584288 bytes
+2024-06-01T06:02:08Z INFO 2403803 (sg02) [PSUM_Allocator]: build_no_bitmap done
+2024-06-01T06:02:08Z INFO 2403803 (sg02) [PSUM_Allocator]:       find costs
+2024-06-01T06:02:09Z INFO 2403803 (sg03) [DMAOptimizationBase]: SB Rotation rotated 317 Sb address
+2024-06-01T06:02:09Z INFO 2403803 (sg01) [MMPacking]: INFO (MMPack) mlBPCG size 201710, CCS = 8001
+2024-06-01T06:02:10Z INFO 2403803 (sg02) [PSUM_Allocator]:       best-of-n loop, heuristic = 0
+2024-06-01T06:02:10Z INFO 2403803 (sg02) [PSUM_Allocator]:         simplify interference graph
+2024-06-01T06:02:10Z INFO 2403803 (sg02) [PSUM_Allocator]:           initialize low and high
+2024-06-01T06:02:10Z INFO 2403803 (sg02) [PSUM_Allocator]:             lo = 77418
+2024-06-01T06:02:10Z INFO 2403803 (sg02) [PSUM_Allocator]:             hi = 512
+2024-06-01T06:02:10Z INFO 2403803 (sg02) [PSUM_Allocator]:             inf = 0
+2024-06-01T06:02:10Z INFO 2403803 (sg02) [PSUM_Allocator]:             total = 77930
+2024-06-01T06:02:10Z INFO 2403803 (sg02) [PSUM_Allocator]:           simplify
+2024-06-01T06:02:10Z INFO 2403803 (sg02) [PSUM_Allocator]:             new candidates = 0
+2024-06-01T06:02:10Z INFO 2403803 (sg02) [PSUM_Allocator]:         select ranges
+2024-06-01T06:02:11Z INFO 2403803 (sg02) [PSUM_Allocator]:           no more spills
+2024-06-01T06:02:11Z INFO 2403803 (sg02) [PSUM_Allocator]:         PSUM score = 0 (lower is better)
+2024-06-01T06:02:11Z INFO 2403803 (sg02) [PSUM_Allocator]: spilling from PSUM cost about 2.68493e+06 cycles
+2024-06-01T06:02:11Z INFO 2403803 (sg02) [PSUM_Allocator]: number of tensors spilled from PSUM = 640
+2024-06-01T06:02:11Z INFO 2403803 (sg02) [PSUM_Allocator]: 100% PSUM utilization after allocation
+2024-06-01T06:02:11Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 9089670928
+2024-06-01T06:02:11Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 2980 bytes
+2024-06-01T06:02:11Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 1312604160
+2024-06-01T06:02:11Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 3303 bytes
+2024-06-01T06:02:11Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 32768
+2024-06-01T06:02:11Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:02:11Z USER 2403803 (sg02) [ModuleForkPass]: coloring_allocator_psum finished after 17.507 seconds
+2024-06-01T06:02:11Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  12783mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:02:11Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 213612 memory location(s), 1 block(s), and 683201 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:02:11Z USER 2403803 (sg02) [ModuleForkPass]: Running dma_optimization_psum
+2024-06-01T06:02:11Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=213612 blocks=1 instructions=683201 Max writers: 576 Max Readers: 184036
+2024-06-01T06:02:12Z INFO 2403803 (sg03) [DMAOptimizationBase]: SB Rotation rotated 3700 Sb address
+2024-06-01T06:02:12Z INFO 2403803 (sg01) [MMPacking]: INFO (MMPack) agRGCG size 1376416
+2024-06-01T06:02:12Z INFO 2403803 (sg01) [MMPackingPass]: INFO (MMPacking) Time: 6.365 seconds
+2024-06-01T06:02:13Z USER 2403803 (sg01) [ModuleForkPass]: mm_packing finished after 6.720 seconds
+2024-06-01T06:02:13Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  12744mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:02:13Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 207468 memory location(s), 1 block(s), and 677057 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:02:13Z USER 2403803 (sg01) [ModuleForkPass]: Running coloring_allocator_psum
+2024-06-01T06:02:13Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=207468 blocks=1 instructions=677057 Max writers: 576 Max Readers: 184036
+2024-06-01T06:02:13Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 9089670928
+2024-06-01T06:02:13Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 2980 bytes
+2024-06-01T06:02:13Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 1312604160
+2024-06-01T06:02:13Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 3303 bytes
+2024-06-01T06:02:13Z INFO 2403803 (sg03) [DMAOptimizationBase]: SB Rotation rotated 629 Sb address
+2024-06-01T06:02:13Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 32768
+2024-06-01T06:02:13Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:02:13Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: Allocating functions
+2024-06-01T06:02:13Z INFO 2403803 (sg01) [ColoringAllocator::Rep]:   linearize and check
+2024-06-01T06:02:14Z INFO 2403803 (sg03) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address
+2024-06-01T06:02:14Z INFO 2403803 (sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 1024 spill/reload instructions
+2024-06-01T06:02:14Z INFO 2403803 (sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 1024 spill/reload memory locations
+2024-06-01T06:02:14Z USER 2403803 (sg02) [ModuleForkPass]: dma_optimization_psum finished after 2.763 seconds
+2024-06-01T06:02:14Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  12498mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:02:14Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 212588 memory location(s), 1 block(s), and 682177 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:02:14Z USER 2403803 (sg02) [ModuleForkPass]: Running address_rotation_psum
+2024-06-01T06:02:14Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=212588 blocks=1 instructions=682177 Max writers: 576 Max Readers: 184036
+2024-06-01T06:02:14Z INFO 2403803 (sg01) [PSUM_Allocator]:   allocating PSUM
+2024-06-01T06:02:14Z INFO 2403803 (sg01) [PSUM_Allocator]:     main loop
+2024-06-01T06:02:15Z INFO 2403803 (sg01) [PSUM_Allocator]:     renumber locations
+2024-06-01T06:02:15Z INFO 2403803 (sg01) [PSUM_Allocator]:         size = 74858
+2024-06-01T06:02:16Z INFO 2403803 (sg01) [PSUM_Allocator]: build_no_bitmap start
+2024-06-01T06:02:16Z INFO 2403803 (sg03) [DMAOptimizationBase]: SB Rotation rotated 784 Sb address
+2024-06-01T06:02:16Z INFO 2403803 (sg01) [PSUM_Allocator]: 600% PSUM demand before spilling
+2024-06-01T06:02:16Z INFO 2403803 (sg01) [PSUM_Allocator]:           PSUM high-water mark = 48 tensors
+2024-06-01T06:02:16Z INFO 2403803 (sg01) [PSUM_Allocator]:         found 1403796 edges
+2024-06-01T06:02:16Z INFO 2403803 (sg01) [PSUM_Allocator]:         mean: 37.5056
+2024-06-01T06:02:16Z INFO 2403803 (sg01) [PSUM_Allocator]:         median: 25.4724
+2024-06-01T06:02:16Z INFO 2403803 (sg01) [PSUM_Allocator]:         adjacency vectors require 11230368 bytes
+2024-06-01T06:02:16Z INFO 2403803 (sg01) [PSUM_Allocator]: build_no_bitmap done
+2024-06-01T06:02:16Z INFO 2403803 (sg01) [PSUM_Allocator]:       find costs
+2024-06-01T06:02:17Z INFO 2403803 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 13726 PSUM Banks
+2024-06-01T06:02:17Z INFO 2403803 (sg03) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address
+2024-06-01T06:02:17Z USER 2403803 (sg03) [ModuleForkPass]: address_rotation_sb finished after 11.255 seconds
+2024-06-01T06:02:17Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  12588mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:02:17Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:02:17Z USER 2403803 (sg03) [ModuleForkPass]: Running coloring_allocator_dram
+2024-06-01T06:02:17Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:02:17Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 7412047876
+2024-06-01T06:02:17Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 3583 bytes
+2024-06-01T06:02:17Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 300279840
+2024-06-01T06:02:17Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 3695 bytes
+2024-06-01T06:02:18Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 16384
+2024-06-01T06:02:18Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:02:18Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: Allocating functions
+2024-06-01T06:02:18Z INFO 2403803 (sg03) [ColoringAllocator::Rep]:   linearize and check
+2024-06-01T06:02:18Z INFO 2403803 (sg01) [PSUM_Allocator]:       best-of-n loop, heuristic = 0
+2024-06-01T06:02:18Z INFO 2403803 (sg01) [PSUM_Allocator]:         simplify interference graph
+2024-06-01T06:02:18Z INFO 2403803 (sg01) [PSUM_Allocator]:           initialize low and high
+2024-06-01T06:02:18Z INFO 2403803 (sg01) [PSUM_Allocator]:             lo = 39514
+2024-06-01T06:02:18Z INFO 2403803 (sg01) [PSUM_Allocator]:             hi = 22112
+2024-06-01T06:02:18Z INFO 2403803 (sg01) [PSUM_Allocator]:             inf = 13232
+2024-06-01T06:02:18Z INFO 2403803 (sg01) [PSUM_Allocator]:             total = 74858
+2024-06-01T06:02:18Z INFO 2403803 (sg01) [PSUM_Allocator]:           simplify
+2024-06-01T06:02:18Z INFO 2403803 (sg01) [PSUM_Allocator]:             new candidates = 640
+2024-06-01T06:02:18Z INFO 2403803 (sg01) [PSUM_Allocator]:         select ranges
+2024-06-01T06:02:18Z INFO 2403803 (sg03) [DRAM_Allocator]:   allocating spills in DRAM pre_link mode
+2024-06-01T06:02:18Z INFO 2403803 (sg03) [DRAM_Allocator]:     reserved space = 753993764 bytes
+2024-06-01T06:02:18Z INFO 2403803 (sg03) [DRAM_Allocator]:     spill space = 1200095232 bytes
+2024-06-01T06:02:18Z INFO 2403803 (sg03) [DRAM_Allocator]:     aligned spill space = 1200095232 bytes
+2024-06-01T06:02:18Z INFO 2403803 (sg03) [DRAM_Allocator]:     dram space = 107374182400 bytes
+2024-06-01T06:02:18Z INFO 2403803 (sg03) [DRAM_Allocator]:     renumber locations
+2024-06-01T06:02:18Z INFO 2403803 (sg03) [DRAM_Allocator]:         size = 6
+2024-06-01T06:02:18Z INFO 2403803 (sg03) [DRAM_Allocator]:       find first defs
+2024-06-01T06:02:19Z INFO 2403803 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 1994 PSUM Banks
+2024-06-01T06:02:19Z INFO 2403803 (sg03) [DRAM_Allocator]:            Num intervals 6 Num locations 6
+2024-06-01T06:02:19Z INFO 2403803 (sg03) [DRAM_Allocator]:            IntervalTree Build Done
+2024-06-01T06:02:19Z INFO 2403803 (sg03) [DRAM_Allocator]:            info.neighbors init Done
+2024-06-01T06:02:19Z INFO 2403803 (sg03) [DRAM_Allocator]:            IntervalTree readback Done
+2024-06-01T06:02:19Z INFO 2403803 (sg03) [DRAM_Allocator]:         simplify interference graph
+2024-06-01T06:02:19Z INFO 2403803 (sg03) [DRAM_Allocator]:           initialize low and high
+2024-06-01T06:02:19Z INFO 2403803 (sg03) [DRAM_Allocator]:             lo = 6
+2024-06-01T06:02:19Z INFO 2403803 (sg03) [DRAM_Allocator]:             hi = 0
+2024-06-01T06:02:19Z INFO 2403803 (sg03) [DRAM_Allocator]:             total = 6
+2024-06-01T06:02:19Z INFO 2403803 (sg03) [DRAM_Allocator]:           simplify
+2024-06-01T06:02:19Z INFO 2403803 (sg03) [DRAM_Allocator]:             new candidates = 0
+2024-06-01T06:02:19Z INFO 2403803 (sg03) [DRAM_Allocator]:         select ranges
+2024-06-01T06:02:19Z INFO 2403803 (sg03) [DRAM_Allocator]: CC buffer size limit 524288000
+2024-06-01T06:02:19Z INFO 2403803 (sg01) [PSUM_Allocator]:           PSUM spills = 640 tensors
+2024-06-01T06:02:19Z INFO 2403803 (sg01) [PSUM_Allocator]:         PSUM score = 2.68493e+06 (lower is better)
+2024-06-01T06:02:19Z INFO 2403803 (sg01) [PSUM_Allocator]:       best PSUM heuristic = 0
+2024-06-01T06:02:19Z INFO 2403803 (sg01) [PSUM_Allocator]:       collect spills
+2024-06-01T06:02:19Z INFO 2403803 (sg03) [DRAM_Allocator]: allreduce_dram_hwm 450035712
+2024-06-01T06:02:19Z INFO 2403803 (sg03) [DRAM_Allocator]: Real CC buffer size 450035712
+2024-06-01T06:02:19Z INFO 2403803 (sg03) [DRAM_Allocator]: DRAM hwm after allocation: 1373765632
+2024-06-01T06:02:19Z INFO 2403803 (sg03) [DRAM_Allocator]: DRAM allocation successful
+2024-06-01T06:02:19Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 7412047876
+2024-06-01T06:02:19Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 3583 bytes
+2024-06-01T06:02:19Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 300279840
+2024-06-01T06:02:19Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 3695 bytes
+2024-06-01T06:02:19Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 16384
+2024-06-01T06:02:19Z INFO 2403803 (sg03) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:02:19Z USER 2403803 (sg03) [ModuleForkPass]: coloring_allocator_dram finished after 1.788 seconds
+2024-06-01T06:02:19Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  12601mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:02:19Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:02:19Z USER 2403803 (sg03) [ModuleForkPass]: Running address_rotation_dram
+2024-06-01T06:02:19Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:02:19Z INFO 2403803 (sg03) [DMAOptimizationBase]: Runtime page size at 512MB
+2024-06-01T06:02:20Z INFO 2403803 (sg03) [DMAOptimizationBase]: DRAM hwm before rotation 1373765632
+2024-06-01T06:02:20Z INFO 2403803 (sg03) [DMAOptimizationBase]: allreduce buffer size 524288000
+2024-06-01T06:02:20Z INFO 2403803 (sg03) [DMAOptimizationBase]: allreduce hwm 450035712
+2024-06-01T06:02:20Z INFO 2403803 (sg03) [DMAOptimizationBase]: Real CC buffer size 450035712
+2024-06-01T06:02:20Z INFO 2403803 (sg03) [DMAOptimizationBase]: DRAM hwm after rotation 1373765632
+2024-06-01T06:02:20Z INFO 2403803 (sg03) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address
+2024-06-01T06:02:20Z USER 2403803 (sg03) [ModuleForkPass]: address_rotation_dram finished after 0.539 seconds
+2024-06-01T06:02:20Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  12560mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:02:20Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:02:20Z USER 2403803 (sg03) [ModuleForkPass]: Running tensorcopy_accel
+2024-06-01T06:02:20Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:02:20Z INFO 2403803 (sg03) [TensorCopyAccel::Impl]: Running peephole optimization pass
+2024-06-01T06:02:20Z INFO 2403803 (sg03) [TensorCopyAccel::Impl]: Accelerated 0 out of 27354 tensorcopy in Function: sg0003 average acceleration factor: -nan
+2024-06-01T06:02:20Z USER 2403803 (sg03) [ModuleForkPass]: tensorcopy_accel finished after 0.039 seconds
+2024-06-01T06:02:20Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  12566mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:02:20Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:02:20Z USER 2403803 (sg03) [ModuleForkPass]: Running peephole_opts
+2024-06-01T06:02:20Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:02:20Z INFO 2403803 (sg03) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: true SplitSelect: true
+2024-06-01T06:02:20Z INFO 2403803 (sg03) [PeepholeOpts]: Split Select: 0
+2024-06-01T06:02:20Z INFO 2403803 (sg03) [PeepholeOpts]: TSP -> ACT: 2592
+2024-06-01T06:02:20Z INFO 2403803 (sg03) [PeepholeOpts]: COPY -> ACT: 0
+2024-06-01T06:02:20Z INFO 2403803 (sg03) [PeepholeOpts]: RECIPROCAL -> ACT: 0
+2024-06-01T06:02:20Z USER 2403803 (sg03) [ModuleForkPass]: peephole_opts finished after 0.473 seconds
+2024-06-01T06:02:20Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  12600mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:02:20Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:02:20Z USER 2403803 (sg03) [ModuleForkPass]: Running lower_kernel
+2024-06-01T06:02:20Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:02:20Z INFO 2403803 (sg03) [LowerKernel]: Started running LowerKernel
+2024-06-01T06:02:20Z INFO 2403803 (sg03) [LowerKernel]: Start of kernel lowering pass, number of insts: 353547, number of allocs: 66596
+2024-06-01T06:02:20Z INFO 2403803 (sg03) [LowerKernel]: Scan BKs time (s): 2.40967
+2024-06-01T06:02:20Z INFO 2403803 (sg03) [LowerKernel]: Lower BKs time (s): 0.001842
+2024-06-01T06:02:20Z USER 2403803 (sg03) [ModuleForkPass]: lower_kernel finished after 0.065 seconds
+2024-06-01T06:02:20Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  12593mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:02:20Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:02:20Z USER 2403803 (sg03) [ModuleForkPass]: Running build_fdeps
+2024-06-01T06:02:20Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:02:20Z INFO 2403803 (sg03) [build_flow_deps]: Start build fdeps. Invocation: 5Sat Jun  1 06:02:20 2024
+2024-06-01T06:02:21Z INFO 2403803 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 4114 PSUM Banks
+2024-06-01T06:02:21Z USER 2403803 (sg02) [ModuleForkPass]: address_rotation_psum finished after 6.213 seconds
+2024-06-01T06:02:21Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  12574mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:02:21Z INFO 2403803 (sg03) [build_flow_deps]: Allocs: 66596 instructions: 353547
+2024-06-01T06:02:21Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 212588 memory location(s), 1 block(s), and 682177 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:02:21Z USER 2403803 (sg02) [ModuleForkPass]: Running coloring_allocator_sb
+2024-06-01T06:02:21Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=212588 blocks=1 instructions=682177 Max writers: 576 Max Readers: 184036
+2024-06-01T06:02:21Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 9089670928
+2024-06-01T06:02:21Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 2980 bytes
+2024-06-01T06:02:21Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 1312604160
+2024-06-01T06:02:21Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 3303 bytes
+2024-06-01T06:02:21Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 32768
+2024-06-01T06:02:21Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:02:21Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: Allocating functions
+2024-06-01T06:02:21Z INFO 2403803 (sg02) [ColoringAllocator::Rep]:   linearize and check
+2024-06-01T06:02:22Z INFO 2403803 (sg00) [SB_Allocator]:           Total: 98596
+2024-06-01T06:02:22Z INFO 2403803 (sg00) [SB_Allocator]:             Spilled: 0.017 (1701)
+2024-06-01T06:02:22Z INFO 2403803 (sg00) [SB_Allocator]:             Allocated: 0.983 (96895)
+2024-06-01T06:02:22Z INFO 2403803 (sg00) [SB_Allocator]:               Rover zone: 0.214 (20761)
+2024-06-01T06:02:22Z INFO 2403803 (sg00) [SB_Allocator]:               Pre-rover zone: 0.002 (158)
+2024-06-01T06:02:22Z INFO 2403803 (sg00) [SB_Allocator]:               Post-rover zone: 0.784 (75976)
+2024-06-01T06:02:22Z INFO 2403803 (sg00) [SB_Allocator]:               Slice zone: 0.000 (0)
+2024-06-01T06:02:22Z INFO 2403803 (sg00) [SB_Allocator]:               Blocks nothing: 0.000 (27)
+2024-06-01T06:02:22Z INFO 2403803 (sg00) [SB_Allocator]:               Blocks medium: 0.000 (2)
+2024-06-01T06:02:22Z INFO 2403803 (sg00) [SB_Allocator]:                 Visited until medium blocking (mean): 0.363
+2024-06-01T06:02:22Z INFO 2403803 (sg00) [SB_Allocator]:                 Visited until medium blocking (median): 0.000
+2024-06-01T06:02:22Z INFO 2403803 (sg00) [SB_Allocator]:                 Visited until medium blocking (p95): 0.000
+2024-06-01T06:02:22Z INFO 2403803 (sg00) [SB_Allocator]:               Blocks tall: 1.000 (96866)
+2024-06-01T06:02:22Z INFO 2403803 (sg00) [SB_Allocator]:                 Visited until tall blocking (mean): 0.959
+2024-06-01T06:02:22Z INFO 2403803 (sg00) [SB_Allocator]:                 Visited until tall blocking (median): 1.000
+2024-06-01T06:02:22Z INFO 2403803 (sg00) [SB_Allocator]:                 Visited until tall blocking (p95): 1.000
+2024-06-01T06:02:22Z INFO 2403803 (sg00) [SB_Allocator]:           Success
+2024-06-01T06:02:23Z INFO 2403803 (sg03) [build_flow_deps]: Build fdeps inserted 1121669 edges 
+2024-06-01T06:02:23Z INFO 2403803 (sg03) [build_flow_deps]: Done build fdeps 1121669 Sat Jun  1 06:02:23 2024
+2024-06-01T06:02:23Z INFO 2403803 (sg02) [SB_Allocator]:   allocating SB
+2024-06-01T06:02:23Z USER 2403803 (sg03) [ModuleForkPass]: build_fdeps finished after 2.300 seconds
+2024-06-01T06:02:23Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  12639mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:02:23Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:02:23Z USER 2403803 (sg03) [ModuleForkPass]: Running remove_redundancies
+2024-06-01T06:02:23Z INFO 2403803 (sg02) [SB_Allocator]:     main loop
+2024-06-01T06:02:23Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:02:23Z INFO 2403803 (sg03) [RemoveRedundancies]: remove_clobbered_writes
+2024-06-01T06:02:23Z INFO 2403803 (sg03) [RemoveRedundancies]: remove_clobbered_writes: 0
+2024-06-01T06:02:23Z INFO 2403803 (sg03) [RemoveRedundancies]: remove_useless_insts
+2024-06-01T06:02:23Z INFO 2403803 (sg00) [SB_Allocator]:           SB spills = 1701 tensors
+2024-06-01T06:02:23Z INFO 2403803 (sg00) [SB_Allocator]:                size = 1676424 bytes/partition
+2024-06-01T06:02:23Z INFO 2403803 (sg00) [SB_Allocator]:              remats = 34 tensors
+2024-06-01T06:02:23Z INFO 2403803 (sg00) [SB_Allocator]:            unpinned = 0 tensors
+2024-06-01T06:02:23Z INFO 2403803 (sg00) [SB_Allocator]:                size = 0 bytes/partition
+2024-06-01T06:02:23Z INFO 2403803 (sg00) [SB_Allocator]:         SB score = 2.07393e+07
+2024-06-01T06:02:23Z INFO 2403803 (sg00) [SB_Allocator]:       best SB heuristic = 0
+2024-06-01T06:02:23Z INFO 2403803 (sg00) [SB_Allocator]:       collect spills
+2024-06-01T06:02:23Z INFO 2403803 (sg01) [PSUM_Allocator]:       insert spills
+2024-06-01T06:02:23Z INFO 2403803 (sg03) [RemoveRedundancies]: remove Useless Instructions: 0
+2024-06-01T06:02:23Z USER 2403803 (sg03) [ModuleForkPass]: remove_redundancies finished after 0.483 seconds
+2024-06-01T06:02:23Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  12635mb, ru_maxrss:  12920mb (delta=0mb)
+2024-06-01T06:02:23Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:02:23Z USER 2403803 (sg03) [ModuleForkPass]: Running anti_dependency_analyzer
+2024-06-01T06:02:23Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:02:23Z INFO 2403803 (sg03) [AntiDependencyAnalyzer]: Batch size: 1000
+2024-06-01T06:02:23Z INFO 2403803 (sg03) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB}
+2024-06-01T06:02:23Z INFO 2403803 (sg02) [SB_Allocator]:       renumber locations
+2024-06-01T06:02:23Z INFO 2403803 (sg02) [SB_Allocator]:         size = 134342
+2024-06-01T06:02:23Z INFO 2403803 (sg02) [SB_Allocator]:       find partners
+2024-06-01T06:02:24Z INFO 2403803 (sg02) [SB_Allocator]:         found 74858 accumulation groups
+2024-06-01T06:02:24Z INFO 2403803 (sg02) [SB_Allocator]:           largest = _dot.651-t806_i1590
+2024-06-01T06:02:24Z INFO 2403803 (sg02) [SB_Allocator]:             tensors = 64
+2024-06-01T06:02:24Z INFO 2403803 (sg02) [SB_Allocator]:             requires 147456 bytes/partition
+2024-06-01T06:02:24Z INFO 2403803 (sg02) [SB_Allocator]:       expanding partners
+2024-06-01T06:02:25Z INFO 2403803 (sg01) [PSUM_Allocator]:     main loop
+2024-06-01T06:02:25Z INFO 2403803 [AntiDependencyAnalyzer-PSUM3-0-32]: Finished analyzing 42350 access patterns a mean/median 1.00113/1 intervals per access pattern and mean/median 5.51322/6.70056 intersections per interval.
+2024-06-01T06:02:25Z INFO 2403803 [AntiDependencyAnalyzer-PSUM5-0-32]: Finished analyzing 42341 access patterns a mean/median 1.00113/1 intervals per access pattern and mean/median 5.51417/6.71611 intersections per interval.
+2024-06-01T06:02:25Z INFO 2403803 [AntiDependencyAnalyzer-PSUM7-0-32]: Finished analyzing 42341 access patterns a mean/median 1.00113/1 intervals per access pattern and mean/median 5.37387/6.21342 intersections per interval.
+2024-06-01T06:02:25Z INFO 2403803 [AntiDependencyAnalyzer-PSUM1-0-32]: Finished analyzing 42383 access patterns a mean/median 1.00113/1 intervals per access pattern and mean/median 5.2476/5.94214 intersections per interval.
+2024-06-01T06:02:25Z INFO 2403803 [AntiDependencyAnalyzer-PSUM0-32-128]: Finished analyzing 38800 access patterns a mean/median 1.00124/1 intervals per access pattern and mean/median 5.62215/7.00848 intersections per interval.
+2024-06-01T06:02:25Z INFO 2403803 [AntiDependencyAnalyzer-PSUM4-0-32]: Finished analyzing 39121 access patterns a mean/median 1.00123/1 intervals per access pattern and mean/median 5.77131/7.5025 intersections per interval.
+2024-06-01T06:02:25Z INFO 2403803 [AntiDependencyAnalyzer-PSUM0-0-32]: Finished analyzing 39130 access patterns a mean/median 1.00123/1 intervals per access pattern and mean/median 5.63099/6.9767 intersections per interval.
+2024-06-01T06:02:25Z INFO 2403803 [AntiDependencyAnalyzer-PSUM2-0-32]: Finished analyzing 39130 access patterns a mean/median 1.00123/1 intervals per access pattern and mean/median 5.49271/6.79907 intersections per interval.
+2024-06-01T06:02:25Z INFO 2403803 [AntiDependencyAnalyzer-PSUM1-32-128]: Finished analyzing 42053 access patterns a mean/median 1.00114/1 intervals per access pattern and mean/median 5.23695/5.91865 intersections per interval.
+2024-06-01T06:02:25Z INFO 2403803 [AntiDependencyAnalyzer-PSUM4-32-128]: Finished analyzing 38791 access patterns a mean/median 1.00124/1 intervals per access pattern and mean/median 5.7601/7.46437 intersections per interval.
+2024-06-01T06:02:25Z INFO 2403803 [AntiDependencyAnalyzer-PSUM6-0-32]: Finished analyzing 39121 access patterns a mean/median 1.00123/1 intervals per access pattern and mean/median 5.77131/7.53045 intersections per interval.
+2024-06-01T06:02:25Z INFO 2403803 [AntiDependencyAnalyzer-PSUM6-32-128]: Finished analyzing 38791 access patterns a mean/median 1.00124/1 intervals per access pattern and mean/median 5.7601/7.48028 intersections per interval.
+2024-06-01T06:02:25Z INFO 2403803 [AntiDependencyAnalyzer-PSUM2-32-128]: Finished analyzing 38800 access patterns a mean/median 1.00124/1 intervals per access pattern and mean/median 5.48534/6.80078 intersections per interval.
+2024-06-01T06:02:25Z INFO 2403803 [AntiDependencyAnalyzer-PSUM7-32-128]: Finished analyzing 41978 access patterns a mean/median 1.00114/1 intervals per access pattern and mean/median 5.36971/6.18039 intersections per interval.
+2024-06-01T06:02:25Z INFO 2403803 [AntiDependencyAnalyzer-PSUM3-32-128]: Finished analyzing 42020 access patterns a mean/median 1.00114/1 intervals per access pattern and mean/median 5.50058/6.66415 intersections per interval.
+2024-06-01T06:02:25Z INFO 2403803 [AntiDependencyAnalyzer-PSUM5-32-128]: Finished analyzing 42011 access patterns a mean/median 1.00114/1 intervals per access pattern and mean/median 5.50154/6.65933 intersections per interval.
+2024-06-01T06:02:25Z INFO 2403803 [AntiDependencyAnalyzer-DRAM]: Finished analyzing 1676 access patterns a mean/median 87.7208/114.854 intervals per access pattern and mean/median 0.499966/2.72978e-05 intersections per interval.
+2024-06-01T06:02:25Z INFO 2403803 (sg01) [PSUM_Allocator]:     renumber locations
+2024-06-01T06:02:25Z INFO 2403803 (sg01) [PSUM_Allocator]:         size = 77930
+2024-06-01T06:02:25Z INFO 2403803 (sg02) [SB_Allocator]:       find first defs
+2024-06-01T06:02:25Z INFO 2403803 (sg02) [SB_Allocator]:       find loads
+2024-06-01T06:02:26Z INFO 2416028 [BackendDriver]: max_allowed_parallelism=192
+2024-06-01T06:02:26Z INFO 2403803 (sg02) [SB_Allocator]:         0 pin count
+2024-06-01T06:02:26Z INFO 2403803 (sg02) [SB_Allocator]:         23447 remat count
+2024-06-01T06:02:26Z INFO 2403803 (sg02) [SB_Allocator]:       build interference graph
+2024-06-01T06:02:26Z INFO 2403803 (sg02) [SB_Allocator]:         pass 1 int-tree
+2024-06-01T06:02:26Z INFO 2403803 (sg02) [SB_Allocator]:            Num intervals 134342 Num locations 134342
+2024-06-01T06:02:26Z INFO 2403803 (sg02) [SB_Allocator]:            IntervalTree Build Done
+2024-06-01T06:02:26Z INFO 2403803 (sg02) [SB_Allocator]:            info.neighbors init Done
+2024-06-01T06:02:26Z INFO 2403803 (sg02) [SB_Allocator]:            info.neighbors partners Done
+2024-06-01T06:02:26Z INFO 2403803 (sg02) [SB_Allocator]:            IntervalTree readback Done
+2024-06-01T06:02:26Z INFO 2403803 (sg02) [SB_Allocator]:         edge: 68199142
+2024-06-01T06:02:26Z INFO 2403803 (sg02) [SB_Allocator]:         mean: 1015.31
+2024-06-01T06:02:26Z INFO 2403803 (sg02) [SB_Allocator]:         median: 625.034
+2024-06-01T06:02:26Z INFO 2403803 (sg02) [SB_Allocator]:       find costs
+2024-06-01T06:02:27Z INFO 2416028 [BackendDriver]: Backend driver mtBackend: false numModules: 1 Cwd: "/root/llava_mistral_0531/app/neuronxcc-0wrwbxjq/sg00"
+2024-06-01T06:02:27Z USER 2416028 [BackendDriver]: Running mod_parallel_pass
+2024-06-01T06:02:27Z INFO 2416028 [BackendDriver]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=4753 blocks=1 instructions=3625 Max writers: 5 Max Readers: 513
+2024-06-01T06:02:27Z USER 2416028 [ModuleForkPass]: Running rewrite_matmult_sparse
+2024-06-01T06:02:27Z INFO 2416028 [ModuleForkPass]: Inputs to rewrite_matmult_sparse: modules=1 functions=1 allocs=4753 blocks=1 instructions=3625 Max writers: 5 Max Readers: 513
+2024-06-01T06:02:27Z USER 2416028 [ModuleForkPass]: rewrite_matmult_sparse finished after 0.007 seconds
+2024-06-01T06:02:27Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  240mb, ru_maxrss:  616mb (delta=0mb)
+2024-06-01T06:02:27Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4753 memory location(s), 1 block(s), and 3625 instruction(s). Max writers: 5 Max Readers: 513
+2024-06-01T06:02:27Z USER 2416028 [ModuleForkPass]: Running birverifier
+2024-06-01T06:02:27Z INFO 2416028 [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=4753 blocks=1 instructions=3625 Max writers: 5 Max Readers: 513
+2024-06-01T06:02:27Z INFO 2403803 [AntiDependencyAnalyzer-SB-113-128]: Finished analyzing 672596 access patterns a mean/median 1.0864/1 intervals per access pattern and mean/median 2.94747/1 intersections per interval.
+2024-06-01T06:02:27Z INFO 2403803 [AntiDependencyAnalyzer-SB-64-65]: Finished analyzing 673821 access patterns a mean/median 1.08624/1 intervals per access pattern and mean/median 2.94887/1 intersections per interval.
+2024-06-01T06:02:27Z INFO 2403803 [AntiDependencyAnalyzer-SB-0-1]: Finished analyzing 674148 access patterns a mean/median 1.0862/1 intervals per access pattern and mean/median 2.94752/1 intersections per interval.
+2024-06-01T06:02:27Z INFO 2403803 [AntiDependencyAnalyzer-SB-65-96]: Finished analyzing 673812 access patterns a mean/median 1.08624/1 intervals per access pattern and mean/median 2.94894/1 intersections per interval.
+2024-06-01T06:02:27Z INFO 2403803 [AntiDependencyAnalyzer-SB-33-64]: Finished analyzing 673821 access patterns a mean/median 1.08624/1 intervals per access pattern and mean/median 2.94887/1 intersections per interval.
+2024-06-01T06:02:27Z INFO 2403803 [AntiDependencyAnalyzer-SB-1-32]: Finished analyzing 673821 access patterns a mean/median 1.08624/1 intervals per access pattern and mean/median 2.94887/1 intersections per interval.
+2024-06-01T06:02:27Z INFO 2403803 [AntiDependencyAnalyzer-SB-32-33]: Finished analyzing 673826 access patterns a mean/median 1.08624/1 intervals per access pattern and mean/median 2.94886/0.999999 intersections per interval.
+2024-06-01T06:02:27Z INFO 2403803 [AntiDependencyAnalyzer-SB-96-113]: Finished analyzing 673812 access patterns a mean/median 1.08624/1 intervals per access pattern and mean/median 2.94894/1 intersections per interval.
+2024-06-01T06:02:27Z USER 2403803 (sg03) [ModuleForkPass]: anti_dependency_analyzer finished after 3.499 seconds
+2024-06-01T06:02:27Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  13493mb, ru_maxrss:  13493mb (delta=573mb)
+2024-06-01T06:02:27Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:02:27Z USER 2403803 (sg03) [ModuleForkPass]: Running tensor_copy_elim
+2024-06-01T06:02:27Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:02:27Z USER 2416028 [ModuleForkPass]: birverifier finished after 0.601 seconds
+2024-06-01T06:02:27Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  381mb, ru_maxrss:  616mb (delta=0mb)
+2024-06-01T06:02:27Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4753 memory location(s), 1 block(s), and 3625 instruction(s). Max writers: 5 Max Readers: 513
+2024-06-01T06:02:27Z USER 2416028 [ModuleForkPass]: Running expand_replication
+2024-06-01T06:02:27Z INFO 2416028 [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=4753 blocks=1 instructions=3625 Max writers: 5 Max Readers: 513
+2024-06-01T06:02:27Z INFO 2416028 [ExpandReplication]: Found 0 replicated matmults
+2024-06-01T06:02:27Z USER 2416028 [ModuleForkPass]: expand_replication finished after 0.012 seconds
+2024-06-01T06:02:27Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  381mb, ru_maxrss:  616mb (delta=0mb)
+2024-06-01T06:02:27Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4753 memory location(s), 1 block(s), and 3625 instruction(s). Max writers: 5 Max Readers: 513
+2024-06-01T06:02:27Z USER 2416028 [ModuleForkPass]: Running unroll
+2024-06-01T06:02:27Z INFO 2416028 [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=4753 blocks=1 instructions=3625 Max writers: 5 Max Readers: 513
+2024-06-01T06:02:27Z INFO 2416028 [Unroll]: INFO (Unroll) Start unrolling at Sat Jun  1 06:02:27 2024
+2024-06-01T06:02:27Z INFO 2416028 [Unroll]: INFO (Unroll) adjusting parallelfor 
+2024-06-01T06:02:27Z INFO 2403803 (sg02) [SB_Allocator]:       best-of-n loop, heuristic = 0
+2024-06-01T06:02:27Z INFO 2403803 (sg02) [SB_Allocator]:         simplify interference graph
+2024-06-01T06:02:27Z INFO 2403803 (sg02) [SB_Allocator]:           initialize safe & unsafe
+2024-06-01T06:02:28Z INFO 2403803 (sg02) [SB_Allocator]:               safe = 3343
+2024-06-01T06:02:28Z INFO 2403803 (sg02) [SB_Allocator]:             unsafe = 97022
+2024-06-01T06:02:28Z INFO 2403803 (sg02) [SB_Allocator]:                inf = 33977
+2024-06-01T06:02:28Z INFO 2403803 (sg02) [SB_Allocator]:              total = 134342
+2024-06-01T06:02:28Z INFO 2403803 (sg02) [SB_Allocator]:           simplify
+2024-06-01T06:02:28Z INFO 2403803 (sg03) [TensorCopyElim]: Tensor CP elimination: 0
+2024-06-01T06:02:28Z INFO 2403803 (sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 96936 #Pinned 0 #Safe 0 minCost 4.24566e-05 maxCost 0.0553019 locations 134342
+2024-06-01T06:02:29Z USER 2403803 (sg03) [ModuleForkPass]: tensor_copy_elim finished after 2.282 seconds
+2024-06-01T06:02:29Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  13475mb, ru_maxrss:  13493mb (delta=0mb)
+2024-06-01T06:02:29Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:02:29Z USER 2403803 (sg03) [ModuleForkPass]: Running post_sched
+2024-06-01T06:02:29Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:02:29Z INFO 2403803 [post_scheduler]: Start PosT ScheD 3 sunda Sat Jun  1 06:02:29 2024
+2024-06-01T06:02:30Z INFO 2416028 [Unroll]: INFO (Unroll) DONE unrolling Sat Jun  1 06:02:27 2024
+
+2024-06-01T06:02:30Z INFO 2416028 [Unroll]: sg0000 Instruction count after Unroll: 
+2024-06-01T06:02:30Z INFO 2416028 [Unroll]: Total count: 276336
+2024-06-01T06:02:30Z INFO 2416028 [Unroll]: Matmult: 223233
+2024-06-01T06:02:30Z INFO 2416028 [Unroll]: GenericCopy: 25952
+2024-06-01T06:02:30Z INFO 2416028 [Unroll]: TensorTensor: 8901
+2024-06-01T06:02:30Z INFO 2416028 [Unroll]: Load: 8106
+2024-06-01T06:02:30Z INFO 2416028 [Unroll]: TensorScalarPtr: 3021
+2024-06-01T06:02:30Z INFO 2416028 [Unroll]: Activation: 1956
+2024-06-01T06:02:30Z INFO 2416028 [Unroll]: TensorReduce: 1792
+2024-06-01T06:02:30Z INFO 2416028 [Unroll]: StreamTranspose: 1024
+2024-06-01T06:02:30Z INFO 2416028 [Unroll]: Select: 960
+2024-06-01T06:02:30Z INFO 2416028 [Unroll]: Memset: 426
+2024-06-01T06:02:30Z INFO 2416028 [Unroll]: StreamShuffle: 335
+2024-06-01T06:02:30Z INFO 2416028 [Unroll]: Save: 319
+2024-06-01T06:02:30Z INFO 2416028 [Unroll]: Iota: 86
+2024-06-01T06:02:30Z INFO 2416028 [Unroll]: DMACopy: 65
+2024-06-01T06:02:30Z INFO 2416028 [Unroll]: CollectiveCompute: 64
+2024-06-01T06:02:30Z INFO 2416028 [Unroll]: GenericIndirectSave: 64
+2024-06-01T06:02:30Z INFO 2416028 [Unroll]: Reciprocal: 32
+2024-06-01T06:02:30Z USER 2416028 [ModuleForkPass]: unroll finished after 3.034 seconds
+2024-06-01T06:02:30Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  1657mb, ru_maxrss:  1657mb (delta=1041mb)
+2024-06-01T06:02:30Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62196 memory location(s), 1 block(s), and 276400 instruction(s). Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:30Z USER 2416028 [ModuleForkPass]: Running psum_legalization
+2024-06-01T06:02:30Z INFO 2416028 [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=62196 blocks=1 instructions=276400 Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:30Z USER 2416028 [ModuleForkPass]: psum_legalization finished after 0.048 seconds
+2024-06-01T06:02:30Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  1350mb, ru_maxrss:  1657mb (delta=0mb)
+2024-06-01T06:02:30Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62196 memory location(s), 1 block(s), and 276400 instruction(s). Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:30Z USER 2416028 [ModuleForkPass]: Running error_injector
+2024-06-01T06:02:30Z INFO 2416028 [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=62196 blocks=1 instructions=276400 Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:30Z WARNING 2416028 [ErrorInjector]: Unrecognized injected error value "0"
+2024-06-01T06:02:30Z USER 2416028 [ModuleForkPass]: error_injector finished after 0.004 seconds
+2024-06-01T06:02:30Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  1350mb, ru_maxrss:  1657mb (delta=0mb)
+2024-06-01T06:02:30Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62196 memory location(s), 1 block(s), and 276400 instruction(s). Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:30Z USER 2416028 [ModuleForkPass]: Running constant_propagate
+2024-06-01T06:02:30Z INFO 2416028 [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=62196 blocks=1 instructions=276400 Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:30Z INFO 2416028 [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0
+2024-06-01T06:02:30Z INFO 2416028 [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0
+2024-06-01T06:02:31Z USER 2416028 [ModuleForkPass]: constant_propagate finished after 0.294 seconds
+2024-06-01T06:02:31Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  1361mb, ru_maxrss:  1657mb (delta=0mb)
+2024-06-01T06:02:31Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62196 memory location(s), 1 block(s), and 276400 instruction(s). Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:31Z USER 2416028 [ModuleForkPass]: Running vn_splitter
+2024-06-01T06:02:31Z INFO 2416028 [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=62196 blocks=1 instructions=276400 Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:31Z INFO 2416028 [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 1
+2024-06-01T06:02:31Z INFO 2416028 [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0
+2024-06-01T06:02:31Z INFO 2416028 [ShrinkDN]: INFO (ShrinkDN): Shrunk 33 nodes. Total savings 4416 bytes/partition
+2024-06-01T06:02:31Z INFO 2416028 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0
+2024-06-01T06:02:31Z INFO 2416028 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0
+2024-06-01T06:02:31Z INFO 2416028 [VNSplitterPass]: INFO (VNSplitter) Time: 0.005 seconds
+2024-06-01T06:02:31Z INFO 2416028 [VNSplitterPass]: INFO (VerticalFusion) Time: 0.097 seconds
+2024-06-01T06:02:31Z INFO 2416028 [VNSplitterPass]: INFO (ShrinkDN) Time: 0.069 seconds
+2024-06-01T06:02:31Z USER 2416028 [ModuleForkPass]: vn_splitter finished after 0.214 seconds
+2024-06-01T06:02:31Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  1370mb, ru_maxrss:  1657mb (delta=0mb)
+2024-06-01T06:02:31Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62196 memory location(s), 1 block(s), and 276400 instruction(s). Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:31Z USER 2416028 [ModuleForkPass]: Running lower_ac
+2024-06-01T06:02:31Z INFO 2416028 [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=62196 blocks=1 instructions=276400 Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:31Z INFO 2416028 [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies.
+2024-06-01T06:02:31Z USER 2416028 [ModuleForkPass]: lower_ac finished after 0.032 seconds
+2024-06-01T06:02:31Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  1370mb, ru_maxrss:  1657mb (delta=0mb)
+2024-06-01T06:02:31Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62196 memory location(s), 1 block(s), and 276400 instruction(s). Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:31Z USER 2416028 [ModuleForkPass]: Running input_dma_coalescing
+2024-06-01T06:02:31Z INFO 2416028 [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=62196 blocks=1 instructions=276400 Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:31Z INFO 2416028 [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads
+2024-06-01T06:02:31Z USER 2416028 [ModuleForkPass]: input_dma_coalescing finished after 0.401 seconds
+2024-06-01T06:02:31Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  1371mb, ru_maxrss:  1657mb (delta=0mb)
+2024-06-01T06:02:31Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62196 memory location(s), 1 block(s), and 276400 instruction(s). Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:31Z USER 2416028 [ModuleForkPass]: Running early_peephole_opts
+2024-06-01T06:02:31Z INFO 2416028 [ModuleForkPass]: Inputs to early_peephole_opts: modules=1 functions=1 allocs=62196 blocks=1 instructions=276400 Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:31Z INFO 2416028 [EarlyPeepholeOpts]: PeepholeOpts enabled? ActivationAccumulate: true
+2024-06-01T06:02:31Z INFO 2416028 [EarlyPeepholeOpts]: Activation Accumulate: 0
+2024-06-01T06:02:31Z USER 2416028 [ModuleForkPass]: early_peephole_opts finished after 0.036 seconds
+2024-06-01T06:02:31Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  1371mb, ru_maxrss:  1657mb (delta=0mb)
+2024-06-01T06:02:31Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62196 memory location(s), 1 block(s), and 276400 instruction(s). Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:31Z USER 2416028 [ModuleForkPass]: Running pre_sched
+2024-06-01T06:02:31Z INFO 2416028 [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=62196 blocks=1 instructions=276400 Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:31Z INFO 2416028 [PreSched]: Start PRE scheduling 2 cores:  1 at: Sat Jun  1 06:02:31 2024
+2024-06-01T06:02:31Z INFO 2416028 [LayerSpiller]: LayerSpill: Start...
+2024-06-01T06:02:31Z INFO 2416028 [LayerSpiller]: LayerSpill: Found 64 Splits CCs
+2024-06-01T06:02:31Z INFO 2416028 [LayerSpiller]: Grouped CCs to 1 clusters.
+2024-06-01T06:02:32Z INFO 2416028 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors
+2024-06-01T06:02:32Z INFO 2416028 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts
+2024-06-01T06:02:32Z INFO 2416028 [LayerSpiller]: LayerSpill: Done.
+2024-06-01T06:02:32Z INFO 2416028 [PreSched]: Start split live ranges Sat Jun  1 06:02:31 2024
+2024-06-01T06:02:32Z INFO 2403803 (sg00) [SB_Allocator]:       insert spills
+2024-06-01T06:02:32Z INFO 2416028 [PreSched]: Num_Splits: 0
+2024-06-01T06:02:32Z INFO 2416028 [PreSched]: End split live ranges Sat Jun  1 06:02:32 2024
+2024-06-01T06:02:32Z INFO 2416028 [PreSched]: Strt remove redundncies Sat Jun  1 06:02:32 2024
+2024-06-01T06:02:32Z INFO 2416028 [PreSched]: remove_redundant_memsets
+2024-06-01T06:02:32Z INFO 2416028 [PreSched]: remove_redundant_memsets: 256
+2024-06-01T06:02:32Z INFO 2416028 [PreSched]: remove_redundant_loads
+2024-06-01T06:02:32Z INFO 2416028 [PreSched]: remove_redundant_loads: 0
+2024-06-01T06:02:32Z INFO 2416028 [PreSched]: End remove redundncies Sat Jun  1 06:02:32 2024
+2024-06-01T06:02:32Z INFO 2416028 [PreSched]: Start DCE Sat Jun  1 06:02:32 2024
+2024-06-01T06:02:32Z INFO 2416028 [PreSched]: End DCE Sat Jun  1 06:02:32 2024
+2024-06-01T06:02:32Z INFO 2403803 (sg00) [SB_Allocator]:       deleting loads #loadsToDelete: 34
+2024-06-01T06:02:32Z INFO 2403803 (sg00) [SB_Allocator]:       deleting locs #locationsToDelete: 34
+2024-06-01T06:02:32Z INFO 2403803 (sg00) [SB_Allocator]:       locationsToDelete done
+2024-06-01T06:02:33Z INFO 2416028 [PreSched]: Start build flow dependencies Sat Jun  1 06:02:33 2024
+2024-06-01T06:02:33Z INFO 2416028 [build_flow_deps]: Start build fdeps. Invocation: 1Sat Jun  1 06:02:33 2024
+2024-06-01T06:02:33Z INFO 2416028 [build_flow_deps]: Allocs: 62196 instructions: 276144
+2024-06-01T06:02:33Z INFO 2403803 (sg00) [SB_Allocator]:     main loop
+2024-06-01T06:02:33Z INFO 2403803 (sg00) [SB_Allocator]:       renumber locations
+2024-06-01T06:02:33Z INFO 2403803 (sg00) [SB_Allocator]:         size = 108058
+2024-06-01T06:02:33Z INFO 2403803 (sg00) [SB_Allocator]:       find partners
+2024-06-01T06:02:33Z INFO 2403803 (sg01) [PSUM_Allocator]: build_no_bitmap start
+2024-06-01T06:02:33Z INFO 2403803 (sg00) [SB_Allocator]:         found 42827 accumulation groups
+2024-06-01T06:02:33Z INFO 2403803 (sg00) [SB_Allocator]:           largest = _dot.414-t975_i83
+2024-06-01T06:02:33Z INFO 2403803 (sg00) [SB_Allocator]:             tensors = 64
+2024-06-01T06:02:33Z INFO 2403803 (sg00) [SB_Allocator]:             requires 98304 bytes/partition
+2024-06-01T06:02:33Z INFO 2403803 (sg00) [SB_Allocator]:       expanding partners
+2024-06-01T06:02:34Z INFO 2403803 (sg01) [PSUM_Allocator]:           PSUM high-water mark = 8 tensors
+2024-06-01T06:02:34Z INFO 2403803 (sg01) [PSUM_Allocator]:         found 198036 edges
+2024-06-01T06:02:34Z INFO 2403803 (sg01) [PSUM_Allocator]:         mean: 5.08241
+2024-06-01T06:02:34Z INFO 2403803 (sg01) [PSUM_Allocator]:         median: 6.33887
+2024-06-01T06:02:34Z INFO 2403803 (sg01) [PSUM_Allocator]:         adjacency vectors require 1584288 bytes
+2024-06-01T06:02:34Z INFO 2403803 (sg01) [PSUM_Allocator]: build_no_bitmap done
+2024-06-01T06:02:34Z INFO 2403803 (sg01) [PSUM_Allocator]:       find costs
+2024-06-01T06:02:34Z INFO 2403803 (sg00) [SB_Allocator]:       find first defs
+2024-06-01T06:02:34Z INFO 2416028 [build_flow_deps]: Build fdeps inserted 815457 edges 
+2024-06-01T06:02:34Z INFO 2416028 [build_flow_deps]: Done build fdeps 815457 Sat Jun  1 06:02:34 2024
+2024-06-01T06:02:34Z INFO 2416028 [PreSched]: End build flow dependencies Sat Jun  1 06:02:34 2024
+2024-06-01T06:02:34Z INFO 2416028 [PreSched]: Start remove useless insts Sat Jun  1 06:02:34 2024
+2024-06-01T06:02:34Z INFO 2416028 [PreSched]: remove_useless_insts
+2024-06-01T06:02:34Z INFO 2416028 [PreSched]: remove Useless Instructions: 0
+2024-06-01T06:02:34Z INFO 2416028 [PreSched]: End remove useless insts Sat Jun  1 06:02:34 2024
+2024-06-01T06:02:34Z INFO 2403803 (sg00) [SB_Allocator]:       find loads
+2024-06-01T06:02:34Z INFO 2403803 (sg00) [SB_Allocator]:         0 pin count
+2024-06-01T06:02:34Z INFO 2403803 (sg00) [SB_Allocator]:         15889 remat count
+2024-06-01T06:02:34Z INFO 2403803 (sg00) [SB_Allocator]:       build interference graph
+2024-06-01T06:02:34Z INFO 2403803 (sg00) [SB_Allocator]:         pass 1 int-tree
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:            Num intervals 108058 Num locations 108058
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:            IntervalTree Build Done
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:            info.neighbors init Done
+2024-06-01T06:02:35Z INFO 2416028 [PreSched]: DONE PRE scheduling Sat Jun  1 06:02:35 2024
+2024-06-01T06:02:35Z USER 2416028 [ModuleForkPass]: pre_sched finished after 3.346 seconds
+2024-06-01T06:02:35Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  1606mb, ru_maxrss:  1657mb (delta=0mb)
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:            info.neighbors partners Done
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:            IntervalTree readback Done
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:         edge: 7302916
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:         mean: 135.167
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:         median: 92.2207
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:       find costs
+2024-06-01T06:02:35Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62196 memory location(s), 1 block(s), and 276144 instruction(s). Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:35Z USER 2416028 [ModuleForkPass]: Running tensor_copy_elim
+2024-06-01T06:02:35Z INFO 2416028 [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=62196 blocks=1 instructions=276144 Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:35Z INFO 2403803 (sg02) [SB_Allocator]:             new candidates = 19392
+2024-06-01T06:02:35Z INFO 2403803 (sg02) [SB_Allocator]:             (including 5120 infinite cost tensors)
+2024-06-01T06:02:35Z INFO 2403803 (sg02) [SB_Allocator]:         select ranges
+2024-06-01T06:02:35Z INFO 2416028 [TensorCopyElim]: Tensor CP elimination: 0
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:       best-of-n loop, heuristic = 0
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:         simplify interference graph
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:           initialize safe & unsafe
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:               safe = 8634
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:             unsafe = 320
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:                inf = 2209
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:              total = 11163
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:           simplify
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 320 #Pinned 0 #Safe 0 minCost 0.0161891 maxCost 0.0201703 locations 108058
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:             new candidates = 320
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:             (including 2209 infinite cost tensors)
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:         select ranges
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:           Total: 11163
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:             Spilled: 0.000 (0)
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:             Allocated: 1.000 (11163)
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:               Rover zone: 0.936 (10450)
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:               Pre-rover zone: 0.008 (93)
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:               Post-rover zone: 0.056 (620)
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:               Slice zone: 0.000 (0)
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:               Blocks nothing: 0.000 (0)
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:               Blocks medium: 0.000 (0)
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:               Blocks tall: 1.000 (11163)
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:                 Visited until tall blocking (mean): 0.999
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:                 Visited until tall blocking (median): 1.000
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:                 Visited until tall blocking (p95): 1.000
+2024-06-01T06:02:35Z INFO 2403803 (sg00) [SB_Allocator]:           Success
+2024-06-01T06:02:35Z INFO 2403803 (sg01) [PSUM_Allocator]:       best-of-n loop, heuristic = 0
+2024-06-01T06:02:35Z INFO 2403803 (sg01) [PSUM_Allocator]:         simplify interference graph
+2024-06-01T06:02:35Z INFO 2403803 (sg01) [PSUM_Allocator]:           initialize low and high
+2024-06-01T06:02:35Z INFO 2403803 (sg01) [PSUM_Allocator]:             lo = 77418
+2024-06-01T06:02:35Z INFO 2403803 (sg01) [PSUM_Allocator]:             hi = 512
+2024-06-01T06:02:35Z INFO 2403803 (sg01) [PSUM_Allocator]:             inf = 0
+2024-06-01T06:02:35Z INFO 2403803 (sg01) [PSUM_Allocator]:             total = 77930
+2024-06-01T06:02:35Z INFO 2403803 (sg01) [PSUM_Allocator]:           simplify
+2024-06-01T06:02:35Z INFO 2403803 (sg01) [PSUM_Allocator]:             new candidates = 0
+2024-06-01T06:02:35Z INFO 2403803 (sg01) [PSUM_Allocator]:         select ranges
+2024-06-01T06:02:35Z USER 2416028 [ModuleForkPass]: tensor_copy_elim finished after 0.751 seconds
+2024-06-01T06:02:35Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  1519mb, ru_maxrss:  1657mb (delta=0mb)
+2024-06-01T06:02:35Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62196 memory location(s), 1 block(s), and 276144 instruction(s). Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:35Z USER 2416028 [ModuleForkPass]: Running mm_packing
+2024-06-01T06:02:36Z INFO 2416028 [ModuleForkPass]: Inputs to mm_packing: modules=1 functions=1 allocs=62196 blocks=1 instructions=276144 Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:36Z INFO 2416028 [MMPacking]: INFO (MMPack) Running the preprocessing step.
+2024-06-01T06:02:36Z INFO 2403803 (sg00) [SB_Allocator]:           SB spills = 0 tensors
+2024-06-01T06:02:36Z INFO 2403803 (sg00) [SB_Allocator]:                size = 0 bytes/partition
+2024-06-01T06:02:36Z INFO 2403803 (sg00) [SB_Allocator]:              remats = 0 tensors
+2024-06-01T06:02:36Z INFO 2403803 (sg00) [SB_Allocator]:            unpinned = 0 tensors
+2024-06-01T06:02:36Z INFO 2403803 (sg00) [SB_Allocator]:                size = 0 bytes/partition
+2024-06-01T06:02:36Z INFO 2403803 (sg00) [SB_Allocator]:         SB score = 0
+2024-06-01T06:02:36Z INFO 2403803 (sg00) [SB_Allocator]: spilling from SB cost about 2.07393e+07 cycles
+2024-06-01T06:02:36Z INFO 2403803 (sg00) [SB_Allocator]: number of tensors spilled from SB = 1701
+2024-06-01T06:02:36Z INFO 2403803 (sg00) [SB_Allocator]: total size of spilled tensors = 1676424 bytes/partition
+2024-06-01T06:02:36Z INFO 2403803 (sg00) [SB_Allocator]: 0 bytes/partition (0%) successfully pinned
+2024-06-01T06:02:36Z INFO 2403803 (sg00) [SB_Allocator]: pinning saved approximately 0 cycles
+2024-06-01T06:02:36Z INFO 2403803 (sg00) [SB_Allocator]: 0% SB utilization after allocation
+2024-06-01T06:02:36Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 2764950152
+2024-06-01T06:02:36Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 1290 bytes
+2024-06-01T06:02:36Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 1116144400
+2024-06-01T06:02:36Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 905 bytes
+2024-06-01T06:02:37Z INFO 2403803 (sg01) [PSUM_Allocator]:           no more spills
+2024-06-01T06:02:37Z INFO 2403803 (sg01) [PSUM_Allocator]:         PSUM score = 0 (lower is better)
+2024-06-01T06:02:37Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 16384
+2024-06-01T06:02:37Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:02:37Z INFO 2403803 (sg01) [PSUM_Allocator]: spilling from PSUM cost about 2.68493e+06 cycles
+2024-06-01T06:02:37Z INFO 2403803 (sg01) [PSUM_Allocator]: number of tensors spilled from PSUM = 640
+2024-06-01T06:02:37Z INFO 2403803 (sg01) [PSUM_Allocator]: 100% PSUM utilization after allocation
+2024-06-01T06:02:37Z USER 2403803 (sg00) [ModuleForkPass]: coloring_allocator_sb finished after 43.486 seconds
+2024-06-01T06:02:37Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  13538mb, ru_maxrss:  13538mb (delta=618mb)
+2024-06-01T06:02:37Z INFO 2416028 [MMPacking]: INFO (MMPack) mlBPCG size 42832, CCS = 2816
+2024-06-01T06:02:38Z INFO 2416028 [MMPacking]: INFO (MMPack) agRGCG size 336752
+2024-06-01T06:02:38Z INFO 2416028 [MMPackingPass]: INFO (MMPacking) Time: 2.417 seconds
+2024-06-01T06:02:38Z USER 2416028 [ModuleForkPass]: mm_packing finished after 2.494 seconds
+2024-06-01T06:02:38Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  1622mb, ru_maxrss:  1657mb (delta=0mb)
+2024-06-01T06:02:38Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62196 memory location(s), 1 block(s), and 276144 instruction(s). Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:38Z USER 2416028 [ModuleForkPass]: Running coloring_allocator_psum
+2024-06-01T06:02:38Z INFO 2416028 [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=62196 blocks=1 instructions=276144 Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:38Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 5033370252
+2024-06-01T06:02:38Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 4962 bytes
+2024-06-01T06:02:38Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 9089670928
+2024-06-01T06:02:38Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 2980 bytes
+2024-06-01T06:02:38Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 1312604160
+2024-06-01T06:02:38Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 3303 bytes
+2024-06-01T06:02:38Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 32768
+2024-06-01T06:02:38Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:02:38Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 8644640
+2024-06-01T06:02:38Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 714 bytes
+2024-06-01T06:02:38Z USER 2403803 (sg01) [ModuleForkPass]: coloring_allocator_psum finished after 25.370 seconds
+2024-06-01T06:02:38Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  12903mb, ru_maxrss:  13538mb (delta=618mb)
+2024-06-01T06:02:38Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 0
+2024-06-01T06:02:38Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 0 bytes
+2024-06-01T06:02:38Z INFO 2416028 [ColoringAllocator::Rep]: Allocating functions
+2024-06-01T06:02:38Z INFO 2416028 [ColoringAllocator::Rep]:   linearize and check
+2024-06-01T06:02:38Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 155962 memory location(s), 1 block(s), and 337598 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:02:38Z USER 2403803 (sg00) [ModuleForkPass]: Running address_rotation_sb
+2024-06-01T06:02:38Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 213612 memory location(s), 1 block(s), and 683201 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:02:38Z USER 2403803 (sg01) [ModuleForkPass]: Running dma_optimization_psum
+2024-06-01T06:02:38Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=213612 blocks=1 instructions=683201 Max writers: 576 Max Readers: 184036
+2024-06-01T06:02:38Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=155962 blocks=1 instructions=337598 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:02:39Z INFO 2416028 [PSUM_Allocator]:   allocating PSUM
+2024-06-01T06:02:39Z INFO 2416028 [PSUM_Allocator]:     main loop
+2024-06-01T06:02:39Z INFO 2416028 [PSUM_Allocator]:     renumber locations
+2024-06-01T06:02:39Z INFO 2416028 [PSUM_Allocator]:         size = 30725
+2024-06-01T06:02:39Z INFO 2416028 [PSUM_Allocator]: build_no_bitmap start
+2024-06-01T06:02:39Z INFO 2416028 [PSUM_Allocator]: 100% PSUM demand before spilling
+2024-06-01T06:02:39Z INFO 2416028 [PSUM_Allocator]:           PSUM high-water mark = 8 tensors
+2024-06-01T06:02:39Z INFO 2416028 [PSUM_Allocator]:         found 55864 edges
+2024-06-01T06:02:39Z INFO 2416028 [PSUM_Allocator]:         mean: 3.63639
+2024-06-01T06:02:39Z INFO 2416028 [PSUM_Allocator]:         median: 3.12848
+2024-06-01T06:02:39Z INFO 2416028 [PSUM_Allocator]:         adjacency vectors require 446912 bytes
+2024-06-01T06:02:39Z INFO 2416028 [PSUM_Allocator]: build_no_bitmap done
+2024-06-01T06:02:39Z INFO 2416028 [PSUM_Allocator]:       find costs
+2024-06-01T06:02:40Z INFO 2403803 [post_scheduler]: Time-aware hwm post-sched
+2024-06-01T06:02:41Z INFO 2403803 (sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address
+2024-06-01T06:02:41Z USER 2403803 (sg00) [ModuleForkPass]: address_rotation_sb finished after 2.630 seconds
+2024-06-01T06:02:41Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  13024mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:02:41Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 155962 memory location(s), 1 block(s), and 337598 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:02:41Z USER 2403803 (sg00) [ModuleForkPass]: Running dma_optimization_sb
+2024-06-01T06:02:41Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=155962 blocks=1 instructions=337598 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:02:41Z INFO 2403803 (sg00) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 3881110936, 26.3746% input load, 3.1414% output write, 70.484% spill/reload [sg0000]
+2024-06-01T06:02:41Z INFO 2416028 [PSUM_Allocator]:       best-of-n loop, heuristic = 0
+2024-06-01T06:02:41Z INFO 2416028 [PSUM_Allocator]:         simplify interference graph
+2024-06-01T06:02:41Z INFO 2416028 [PSUM_Allocator]:           initialize low and high
+2024-06-01T06:02:41Z INFO 2416028 [PSUM_Allocator]:             lo = 30725
+2024-06-01T06:02:41Z INFO 2416028 [PSUM_Allocator]:             hi = 0
+2024-06-01T06:02:41Z INFO 2416028 [PSUM_Allocator]:             inf = 0
+2024-06-01T06:02:41Z INFO 2416028 [PSUM_Allocator]:             total = 30725
+2024-06-01T06:02:41Z INFO 2416028 [PSUM_Allocator]:           simplify
+2024-06-01T06:02:41Z INFO 2416028 [PSUM_Allocator]:             new candidates = 0
+2024-06-01T06:02:41Z INFO 2416028 [PSUM_Allocator]:         select ranges
+2024-06-01T06:02:41Z INFO 2403803 (sg00) [DMAOptimizationBase]: removed 0 identical load 
+2024-06-01T06:02:41Z INFO 2416028 [PSUM_Allocator]:           no more spills
+2024-06-01T06:02:41Z INFO 2416028 [PSUM_Allocator]:         PSUM score = 0 (lower is better)
+2024-06-01T06:02:41Z INFO 2416028 [PSUM_Allocator]: spilling from PSUM cost about 0 cycles
+2024-06-01T06:02:41Z INFO 2416028 [PSUM_Allocator]: 100% PSUM utilization after allocation
+2024-06-01T06:02:41Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 5033370252
+2024-06-01T06:02:41Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 4962 bytes
+2024-06-01T06:02:41Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 8644640
+2024-06-01T06:02:41Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 714 bytes
+2024-06-01T06:02:41Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 0
+2024-06-01T06:02:41Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 0 bytes
+2024-06-01T06:02:42Z USER 2416028 [ModuleForkPass]: coloring_allocator_psum finished after 3.488 seconds
+2024-06-01T06:02:42Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  1519mb, ru_maxrss:  1657mb (delta=0mb)
+2024-06-01T06:02:42Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62196 memory location(s), 1 block(s), and 276144 instruction(s). Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:42Z USER 2416028 [ModuleForkPass]: Running dma_optimization_psum
+2024-06-01T06:02:42Z INFO 2416028 [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=62196 blocks=1 instructions=276144 Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:42Z INFO 2403803 (sg00) [DMAOptimizationBase]: [Load Merging]: removed 711 remat/cloned instructions
+2024-06-01T06:02:42Z INFO 2416028 [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions
+2024-06-01T06:02:42Z INFO 2416028 [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations
+2024-06-01T06:02:42Z USER 2416028 [ModuleForkPass]: dma_optimization_psum finished after 0.270 seconds
+2024-06-01T06:02:42Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  1499mb, ru_maxrss:  1657mb (delta=0mb)
+2024-06-01T06:02:42Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62196 memory location(s), 1 block(s), and 276144 instruction(s). Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:42Z USER 2416028 [ModuleForkPass]: Running address_rotation_psum
+2024-06-01T06:02:42Z INFO 2416028 [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=62196 blocks=1 instructions=276144 Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:42Z INFO 2403803 (sg00) [DMAOptimizationBase]: [Load shrink]: shrinked 16 GCA remat/cloned instructions
+2024-06-01T06:02:42Z INFO 2403803 (sg00) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 300745088, 7.74894% out of total dma traffic(1.02363e+09)
+2024-06-01T06:02:42Z INFO 2403803 (sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs
+2024-06-01T06:02:42Z INFO 2416028 [DMAOptimizationBase]: PSUM Rotation rotated 690 PSUM Banks
+2024-06-01T06:02:42Z INFO 2416028 [DMAOptimizationBase]: PSUM Rotation rotated 48 PSUM Banks
+2024-06-01T06:02:43Z INFO 2416028 [DMAOptimizationBase]: PSUM Rotation rotated 454 PSUM Banks
+2024-06-01T06:02:43Z USER 2416028 [ModuleForkPass]: address_rotation_psum finished after 0.825 seconds
+2024-06-01T06:02:43Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  1554mb, ru_maxrss:  1657mb (delta=0mb)
+2024-06-01T06:02:43Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62196 memory location(s), 1 block(s), and 276144 instruction(s). Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:43Z USER 2416028 [ModuleForkPass]: Running coloring_allocator_sb
+2024-06-01T06:02:43Z INFO 2416028 [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=62196 blocks=1 instructions=276144 Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:43Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 5033370252
+2024-06-01T06:02:43Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 4962 bytes
+2024-06-01T06:02:43Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 8644640
+2024-06-01T06:02:43Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 714 bytes
+2024-06-01T06:02:43Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 0
+2024-06-01T06:02:43Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 0 bytes
+2024-06-01T06:02:43Z INFO 2416028 [ColoringAllocator::Rep]: Allocating functions
+2024-06-01T06:02:43Z INFO 2416028 [ColoringAllocator::Rep]:   linearize and check
+2024-06-01T06:02:43Z INFO 2416028 [SB_Allocator]:   allocating SB
+2024-06-01T06:02:43Z INFO 2416028 [SB_Allocator]:     main loop
+2024-06-01T06:02:43Z INFO 2416028 [SB_Allocator]:       renumber locations
+2024-06-01T06:02:43Z INFO 2416028 [SB_Allocator]:         size = 30915
+2024-06-01T06:02:43Z INFO 2416028 [SB_Allocator]:       find partners
+2024-06-01T06:02:43Z INFO 2416028 [SB_Allocator]:         found 30207 accumulation groups
+2024-06-01T06:02:43Z INFO 2416028 [SB_Allocator]:           largest = _dot.6260-t41980_i55
+2024-06-01T06:02:43Z INFO 2416028 [SB_Allocator]:             tensors = 32
+2024-06-01T06:02:43Z INFO 2416028 [SB_Allocator]:             requires 115200 bytes/partition
+2024-06-01T06:02:43Z INFO 2416028 [SB_Allocator]:       expanding partners
+2024-06-01T06:02:43Z INFO 2416028 [SB_Allocator]:       find first defs
+2024-06-01T06:02:43Z INFO 2416028 [SB_Allocator]:       find loads
+2024-06-01T06:02:43Z INFO 2416028 [SB_Allocator]:         0 pin count
+2024-06-01T06:02:43Z INFO 2416028 [SB_Allocator]:         8039 remat count
+2024-06-01T06:02:43Z INFO 2416028 [SB_Allocator]:       build interference graph
+2024-06-01T06:02:43Z INFO 2416028 [SB_Allocator]:         pass 1 int-tree
+2024-06-01T06:02:43Z INFO 2416028 [SB_Allocator]:            Num intervals 30915 Num locations 30915
+2024-06-01T06:02:43Z INFO 2416028 [SB_Allocator]:            IntervalTree Build Done
+2024-06-01T06:02:43Z INFO 2416028 [SB_Allocator]:            info.neighbors init Done
+2024-06-01T06:02:43Z INFO 2416028 [SB_Allocator]:            info.neighbors partners Done
+2024-06-01T06:02:43Z INFO 2416028 [SB_Allocator]:            IntervalTree readback Done
+2024-06-01T06:02:43Z INFO 2416028 [SB_Allocator]:         edge: 2159503
+2024-06-01T06:02:43Z INFO 2416028 [SB_Allocator]:         mean: 139.706
+2024-06-01T06:02:43Z INFO 2416028 [SB_Allocator]:         median: 97.3782
+2024-06-01T06:02:43Z INFO 2416028 [SB_Allocator]:       find costs
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:       best-of-n loop, heuristic = 0
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:         simplify interference graph
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:           initialize safe & unsafe
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:               safe = 21190
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:             unsafe = 4863
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:                inf = 4862
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:              total = 30915
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:           simplify
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]: simplify_step3_sorted2 #Unsafe 3526 #Pinned 0 #Safe 0 minCost 0.00110361 maxCost 0.305498 locations 30915
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:             new candidates = 1313
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:             (including 2048 infinite cost tensors)
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:         select ranges
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:           Total: 30915
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:             Spilled: 0.000 (0)
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:             Allocated: 1.000 (30915)
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:               Rover zone: 0.854 (26398)
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:               Pre-rover zone: 0.073 (2250)
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:               Post-rover zone: 0.073 (2267)
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:               Slice zone: 0.000 (0)
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:               Blocks nothing: 0.119 (3666)
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:               Blocks medium: 0.015 (479)
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:                 Visited until medium blocking (mean): 0.702
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:                 Visited until medium blocking (median): 0.801
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:                 Visited until medium blocking (p95): 0.888
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:               Blocks tall: 0.866 (26770)
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:                 Visited until tall blocking (mean): 0.867
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:                 Visited until tall blocking (median): 1.000
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:                 Visited until tall blocking (p95): 1.000
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:           Success
+2024-06-01T06:02:44Z INFO 2403803 (sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 1024 spill/reload instructions
+2024-06-01T06:02:44Z INFO 2403803 (sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 1024 spill/reload memory locations
+2024-06-01T06:02:44Z USER 2403803 (sg01) [ModuleForkPass]: dma_optimization_psum finished after 5.621 seconds
+2024-06-01T06:02:44Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  13041mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:           SB spills = 0 tensors
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:                size = 0 bytes/partition
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:              remats = 0 tensors
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:            unpinned = 0 tensors
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:                size = 0 bytes/partition
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]:         SB score = 0
+2024-06-01T06:02:44Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 212588 memory location(s), 1 block(s), and 682177 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:02:44Z USER 2403803 (sg01) [ModuleForkPass]: Running address_rotation_psum
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]: spilling from SB cost about 0 cycles
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]: 0 bytes/partition (0%) successfully pinned
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]: pinning saved approximately 0 cycles
+2024-06-01T06:02:44Z INFO 2416028 [SB_Allocator]: 0% SB utilization after allocation
+2024-06-01T06:02:44Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=212588 blocks=1 instructions=682177 Max writers: 576 Max Readers: 184036
+2024-06-01T06:02:44Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 5033370252
+2024-06-01T06:02:44Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 4962 bytes
+2024-06-01T06:02:44Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 8644640
+2024-06-01T06:02:44Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 714 bytes
+2024-06-01T06:02:44Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 0
+2024-06-01T06:02:44Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 0 bytes
+2024-06-01T06:02:44Z USER 2416028 [ModuleForkPass]: coloring_allocator_sb finished after 1.486 seconds
+2024-06-01T06:02:44Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  1559mb, ru_maxrss:  1657mb (delta=0mb)
+2024-06-01T06:02:44Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62196 memory location(s), 1 block(s), and 276144 instruction(s). Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:44Z USER 2416028 [ModuleForkPass]: Running address_rotation_sb
+2024-06-01T06:02:44Z INFO 2416028 [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=62196 blocks=1 instructions=276144 Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:45Z INFO 2416028 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address
+2024-06-01T06:02:45Z USER 2416028 [ModuleForkPass]: address_rotation_sb finished after 0.486 seconds
+2024-06-01T06:02:45Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  1501mb, ru_maxrss:  1657mb (delta=0mb)
+2024-06-01T06:02:45Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62196 memory location(s), 1 block(s), and 276144 instruction(s). Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:45Z USER 2416028 [ModuleForkPass]: Running dma_optimization_sb
+2024-06-01T06:02:45Z INFO 2416028 [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=62196 blocks=1 instructions=276144 Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:45Z INFO 2416028 [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 5042014892, 99.6622% input load, 0.00507797% output write, 0.332748% spill/reload [sg0000]
+2024-06-01T06:02:45Z INFO 2416028 [DMAOptimizationBase]: removed 0 identical load 
+2024-06-01T06:02:45Z INFO 2416028 [DMAOptimizationBase]: [Load Merging]: removed 33 remat/cloned instructions
+2024-06-01T06:02:45Z INFO 2403803 (sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 4299 spill/reload instructions
+2024-06-01T06:02:45Z INFO 2403803 (sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 4298 spill/reload memory locations
+2024-06-01T06:02:45Z INFO 2416028 [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions
+2024-06-01T06:02:45Z INFO 2416028 [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 132, 2.618e-06% out of total dma traffic(5.02498e+09)
+2024-06-01T06:02:45Z INFO 2416028 [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs
+2024-06-01T06:02:45Z INFO 2416028 [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions
+2024-06-01T06:02:45Z INFO 2416028 [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations
+2024-06-01T06:02:45Z INFO 2416028 [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic
+2024-06-01T06:02:45Z INFO 2403803 (sg02) [SB_Allocator]:           Total: 134342
+2024-06-01T06:02:45Z INFO 2403803 (sg02) [SB_Allocator]:             Spilled: 0.017 (2281)
+2024-06-01T06:02:45Z INFO 2403803 (sg02) [SB_Allocator]:             Allocated: 0.983 (132061)
+2024-06-01T06:02:45Z INFO 2403803 (sg02) [SB_Allocator]:               Rover zone: 0.270 (35703)
+2024-06-01T06:02:45Z INFO 2403803 (sg02) [SB_Allocator]:               Pre-rover zone: 0.009 (1145)
+2024-06-01T06:02:45Z INFO 2403803 (sg02) [SB_Allocator]:               Post-rover zone: 0.721 (95213)
+2024-06-01T06:02:45Z INFO 2403803 (sg02) [SB_Allocator]:               Slice zone: 0.000 (0)
+2024-06-01T06:02:45Z INFO 2403803 (sg02) [SB_Allocator]:               Blocks nothing: 0.000 (0)
+2024-06-01T06:02:45Z INFO 2403803 (sg02) [SB_Allocator]:               Blocks medium: 0.000 (0)
+2024-06-01T06:02:45Z INFO 2403803 (sg02) [SB_Allocator]:               Blocks tall: 1.000 (132061)
+2024-06-01T06:02:45Z INFO 2403803 (sg02) [SB_Allocator]:                 Visited until tall blocking (mean): 0.974
+2024-06-01T06:02:45Z INFO 2403803 (sg02) [SB_Allocator]:                 Visited until tall blocking (median): 1.000
+2024-06-01T06:02:45Z INFO 2403803 (sg02) [SB_Allocator]:                 Visited until tall blocking (p95): 1.000
+2024-06-01T06:02:45Z INFO 2403803 (sg02) [SB_Allocator]:           Success
+2024-06-01T06:02:46Z INFO 2403803 (sg02) [SB_Allocator]:           SB spills = 2281 tensors
+2024-06-01T06:02:46Z INFO 2403803 (sg02) [SB_Allocator]:                size = 4281992 bytes/partition
+2024-06-01T06:02:46Z INFO 2403803 (sg02) [SB_Allocator]:              remats = 31 tensors
+2024-06-01T06:02:46Z INFO 2403803 (sg02) [SB_Allocator]:            unpinned = 0 tensors
+2024-06-01T06:02:46Z INFO 2403803 (sg02) [SB_Allocator]:                size = 0 bytes/partition
+2024-06-01T06:02:46Z INFO 2403803 (sg02) [SB_Allocator]:         SB score = 3.64866e+07
+2024-06-01T06:02:46Z INFO 2403803 (sg02) [SB_Allocator]:       best SB heuristic = 0
+2024-06-01T06:02:46Z INFO 2403803 (sg02) [SB_Allocator]:       collect spills
+2024-06-01T06:02:46Z INFO 2416028 [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions
+2024-06-01T06:02:46Z INFO 2416028 [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations
+2024-06-01T06:02:46Z INFO 2403803 [post_scheduler]: Time-aware simulation time: 62776810
+2024-06-01T06:02:46Z INFO 2416028 [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic
+2024-06-01T06:02:46Z INFO 2403803 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 13726 PSUM Banks
+2024-06-01T06:02:46Z INFO 2416028 [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs
+2024-06-01T06:02:47Z INFO 2416028 [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions
+2024-06-01T06:02:47Z INFO 2416028 [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations
+2024-06-01T06:02:47Z INFO 2416028 [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic
+2024-06-01T06:02:47Z INFO 2416028 [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions
+2024-06-01T06:02:47Z INFO 2416028 [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions
+2024-06-01T06:02:47Z INFO 2416028 [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations
+2024-06-01T06:02:47Z INFO 2403803 [post_scheduler]: Done  PosT ScheD Sat Jun  1 06:02:47 2024
+2024-06-01T06:02:47Z USER 2403803 (sg03) [ModuleForkPass]: post_sched finished after 17.937 seconds
+2024-06-01T06:02:47Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  13133mb, ru_maxrss:  13538mb (delta=45mb)
+2024-06-01T06:02:47Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:02:47Z USER 2403803 (sg03) [ModuleForkPass]: Running address_rotation_sb
+2024-06-01T06:02:47Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:02:48Z INFO 2403803 (sg02) [SB_Allocator]:       insert spills
+2024-06-01T06:02:48Z INFO 2403803 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 1994 PSUM Banks
+2024-06-01T06:02:48Z INFO 2416028 [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads
+2024-06-01T06:02:48Z INFO 2416028 [DMAOptimizationBase]:  average loaded DMA size 4962 bytes
+2024-06-01T06:02:48Z INFO 2416028 [DMAOptimizationBase]:  average saved DMA size 714 bytes
+2024-06-01T06:02:49Z INFO 2416028 [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 5033370120
+2024-06-01T06:02:49Z INFO 2416028 [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 4962 bytes
+2024-06-01T06:02:49Z INFO 2416028 [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 8644640
+2024-06-01T06:02:49Z INFO 2416028 [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 714 bytes
+2024-06-01T06:02:49Z INFO 2403803 (sg02) [SB_Allocator]:       deleting loads #loadsToDelete: 31
+2024-06-01T06:02:49Z INFO 2403803 (sg02) [SB_Allocator]:       deleting locs #locationsToDelete: 31
+2024-06-01T06:02:49Z INFO 2403803 (sg02) [SB_Allocator]:       locationsToDelete done
+2024-06-01T06:02:49Z INFO 2416028 [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs
+2024-06-01T06:02:49Z INFO 2416028 [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic
+2024-06-01T06:02:49Z INFO 2416028 [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 132, 2.618e-06% out of total dma traffic
+2024-06-01T06:02:49Z INFO 2416028 [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 5042014760, 99.6622% input load, 0.00507797% output write, 0.332748% spill/reload [sg0000]
+2024-06-01T06:02:49Z INFO 2416028 [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 5033370120
+2024-06-01T06:02:49Z INFO 2416028 [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 4962 bytes
+2024-06-01T06:02:49Z INFO 2416028 [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 8644640
+2024-06-01T06:02:49Z INFO 2416028 [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 714 bytes
+2024-06-01T06:02:50Z INFO 2416028 [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 0
+2024-06-01T06:02:50Z INFO 2416028 [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 0 bytes
+2024-06-01T06:02:50Z INFO 2416028 [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 4911 bytes
+2024-06-01T06:02:50Z INFO 2416028 [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module);
+2024-06-01T06:02:50Z INFO 2416028 [DMAOptimizationBase]: DMA optimization re-enable optimization
+2024-06-01T06:02:50Z USER 2416028 [ModuleForkPass]: dma_optimization_sb finished after 5.068 seconds
+2024-06-01T06:02:50Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  1550mb, ru_maxrss:  1657mb (delta=0mb)
+2024-06-01T06:02:50Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62163 memory location(s), 1 block(s), and 276111 instruction(s). Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:50Z USER 2416028 [ModuleForkPass]: Running address_rotation_sb
+2024-06-01T06:02:50Z INFO 2416028 [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=62163 blocks=1 instructions=276111 Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:50Z INFO 2403803 (sg03) [DMAOptimizationBase]: PSUM Rotation rotated 5748 PSUM Banks
+2024-06-01T06:02:50Z INFO 2416028 [DMAOptimizationBase]: SB Rotation rotated 126 Sb address
+2024-06-01T06:02:51Z INFO 2403803 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 4114 PSUM Banks
+2024-06-01T06:02:51Z USER 2403803 (sg01) [ModuleForkPass]: address_rotation_psum finished after 6.519 seconds
+2024-06-01T06:02:51Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  13012mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:02:51Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 212588 memory location(s), 1 block(s), and 682177 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:02:51Z USER 2403803 (sg01) [ModuleForkPass]: Running coloring_allocator_sb
+2024-06-01T06:02:51Z INFO 2416028 [DMAOptimizationBase]: SB Rotation rotated 3619 Sb address
+2024-06-01T06:02:51Z INFO 2403803 (sg03) [DMAOptimizationBase]: PSUM Rotation rotated 4306 PSUM Banks
+2024-06-01T06:02:51Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=212588 blocks=1 instructions=682177 Max writers: 576 Max Readers: 184036
+2024-06-01T06:02:51Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 9089670928
+2024-06-01T06:02:51Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 2980 bytes
+2024-06-01T06:02:51Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 1312604160
+2024-06-01T06:02:51Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 3303 bytes
+2024-06-01T06:02:52Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 32768
+2024-06-01T06:02:52Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:02:52Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: Allocating functions
+2024-06-01T06:02:52Z INFO 2403803 (sg01) [ColoringAllocator::Rep]:   linearize and check
+2024-06-01T06:02:52Z INFO 2403803 (sg00) [DMAOptimizationBase]: [spill optimization round 1]: removed 339 spill/reload instructions
+2024-06-01T06:02:52Z INFO 2403803 (sg00) [DMAOptimizationBase]: [spill optimization round 1]: removed 335 spill/reload memory locations
+2024-06-01T06:02:52Z INFO 2416028 [DMAOptimizationBase]: SB Rotation rotated 893 Sb address
+2024-06-01T06:02:52Z INFO 2403803 (sg02) [SB_Allocator]:     main loop
+2024-06-01T06:02:52Z INFO 2403803 (sg03) [DMAOptimizationBase]: PSUM Rotation rotated 623 PSUM Banks
+2024-06-01T06:02:53Z INFO 2403803 (sg03) [DMAOptimizationBase]: SB Rotation rotated 13 Sb address
+2024-06-01T06:02:53Z INFO 2403803 (sg02) [SB_Allocator]:       renumber locations
+2024-06-01T06:02:53Z INFO 2403803 (sg02) [SB_Allocator]:         size = 147483
+2024-06-01T06:02:53Z INFO 2403803 (sg02) [SB_Allocator]:       find partners
+2024-06-01T06:02:53Z INFO 2416028 [DMAOptimizationBase]: SB Rotation rotated 255 Sb address
+2024-06-01T06:02:53Z INFO 2403803 (sg01) [SB_Allocator]:   allocating SB
+2024-06-01T06:02:53Z INFO 2403803 (sg01) [SB_Allocator]:     main loop
+2024-06-01T06:02:53Z INFO 2403803 (sg03) [DMAOptimizationBase]: SB Rotation rotated 43 Sb address
+2024-06-01T06:02:53Z INFO 2403803 (sg01) [SB_Allocator]:       renumber locations
+2024-06-01T06:02:53Z INFO 2403803 (sg01) [SB_Allocator]:         size = 134342
+2024-06-01T06:02:53Z INFO 2403803 (sg01) [SB_Allocator]:       find partners
+2024-06-01T06:02:54Z INFO 2403803 (sg02) [SB_Allocator]:         found 74858 accumulation groups
+2024-06-01T06:02:54Z INFO 2403803 (sg02) [SB_Allocator]:           largest = _dot.651-t806_i1590
+2024-06-01T06:02:54Z INFO 2403803 (sg02) [SB_Allocator]:             tensors = 64
+2024-06-01T06:02:54Z INFO 2403803 (sg02) [SB_Allocator]:             requires 147456 bytes/partition
+2024-06-01T06:02:54Z INFO 2403803 (sg02) [SB_Allocator]:       expanding partners
+2024-06-01T06:02:54Z INFO 2416028 [DMAOptimizationBase]: SB Rotation rotated 3210 Sb address
+2024-06-01T06:02:54Z INFO 2403803 (sg01) [SB_Allocator]:         found 74858 accumulation groups
+2024-06-01T06:02:54Z INFO 2403803 (sg01) [SB_Allocator]:           largest = _dot.510-t806_i279
+2024-06-01T06:02:54Z INFO 2403803 (sg01) [SB_Allocator]:             tensors = 64
+2024-06-01T06:02:54Z INFO 2403803 (sg01) [SB_Allocator]:             requires 147456 bytes/partition
+2024-06-01T06:02:54Z INFO 2403803 (sg01) [SB_Allocator]:       expanding partners
+2024-06-01T06:02:54Z INFO 2403803 (sg03) [DMAOptimizationBase]: SB Rotation rotated 14 Sb address
+2024-06-01T06:02:54Z INFO 2416028 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address
+2024-06-01T06:02:54Z USER 2416028 [ModuleForkPass]: address_rotation_sb finished after 4.305 seconds
+2024-06-01T06:02:54Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  1531mb, ru_maxrss:  1657mb (delta=0mb)
+2024-06-01T06:02:54Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62163 memory location(s), 1 block(s), and 276111 instruction(s). Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:54Z USER 2416028 [ModuleForkPass]: Running coloring_allocator_dram
+2024-06-01T06:02:54Z INFO 2416028 [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=62163 blocks=1 instructions=276111 Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:54Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 5033370120
+2024-06-01T06:02:54Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 4962 bytes
+2024-06-01T06:02:54Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 8644640
+2024-06-01T06:02:54Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 714 bytes
+2024-06-01T06:02:54Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 0
+2024-06-01T06:02:54Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 0 bytes
+2024-06-01T06:02:54Z INFO 2416028 [ColoringAllocator::Rep]: Allocating functions
+2024-06-01T06:02:54Z INFO 2416028 [ColoringAllocator::Rep]:   linearize and check
+2024-06-01T06:02:55Z INFO 2416028 [DRAM_Allocator]:   allocating spills in DRAM pre_link mode
+2024-06-01T06:02:55Z INFO 2416028 [DRAM_Allocator]:     reserved space = 6493817196 bytes
+2024-06-01T06:02:55Z INFO 2416028 [DRAM_Allocator]:     spill space = 16908288 bytes
+2024-06-01T06:02:55Z INFO 2416028 [DRAM_Allocator]:     aligned spill space = 16908288 bytes
+2024-06-01T06:02:55Z INFO 2416028 [DRAM_Allocator]:     dram space = 107374182400 bytes
+2024-06-01T06:02:55Z INFO 2416028 [DRAM_Allocator]:     renumber locations
+2024-06-01T06:02:55Z INFO 2416028 [DRAM_Allocator]:         size = 129
+2024-06-01T06:02:55Z INFO 2416028 [DRAM_Allocator]:       find first defs
+2024-06-01T06:02:55Z INFO 2403803 (sg03) [DMAOptimizationBase]: SB Rotation rotated 729 Sb address
+2024-06-01T06:02:55Z INFO 2416028 [DRAM_Allocator]:            Num intervals 129 Num locations 129
+2024-06-01T06:02:55Z INFO 2416028 [DRAM_Allocator]:            IntervalTree Build Done
+2024-06-01T06:02:55Z INFO 2416028 [DRAM_Allocator]:            info.neighbors init Done
+2024-06-01T06:02:55Z INFO 2416028 [DRAM_Allocator]:            IntervalTree readback Done
+2024-06-01T06:02:55Z INFO 2416028 [DRAM_Allocator]:         simplify interference graph
+2024-06-01T06:02:55Z INFO 2416028 [DRAM_Allocator]:           initialize low and high
+2024-06-01T06:02:55Z INFO 2416028 [DRAM_Allocator]:             lo = 129
+2024-06-01T06:02:55Z INFO 2416028 [DRAM_Allocator]:             hi = 0
+2024-06-01T06:02:55Z INFO 2416028 [DRAM_Allocator]:             total = 129
+2024-06-01T06:02:55Z INFO 2416028 [DRAM_Allocator]:           simplify
+2024-06-01T06:02:55Z INFO 2416028 [DRAM_Allocator]:             new candidates = 0
+2024-06-01T06:02:55Z INFO 2416028 [DRAM_Allocator]:         select ranges
+2024-06-01T06:02:55Z INFO 2416028 [DRAM_Allocator]: CC buffer size limit 524288000
+2024-06-01T06:02:55Z INFO 2416028 [DRAM_Allocator]: allreduce_dram_hwm 16777216
+2024-06-01T06:02:55Z INFO 2416028 [DRAM_Allocator]: Real CC buffer size 16777216
+2024-06-01T06:02:55Z INFO 2416028 [DRAM_Allocator]: DRAM hwm after allocation: 16908288
+2024-06-01T06:02:55Z INFO 2416028 [DRAM_Allocator]: DRAM allocation successful
+2024-06-01T06:02:55Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 5033370120
+2024-06-01T06:02:55Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 4962 bytes
+2024-06-01T06:02:55Z INFO 2403803 (sg01) [SB_Allocator]:       find first defs
+2024-06-01T06:02:55Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 8644640
+2024-06-01T06:02:55Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 714 bytes
+2024-06-01T06:02:55Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 0
+2024-06-01T06:02:55Z INFO 2416028 [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 0 bytes
+2024-06-01T06:02:55Z USER 2416028 [ModuleForkPass]: coloring_allocator_dram finished after 1.316 seconds
+2024-06-01T06:02:55Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  1534mb, ru_maxrss:  1657mb (delta=0mb)
+2024-06-01T06:02:55Z INFO 2403803 (sg02) [SB_Allocator]:       find first defs
+2024-06-01T06:02:55Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62163 memory location(s), 1 block(s), and 276111 instruction(s). Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:55Z USER 2416028 [ModuleForkPass]: Running address_rotation_dram
+2024-06-01T06:02:55Z INFO 2416028 [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=62163 blocks=1 instructions=276111 Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:55Z INFO 2416028 [DMAOptimizationBase]: Runtime page size at 512MB
+2024-06-01T06:02:56Z INFO 2403803 (sg01) [SB_Allocator]:       find loads
+2024-06-01T06:02:56Z INFO 2416028 [DMAOptimizationBase]: DRAM hwm before rotation 16908288
+2024-06-01T06:02:56Z INFO 2403803 (sg03) [DMAOptimizationBase]: SB Rotation rotated 2854 Sb address
+2024-06-01T06:02:56Z INFO 2403803 (sg02) [SB_Allocator]:       find loads
+2024-06-01T06:02:56Z INFO 2403803 (sg01) [SB_Allocator]:         0 pin count
+2024-06-01T06:02:56Z INFO 2403803 (sg01) [SB_Allocator]:         23447 remat count
+2024-06-01T06:02:56Z INFO 2403803 (sg01) [SB_Allocator]:       build interference graph
+2024-06-01T06:02:56Z INFO 2403803 (sg01) [SB_Allocator]:         pass 1 int-tree
+2024-06-01T06:02:56Z INFO 2416028 [DMAOptimizationBase]: allreduce buffer size 524288000
+2024-06-01T06:02:56Z INFO 2416028 [DMAOptimizationBase]: allreduce hwm 16777216
+2024-06-01T06:02:56Z INFO 2416028 [DMAOptimizationBase]: Real CC buffer size 16777216
+2024-06-01T06:02:56Z INFO 2416028 [DMAOptimizationBase]: DRAM hwm after rotation 16908288
+2024-06-01T06:02:56Z INFO 2416028 [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address
+2024-06-01T06:02:56Z USER 2416028 [ModuleForkPass]: address_rotation_dram finished after 0.984 seconds
+2024-06-01T06:02:56Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  1531mb, ru_maxrss:  1657mb (delta=0mb)
+2024-06-01T06:02:56Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62163 memory location(s), 1 block(s), and 276111 instruction(s). Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:56Z USER 2416028 [ModuleForkPass]: Running tensorcopy_accel
+2024-06-01T06:02:56Z INFO 2416028 [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=62163 blocks=1 instructions=276111 Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:56Z INFO 2416028 [TensorCopyAccel::Impl]: Running peephole optimization pass
+2024-06-01T06:02:56Z INFO 2403803 (sg01) [SB_Allocator]:            Num intervals 134342 Num locations 134342
+2024-06-01T06:02:56Z INFO 2403803 (sg01) [SB_Allocator]:            IntervalTree Build Done
+2024-06-01T06:02:57Z INFO 2403803 (sg01) [SB_Allocator]:            info.neighbors init Done
+2024-06-01T06:02:57Z INFO 2416028 [TensorCopyAccel::Impl]: Accelerated 0 out of 26058 tensorcopy in Function: sg0000 average acceleration factor: -nan
+2024-06-01T06:02:57Z USER 2416028 [ModuleForkPass]: tensorcopy_accel finished after 0.090 seconds
+2024-06-01T06:02:57Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  1528mb, ru_maxrss:  1657mb (delta=0mb)
+2024-06-01T06:02:57Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62163 memory location(s), 1 block(s), and 276111 instruction(s). Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:57Z USER 2416028 [ModuleForkPass]: Running peephole_opts
+2024-06-01T06:02:57Z INFO 2403803 (sg02) [SB_Allocator]:         0 pin count
+2024-06-01T06:02:57Z INFO 2403803 (sg02) [SB_Allocator]:         35277 remat count
+2024-06-01T06:02:57Z INFO 2403803 (sg02) [SB_Allocator]:       build interference graph
+2024-06-01T06:02:57Z INFO 2403803 (sg02) [SB_Allocator]:         pass 1 int-tree
+2024-06-01T06:02:57Z INFO 2416028 [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=62163 blocks=1 instructions=276111 Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:57Z INFO 2416028 [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: true SplitSelect: true
+2024-06-01T06:02:57Z INFO 2403803 (sg03) [DMAOptimizationBase]: SB Rotation rotated 64 Sb address
+2024-06-01T06:02:57Z INFO 2416028 [PeepholeOpts]: Split Select: 960
+2024-06-01T06:02:57Z USER 2416028 [ModuleForkPass]: peephole_opts finished after 0.104 seconds
+2024-06-01T06:02:57Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  1532mb, ru_maxrss:  1657mb (delta=0mb)
+2024-06-01T06:02:57Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62163 memory location(s), 1 block(s), and 277071 instruction(s). Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:57Z USER 2416028 [ModuleForkPass]: Running lower_kernel
+2024-06-01T06:02:57Z INFO 2416028 [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=62163 blocks=1 instructions=277071 Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:57Z INFO 2416028 [LowerKernel]: Started running LowerKernel
+2024-06-01T06:02:57Z INFO 2416028 [LowerKernel]: Start of kernel lowering pass, number of insts: 277071, number of allocs: 62163
+2024-06-01T06:02:57Z INFO 2403803 (sg01) [SB_Allocator]:            info.neighbors partners Done
+2024-06-01T06:02:57Z INFO 2416028 [LowerKernel]: Scan BKs time (s): 0.051899
+2024-06-01T06:02:57Z INFO 2416028 [LowerKernel]: Lower BKs time (s): 6e-06
+2024-06-01T06:02:57Z USER 2416028 [ModuleForkPass]: lower_kernel finished after 0.084 seconds
+2024-06-01T06:02:57Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  1532mb, ru_maxrss:  1657mb (delta=0mb)
+2024-06-01T06:02:57Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62163 memory location(s), 1 block(s), and 277071 instruction(s). Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:57Z USER 2416028 [ModuleForkPass]: Running build_fdeps
+2024-06-01T06:02:57Z INFO 2416028 [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=62163 blocks=1 instructions=277071 Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:57Z INFO 2416028 [build_flow_deps]: Start build fdeps. Invocation: 2Sat Jun  1 06:02:57 2024
+2024-06-01T06:02:57Z INFO 2416028 [build_flow_deps]: Allocs: 62163 instructions: 277071
+2024-06-01T06:02:57Z INFO 2403803 (sg01) [SB_Allocator]:            IntervalTree readback Done
+2024-06-01T06:02:57Z INFO 2403803 (sg02) [SB_Allocator]:            Num intervals 147483 Num locations 147483
+2024-06-01T06:02:57Z INFO 2403803 (sg02) [SB_Allocator]:            IntervalTree Build Done
+2024-06-01T06:02:57Z INFO 2403803 (sg01) [SB_Allocator]:         edge: 68199142
+2024-06-01T06:02:57Z INFO 2403803 (sg01) [SB_Allocator]:         mean: 1015.31
+2024-06-01T06:02:57Z INFO 2403803 (sg01) [SB_Allocator]:         median: 625.034
+2024-06-01T06:02:57Z INFO 2403803 (sg01) [SB_Allocator]:       find costs
+2024-06-01T06:02:57Z INFO 2403803 (sg02) [SB_Allocator]:            info.neighbors init Done
+2024-06-01T06:02:57Z INFO 2403803 (sg00) [DMAOptimizationBase]: [spill optimization round 2]: removed 44 spill/reload instructions
+2024-06-01T06:02:57Z INFO 2403803 (sg00) [DMAOptimizationBase]: [spill optimization round 2]: removed 44 spill/reload memory locations
+2024-06-01T06:02:57Z INFO 2403803 (sg02) [SB_Allocator]:            info.neighbors partners Done
+2024-06-01T06:02:57Z INFO 2403803 (sg02) [SB_Allocator]:            IntervalTree readback Done
+2024-06-01T06:02:57Z INFO 2403803 (sg02) [SB_Allocator]:         edge: 8888587
+2024-06-01T06:02:57Z INFO 2403803 (sg02) [SB_Allocator]:         mean: 120.537
+2024-06-01T06:02:57Z INFO 2403803 (sg02) [SB_Allocator]:         median: 89.3004
+2024-06-01T06:02:57Z INFO 2403803 (sg02) [SB_Allocator]:       find costs
+2024-06-01T06:02:57Z INFO 2403803 (sg03) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address
+2024-06-01T06:02:57Z USER 2403803 (sg03) [ModuleForkPass]: address_rotation_sb finished after 9.889 seconds
+2024-06-01T06:02:57Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  13251mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:02:57Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:02:57Z USER 2403803 (sg03) [ModuleForkPass]: Running anti_dependency_analyzer
+2024-06-01T06:02:57Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:02:57Z INFO 2403803 (sg03) [AntiDependencyAnalyzer]: Batch size: 1000
+2024-06-01T06:02:57Z INFO 2403803 (sg03) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB}
+2024-06-01T06:02:58Z INFO 2403803 (sg01) [SB_Allocator]:       best-of-n loop, heuristic = 0
+2024-06-01T06:02:58Z INFO 2403803 (sg01) [SB_Allocator]:         simplify interference graph
+2024-06-01T06:02:58Z INFO 2403803 (sg01) [SB_Allocator]:           initialize safe & unsafe
+2024-06-01T06:02:58Z INFO 2416028 [build_flow_deps]: Build fdeps inserted 821345 edges 
+2024-06-01T06:02:58Z INFO 2416028 [build_flow_deps]: Done build fdeps 821345 Sat Jun  1 06:02:58 2024
+2024-06-01T06:02:58Z USER 2416028 [ModuleForkPass]: build_fdeps finished after 0.949 seconds
+2024-06-01T06:02:58Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  1671mb, ru_maxrss:  1671mb (delta=14mb)
+2024-06-01T06:02:58Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62163 memory location(s), 1 block(s), and 277071 instruction(s). Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:58Z USER 2416028 [ModuleForkPass]: Running remove_redundancies
+2024-06-01T06:02:58Z INFO 2403803 (sg01) [SB_Allocator]:               safe = 3343
+2024-06-01T06:02:58Z INFO 2403803 (sg01) [SB_Allocator]:             unsafe = 97022
+2024-06-01T06:02:58Z INFO 2403803 (sg01) [SB_Allocator]:                inf = 33977
+2024-06-01T06:02:58Z INFO 2403803 (sg01) [SB_Allocator]:              total = 134342
+2024-06-01T06:02:58Z INFO 2403803 (sg01) [SB_Allocator]:           simplify
+2024-06-01T06:02:58Z INFO 2403803 (sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 96936 #Pinned 0 #Safe 0 minCost 4.24566e-05 maxCost 0.0553019 locations 134342
+2024-06-01T06:02:58Z INFO 2416028 [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=62163 blocks=1 instructions=277071 Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:58Z INFO 2416028 [RemoveRedundancies]: remove_clobbered_writes
+2024-06-01T06:02:58Z INFO 2416028 [RemoveRedundancies]: remove_clobbered_writes: 0
+2024-06-01T06:02:58Z INFO 2416028 [RemoveRedundancies]: remove_useless_insts
+2024-06-01T06:02:58Z INFO 2403803 (sg02) [SB_Allocator]:       best-of-n loop, heuristic = 0
+2024-06-01T06:02:58Z INFO 2403803 (sg02) [SB_Allocator]:         simplify interference graph
+2024-06-01T06:02:58Z INFO 2403803 (sg02) [SB_Allocator]:           initialize safe & unsafe
+2024-06-01T06:02:58Z INFO 2403803 (sg02) [SB_Allocator]:               safe = 12629
+2024-06-01T06:02:58Z INFO 2403803 (sg02) [SB_Allocator]:             unsafe = 550
+2024-06-01T06:02:58Z INFO 2403803 (sg02) [SB_Allocator]:                inf = 2243
+2024-06-01T06:02:58Z INFO 2403803 (sg02) [SB_Allocator]:              total = 15422
+2024-06-01T06:02:58Z INFO 2403803 (sg02) [SB_Allocator]:           simplify
+2024-06-01T06:02:58Z INFO 2403803 (sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 470 #Pinned 0 #Safe 0 minCost 0.0167677 maxCost 0.0314575 locations 147483
+2024-06-01T06:02:58Z INFO 2416028 [RemoveRedundancies]: remove Useless Instructions: 0
+2024-06-01T06:02:58Z USER 2416028 [ModuleForkPass]: remove_redundancies finished after 0.168 seconds
+2024-06-01T06:02:58Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  1661mb, ru_maxrss:  1671mb (delta=0mb)
+2024-06-01T06:02:58Z INFO 2403803 (sg02) [SB_Allocator]:             new candidates = 453
+2024-06-01T06:02:58Z INFO 2403803 (sg02) [SB_Allocator]:             (including 2243 infinite cost tensors)
+2024-06-01T06:02:58Z INFO 2403803 (sg02) [SB_Allocator]:         select ranges
+2024-06-01T06:02:58Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62163 memory location(s), 1 block(s), and 277071 instruction(s). Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:58Z USER 2416028 [ModuleForkPass]: Running anti_dependency_analyzer
+2024-06-01T06:02:58Z INFO 2416028 [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=62163 blocks=1 instructions=277071 Max writers: 256 Max Readers: 66720
+2024-06-01T06:02:58Z INFO 2416028 [AntiDependencyAnalyzer]: Batch size: 1000
+2024-06-01T06:02:58Z INFO 2416028 [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB}
+2024-06-01T06:02:58Z INFO 2403803 (sg02) [SB_Allocator]:           Total: 15422
+2024-06-01T06:02:58Z INFO 2403803 (sg02) [SB_Allocator]:             Spilled: 0.000 (0)
+2024-06-01T06:02:58Z INFO 2403803 (sg02) [SB_Allocator]:             Allocated: 1.000 (15422)
+2024-06-01T06:02:58Z INFO 2403803 (sg02) [SB_Allocator]:               Rover zone: 0.942 (14528)
+2024-06-01T06:02:58Z INFO 2403803 (sg02) [SB_Allocator]:               Pre-rover zone: 0.013 (193)
+2024-06-01T06:02:58Z INFO 2403803 (sg02) [SB_Allocator]:               Post-rover zone: 0.045 (701)
+2024-06-01T06:02:58Z INFO 2403803 (sg02) [SB_Allocator]:               Slice zone: 0.000 (0)
+2024-06-01T06:02:58Z INFO 2403803 (sg02) [SB_Allocator]:               Blocks nothing: 0.010 (160)
+2024-06-01T06:02:58Z INFO 2403803 (sg02) [SB_Allocator]:               Blocks medium: 0.000 (0)
+2024-06-01T06:02:58Z INFO 2403803 (sg02) [SB_Allocator]:               Blocks tall: 0.990 (15262)
+2024-06-01T06:02:58Z INFO 2403803 (sg02) [SB_Allocator]:                 Visited until tall blocking (mean): 0.990
+2024-06-01T06:02:58Z INFO 2403803 (sg02) [SB_Allocator]:                 Visited until tall blocking (median): 1.000
+2024-06-01T06:02:58Z INFO 2403803 (sg02) [SB_Allocator]:                 Visited until tall blocking (p95): 1.000
+2024-06-01T06:02:58Z INFO 2403803 (sg02) [SB_Allocator]:           Success
+2024-06-01T06:02:58Z INFO 2403803 (sg02) [SB_Allocator]:           SB spills = 0 tensors
+2024-06-01T06:02:58Z INFO 2403803 (sg02) [SB_Allocator]:                size = 0 bytes/partition
+2024-06-01T06:02:58Z INFO 2403803 (sg02) [SB_Allocator]:              remats = 0 tensors
+2024-06-01T06:02:58Z INFO 2403803 (sg02) [SB_Allocator]:            unpinned = 0 tensors
+2024-06-01T06:02:58Z INFO 2403803 (sg02) [SB_Allocator]:                size = 0 bytes/partition
+2024-06-01T06:02:58Z INFO 2403803 (sg02) [SB_Allocator]:         SB score = 0
+2024-06-01T06:02:59Z INFO 2403803 (sg02) [SB_Allocator]: spilling from SB cost about 3.64866e+07 cycles
+2024-06-01T06:02:59Z INFO 2403803 (sg02) [SB_Allocator]: number of tensors spilled from SB = 2281
+2024-06-01T06:02:59Z INFO 2403803 (sg02) [SB_Allocator]: total size of spilled tensors = 4281992 bytes/partition
+2024-06-01T06:02:59Z INFO 2403803 (sg02) [SB_Allocator]: 0 bytes/partition (0%) successfully pinned
+2024-06-01T06:02:59Z INFO 2403803 (sg02) [SB_Allocator]: pinning saved approximately 0 cycles
+2024-06-01T06:02:59Z INFO 2403803 (sg02) [SB_Allocator]: 0% SB utilization after allocation
+2024-06-01T06:02:59Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 12179890576
+2024-06-01T06:02:59Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 2585 bytes
+2024-06-01T06:02:59Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 2231435268
+2024-06-01T06:02:59Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2615 bytes
+2024-06-01T06:02:59Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 32768
+2024-06-01T06:02:59Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:02:59Z USER 2403803 (sg02) [ModuleForkPass]: coloring_allocator_sb finished after 38.265 seconds
+2024-06-01T06:02:59Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  13403mb, ru_maxrss:  13538mb (delta=618mb)
+2024-06-01T06:02:59Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 227979 memory location(s), 1 block(s), and 698879 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:02:59Z USER 2403803 (sg02) [ModuleForkPass]: Running address_rotation_sb
+2024-06-01T06:02:59Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=227979 blocks=1 instructions=698879 Max writers: 576 Max Readers: 184036
+2024-06-01T06:02:59Z INFO 2403803 [AntiDependencyAnalyzer-PSUM1-0-32]: Finished analyzing 43259 access patterns a mean/median 1.00125/1 intervals per access pattern and mean/median 5.27522/6.04078 intersections per interval.
+2024-06-01T06:02:59Z INFO 2403803 [AntiDependencyAnalyzer-PSUM2-32-128]: Finished analyzing 38180 access patterns a mean/median 1.0011/1 intervals per access pattern and mean/median 5.59882/7.19242 intersections per interval.
+2024-06-01T06:02:59Z INFO 2403803 [AntiDependencyAnalyzer-PSUM2-0-32]: Finished analyzing 38378 access patterns a mean/median 1.00109/1 intervals per access pattern and mean/median 5.60753/7.22953 intersections per interval.
+2024-06-01T06:02:59Z INFO 2403803 [AntiDependencyAnalyzer-PSUM4-0-32]: Finished analyzing 38550 access patterns a mean/median 1.00109/1 intervals per access pattern and mean/median 5.76007/7.51055 intersections per interval.
+2024-06-01T06:02:59Z INFO 2403803 [AntiDependencyAnalyzer-PSUM0-0-32]: Finished analyzing 38833 access patterns a mean/median 1.00116/1 intervals per access pattern and mean/median 5.76132/7.48508 intersections per interval.
+2024-06-01T06:02:59Z INFO 2403803 [AntiDependencyAnalyzer-PSUM0-32-128]: Finished analyzing 38470 access patterns a mean/median 1.00117/1 intervals per access pattern and mean/median 5.75147/7.4972 intersections per interval.
+2024-06-01T06:02:59Z INFO 2403803 [AntiDependencyAnalyzer-PSUM5-0-128]: Finished analyzing 42506 access patterns a mean/median 1.00113/1 intervals per access pattern and mean/median 5.52414/6.78504 intersections per interval.
+2024-06-01T06:02:59Z INFO 2403803 [AntiDependencyAnalyzer-PSUM3-32-128]: Finished analyzing 42452 access patterns a mean/median 1.00113/1 intervals per access pattern and mean/median 5.50791/6.6797 intersections per interval.
+2024-06-01T06:02:59Z INFO 2403803 [AntiDependencyAnalyzer-PSUM1-32-128]: Finished analyzing 42665 access patterns a mean/median 1.00127/1 intervals per access pattern and mean/median 5.25682/5.97916 intersections per interval.
+2024-06-01T06:02:59Z INFO 2403803 [AntiDependencyAnalyzer-PSUM4-32-128]: Finished analyzing 38088 access patterns a mean/median 1.0011/1 intervals per access pattern and mean/median 5.74946/7.49483 intersections per interval.
+2024-06-01T06:02:59Z INFO 2403803 [AntiDependencyAnalyzer-PSUM3-0-32]: Finished analyzing 42650 access patterns a mean/median 1.00113/1 intervals per access pattern and mean/median 5.51704/6.72263 intersections per interval.
+2024-06-01T06:02:59Z INFO 2403803 [AntiDependencyAnalyzer-PSUM6-0-32]: Finished analyzing 38735 access patterns a mean/median 1.00139/1 intervals per access pattern and mean/median 5.76163/7.59954 intersections per interval.
+2024-06-01T06:02:59Z INFO 2403803 [AntiDependencyAnalyzer-PSUM7-32-128]: Finished analyzing 42610 access patterns a mean/median 1.0012/1 intervals per access pattern and mean/median 5.51762/6.74448 intersections per interval.
+2024-06-01T06:02:59Z INFO 2403803 [AntiDependencyAnalyzer-PSUM6-32-128]: Finished analyzing 38273 access patterns a mean/median 1.00141/1 intervals per access pattern and mean/median 5.75108/7.5702 intersections per interval.
+2024-06-01T06:02:59Z INFO 2403803 [AntiDependencyAnalyzer-PSUM7-0-32]: Finished analyzing 43006 access patterns a mean/median 1.00119/1 intervals per access pattern and mean/median 5.52815/6.81599 intersections per interval.
+2024-06-01T06:02:59Z INFO 2403803 [AntiDependencyAnalyzer-DRAM]: Finished analyzing 1676 access patterns a mean/median 87.7208/115.263 intervals per access pattern and mean/median 0.499966/2.72985e-05 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias53]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias53]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias17]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias17]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias60]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias12]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias55]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias47]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias54]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias54]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias55]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias47]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias44]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias44]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias58]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias49]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias58]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias49]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias60]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias20]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias20]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias12]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias13]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias43]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias43]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias57]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias13]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias57]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias46]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias46]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias1]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias1]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-PSUM3-0-32]: Finished analyzing 31149 access patterns a mean/median 1.00616/1 intervals per access pattern and mean/median 7.75398/7.00009 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-PSUM3-64-128]: Finished analyzing 29312 access patterns a mean/median 1.00655/1 intervals per access pattern and mean/median 4.39563/2.00027 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-PSUM1-64-128]: Finished analyzing 31358 access patterns a mean/median 1.95516/1 intervals per access pattern and mean/median 6.65005/1.1129 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias39]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias2]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias39]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias2]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias22]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias22]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias14]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias14]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias16]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias10]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias16]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias10]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias25]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias41]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias25]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias4]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias4]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias0]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias0]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias32]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias32]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-PSUM6-0-32]: Finished analyzing 31776 access patterns a mean/median 1.00019/1 intervals per access pattern and mean/median 12.2371/10.838 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias18]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-PSUM2-64-128]: Finished analyzing 28256 access patterns a mean/median 1/1 intervals per access pattern and mean/median 3.35647/3.73717 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias18]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-PSUM5-0-32]: Finished analyzing 31294 access patterns a mean/median 1.00614/1.00001 intervals per access pattern and mean/median 13.605/13.4258 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-PSUM4-0-32]: Finished analyzing 31072 access patterns a mean/median 1.0309/1 intervals per access pattern and mean/median 10.5241/9.80315 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-PSUM2-32-64]: Finished analyzing 28258 access patterns a mean/median 1/1 intervals per access pattern and mean/median 4.05527/4.9604 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-PSUM0-0-32]: Finished analyzing 30061 access patterns a mean/median 1/1 intervals per access pattern and mean/median 6.7/8.74237 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-PSUM1-32-64]: Finished analyzing 31360 access patterns a mean/median 1.9551/1 intervals per access pattern and mean/median 6.66051/1.11305 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-PSUM0-64-128]: Finished analyzing 28322 access patterns a mean/median 1/1 intervals per access pattern and mean/median 3.33809/3.92728 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias9]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias9]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias50]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-PSUM6-32-128]: Finished analyzing 29336 access patterns a mean/median 1.0002/1 intervals per access pattern and mean/median 11.3894/10.0489 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias50]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-PSUM4-32-64]: Finished analyzing 29371 access patterns a mean/median 1.03269/1 intervals per access pattern and mean/median 10.4873/9.086 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias5]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias5]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias26]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias51]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias6]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias63]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias51]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-PSUM0-32-64]: Finished analyzing 28324 access patterns a mean/median 1/1 intervals per access pattern and mean/median 4.0373/5.00024 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias6]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias26]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias63]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias37]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias19]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias19]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias31]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias35]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias31]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias35]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias41]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias11]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias48]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias23]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias56]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias29]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias40]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias33]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-DRAM]: Finished analyzing 450 access patterns a mean/median 9.53333/15.8707 intervals per access pattern and mean/median 0/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias62]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias33]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias40]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias48]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias3]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias21]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias3]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias21]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias42]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias42]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias62]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias15]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias8]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias8]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-PSUM7-64-128]: Finished analyzing 36567 access patterns a mean/median 4.36549/1.00009 intervals per access pattern and mean/median 4.34447/1.00388 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-PSUM7-0-32]: Finished analyzing 38404 access patterns a mean/median 4.20451/1.00036 intervals per access pattern and mean/median 4.58235/1.00429 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias15]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-PSUM4-64-128]: Finished analyzing 29369 access patterns a mean/median 1.03269/1 intervals per access pattern and mean/median 10.486/9.08565 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias59]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias59]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-PSUM2-0-32]: Finished analyzing 30024 access patterns a mean/median 1/1 intervals per access pattern and mean/median 6.73472/8.99978 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-PSUM7-32-64]: Finished analyzing 36567 access patterns a mean/median 4.36549/1.00009 intervals per access pattern and mean/median 4.34447/1.00388 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias27]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias45]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias45]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias11]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-PSUM1-0-32]: Finished analyzing 33288 access patterns a mean/median 1.89978/1 intervals per access pattern and mean/median 6.89148/1.12113 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias7]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-PSUM3-32-64]: Finished analyzing 29314 access patterns a mean/median 1.00655/1 intervals per access pattern and mean/median 5.10514/3 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias38]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias27]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias34]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias34]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias56]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias37]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias23]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-PSUM5-64-128]: Finished analyzing 29367 access patterns a mean/median 1.00654/1 intervals per access pattern and mean/median 13.5519/13.6676 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias36]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias7]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias36]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias29]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias38]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-PSUM5-32-64]: Finished analyzing 29369 access patterns a mean/median 1.00654/1.00001 intervals per access pattern and mean/median 13.5729/13.6027 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias28]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias28]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias24]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias24]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias61]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias61]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias30]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias52]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias52]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:02:59Z INFO 2416028 [AntiDependencyAnalyzer-Alias30]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:00Z INFO 2403803 (sg00) [DMAOptimizationBase]: [spill optimization round 3]: removed 9 spill/reload instructions
+2024-06-01T06:03:00Z INFO 2403803 (sg00) [DMAOptimizationBase]: [spill optimization round 3]: removed 9 spill/reload memory locations
+2024-06-01T06:03:00Z INFO 2403803 (sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 609746944, 22.2896% out of total spill/reload dma traffic
+2024-06-01T06:03:00Z INFO 2403803 (sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions
+2024-06-01T06:03:00Z INFO 2403803 (sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations
+2024-06-01T06:03:00Z INFO 2403803 (sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address
+2024-06-01T06:03:00Z USER 2403803 (sg02) [ModuleForkPass]: address_rotation_sb finished after 1.207 seconds
+2024-06-01T06:03:00Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  13371mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:03:00Z INFO 2403803 (sg00) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic
+2024-06-01T06:03:00Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 227979 memory location(s), 1 block(s), and 698879 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:03:00Z USER 2403803 (sg02) [ModuleForkPass]: Running dma_optimization_sb
+2024-06-01T06:03:00Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=227979 blocks=1 instructions=698879 Max writers: 576 Max Readers: 184036
+2024-06-01T06:03:00Z INFO 2403803 (sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs
+2024-06-01T06:03:00Z INFO 2403803 (sg02) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 14411358612, 54.4931% input load, 2.34209% output write, 43.1648% spill/reload [sg0002]
+2024-06-01T06:03:01Z INFO 2403803 [AntiDependencyAnalyzer-SB-113-128]: Finished analyzing 672596 access patterns a mean/median 1.0864/1 intervals per access pattern and mean/median 2.94431/1 intersections per interval.
+2024-06-01T06:03:01Z INFO 2403803 [AntiDependencyAnalyzer-SB-96-113]: Finished analyzing 673812 access patterns a mean/median 1.08624/1 intervals per access pattern and mean/median 2.94579/1 intersections per interval.
+2024-06-01T06:03:01Z INFO 2403803 [AntiDependencyAnalyzer-SB-65-96]: Finished analyzing 673812 access patterns a mean/median 1.08624/1 intervals per access pattern and mean/median 2.94579/1 intersections per interval.
+2024-06-01T06:03:01Z INFO 2403803 [AntiDependencyAnalyzer-SB-1-32]: Finished analyzing 673821 access patterns a mean/median 1.08624/1 intervals per access pattern and mean/median 2.94572/1 intersections per interval.
+2024-06-01T06:03:01Z INFO 2403803 [AntiDependencyAnalyzer-SB-32-33]: Finished analyzing 673826 access patterns a mean/median 1.08624/1 intervals per access pattern and mean/median 2.9457/1 intersections per interval.
+2024-06-01T06:03:01Z INFO 2403803 [AntiDependencyAnalyzer-SB-33-64]: Finished analyzing 673821 access patterns a mean/median 1.08624/1 intervals per access pattern and mean/median 2.94572/1 intersections per interval.
+2024-06-01T06:03:01Z INFO 2403803 [AntiDependencyAnalyzer-SB-0-1]: Finished analyzing 674148 access patterns a mean/median 1.0862/1 intervals per access pattern and mean/median 2.94437/1 intersections per interval.
+2024-06-01T06:03:01Z INFO 2403803 [AntiDependencyAnalyzer-SB-64-65]: Finished analyzing 673821 access patterns a mean/median 1.08624/1 intervals per access pattern and mean/median 2.94572/1 intersections per interval.
+2024-06-01T06:03:01Z INFO 2403803 (sg01) [SB_Allocator]:             new candidates = 19392
+2024-06-01T06:03:01Z INFO 2403803 (sg01) [SB_Allocator]:             (including 5120 infinite cost tensors)
+2024-06-01T06:03:01Z INFO 2403803 (sg01) [SB_Allocator]:         select ranges
+2024-06-01T06:03:01Z USER 2403803 (sg03) [ModuleForkPass]: anti_dependency_analyzer finished after 3.579 seconds
+2024-06-01T06:03:01Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  13198mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:03:01Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:03:01Z USER 2403803 (sg03) [ModuleForkPass]: Running dep_opt
+2024-06-01T06:03:01Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:03:01Z INFO 2403803 (sg03) [build_flow_deps]: Start build fdeps. Invocation: 6Sat Jun  1 06:03:01 2024
+2024-06-01T06:03:01Z INFO 2403803 (sg02) [DMAOptimizationBase]: removed 0 identical load 
+2024-06-01T06:03:01Z INFO 2403803 (sg03) [build_flow_deps]: Allocs: 66596 instructions: 353547
+2024-06-01T06:03:01Z INFO 2416028 [AntiDependencyAnalyzer-SB-113-128]: Finished analyzing 485278 access patterns a mean/median 1.61022/1 intervals per access pattern and mean/median 3.73607/1.3261 intersections per interval.
+2024-06-01T06:03:02Z INFO 2416028 [AntiDependencyAnalyzer-SB-80-96]: Finished analyzing 490883 access patterns a mean/median 1.6189/1 intervals per access pattern and mean/median 3.70542/1.33459 intersections per interval.
+2024-06-01T06:03:02Z INFO 2416028 [AntiDependencyAnalyzer-SB-48-64]: Finished analyzing 491268 access patterns a mean/median 1.61842/1 intervals per access pattern and mean/median 3.7009/1.30333 intersections per interval.
+2024-06-01T06:03:02Z INFO 2416028 [AntiDependencyAnalyzer-SB-112-113]: Finished analyzing 491210 access patterns a mean/median 1.61849/1 intervals per access pattern and mean/median 3.70105/1.29787 intersections per interval.
+2024-06-01T06:03:02Z INFO 2416028 [AntiDependencyAnalyzer-SB-97-112]: Finished analyzing 493244 access patterns a mean/median 1.61594/1 intervals per access pattern and mean/median 3.79456/1.28677 intersections per interval.
+2024-06-01T06:03:02Z INFO 2416028 [AntiDependencyAnalyzer-SB-64-65]: Finished analyzing 492101 access patterns a mean/median 1.61737/1 intervals per access pattern and mean/median 3.80369/1.30075 intersections per interval.
+2024-06-01T06:03:02Z INFO 2416028 [AntiDependencyAnalyzer-SB-1-16]: Finished analyzing 513893 access patterns a mean/median 1.59119/1 intervals per access pattern and mean/median 3.88658/1.21483 intersections per interval.
+2024-06-01T06:03:02Z INFO 2416028 [AntiDependencyAnalyzer-SB-33-48]: Finished analyzing 496438 access patterns a mean/median 1.61198/1 intervals per access pattern and mean/median 3.76818/1.2757 intersections per interval.
+2024-06-01T06:03:02Z INFO 2416028 [AntiDependencyAnalyzer-SB-32-33]: Finished analyzing 497181 access patterns a mean/median 1.61106/1 intervals per access pattern and mean/median 3.76872/1.27538 intersections per interval.
+2024-06-01T06:03:02Z INFO 2416028 [AntiDependencyAnalyzer-SB-65-80]: Finished analyzing 492092 access patterns a mean/median 1.61738/1 intervals per access pattern and mean/median 3.80378/1.30084 intersections per interval.
+2024-06-01T06:03:02Z INFO 2416028 [AntiDependencyAnalyzer-SB-16-32]: Finished analyzing 491802 access patterns a mean/median 1.61774/1 intervals per access pattern and mean/median 3.69254/1.33768 intersections per interval.
+2024-06-01T06:03:02Z INFO 2416028 [AntiDependencyAnalyzer-SB-96-97]: Finished analyzing 493785 access patterns a mean/median 1.61526/1 intervals per access pattern and mean/median 3.79187/1.27636 intersections per interval.
+2024-06-01T06:03:02Z INFO 2416028 [AntiDependencyAnalyzer-SB-0-1]: Finished analyzing 514902 access patterns a mean/median 1.60495/1 intervals per access pattern and mean/median 3.84706/1.19966 intersections per interval.
+2024-06-01T06:03:02Z USER 2416028 [ModuleForkPass]: anti_dependency_analyzer finished after 3.646 seconds
+2024-06-01T06:03:02Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  2045mb, ru_maxrss:  2046mb (delta=375mb)
+2024-06-01T06:03:02Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62163 memory location(s), 1 block(s), and 277071 instruction(s). Max writers: 256 Max Readers: 66720
+2024-06-01T06:03:02Z USER 2416028 [ModuleForkPass]: Running tensor_copy_elim
+2024-06-01T06:03:02Z INFO 2416028 [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=62163 blocks=1 instructions=277071 Max writers: 256 Max Readers: 66720
+2024-06-01T06:03:02Z INFO 2416028 [TensorCopyElim]: Tensor CP elimination: 0
+2024-06-01T06:03:02Z USER 2416028 [ModuleForkPass]: tensor_copy_elim finished after 0.723 seconds
+2024-06-01T06:03:02Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  1834mb, ru_maxrss:  2046mb (delta=0mb)
+2024-06-01T06:03:02Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62163 memory location(s), 1 block(s), and 277071 instruction(s). Max writers: 256 Max Readers: 66720
+2024-06-01T06:03:02Z USER 2416028 [ModuleForkPass]: Running post_sched
+2024-06-01T06:03:02Z INFO 2416028 [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=62163 blocks=1 instructions=277071 Max writers: 256 Max Readers: 66720
+2024-06-01T06:03:02Z INFO 2416028 [post_scheduler]: Start PosT ScheD 3 sunda Sat Jun  1 06:03:02 2024
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.417-t32882
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.418-t32884
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.601-t32918
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.602-t32920
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.785-t32954
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.786-t32956
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.969-t32990
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.970-t32992
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.1153-t33026
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.1154-t33028
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.1337-t33062
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.1338-t33064
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.1521-t33098
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.1522-t33100
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.1705-t33134
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.1706-t33136
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.1889-t33170
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.1890-t33172
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.2073-t33206
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.2074-t33208
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.2257-t33242
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.2258-t33244
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.2441-t33278
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.2442-t33280
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.2625-t33314
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.2626-t33316
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.2809-t33350
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.2810-t33352
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.2993-t33386
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.2994-t33388
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.3177-t33422
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.3178-t33424
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.3361-t33458
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.3362-t33460
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.3545-t33494
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.3546-t33496
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.3729-t33530
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.3730-t33532
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.3913-t33566
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.3914-t33568
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.4097-t33602
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.4098-t33604
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.4281-t33638
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.4282-t33640
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.4465-t33674
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.4466-t33676
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.4649-t33710
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.4650-t33712
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.4833-t33746
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.4834-t33748
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.5017-t33782
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.5018-t33784
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.5201-t33818
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.5202-t33820
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.5385-t33854
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.5386-t33856
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.5569-t33890
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.5570-t33892
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.5753-t33926
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.5754-t33928
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.5937-t33962
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.5938-t33964
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.6121-t33998
+2024-06-01T06:03:02Z WARNING 2416028 [post_scheduler]: Inserted memset 0 for _dot.6122-t34000
+2024-06-01T06:03:03Z INFO 2403803 (sg03) [build_flow_deps]: Build fdeps inserted 1111596 edges 
+2024-06-01T06:03:03Z INFO 2403803 (sg03) [build_flow_deps]: Done build fdeps 1111596 Sat Jun  1 06:03:03 2024
+2024-06-01T06:03:03Z INFO 2403803 (sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 1 spill/reload instructions
+2024-06-01T06:03:03Z INFO 2403803 (sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 1 spill/reload memory locations
+2024-06-01T06:03:03Z INFO 2403803 (sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 131072, 0.00479141% out of total spill/reload dma traffic
+2024-06-01T06:03:03Z INFO 2403803 (sg02) [DMAOptimizationBase]: [Load Merging]: removed 676 remat/cloned instructions
+2024-06-01T06:03:04Z INFO 2403803 (sg00) [DMAOptimizationBase]: [remove extra save] removed 10 memlocs and 10 instructions
+2024-06-01T06:03:04Z INFO 2403803 (sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions
+2024-06-01T06:03:04Z INFO 2403803 (sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations
+2024-06-01T06:03:04Z INFO 2403803 (sg02) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions
+2024-06-01T06:03:04Z INFO 2403803 (sg02) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 297533440, 2.06458% out of total dma traffic(7.8532e+09)
+2024-06-01T06:03:04Z USER 2403803 (sg03) [ModuleForkPass]: dep_opt finished after 3.111 seconds
+2024-06-01T06:03:04Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  13128mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:03:04Z INFO 2403803 (sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs
+2024-06-01T06:03:04Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:03:04Z USER 2403803 (sg03) [ModuleForkPass]: Running report_stats
+2024-06-01T06:03:04Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:03:04Z INFO 2403803 (sg03) [ReportStats]: Data Movement Statistics: sg0003
+┌─────────────┬────────────────────────────────────┬───────┬────────────┐
+│ Instruction │ Kind                               │ Count │ Bytes      │
+├─────────────┼────────────────────────────────────┼───────┼────────────┤
+│ DMACopy     │ ExternalInputParameter -> Internal │ 1     │ 16384      │
+│ DMACopy     │ Input -> Internal                  │ 1     │ 900071424  │
+│ DMACopy     │ Internal                           │ 2     │ 900071424  │
+│ Load        │ Const -> Internal                  │ 1     │ 32768      │
+│ Load        │ ExternalInput -> Internal          │ 1     │ 4          │
+│ Load        │ ExternalInputParameter -> Internal │ 15585 │ 7111991296 │
+│ Load        │ Internal                           │ 576   │ 300023808  │
+│ Save        │ Internal                           │ 576   │ 300023808  │
+│ Save        │ Internal -> ExternalOutput         │ 63    │ 256032     │
+└─────────────┴────────────────────────────────────┴───────┴────────────┘
+
+2024-06-01T06:03:04Z INFO 2403803 (sg03) [ReportStats]: 
+┌─────────────────────┬───────┐
+│ Bytes per partition │ Count │
+├─────────────────────┼───────┤
+│ 4                   │ 1     │
+│ 32                  │ 63    │
+│ 128                 │ 1     │
+│ 256                 │ 1     │
+│ 512                 │ 1     │
+│ 2178                │ 32    │
+│ 2304                │ 192   │
+│ 3584                │ 15360 │
+│ 4096                │ 1152  │
+│ 150011904           │ 6     │
+│ 300023808           │ 3     │
+└─────────────────────┴───────┘
+
+2024-06-01T06:03:04Z INFO 2403803 (sg03) [ReportStats]: MM Stats: #MatMults 293456 #MatMult-Transposes 80896
+2024-06-01T06:03:04Z USER 2403803 (sg03) [ModuleForkPass]: report_stats finished after 0.113 seconds
+2024-06-01T06:03:04Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  13094mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:03:04Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:03:04Z USER 2403803 (sg03) [ModuleForkPass]: Running assign_trigger_engine
+2024-06-01T06:03:04Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to assign_trigger_engine: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:03:04Z INFO 2403803 (sg03) [AssignTriggerEngine]: Assigned trigger engine for 578 DMA instructions
+2024-06-01T06:03:04Z USER 2403803 (sg03) [ModuleForkPass]: assign_trigger_engine finished after 0.199 seconds
+2024-06-01T06:03:04Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  13129mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:03:04Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:03:04Z USER 2403803 (sg03) [ModuleForkPass]: Running alloc_queues
+2024-06-01T06:03:04Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:03:05Z USER 2403803 (sg03) [ModuleForkPass]: alloc_queues finished after 0.066 seconds
+2024-06-01T06:03:05Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  13136mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:03:05Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:03:05Z USER 2403803 (sg03) [ModuleForkPass]: Running dep_reduction
+2024-06-01T06:03:05Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:03:05Z INFO 2403803 (sg03) [DepReduction]: Start Dependency Reduction
+2024-06-01T06:03:05Z INFO 2403803 (sg03) [DepReduction]: Processing async instrs...
+2024-06-01T06:03:05Z INFO 2403803 (sg03) [DepReduction]: Processing secondary edges per engine...
+2024-06-01T06:03:05Z INFO 2403803 (sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 1768 SpillSaves and Reloads
+2024-06-01T06:03:05Z INFO 2403803 (sg00) [DMAOptimizationBase]:  average loaded DMA size 1402 bytes
+2024-06-01T06:03:05Z INFO 2403803 (sg00) [DMAOptimizationBase]:  average saved DMA size 949 bytes
+2024-06-01T06:03:05Z INFO 2403803 (sg03) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 303159
+2024-06-01T06:03:05Z INFO 2403803 (sg03) [DepReduction]: Processing redundant descendants...
+2024-06-01T06:03:06Z INFO 2403803 (sg01) [SB_Allocator]:           Total: 134342
+2024-06-01T06:03:06Z INFO 2403803 (sg01) [SB_Allocator]:             Spilled: 0.017 (2281)
+2024-06-01T06:03:06Z INFO 2403803 (sg01) [SB_Allocator]:             Allocated: 0.983 (132061)
+2024-06-01T06:03:06Z INFO 2403803 (sg01) [SB_Allocator]:               Rover zone: 0.270 (35703)
+2024-06-01T06:03:06Z INFO 2403803 (sg01) [SB_Allocator]:               Pre-rover zone: 0.009 (1145)
+2024-06-01T06:03:06Z INFO 2403803 (sg01) [SB_Allocator]:               Post-rover zone: 0.721 (95213)
+2024-06-01T06:03:06Z INFO 2403803 (sg01) [SB_Allocator]:               Slice zone: 0.000 (0)
+2024-06-01T06:03:06Z INFO 2403803 (sg01) [SB_Allocator]:               Blocks nothing: 0.000 (0)
+2024-06-01T06:03:06Z INFO 2403803 (sg01) [SB_Allocator]:               Blocks medium: 0.000 (0)
+2024-06-01T06:03:06Z INFO 2403803 (sg01) [SB_Allocator]:               Blocks tall: 1.000 (132061)
+2024-06-01T06:03:06Z INFO 2403803 (sg01) [SB_Allocator]:                 Visited until tall blocking (mean): 0.974
+2024-06-01T06:03:06Z INFO 2403803 (sg01) [SB_Allocator]:                 Visited until tall blocking (median): 1.000
+2024-06-01T06:03:06Z INFO 2403803 (sg01) [SB_Allocator]:                 Visited until tall blocking (p95): 1.000
+2024-06-01T06:03:06Z INFO 2403803 (sg01) [SB_Allocator]:           Success
+2024-06-01T06:03:06Z INFO 2403803 (sg01) [SB_Allocator]:           SB spills = 2281 tensors
+2024-06-01T06:03:06Z INFO 2403803 (sg01) [SB_Allocator]:                size = 4281992 bytes/partition
+2024-06-01T06:03:06Z INFO 2403803 (sg01) [SB_Allocator]:              remats = 31 tensors
+2024-06-01T06:03:06Z INFO 2403803 (sg01) [SB_Allocator]:            unpinned = 0 tensors
+2024-06-01T06:03:06Z INFO 2403803 (sg01) [SB_Allocator]:                size = 0 bytes/partition
+2024-06-01T06:03:06Z INFO 2403803 (sg01) [SB_Allocator]:         SB score = 3.64866e+07
+2024-06-01T06:03:06Z INFO 2403803 (sg01) [SB_Allocator]:       best SB heuristic = 0
+2024-06-01T06:03:06Z INFO 2403803 (sg01) [SB_Allocator]:       collect spills
+2024-06-01T06:03:07Z INFO 2416028 [post_scheduler]: Time-aware hwm post-sched
+2024-06-01T06:03:07Z INFO 2403803 (sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 1 combined 776 SpillSaves and Reloads
+2024-06-01T06:03:07Z INFO 2403803 (sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 8471 spill/reload instructions
+2024-06-01T06:03:07Z INFO 2403803 (sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 7994 spill/reload memory locations
+2024-06-01T06:03:07Z INFO 2403803 (sg00) [DMAOptimizationBase]:  average loaded DMA size 1454 bytes
+2024-06-01T06:03:07Z INFO 2403803 (sg03) [DepReduction]: Processing redundant descendants, Done. Num edges removed 17733
+2024-06-01T06:03:07Z INFO 2403803 (sg03) [DepReduction]: Processing async instrs, Done. Num edges removed 320892
+2024-06-01T06:03:07Z INFO 2403803 (sg00) [DMAOptimizationBase]:  average saved DMA size 969 bytes
+2024-06-01T06:03:08Z INFO 2403803 (sg01) [SB_Allocator]:       insert spills
+2024-06-01T06:03:09Z INFO 2403803 (sg01) [SB_Allocator]:       deleting loads #loadsToDelete: 31
+2024-06-01T06:03:09Z INFO 2403803 (sg01) [SB_Allocator]:       deleting locs #locationsToDelete: 31
+2024-06-01T06:03:09Z INFO 2403803 (sg01) [SB_Allocator]:       locationsToDelete done
+2024-06-01T06:03:09Z INFO 2403803 (sg01) [SB_Allocator]:     main loop
+2024-06-01T06:03:09Z INFO 2403803 (sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 2 combined 24 SpillSaves and Reloads
+2024-06-01T06:03:09Z INFO 2403803 (sg00) [DMAOptimizationBase]:  average loaded DMA size 1464 bytes
+2024-06-01T06:03:09Z INFO 2403803 (sg00) [DMAOptimizationBase]:  average saved DMA size 970 bytes
+2024-06-01T06:03:09Z INFO 2403803 (sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 1854982408
+2024-06-01T06:03:09Z INFO 2403803 (sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 1464 bytes
+2024-06-01T06:03:10Z INFO 2403803 (sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 1114178320
+2024-06-01T06:03:10Z INFO 2403803 (sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 970 bytes
+2024-06-01T06:03:10Z INFO 2403803 (sg02) [DMAOptimizationBase]: [spill optimization round 1]: removed 307 spill/reload instructions
+2024-06-01T06:03:10Z INFO 2403803 (sg02) [DMAOptimizationBase]: [spill optimization round 1]: removed 297 spill/reload memory locations
+2024-06-01T06:03:10Z INFO 2403803 (sg01) [SB_Allocator]:       renumber locations
+2024-06-01T06:03:10Z INFO 2403803 (sg01) [SB_Allocator]:         size = 147483
+2024-06-01T06:03:10Z INFO 2403803 (sg01) [SB_Allocator]:       find partners
+2024-06-01T06:03:11Z INFO 2403803 (sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs
+2024-06-01T06:03:11Z INFO 2403803 (sg01) [SB_Allocator]:         found 74858 accumulation groups
+2024-06-01T06:03:11Z INFO 2403803 (sg00) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 129040384, 4.71714% out of total spill/reload dma traffic
+2024-06-01T06:03:11Z INFO 2403803 (sg00) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 1039663488, 26.7878% out of total dma traffic
+2024-06-01T06:03:11Z INFO 2403803 (sg00) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 2841447448, 25.4406% input load, 4.29082% output write, 70.2686% spill/reload [sg0000]
+2024-06-01T06:03:11Z INFO 2403803 (sg01) [SB_Allocator]:           largest = _dot.510-t806_i279
+2024-06-01T06:03:11Z INFO 2403803 (sg01) [SB_Allocator]:             tensors = 64
+2024-06-01T06:03:11Z INFO 2403803 (sg01) [SB_Allocator]:             requires 147456 bytes/partition
+2024-06-01T06:03:11Z INFO 2403803 (sg01) [SB_Allocator]:       expanding partners
+2024-06-01T06:03:11Z INFO 2403803 (sg03) [DepReduction]: Num Async removed: 0
+2024-06-01T06:03:11Z INFO 2403803 (sg03) [DepReduction]: Finished dependency reduction: 2489903 removed, new total 100176
+2024-06-01T06:03:11Z INFO 2403803 (sg03) [DepReduction]: Finished Dependency Reduction
+2024-06-01T06:03:11Z USER 2403803 (sg03) [ModuleForkPass]: dep_reduction finished after 6.787 seconds
+2024-06-01T06:03:11Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  13252mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:03:11Z INFO 2403803 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 1782302984
+2024-06-01T06:03:11Z INFO 2403803 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 1463 bytes
+2024-06-01T06:03:12Z INFO 2403803 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 1059128080
+2024-06-01T06:03:12Z INFO 2403803 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 963 bytes
+2024-06-01T06:03:12Z INFO 2403803 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 16384
+2024-06-01T06:03:12Z INFO 2403803 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 128 bytes
+2024-06-01T06:03:12Z INFO 2403803 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 1225 bytes
+2024-06-01T06:03:12Z INFO 2403803 (sg00) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module);
+2024-06-01T06:03:12Z INFO 2403803 (sg00) [DMAOptimizationBase]: DMA optimization re-enable optimization
+2024-06-01T06:03:12Z USER 2403803 (sg00) [ModuleForkPass]: dma_optimization_sb finished after 30.867 seconds
+2024-06-01T06:03:12Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  12671mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:03:12Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 147823 memory location(s), 1 block(s), and 330096 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:03:12Z USER 2403803 (sg00) [ModuleForkPass]: Running address_rotation_sb
+2024-06-01T06:03:12Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=147823 blocks=1 instructions=330096 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:03:12Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:03:12Z USER 2403803 (sg03) [ModuleForkPass]: Running bir_racecheck
+2024-06-01T06:03:12Z INFO 2403803 (sg03) [ModuleForkPass]: Inputs to bir_racecheck: modules=1 functions=1 allocs=66596 blocks=1 instructions=353547 Max writers: 512 Max Readers: 80896
+2024-06-01T06:03:13Z INFO 2403803 (sg00) [DMAOptimizationBase]: SB Rotation rotated 1395 Sb address
+2024-06-01T06:03:13Z INFO 2403803 (sg02) [DMAOptimizationBase]: [spill optimization round 2]: removed 36 spill/reload instructions
+2024-06-01T06:03:13Z INFO 2403803 (sg02) [DMAOptimizationBase]: [spill optimization round 2]: removed 36 spill/reload memory locations
+2024-06-01T06:03:14Z INFO 2403803 (sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 2472935424, 39.7538% out of total spill/reload dma traffic
+2024-06-01T06:03:14Z INFO 2403803 (sg01) [SB_Allocator]:       find first defs
+2024-06-01T06:03:14Z INFO 2403803 (sg01) [SB_Allocator]:       find loads
+2024-06-01T06:03:15Z INFO 2403803 (sg01) [SB_Allocator]:         0 pin count
+2024-06-01T06:03:15Z INFO 2403803 (sg01) [SB_Allocator]:         35277 remat count
+2024-06-01T06:03:15Z INFO 2403803 (sg01) [SB_Allocator]:       build interference graph
+2024-06-01T06:03:15Z INFO 2403803 (sg01) [SB_Allocator]:         pass 1 int-tree
+2024-06-01T06:03:15Z INFO 2403803 (sg00) [DMAOptimizationBase]: SB Rotation rotated 4047 Sb address
+2024-06-01T06:03:15Z INFO 2403803 (sg01) [SB_Allocator]:            Num intervals 147483 Num locations 147483
+2024-06-01T06:03:15Z INFO 2403803 (sg01) [SB_Allocator]:            IntervalTree Build Done
+2024-06-01T06:03:16Z INFO 2403803 (sg01) [SB_Allocator]:            info.neighbors init Done
+2024-06-01T06:03:16Z INFO 2403803 (sg01) [SB_Allocator]:            info.neighbors partners Done
+2024-06-01T06:03:16Z INFO 2403803 (sg01) [SB_Allocator]:            IntervalTree readback Done
+2024-06-01T06:03:16Z INFO 2403803 (sg01) [SB_Allocator]:         edge: 8888587
+2024-06-01T06:03:16Z INFO 2403803 (sg01) [SB_Allocator]:         mean: 120.537
+2024-06-01T06:03:16Z INFO 2403803 (sg01) [SB_Allocator]:         median: 89.3004
+2024-06-01T06:03:16Z INFO 2403803 (sg01) [SB_Allocator]:       find costs
+2024-06-01T06:03:16Z INFO 2403803 (sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 202 spill/reload instructions
+2024-06-01T06:03:16Z INFO 2403803 (sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 177 spill/reload memory locations
+2024-06-01T06:03:16Z INFO 2403803 (sg02) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 105381892, 1.69407% out of total spill/reload dma traffic
+2024-06-01T06:03:17Z INFO 2403803 (sg01) [SB_Allocator]:       best-of-n loop, heuristic = 0
+2024-06-01T06:03:17Z INFO 2403803 (sg01) [SB_Allocator]:         simplify interference graph
+2024-06-01T06:03:17Z INFO 2403803 (sg01) [SB_Allocator]:           initialize safe & unsafe
+2024-06-01T06:03:17Z INFO 2403803 (sg01) [SB_Allocator]:               safe = 12629
+2024-06-01T06:03:17Z INFO 2403803 (sg01) [SB_Allocator]:             unsafe = 550
+2024-06-01T06:03:17Z INFO 2403803 (sg01) [SB_Allocator]:                inf = 2243
+2024-06-01T06:03:17Z INFO 2403803 (sg01) [SB_Allocator]:              total = 15422
+2024-06-01T06:03:17Z INFO 2403803 (sg01) [SB_Allocator]:           simplify
+2024-06-01T06:03:17Z INFO 2403803 (sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 470 #Pinned 0 #Safe 0 minCost 0.0167677 maxCost 0.0314575 locations 147483
+2024-06-01T06:03:17Z INFO 2403803 (sg01) [SB_Allocator]:             new candidates = 453
+2024-06-01T06:03:17Z INFO 2403803 (sg01) [SB_Allocator]:             (including 2243 infinite cost tensors)
+2024-06-01T06:03:17Z INFO 2403803 (sg01) [SB_Allocator]:         select ranges
+2024-06-01T06:03:17Z INFO 2403803 (sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs
+2024-06-01T06:03:17Z INFO 2403803 (sg01) [SB_Allocator]:           Total: 15422
+2024-06-01T06:03:17Z INFO 2403803 (sg01) [SB_Allocator]:             Spilled: 0.000 (0)
+2024-06-01T06:03:17Z INFO 2403803 (sg01) [SB_Allocator]:             Allocated: 1.000 (15422)
+2024-06-01T06:03:17Z INFO 2403803 (sg01) [SB_Allocator]:               Rover zone: 0.942 (14528)
+2024-06-01T06:03:17Z INFO 2403803 (sg01) [SB_Allocator]:               Pre-rover zone: 0.013 (193)
+2024-06-01T06:03:17Z INFO 2403803 (sg01) [SB_Allocator]:               Post-rover zone: 0.045 (701)
+2024-06-01T06:03:17Z INFO 2403803 (sg01) [SB_Allocator]:               Slice zone: 0.000 (0)
+2024-06-01T06:03:17Z INFO 2403803 (sg01) [SB_Allocator]:               Blocks nothing: 0.010 (160)
+2024-06-01T06:03:17Z INFO 2403803 (sg01) [SB_Allocator]:               Blocks medium: 0.000 (0)
+2024-06-01T06:03:17Z INFO 2403803 (sg01) [SB_Allocator]:               Blocks tall: 0.990 (15262)
+2024-06-01T06:03:17Z INFO 2403803 (sg01) [SB_Allocator]:                 Visited until tall blocking (mean): 0.990
+2024-06-01T06:03:17Z INFO 2403803 (sg01) [SB_Allocator]:                 Visited until tall blocking (median): 1.000
+2024-06-01T06:03:17Z INFO 2403803 (sg01) [SB_Allocator]:                 Visited until tall blocking (p95): 1.000
+2024-06-01T06:03:17Z INFO 2403803 (sg01) [SB_Allocator]:           Success
+2024-06-01T06:03:17Z INFO 2403803 (sg00) [DMAOptimizationBase]: SB Rotation rotated 2823 Sb address
+2024-06-01T06:03:17Z USER 2403803 (sg03) [ModuleForkPass]: bir_racecheck finished after 5.247 seconds
+2024-06-01T06:03:17Z INFO 2403803 (sg03) [ModuleForkPass]: curr_vmrss:  13279mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:03:18Z INFO 2403803 (sg03) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 66596 memory location(s), 1 block(s), and 353547 instruction(s). Max writers: 512 Max Readers: 80896
+2024-06-01T06:03:18Z INFO 2403803 (sg01) [SB_Allocator]:           SB spills = 0 tensors
+2024-06-01T06:03:18Z INFO 2403803 (sg01) [SB_Allocator]:                size = 0 bytes/partition
+2024-06-01T06:03:18Z INFO 2403803 (sg01) [SB_Allocator]:              remats = 0 tensors
+2024-06-01T06:03:18Z INFO 2403803 (sg01) [SB_Allocator]:            unpinned = 0 tensors
+2024-06-01T06:03:18Z INFO 2403803 (sg01) [SB_Allocator]:                size = 0 bytes/partition
+2024-06-01T06:03:18Z INFO 2403803 (sg01) [SB_Allocator]:         SB score = 0
+2024-06-01T06:03:18Z INFO 2403803 (sg01) [SB_Allocator]: spilling from SB cost about 3.64866e+07 cycles
+2024-06-01T06:03:18Z INFO 2403803 (sg01) [SB_Allocator]: number of tensors spilled from SB = 2281
+2024-06-01T06:03:18Z INFO 2403803 (sg01) [SB_Allocator]: total size of spilled tensors = 4281992 bytes/partition
+2024-06-01T06:03:18Z INFO 2403803 (sg01) [SB_Allocator]: 0 bytes/partition (0%) successfully pinned
+2024-06-01T06:03:18Z INFO 2403803 (sg01) [SB_Allocator]: pinning saved approximately 0 cycles
+2024-06-01T06:03:18Z INFO 2403803 (sg01) [SB_Allocator]: 0% SB utilization after allocation
+2024-06-01T06:03:18Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 12179890576
+2024-06-01T06:03:18Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 2585 bytes
+2024-06-01T06:03:18Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 2231435268
+2024-06-01T06:03:18Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2615 bytes
+2024-06-01T06:03:18Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 32768
+2024-06-01T06:03:18Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:03:19Z USER 2403803 (sg01) [ModuleForkPass]: coloring_allocator_sb finished after 27.375 seconds
+2024-06-01T06:03:19Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  12653mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:03:19Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 227979 memory location(s), 1 block(s), and 698879 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:03:19Z USER 2403803 (sg01) [ModuleForkPass]: Running address_rotation_sb
+2024-06-01T06:03:19Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=227979 blocks=1 instructions=698879 Max writers: 576 Max Readers: 184036
+2024-06-01T06:03:20Z INFO 2403803 (sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 7 spill/reload instructions
+2024-06-01T06:03:20Z INFO 2403803 (sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 7 spill/reload memory locations
+2024-06-01T06:03:20Z INFO 2403803 (sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 917504, 0.0147494% out of total spill/reload dma traffic
+2024-06-01T06:03:20Z INFO 2403803 (sg00) [DMAOptimizationBase]: SB Rotation rotated 202 Sb address
+2024-06-01T06:03:21Z INFO 2416028 [post_scheduler]: Time-aware simulation time: 24867321
+2024-06-01T06:03:22Z INFO 2403803 (sg02) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions
+2024-06-01T06:03:22Z INFO 2416028 [post_scheduler]: Done  PosT ScheD Sat Jun  1 06:03:22 2024
+2024-06-01T06:03:22Z INFO 2403803 (sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 326 spill/reload instructions
+2024-06-01T06:03:22Z INFO 2403803 (sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 3 spill/reload memory locations
+2024-06-01T06:03:22Z USER 2416028 [ModuleForkPass]: post_sched finished after 19.671 seconds
+2024-06-01T06:03:22Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  2111mb, ru_maxrss:  2111mb (delta=65mb)
+2024-06-01T06:03:22Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62163 memory location(s), 1 block(s), and 277135 instruction(s). Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:22Z USER 2416028 [ModuleForkPass]: Running address_rotation_sb
+2024-06-01T06:03:22Z INFO 2416028 [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=62163 blocks=1 instructions=277135 Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:23Z INFO 2403803 (sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address
+2024-06-01T06:03:23Z USER 2403803 (sg01) [ModuleForkPass]: address_rotation_sb finished after 4.075 seconds
+2024-06-01T06:03:23Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  12449mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:03:23Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 227979 memory location(s), 1 block(s), and 698879 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:03:23Z USER 2403803 (sg01) [ModuleForkPass]: Running dma_optimization_sb
+2024-06-01T06:03:23Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=227979 blocks=1 instructions=698879 Max writers: 576 Max Readers: 184036
+2024-06-01T06:03:24Z INFO 2403803 (sg01) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 14411358612, 54.4931% input load, 2.34209% output write, 43.1648% spill/reload [sg0001]
+2024-06-01T06:03:24Z INFO 2403803 (sg00) [DMAOptimizationBase]: SB Rotation rotated 12238 Sb address
+2024-06-01T06:03:24Z INFO 2416028 [DMAOptimizationBase]: PSUM Rotation rotated 10000 PSUM Banks
+2024-06-01T06:03:24Z INFO 2403803 (sg01) [DMAOptimizationBase]: removed 0 identical load 
+2024-06-01T06:03:25Z INFO 2416028 [DMAOptimizationBase]: PSUM Rotation rotated 12673 PSUM Banks
+2024-06-01T06:03:25Z INFO 2416028 [DMAOptimizationBase]: PSUM Rotation rotated 2327 PSUM Banks
+2024-06-01T06:03:26Z INFO 2403803 (sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address
+2024-06-01T06:03:26Z USER 2403803 (sg00) [ModuleForkPass]: address_rotation_sb finished after 13.654 seconds
+2024-06-01T06:03:26Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  12469mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:03:26Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 147823 memory location(s), 1 block(s), and 330096 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:03:26Z USER 2403803 (sg00) [ModuleForkPass]: Running coloring_allocator_dram
+2024-06-01T06:03:26Z INFO 2403803 (sg02) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 1608 SpillSaves and Reloads
+2024-06-01T06:03:26Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=147823 blocks=1 instructions=330096 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:03:26Z INFO 2416028 [DMAOptimizationBase]: SB Rotation rotated 66 Sb address
+2024-06-01T06:03:26Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 1782302984
+2024-06-01T06:03:26Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 1463 bytes
+2024-06-01T06:03:26Z INFO 2403803 (sg02) [DMAOptimizationBase]:  average loaded DMA size 2761 bytes
+2024-06-01T06:03:26Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 1059128080
+2024-06-01T06:03:26Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 963 bytes
+2024-06-01T06:03:26Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 16384
+2024-06-01T06:03:26Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:03:26Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: Allocating functions
+2024-06-01T06:03:26Z INFO 2403803 (sg00) [ColoringAllocator::Rep]:   linearize and check
+2024-06-01T06:03:26Z INFO 2403803 (sg02) [DMAOptimizationBase]:  average saved DMA size 2668 bytes
+2024-06-01T06:03:27Z INFO 2416028 [DMAOptimizationBase]: SB Rotation rotated 255 Sb address
+2024-06-01T06:03:27Z INFO 2416028 [DMAOptimizationBase]: SB Rotation rotated 586 Sb address
+2024-06-01T06:03:27Z INFO 2403803 (sg00) [DRAM_Allocator]:   allocating spills in DRAM pre_link mode
+2024-06-01T06:03:27Z INFO 2403803 (sg00) [DRAM_Allocator]:     reserved space = 743015444 bytes
+2024-06-01T06:03:27Z INFO 2403803 (sg00) [DRAM_Allocator]:     spill space = 1234576384 bytes
+2024-06-01T06:03:27Z INFO 2403803 (sg00) [DRAM_Allocator]:     aligned spill space = 1234579456 bytes
+2024-06-01T06:03:27Z INFO 2403803 (sg00) [DRAM_Allocator]:     dram space = 107374182400 bytes
+2024-06-01T06:03:27Z INFO 2403803 (sg00) [DRAM_Allocator]:     renumber locations
+2024-06-01T06:03:27Z INFO 2403803 (sg00) [DRAM_Allocator]:         size = 1848
+2024-06-01T06:03:27Z INFO 2403803 (sg00) [DRAM_Allocator]:       find first defs
+2024-06-01T06:03:28Z INFO 2416028 [DMAOptimizationBase]: SB Rotation rotated 806 Sb address
+2024-06-01T06:03:28Z INFO 2403803 (sg00) [DRAM_Allocator]:            Num intervals 1848 Num locations 1848
+2024-06-01T06:03:28Z INFO 2403803 (sg00) [DRAM_Allocator]:            IntervalTree Build Done
+2024-06-01T06:03:28Z INFO 2403803 (sg00) [DRAM_Allocator]:            info.neighbors init Done
+2024-06-01T06:03:28Z INFO 2403803 (sg00) [DRAM_Allocator]:            IntervalTree readback Done
+2024-06-01T06:03:28Z INFO 2403803 (sg00) [DRAM_Allocator]:         simplify interference graph
+2024-06-01T06:03:28Z INFO 2403803 (sg00) [DRAM_Allocator]:           initialize low and high
+2024-06-01T06:03:28Z INFO 2403803 (sg00) [DRAM_Allocator]:             lo = 1848
+2024-06-01T06:03:28Z INFO 2403803 (sg00) [DRAM_Allocator]:             hi = 0
+2024-06-01T06:03:28Z INFO 2403803 (sg00) [DRAM_Allocator]:             total = 1848
+2024-06-01T06:03:28Z INFO 2403803 (sg00) [DRAM_Allocator]:           simplify
+2024-06-01T06:03:28Z INFO 2403803 (sg00) [DRAM_Allocator]:             new candidates = 0
+2024-06-01T06:03:28Z INFO 2403803 (sg00) [DRAM_Allocator]:         select ranges
+2024-06-01T06:03:28Z INFO 2403803 (sg00) [DRAM_Allocator]: CC buffer size limit 524288000
+2024-06-01T06:03:28Z INFO 2403803 (sg00) [DRAM_Allocator]: allreduce_dram_hwm 836894720
+2024-06-01T06:03:28Z INFO 2403803 (sg00) [DRAM_Allocator]: Real CC buffer size 524288000
+2024-06-01T06:03:28Z INFO 2403803 (sg02) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 1 combined 708 SpillSaves and Reloads
+2024-06-01T06:03:28Z INFO 2403803 (sg00) [DRAM_Allocator]: DRAM hwm after allocation: 1043738624
+2024-06-01T06:03:28Z INFO 2403803 (sg00) [DRAM_Allocator]: DRAM allocation successful
+2024-06-01T06:03:28Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 1782302984
+2024-06-01T06:03:28Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 1463 bytes
+2024-06-01T06:03:28Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 1059128080
+2024-06-01T06:03:28Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 963 bytes
+2024-06-01T06:03:28Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 16384
+2024-06-01T06:03:28Z INFO 2403803 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:03:28Z USER 2403803 (sg00) [ModuleForkPass]: coloring_allocator_dram finished after 2.726 seconds
+2024-06-01T06:03:28Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  12474mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:03:28Z INFO 2403803 (sg02) [DMAOptimizationBase]:  average loaded DMA size 2779 bytes
+2024-06-01T06:03:29Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 147823 memory location(s), 1 block(s), and 330096 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:03:29Z USER 2403803 (sg00) [ModuleForkPass]: Running address_rotation_dram
+2024-06-01T06:03:29Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=147823 blocks=1 instructions=330096 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:03:29Z INFO 2403803 (sg00) [DMAOptimizationBase]: Runtime page size at 512MB
+2024-06-01T06:03:29Z INFO 2403803 (sg01) [DMAOptimizationBase]: [Load Merging]: removed 676 remat/cloned instructions
+2024-06-01T06:03:29Z INFO 2416028 [DMAOptimizationBase]: SB Rotation rotated 863 Sb address
+2024-06-01T06:03:29Z INFO 2403803 (sg02) [DMAOptimizationBase]:  average saved DMA size 2753 bytes
+2024-06-01T06:03:29Z INFO 2403803 (sg00) [DMAOptimizationBase]: DRAM hwm before rotation 1043738624
+2024-06-01T06:03:29Z INFO 2416028 [DMAOptimizationBase]: moved 0 MM forward
+2024-06-01T06:03:29Z INFO 2403803 (sg00) [DMAOptimizationBase]: allreduce buffer size 524288000
+2024-06-01T06:03:29Z INFO 2403803 (sg00) [DMAOptimizationBase]: allreduce hwm 836894720
+2024-06-01T06:03:29Z INFO 2403803 (sg00) [DMAOptimizationBase]: Real CC buffer size 524288000
+2024-06-01T06:03:30Z INFO 2403803 (sg00) [DMAOptimizationBase]: DRAM hwm after rotation 1043738624
+2024-06-01T06:03:30Z INFO 2403803 (sg00) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address
+2024-06-01T06:03:30Z USER 2403803 (sg00) [ModuleForkPass]: address_rotation_dram finished after 1.006 seconds
+2024-06-01T06:03:30Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  12465mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:03:30Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 147823 memory location(s), 1 block(s), and 330096 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:03:30Z USER 2403803 (sg00) [ModuleForkPass]: Running tensorcopy_accel
+2024-06-01T06:03:30Z INFO 2403803 (sg01) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions
+2024-06-01T06:03:30Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=147823 blocks=1 instructions=330096 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:03:30Z INFO 2403803 (sg00) [TensorCopyAccel::Impl]: Running peephole optimization pass
+2024-06-01T06:03:30Z INFO 2403803 (sg00) [TensorCopyAccel::Impl]: Accelerated 1552 out of 64113 tensorcopy in Function: sg0000 average acceleration factor: 1
+2024-06-01T06:03:30Z USER 2403803 (sg00) [ModuleForkPass]: tensorcopy_accel finished after 0.164 seconds
+2024-06-01T06:03:30Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  12459mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:03:30Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 147823 memory location(s), 1 block(s), and 330096 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:03:30Z USER 2403803 (sg00) [ModuleForkPass]: Running peephole_opts
+2024-06-01T06:03:30Z INFO 2403803 (sg01) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 297533440, 2.06458% out of total dma traffic(7.8532e+09)
+2024-06-01T06:03:30Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=147823 blocks=1 instructions=330096 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:03:30Z INFO 2403803 (sg00) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: true SplitSelect: true
+2024-06-01T06:03:30Z INFO 2416028 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address
+2024-06-01T06:03:30Z INFO 2403803 (sg00) [PeepholeOpts]: Split Select: 11520
+2024-06-01T06:03:30Z INFO 2403803 (sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs
+2024-06-01T06:03:30Z INFO 2403803 (sg00) [PeepholeOpts]: TSP -> ACT: 14735
+2024-06-01T06:03:30Z INFO 2403803 (sg00) [PeepholeOpts]: COPY -> ACT: 0
+2024-06-01T06:03:30Z INFO 2403803 (sg00) [PeepholeOpts]: RECIPROCAL -> ACT: 0
+2024-06-01T06:03:30Z USER 2403803 (sg00) [ModuleForkPass]: peephole_opts finished after 0.675 seconds
+2024-06-01T06:03:30Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  12499mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:03:31Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 147823 memory location(s), 1 block(s), and 341616 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:03:31Z USER 2403803 (sg00) [ModuleForkPass]: Running lower_kernel
+2024-06-01T06:03:31Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=147823 blocks=1 instructions=341616 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:03:31Z INFO 2403803 (sg00) [LowerKernel]: Started running LowerKernel
+2024-06-01T06:03:31Z INFO 2403803 (sg00) [LowerKernel]: Start of kernel lowering pass, number of insts: 341616, number of allocs: 147823
+2024-06-01T06:03:31Z INFO 2403803 (sg00) [LowerKernel]: Scan BKs time (s): 0.122863
+2024-06-01T06:03:31Z INFO 2403803 (sg00) [LowerKernel]: Lower BKs time (s): 7e-06
+2024-06-01T06:03:31Z USER 2403803 (sg00) [ModuleForkPass]: lower_kernel finished after 0.066 seconds
+2024-06-01T06:03:31Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  12461mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:03:31Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 147823 memory location(s), 1 block(s), and 341616 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:03:31Z USER 2403803 (sg00) [ModuleForkPass]: Running build_fdeps
+2024-06-01T06:03:31Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=147823 blocks=1 instructions=341616 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:03:31Z INFO 2403803 (sg00) [build_flow_deps]: Start build fdeps. Invocation: 7Sat Jun  1 06:03:31 2024
+2024-06-01T06:03:31Z INFO 2403803 (sg00) [build_flow_deps]: Allocs: 147823 instructions: 341616
+2024-06-01T06:03:31Z INFO 2403803 (sg02) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 2 combined 0 SpillSaves and Reloads
+2024-06-01T06:03:31Z INFO 2416028 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address
+2024-06-01T06:03:31Z USER 2416028 [ModuleForkPass]: address_rotation_sb finished after 8.828 seconds
+2024-06-01T06:03:31Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  2008mb, ru_maxrss:  2111mb (delta=0mb)
+2024-06-01T06:03:31Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62163 memory location(s), 1 block(s), and 277135 instruction(s). Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:31Z USER 2416028 [ModuleForkPass]: Running anti_dependency_analyzer
+2024-06-01T06:03:31Z INFO 2416028 [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=62163 blocks=1 instructions=277135 Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:31Z INFO 2416028 [AntiDependencyAnalyzer]: Batch size: 1000
+2024-06-01T06:03:31Z INFO 2416028 [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB}
+2024-06-01T06:03:31Z INFO 2403803 (sg02) [DMAOptimizationBase]:  average loaded DMA size 2779 bytes
+2024-06-01T06:03:31Z INFO 2403803 (sg02) [DMAOptimizationBase]:  average saved DMA size 2753 bytes
+2024-06-01T06:03:31Z INFO 2403803 (sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 9560399632
+2024-06-01T06:03:31Z INFO 2403803 (sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 2779 bytes
+2024-06-01T06:03:31Z INFO 2403803 (sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 1963245568
+2024-06-01T06:03:31Z INFO 2403803 (sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 2753 bytes
+2024-06-01T06:03:32Z INFO 2403803 (sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs
+2024-06-01T06:03:32Z INFO 2403803 (sg02) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 141778560, 2.27917% out of total spill/reload dma traffic
+2024-06-01T06:03:32Z INFO 2403803 (sg02) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 3018546820, 20.9456% out of total dma traffic
+2024-06-01T06:03:32Z INFO 2403803 (sg02) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 11392811792, 66.331% input load, 2.96263% output write, 30.7064% spill/reload [sg0002]
+2024-06-01T06:03:32Z INFO 2403803 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 9486189328
+2024-06-01T06:03:32Z INFO 2403803 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 2799 bytes
+2024-06-01T06:03:32Z INFO 2403803 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 1906589696
+2024-06-01T06:03:32Z INFO 2403803 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 2876 bytes
+2024-06-01T06:03:33Z INFO 2403803 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 32768
+2024-06-01T06:03:33Z INFO 2403803 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 128 bytes
+2024-06-01T06:03:33Z INFO 2403803 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 2811 bytes
+2024-06-01T06:03:33Z INFO 2403803 (sg02) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module);
+2024-06-01T06:03:33Z INFO 2403803 (sg02) [DMAOptimizationBase]: DMA optimization re-enable optimization
+2024-06-01T06:03:33Z USER 2403803 (sg02) [ModuleForkPass]: dma_optimization_sb finished after 32.225 seconds
+2024-06-01T06:03:33Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  12496mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:03:33Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 688010 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:03:33Z USER 2403803 (sg02) [ModuleForkPass]: Running address_rotation_sb
+2024-06-01T06:03:33Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=217043 blocks=1 instructions=688010 Max writers: 576 Max Readers: 184036
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-PSUM2-32-64]: Finished analyzing 29124 access patterns a mean/median 1.00175/1 intervals per access pattern and mean/median 11.5892/10.2461 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-PSUM4-64-128]: Finished analyzing 30244 access patterns a mean/median 1.79493/1 intervals per access pattern and mean/median 9.00097/3.5492 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-PSUM4-32-64]: Finished analyzing 30246 access patterns a mean/median 1.79488/1 intervals per access pattern and mean/median 9.00166/3.5559 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-PSUM1-64-128]: Finished analyzing 30474 access patterns a mean/median 1.98011/1 intervals per access pattern and mean/median 6.45486/1.1209 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-PSUM0-32-64]: Finished analyzing 28519 access patterns a mean/median 1.00168/1 intervals per access pattern and mean/median 11.7432/9.74546 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-PSUM2-64-128]: Finished analyzing 29122 access patterns a mean/median 1.00175/1 intervals per access pattern and mean/median 11.5023/9.96681 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-PSUM2-0-32]: Finished analyzing 31044 access patterns a mean/median 1.00164/1 intervals per access pattern and mean/median 11.8641/10.7074 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-PSUM3-0-32]: Finished analyzing 30855 access patterns a mean/median 1.00574/1 intervals per access pattern and mean/median 8.30699/9.0001 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-PSUM3-64-128]: Finished analyzing 29005 access patterns a mean/median 1.0061/1 intervals per access pattern and mean/median 4.96543/4.00015 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-PSUM3-32-64]: Finished analyzing 29007 access patterns a mean/median 1.0061/1 intervals per access pattern and mean/median 5.67339/5.00009 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-PSUM1-32-64]: Finished analyzing 30476 access patterns a mean/median 1.98005/1 intervals per access pattern and mean/median 6.46532/1.12123 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-PSUM1-0-32]: Finished analyzing 32909 access patterns a mean/median 1.90759/1 intervals per access pattern and mean/median 6.86027/1.17789 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-PSUM0-0-32]: Finished analyzing 29939 access patterns a mean/median 1.0016/1 intervals per access pattern and mean/median 11.8675/9.75308 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-PSUM5-64-128]: Finished analyzing 30538 access patterns a mean/median 1.00501/1 intervals per access pattern and mean/median 8.53911/7.27074 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-PSUM4-0-32]: Finished analyzing 31864 access patterns a mean/median 1.75452/1 intervals per access pattern and mean/median 9.53712/3.87894 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-PSUM7-32-64]: Finished analyzing 35256 access patterns a mean/median 3.83518/1.0002 intervals per access pattern and mean/median 4.58516/1.00545 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-PSUM6-0-32]: Finished analyzing 30685 access patterns a mean/median 1.00235/1 intervals per access pattern and mean/median 12.6116/11.9605 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-PSUM6-32-128]: Finished analyzing 28795 access patterns a mean/median 1.0025/1 intervals per access pattern and mean/median 12.2813/12.5905 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-PSUM5-32-64]: Finished analyzing 30540 access patterns a mean/median 1.00501/1 intervals per access pattern and mean/median 8.9055/8.04726 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-PSUM0-64-128]: Finished analyzing 28517 access patterns a mean/median 1.00168/1 intervals per access pattern and mean/median 11.7236/9.74254 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias57]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias45]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias56]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias57]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias45]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias56]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias44]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias46]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias44]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias42]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias40]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias63]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias46]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias42]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias40]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-DRAM]: Finished analyzing 450 access patterns a mean/median 9.53333/15.8707 intervals per access pattern and mean/median 0/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias63]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias32]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias22]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias32]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias13]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias2]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias38]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias22]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias50]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias29]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias2]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias29]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias50]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias38]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias13]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias0]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias47]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias0]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias23]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias37]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias23]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias37]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias47]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias20]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias18]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias28]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias5]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias28]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias20]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias35]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias5]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias18]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias35]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias19]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias7]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias36]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias19]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias7]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias6]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias36]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias6]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias25]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias24]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias24]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias25]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias59]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias15]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias15]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias54]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias41]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias12]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias4]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias59]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias8]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-PSUM7-64-128]: Finished analyzing 35256 access patterns a mean/median 3.83518/1.0002 intervals per access pattern and mean/median 4.58516/1.00545 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias41]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias4]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias12]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias8]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-PSUM5-0-32]: Finished analyzing 32941 access patterns a mean/median 1.00464/1 intervals per access pattern and mean/median 10.4153/9.52225 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias33]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias43]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias33]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias43]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias17]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias17]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias61]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias61]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias54]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias58]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias58]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias60]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias60]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias53]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias55]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias55]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias53]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias52]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias52]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias48]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias48]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias51]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias51]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias62]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias62]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-PSUM7-0-32]: Finished analyzing 36895 access patterns a mean/median 3.70923/1.00012 intervals per access pattern and mean/median 4.87374/1.00567 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias49]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias49]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias39]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias39]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias34]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias34]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias11]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias11]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias21]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias21]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias10]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias10]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias26]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias26]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias9]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias9]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias3]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias31]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias31]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias14]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias14]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias30]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias1]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias30]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias1]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias16]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias16]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias27]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias27]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:33Z INFO 2416028 [AntiDependencyAnalyzer-Alias3]: Finished analyzing 24 access patterns a mean/median 1/1 intervals per access pattern and mean/median 22/0 intersections per interval.
+2024-06-01T06:03:34Z INFO 2416028 [AntiDependencyAnalyzer-SB-113-128]: Finished analyzing 485342 access patterns a mean/median 1.61014/1 intervals per access pattern and mean/median 3.76102/1.35336 intersections per interval.
+2024-06-01T06:03:34Z INFO 2416028 [AntiDependencyAnalyzer-SB-97-112]: Finished analyzing 493855 access patterns a mean/median 1.61518/1 intervals per access pattern and mean/median 3.80869/1.31664 intersections per interval.
+2024-06-01T06:03:34Z INFO 2416028 [AntiDependencyAnalyzer-SB-0-1]: Finished analyzing 514840 access patterns a mean/median 1.60502/1 intervals per access pattern and mean/median 3.85822/1.25031 intersections per interval.
+2024-06-01T06:03:34Z INFO 2416028 [AntiDependencyAnalyzer-SB-80-96]: Finished analyzing 490947 access patterns a mean/median 1.61882/1 intervals per access pattern and mean/median 3.73144/1.34715 intersections per interval.
+2024-06-01T06:03:34Z INFO 2416028 [AntiDependencyAnalyzer-SB-65-80]: Finished analyzing 492150 access patterns a mean/median 1.61731/1 intervals per access pattern and mean/median 3.82651/1.30954 intersections per interval.
+2024-06-01T06:03:34Z INFO 2416028 [AntiDependencyAnalyzer-SB-96-97]: Finished analyzing 494706 access patterns a mean/median 1.61412/1 intervals per access pattern and mean/median 3.80682/1.31607 intersections per interval.
+2024-06-01T06:03:34Z INFO 2416028 [AntiDependencyAnalyzer-SB-1-16]: Finished analyzing 513836 access patterns a mean/median 1.59125/1 intervals per access pattern and mean/median 3.89805/1.26976 intersections per interval.
+2024-06-01T06:03:34Z INFO 2416028 [AntiDependencyAnalyzer-SB-48-64]: Finished analyzing 491204 access patterns a mean/median 1.6185/1 intervals per access pattern and mean/median 3.7271/1.35241 intersections per interval.
+2024-06-01T06:03:34Z INFO 2416028 [AntiDependencyAnalyzer-SB-112-113]: Finished analyzing 491274 access patterns a mean/median 1.61841/1 intervals per access pattern and mean/median 3.72717/1.33931 intersections per interval.
+2024-06-01T06:03:34Z INFO 2416028 [AntiDependencyAnalyzer-SB-32-33]: Finished analyzing 496264 access patterns a mean/median 1.61219/1 intervals per access pattern and mean/median 3.79441/1.33153 intersections per interval.
+2024-06-01T06:03:34Z INFO 2416028 [AntiDependencyAnalyzer-SB-16-32]: Finished analyzing 491738 access patterns a mean/median 1.61782/1 intervals per access pattern and mean/median 3.71873/1.35426 intersections per interval.
+2024-06-01T06:03:34Z INFO 2416028 [AntiDependencyAnalyzer-SB-33-48]: Finished analyzing 495826 access patterns a mean/median 1.61273/1 intervals per access pattern and mean/median 3.79513/1.33184 intersections per interval.
+2024-06-01T06:03:34Z INFO 2416028 [AntiDependencyAnalyzer-SB-64-65]: Finished analyzing 492159 access patterns a mean/median 1.6173/1 intervals per access pattern and mean/median 3.82643/1.30946 intersections per interval.
+2024-06-01T06:03:34Z USER 2416028 [ModuleForkPass]: anti_dependency_analyzer finished after 2.784 seconds
+2024-06-01T06:03:34Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  1956mb, ru_maxrss:  2111mb (delta=0mb)
+2024-06-01T06:03:34Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62163 memory location(s), 1 block(s), and 277135 instruction(s). Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:34Z USER 2416028 [ModuleForkPass]: Running dep_opt
+2024-06-01T06:03:34Z INFO 2416028 [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=62163 blocks=1 instructions=277135 Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:34Z INFO 2416028 [build_flow_deps]: Start build fdeps. Invocation: 3Sat Jun  1 06:03:34 2024
+2024-06-01T06:03:34Z INFO 2416028 [build_flow_deps]: Allocs: 62163 instructions: 277135
+2024-06-01T06:03:34Z INFO 2403803 (sg00) [build_flow_deps]: Build fdeps inserted 816813 edges 
+2024-06-01T06:03:34Z INFO 2403803 (sg00) [build_flow_deps]: Done build fdeps 816813 Sat Jun  1 06:03:34 2024
+2024-06-01T06:03:34Z USER 2403803 (sg00) [ModuleForkPass]: build_fdeps finished after 3.441 seconds
+2024-06-01T06:03:34Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  12548mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:03:34Z INFO 2403803 (sg02) [DMAOptimizationBase]: SB Rotation rotated 1223 Sb address
+2024-06-01T06:03:34Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 147823 memory location(s), 1 block(s), and 341616 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:03:34Z USER 2403803 (sg00) [ModuleForkPass]: Running remove_redundancies
+2024-06-01T06:03:34Z INFO 2403803 (sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 8471 spill/reload instructions
+2024-06-01T06:03:34Z INFO 2403803 (sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 7994 spill/reload memory locations
+2024-06-01T06:03:34Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=147823 blocks=1 instructions=341616 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:03:34Z INFO 2403803 (sg00) [RemoveRedundancies]: remove_clobbered_writes
+2024-06-01T06:03:35Z INFO 2403803 (sg00) [RemoveRedundancies]: remove_clobbered_writes: 615
+2024-06-01T06:03:35Z INFO 2403803 (sg00) [RemoveRedundancies]: remove_useless_insts
+2024-06-01T06:03:35Z INFO 2403803 (sg00) [RemoveRedundancies]: remove Useless Instructions: 0
+2024-06-01T06:03:35Z USER 2403803 (sg00) [ModuleForkPass]: remove_redundancies finished after 0.554 seconds
+2024-06-01T06:03:35Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  12546mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:03:35Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 147823 memory location(s), 1 block(s), and 341001 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:03:35Z USER 2403803 (sg00) [ModuleForkPass]: Running anti_dependency_analyzer
+2024-06-01T06:03:35Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=147823 blocks=1 instructions=341001 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:03:35Z INFO 2403803 (sg00) [AntiDependencyAnalyzer]: Batch size: 1000
+2024-06-01T06:03:35Z INFO 2403803 (sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB}
+2024-06-01T06:03:35Z INFO 2416028 [build_flow_deps]: Build fdeps inserted 814407 edges 
+2024-06-01T06:03:35Z INFO 2416028 [build_flow_deps]: Done build fdeps 814407 Sat Jun  1 06:03:35 2024
+2024-06-01T06:03:36Z USER 2416028 [ModuleForkPass]: dep_opt finished after 2.006 seconds
+2024-06-01T06:03:36Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  1900mb, ru_maxrss:  2111mb (delta=0mb)
+2024-06-01T06:03:36Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62163 memory location(s), 1 block(s), and 277135 instruction(s). Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:36Z USER 2416028 [ModuleForkPass]: Running report_stats
+2024-06-01T06:03:36Z INFO 2416028 [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=62163 blocks=1 instructions=277135 Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:36Z INFO 2416028 [ReportStats]: Data Movement Statistics: sg0000
+┌─────────────┬────────────────────────────────────┬───────┬────────────┐
+│ Instruction │ Kind                               │ Count │ Bytes      │
+├─────────────┼────────────────────────────────────┼───────┼────────────┤
+│ DMACopy     │ ExternalInputParameter -> Internal │ 1     │ 393216     │
+│ Load        │ Const -> Internal                  │ 3     │ 49408      │
+│ Load        │ ExternalInput -> Internal          │ 36    │ 264        │
+│ Load        │ ExternalInputParameter -> Internal │ 7970  │ 5024931840 │
+│ Load        │ Internal                           │ 64    │ 8388608    │
+│ Save        │ Internal                           │ 256   │ 8388608    │
+│ Save        │ Internal -> ExternalOutput         │ 63    │ 256032     │
+└─────────────┴────────────────────────────────────┴───────┴────────────┘
+
+2024-06-01T06:03:36Z INFO 2416028 [ReportStats]: 
+┌─────────────────────┬───────┐
+│ Bytes per partition │ Count │
+├─────────────────────┼───────┤
+│ 4                   │ 36    │
+│ 32                  │ 63    │
+│ 64                  │ 1     │
+│ 128                 │ 1     │
+│ 256                 │ 1     │
+│ 512                 │ 2113  │
+│ 2048                │ 1280  │
+│ 5250                │ 32    │
+│ 5376                │ 64    │
+│ 7168                │ 3072  │
+│ 8192                │ 1729  │
+│ 131072              │ 3     │
+└─────────────────────┴───────┘
+
+2024-06-01T06:03:36Z INFO 2416028 [ReportStats]: MM Stats: #MatMults 223233 #MatMult-Transposes 66785
+2024-06-01T06:03:36Z USER 2416028 [ModuleForkPass]: report_stats finished after 0.076 seconds
+2024-06-01T06:03:36Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  1860mb, ru_maxrss:  2111mb (delta=0mb)
+2024-06-01T06:03:36Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62163 memory location(s), 1 block(s), and 277135 instruction(s). Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:36Z USER 2416028 [ModuleForkPass]: Running assign_trigger_engine
+2024-06-01T06:03:36Z INFO 2416028 [ModuleForkPass]: Inputs to assign_trigger_engine: modules=1 functions=1 allocs=62163 blocks=1 instructions=277135 Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:36Z INFO 2416028 [AssignTriggerEngine]: Assigned trigger engine for 320 DMA instructions
+2024-06-01T06:03:36Z USER 2416028 [ModuleForkPass]: assign_trigger_engine finished after 0.116 seconds
+2024-06-01T06:03:36Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  1875mb, ru_maxrss:  2111mb (delta=0mb)
+2024-06-01T06:03:36Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62163 memory location(s), 1 block(s), and 277135 instruction(s). Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:36Z USER 2416028 [ModuleForkPass]: Running alloc_queues
+2024-06-01T06:03:36Z INFO 2416028 [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=62163 blocks=1 instructions=277135 Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:36Z USER 2416028 [ModuleForkPass]: alloc_queues finished after 0.046 seconds
+2024-06-01T06:03:36Z INFO 2416028 [ModuleForkPass]: curr_vmrss:  1875mb, ru_maxrss:  2111mb (delta=0mb)
+2024-06-01T06:03:36Z INFO 2416028 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 62163 memory location(s), 1 block(s), and 277135 instruction(s). Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:36Z USER 2416028 [BackendDriver]: mod_parallel_pass finished after 69.533 seconds
+2024-06-01T06:03:36Z INFO 2416028 [BackendDriver]: curr_vmrss:  1875mb, ru_maxrss:  2111mb (delta=1495mb)
+2024-06-01T06:03:36Z INFO 2416028 [BackendDriver]: Output has 1 module(s), 1 function(s), 62163 memory location(s), 1 block(s), and 277135 instruction(s). Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:36Z USER 2416028 [BackendDriver]: Running dep_reduction
+2024-06-01T06:03:36Z INFO 2416028 [BackendDriver]: Inputs to dep_reduction: modules=1 functions=1 allocs=62163 blocks=1 instructions=277135 Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:36Z INFO 2416028 [DepReduction]: Start Dependency Reduction
+2024-06-01T06:03:36Z INFO 2416028 [DepReduction]: Processing async instrs...
+2024-06-01T06:03:36Z INFO 2416028 [DepReduction]: Processing secondary edges per engine...
+2024-06-01T06:03:36Z INFO 2403803 (sg02) [DMAOptimizationBase]: SB Rotation rotated 8187 Sb address
+2024-06-01T06:03:36Z INFO 2403803 (sg01) [DMAOptimizationBase]: [spill optimization round 1]: removed 307 spill/reload instructions
+2024-06-01T06:03:36Z INFO 2403803 (sg01) [DMAOptimizationBase]: [spill optimization round 1]: removed 297 spill/reload memory locations
+2024-06-01T06:03:36Z INFO 2416028 [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 224835
+2024-06-01T06:03:36Z INFO 2416028 [DepReduction]: Processing redundant descendants...
+2024-06-01T06:03:37Z INFO 2403803 [AntiDependencyAnalyzer-PSUM1-0-32]: Finished analyzing 31305 access patterns a mean/median 1.00767/1 intervals per access pattern and mean/median 3.14836/1.71493 intersections per interval.
+2024-06-01T06:03:37Z INFO 2403803 [AntiDependencyAnalyzer-PSUM3-64-128]: Finished analyzing 32120 access patterns a mean/median 1.00747/1 intervals per access pattern and mean/median 3.65357/2.22414 intersections per interval.
+2024-06-01T06:03:37Z INFO 2403803 [AntiDependencyAnalyzer-PSUM3-32-64]: Finished analyzing 32124 access patterns a mean/median 1.00747/1 intervals per access pattern and mean/median 3.65668/2.25399 intersections per interval.
+2024-06-01T06:03:37Z INFO 2403803 [AntiDependencyAnalyzer-PSUM3-0-32]: Finished analyzing 32454 access patterns a mean/median 1.0074/1 intervals per access pattern and mean/median 3.70677/2.30934 intersections per interval.
+2024-06-01T06:03:37Z INFO 2403803 [AntiDependencyAnalyzer-PSUM7-32-128]: Finished analyzing 32207 access patterns a mean/median 1.00745/1 intervals per access pattern and mean/median 3.48152/2.0739 intersections per interval.
+2024-06-01T06:03:37Z INFO 2403803 [AntiDependencyAnalyzer-PSUM5-0-32]: Finished analyzing 31368 access patterns a mean/median 1.00765/1 intervals per access pattern and mean/median 3.14035/1.76143 intersections per interval.
+2024-06-01T06:03:37Z INFO 2403803 [AntiDependencyAnalyzer-PSUM0-64-128]: Finished analyzing 27687 access patterns a mean/median 1.00867/1 intervals per access pattern and mean/median 3.08937/1.40205 intersections per interval.
+2024-06-01T06:03:37Z INFO 2403803 [AntiDependencyAnalyzer-PSUM5-32-128]: Finished analyzing 31038 access patterns a mean/median 1.00773/1 intervals per access pattern and mean/median 3.09371/1.73629 intersections per interval.
+2024-06-01T06:03:37Z INFO 2403803 [AntiDependencyAnalyzer-PSUM0-32-64]: Finished analyzing 27691 access patterns a mean/median 1.00867/1 intervals per access pattern and mean/median 3.08946/1.40211 intersections per interval.
+2024-06-01T06:03:37Z INFO 2403803 [AntiDependencyAnalyzer-PSUM4-32-64]: Finished analyzing 27629 access patterns a mean/median 1.00869/1 intervals per access pattern and mean/median 2.959/1.36526 intersections per interval.
+2024-06-01T06:03:37Z INFO 2403803 [AntiDependencyAnalyzer-PSUM6-0-32]: Finished analyzing 29071 access patterns a mean/median 1.00826/1 intervals per access pattern and mean/median 2.73387/1.31485 intersections per interval.
+2024-06-01T06:03:37Z INFO 2403803 [AntiDependencyAnalyzer-PSUM6-32-128]: Finished analyzing 28741 access patterns a mean/median 1.00835/1 intervals per access pattern and mean/median 2.67387/1.31959 intersections per interval.
+2024-06-01T06:03:37Z INFO 2403803 [AntiDependencyAnalyzer-PSUM2-32-64]: Finished analyzing 28683 access patterns a mean/median 1.00837/1 intervals per access pattern and mean/median 3.15808/1.41371 intersections per interval.
+2024-06-01T06:03:37Z INFO 2403803 [AntiDependencyAnalyzer-PSUM1-32-64]: Finished analyzing 30975 access patterns a mean/median 1.00775/1 intervals per access pattern and mean/median 3.10303/1.69501 intersections per interval.
+2024-06-01T06:03:37Z INFO 2403803 [AntiDependencyAnalyzer-PSUM1-64-128]: Finished analyzing 30971 access patterns a mean/median 1.00775/1 intervals per access pattern and mean/median 3.10311/1.69385 intersections per interval.
+2024-06-01T06:03:37Z INFO 2403803 [AntiDependencyAnalyzer-PSUM7-0-32]: Finished analyzing 32537 access patterns a mean/median 1.00738/1 intervals per access pattern and mean/median 3.52496/2.06323 intersections per interval.
+2024-06-01T06:03:37Z INFO 2403803 [AntiDependencyAnalyzer-PSUM2-64-128]: Finished analyzing 28679 access patterns a mean/median 1.00837/1 intervals per access pattern and mean/median 3.15804/1.41349 intersections per interval.
+2024-06-01T06:03:37Z INFO 2403803 [AntiDependencyAnalyzer-PSUM0-0-32]: Finished analyzing 28021 access patterns a mean/median 1.00856/1 intervals per access pattern and mean/median 3.15505/1.41016 intersections per interval.
+2024-06-01T06:03:37Z INFO 2403803 [AntiDependencyAnalyzer-PSUM4-64-128]: Finished analyzing 27625 access patterns a mean/median 1.00869/1 intervals per access pattern and mean/median 2.95895/1.36495 intersections per interval.
+2024-06-01T06:03:37Z INFO 2403803 [AntiDependencyAnalyzer-PSUM2-0-32]: Finished analyzing 29013 access patterns a mean/median 1.00827/1 intervals per access pattern and mean/median 3.20177/1.40912 intersections per interval.
+2024-06-01T06:03:37Z INFO 2403803 [AntiDependencyAnalyzer-PSUM4-0-32]: Finished analyzing 27959 access patterns a mean/median 1.00858/1 intervals per access pattern and mean/median 3.01722/1.35725 intersections per interval.
+2024-06-01T06:03:37Z INFO 2416028 [DepReduction]: Processing redundant descendants, Done. Num edges removed 10275
+2024-06-01T06:03:37Z INFO 2416028 [DepReduction]: Processing async instrs, Done. Num edges removed 235110
+2024-06-01T06:03:37Z INFO 2403803 [AntiDependencyAnalyzer-DRAM]: Finished analyzing 11119 access patterns a mean/median 90.1435/112.471 intervals per access pattern and mean/median 1.76648/0.486629 intersections per interval.
+2024-06-01T06:03:37Z INFO 2403803 (sg01) [DMAOptimizationBase]: [spill optimization round 2]: removed 36 spill/reload instructions
+2024-06-01T06:03:37Z INFO 2403803 (sg01) [DMAOptimizationBase]: [spill optimization round 2]: removed 36 spill/reload memory locations
+2024-06-01T06:03:37Z INFO 2403803 (sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 2472935424, 39.7538% out of total spill/reload dma traffic
+2024-06-01T06:03:38Z INFO 2403803 (sg02) [DMAOptimizationBase]: SB Rotation rotated 2539 Sb address
+2024-06-01T06:03:38Z INFO 2403803 [AntiDependencyAnalyzer-SB-113-128]: Finished analyzing 584332 access patterns a mean/median 1.06637/1 intervals per access pattern and mean/median 1.88775/1.00001 intersections per interval.
+2024-06-01T06:03:38Z INFO 2403803 [AntiDependencyAnalyzer-SB-64-65]: Finished analyzing 610517 access patterns a mean/median 1.06353/1 intervals per access pattern and mean/median 1.89267/1.00001 intersections per interval.
+2024-06-01T06:03:38Z INFO 2403803 [AntiDependencyAnalyzer-SB-33-64]: Finished analyzing 622696 access patterns a mean/median 1.06228/1 intervals per access pattern and mean/median 1.87089/1 intersections per interval.
+2024-06-01T06:03:38Z INFO 2403803 [AntiDependencyAnalyzer-SB-32-33]: Finished analyzing 622701 access patterns a mean/median 1.06228/1 intervals per access pattern and mean/median 1.87088/1.00001 intersections per interval.
+2024-06-01T06:03:38Z INFO 2403803 [AntiDependencyAnalyzer-SB-96-97]: Finished analyzing 610633 access patterns a mean/median 1.06351/1 intervals per access pattern and mean/median 1.89269/1 intersections per interval.
+2024-06-01T06:03:38Z INFO 2403803 [AntiDependencyAnalyzer-SB-65-96]: Finished analyzing 610512 access patterns a mean/median 1.06353/1 intervals per access pattern and mean/median 1.89267/1.00001 intersections per interval.
+2024-06-01T06:03:38Z INFO 2403803 [AntiDependencyAnalyzer-SB-0-1]: Finished analyzing 623007 access patterns a mean/median 1.06225/1 intervals per access pattern and mean/median 1.87057/1.00001 intersections per interval.
+2024-06-01T06:03:38Z INFO 2403803 [AntiDependencyAnalyzer-SB-97-113]: Finished analyzing 610512 access patterns a mean/median 1.06353/1 intervals per access pattern and mean/median 1.89267/1.00001 intersections per interval.
+2024-06-01T06:03:38Z INFO 2403803 [AntiDependencyAnalyzer-SB-1-32]: Finished analyzing 622696 access patterns a mean/median 1.06228/1 intervals per access pattern and mean/median 1.87089/1 intersections per interval.
+2024-06-01T06:03:38Z USER 2403803 (sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 3.013 seconds
+2024-06-01T06:03:38Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  12941mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:03:38Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 147823 memory location(s), 1 block(s), and 341001 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:03:38Z USER 2403803 (sg00) [ModuleForkPass]: Running tensor_copy_elim
+2024-06-01T06:03:38Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=147823 blocks=1 instructions=341001 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:03:38Z INFO 2403803 (sg00) [TensorCopyElim]: Tensor CP elimination: 0
+2024-06-01T06:03:40Z USER 2403803 (sg00) [ModuleForkPass]: tensor_copy_elim finished after 1.781 seconds
+2024-06-01T06:03:40Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  12753mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:03:40Z INFO 2416028 [DepReduction]: Num Async removed: 0
+2024-06-01T06:03:40Z INFO 2416028 [DepReduction]: Finished dependency reduction: 1891258 removed, new total 76966
+2024-06-01T06:03:40Z INFO 2416028 [DepReduction]: Finished Dependency Reduction
+2024-06-01T06:03:40Z USER 2416028 [BackendDriver]: dep_reduction finished after 3.821 seconds
+2024-06-01T06:03:40Z INFO 2416028 [BackendDriver]: curr_vmrss:  1898mb, ru_maxrss:  2111mb (delta=0mb)
+2024-06-01T06:03:40Z INFO 2416028 [BackendDriver]: Output has 1 module(s), 1 function(s), 62163 memory location(s), 1 block(s), and 277135 instruction(s). Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:40Z USER 2416028 [BackendDriver]: Running bir_racecheck
+2024-06-01T06:03:40Z INFO 2416028 [BackendDriver]: Inputs to bir_racecheck: modules=1 functions=1 allocs=62163 blocks=1 instructions=277135 Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:40Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 147823 memory location(s), 1 block(s), and 341001 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:03:40Z USER 2403803 (sg00) [ModuleForkPass]: Running post_sched
+2024-06-01T06:03:40Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=147823 blocks=1 instructions=341001 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:03:40Z INFO 2403803 [post_scheduler]: Start PosT ScheD 3 sunda Sat Jun  1 06:03:40 2024
+2024-06-01T06:03:41Z INFO 2403803 (sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 202 spill/reload instructions
+2024-06-01T06:03:41Z INFO 2403803 (sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 177 spill/reload memory locations
+2024-06-01T06:03:41Z INFO 2403803 (sg01) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 105381892, 1.69407% out of total spill/reload dma traffic
+2024-06-01T06:03:41Z INFO 2403803 (sg02) [DMAOptimizationBase]: SB Rotation rotated 117 Sb address
+2024-06-01T06:03:42Z INFO 2403803 (sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs
+2024-06-01T06:03:44Z USER 2416028 [BackendDriver]: bir_racecheck finished after 3.684 seconds
+2024-06-01T06:03:44Z INFO 2416028 [BackendDriver]: curr_vmrss:  2353mb, ru_maxrss:  2353mb (delta=242mb)
+2024-06-01T06:03:44Z INFO 2416028 [BackendDriver]: Output has 1 module(s), 1 function(s), 62163 memory location(s), 1 block(s), and 277135 instruction(s). Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:44Z USER 2416028 [BackendDriver]: Running lower_dma
+2024-06-01T06:03:44Z INFO 2416028 [BackendDriver]: Inputs to lower_dma: modules=1 functions=1 allocs=62163 blocks=1 instructions=277135 Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:44Z USER 2416028 [BackendDriver]: lower_dma finished after 0.490 seconds
+2024-06-01T06:03:44Z INFO 2416028 [BackendDriver]: curr_vmrss:  1848mb, ru_maxrss:  2353mb (delta=0mb)
+2024-06-01T06:03:44Z INFO 2416028 [BackendDriver]: Output has 1 module(s), 1 function(s), 62227 memory location(s), 1 block(s), and 277198 instruction(s). Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:44Z USER 2416028 [BackendDriver]: Running coalesce_dma_blocks
+2024-06-01T06:03:44Z INFO 2416028 [BackendDriver]: Inputs to coalesce_dma_blocks: modules=1 functions=1 allocs=62227 blocks=1 instructions=277198 Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:44Z INFO 2416028 [CoalesceDmaBlocks]: Coaleseced 116 DMA triggers
+2024-06-01T06:03:44Z USER 2416028 [BackendDriver]: coalesce_dma_blocks finished after 0.351 seconds
+2024-06-01T06:03:44Z INFO 2416028 [BackendDriver]: curr_vmrss:  1854mb, ru_maxrss:  2353mb (delta=0mb)
+2024-06-01T06:03:45Z INFO 2416028 [BackendDriver]: Output has 1 module(s), 1 function(s), 62227 memory location(s), 1 block(s), and 277082 instruction(s). Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:45Z USER 2416028 [BackendDriver]: Running alloc_semaphores
+2024-06-01T06:03:45Z INFO 2416028 [BackendDriver]: Inputs to alloc_semaphores: modules=1 functions=1 allocs=62227 blocks=1 instructions=277082 Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:45Z USER 2416028 [BackendDriver]: alloc_semaphores finished after 0.364 seconds
+2024-06-01T06:03:45Z INFO 2416028 [BackendDriver]: curr_vmrss:  1854mb, ru_maxrss:  2353mb (delta=0mb)
+2024-06-01T06:03:45Z INFO 2416028 [BackendDriver]: Output has 1 module(s), 1 function(s), 62227 memory location(s), 1 block(s), and 277082 instruction(s). Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:45Z USER 2416028 [BackendDriver]: Running expand_inst_late
+2024-06-01T06:03:45Z INFO 2416028 [BackendDriver]: Inputs to expand_inst_late: modules=1 functions=1 allocs=62227 blocks=1 instructions=277082 Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:45Z USER 2416028 [BackendDriver]: expand_inst_late finished after 0.097 seconds
+2024-06-01T06:03:45Z INFO 2416028 [BackendDriver]: curr_vmrss:  1854mb, ru_maxrss:  2353mb (delta=0mb)
+2024-06-01T06:03:45Z INFO 2416028 [BackendDriver]: Output has 1 module(s), 1 function(s), 62227 memory location(s), 1 block(s), and 277082 instruction(s). Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:45Z USER 2416028 [BackendDriver]: Running lower_sync
+2024-06-01T06:03:45Z INFO 2416028 [BackendDriver]: Inputs to lower_sync: modules=1 functions=1 allocs=62227 blocks=1 instructions=277082 Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:45Z USER 2416028 [BackendDriver]: lower_sync finished after 0.177 seconds
+2024-06-01T06:03:45Z INFO 2416028 [BackendDriver]: curr_vmrss:  1860mb, ru_maxrss:  2353mb (delta=0mb)
+2024-06-01T06:03:45Z INFO 2416028 [BackendDriver]: Output has 1 module(s), 1 function(s), 62227 memory location(s), 1 block(s), and 284741 instruction(s). Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:45Z USER 2416028 [BackendDriver]: Running lower_act
+2024-06-01T06:03:45Z INFO 2416028 [BackendDriver]: Inputs to lower_act: modules=1 functions=1 allocs=62227 blocks=1 instructions=284741 Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:45Z USER 2416028 [BackendDriver]: lower_act finished after 0.093 seconds
+2024-06-01T06:03:45Z INFO 2416028 [BackendDriver]: curr_vmrss:  1861mb, ru_maxrss:  2353mb (delta=0mb)
+2024-06-01T06:03:45Z INFO 2416028 [BackendDriver]: Output has 1 module(s), 1 function(s), 62227 memory location(s), 1 block(s), and 284871 instruction(s). Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:45Z USER 2416028 [BackendDriver]: Running lower_dve
+2024-06-01T06:03:45Z INFO 2416028 [BackendDriver]: Inputs to lower_dve: modules=1 functions=1 allocs=62227 blocks=1 instructions=284871 Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:45Z INFO 2416028 [LowerDVE]: Loading DVE opcodes table dve_info.json from /root/anaconda3/envs/masp_fastapi/lib/python3.9/site-packages/neuronxcc/dve/dve_bin/dve_info.json
+2024-06-01T06:03:46Z USER 2416028 [BackendDriver]: lower_dve finished after 0.678 seconds
+2024-06-01T06:03:46Z INFO 2416028 [BackendDriver]: curr_vmrss:  1898mb, ru_maxrss:  2353mb (delta=0mb)
+2024-06-01T06:03:46Z INFO 2416028 [BackendDriver]: Output has 1 module(s), 1 function(s), 62227 memory location(s), 1 block(s), and 284871 instruction(s). Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:46Z USER 2416028 [BackendDriver]: Running lower_ap
+2024-06-01T06:03:46Z INFO 2416028 [BackendDriver]: Inputs to lower_ap: modules=1 functions=1 allocs=62227 blocks=1 instructions=284871 Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:46Z INFO 2403803 [post_scheduler]: Time-aware hwm post-sched
+2024-06-01T06:03:46Z INFO 2403803 (sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 7 spill/reload instructions
+2024-06-01T06:03:46Z INFO 2403803 (sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 7 spill/reload memory locations
+2024-06-01T06:03:46Z USER 2416028 [BackendDriver]: lower_ap finished after 0.183 seconds
+2024-06-01T06:03:46Z INFO 2416028 [BackendDriver]: curr_vmrss:  1898mb, ru_maxrss:  2353mb (delta=0mb)
+2024-06-01T06:03:46Z INFO 2416028 [BackendDriver]: Output has 1 module(s), 1 function(s), 62227 memory location(s), 1 block(s), and 284871 instruction(s). Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:46Z USER 2416028 [BackendDriver]: Running alloc_regs
+2024-06-01T06:03:46Z INFO 2416028 [BackendDriver]: Inputs to alloc_regs: modules=1 functions=1 allocs=62227 blocks=1 instructions=284871 Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:46Z INFO 2416028 [AllocRegs]:   allocating REG
+2024-06-01T06:03:46Z INFO 2416028 [AllocRegs]:     main loop iteration 1
+2024-06-01T06:03:46Z USER 2416028 [BackendDriver]: alloc_regs finished after 0.027 seconds
+2024-06-01T06:03:46Z INFO 2416028 [BackendDriver]: curr_vmrss:  1898mb, ru_maxrss:  2353mb (delta=0mb)
+2024-06-01T06:03:46Z INFO 2416028 [BackendDriver]: Output has 1 module(s), 1 function(s), 62227 memory location(s), 1 block(s), and 284871 instruction(s). Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:46Z USER 2416028 [BackendDriver]: Running birverifier
+2024-06-01T06:03:46Z INFO 2416028 [BackendDriver]: Inputs to birverifier: modules=1 functions=1 allocs=62227 blocks=1 instructions=284871 Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:47Z INFO 2403803 (sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 917504, 0.0147494% out of total spill/reload dma traffic
+2024-06-01T06:03:47Z USER 2416028 [BackendDriver]: birverifier finished after 0.599 seconds
+2024-06-01T06:03:47Z INFO 2416028 [BackendDriver]: curr_vmrss:  1904mb, ru_maxrss:  2353mb (delta=0mb)
+2024-06-01T06:03:47Z INFO 2403803 (sg02) [DMAOptimizationBase]: SB Rotation rotated 16807 Sb address
+2024-06-01T06:03:47Z INFO 2416028 [BackendDriver]: Output has 1 module(s), 1 function(s), 62227 memory location(s), 1 block(s), and 284871 instruction(s). Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:47Z USER 2416028 [BackendDriver]: Running codegen
+2024-06-01T06:03:47Z INFO 2416028 [BackendDriver]: Inputs to codegen: modules=1 functions=1 allocs=62227 blocks=1 instructions=284871 Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:47Z INFO 2416028 [Codegen]: Total compiler allocated DRAM tensors: 0.0157471 GB
+2024-06-01T06:03:47Z INFO 2416028 [Codegen]: Total un-allocated DRAM tensors by kind: 
+2024-06-01T06:03:47Z INFO 2416028 [Codegen]: 
+┌────────────────────────┬─────────────┐
+│ TensorKind             │ Size (GB)   │
+├────────────────────────┼─────────────┤
+│ ExternalInput          │ 7.07805e-08 │
+│ ExternalInputParameter │ 4.67983     │
+│ ExternalOutput         │ 0.000238448 │
+│ Const                  │ 9.17912e-05 │
+│ Pointer                │ 4.76837e-07 │
+└────────────────────────┴─────────────┘
+
+2024-06-01T06:03:47Z INFO 2416028 [Codegen]: Total runtime managed DRAM tensors: 4.68016 GB
+2024-06-01T06:03:49Z INFO 2416028 [Codegen]: Instruction Stats: 
+2024-06-01T06:03:49Z INFO 2416028 [Codegen]: 
+┌───────────────────────────┬────────┐
+│ Opcode                    │ Count  │
+├───────────────────────────┼────────┤
+│ MATMUL                    │ 223298 │
+│ LDWEIGHTS                 │ 220644 │
+│ ACTIVATE                  │ 29323  │
+│ TENSOR_TENSOR             │ 8645   │
+│ PSEUDO_DMA_TRIGGER        │ 8341   │
+│ EVENT_SEMAPHORE           │ 7659   │
+│ LOAD_MASK_SELECT          │ 1359   │
+│ UNKNOWN(0x8d)             │ 1280   │
+│ COPY                      │ 1263   │
+│ STREAM_TRANSPOSE          │ 1024   │
+│ COPY_PREDICATED           │ 960    │
+│ TENSOR_SCALAR             │ 875    │
+│ TENSOR_REDUCE             │ 512    │
+│ CAST                      │ 340    │
+│ STREAM_SHUFFLE            │ 335    │
+│ TENSOR_SCALAR_ADDR        │ 194    │
+│ MEMSET                    │ 170    │
+│ ACT_TABLE_LOAD            │ 130    │
+│ UNKNOWN(0x8a)             │ 96     │
+│ UNKNOWN(0x8b)             │ 96     │
+│ IOTA                      │ 86     │
+│ UNKNOWN(0xd6)             │ 64     │
+│ PSEUDO_BRANCH_LABEL       │ 64     │
+│ PSEUDO_TRIGGER_COLLECTIVE │ 64     │
+│ PSEUDO_DMA_MEMCPY         │ 64     │
+│ UNKNOWN(0x8f)             │ 64     │
+│ NOP                       │ 63     │
+│ RECIPROCAL                │ 32     │
+│ TENSOR_SCALAR             │ 22     │
+└───────────────────────────┴────────┘
+
+2024-06-01T06:03:49Z INFO 2416028 [Codegen]: 
+┌────────────┬────────┐
+│ Engine     │ Count  │
+├────────────┼────────┤
+│ Unassigned │ 0      │
+│ Pool       │ 947    │
+│ Activation │ 30623  │
+│ PE         │ 446816 │
+│ DMA        │ 0      │
+│ DVE        │ 17348  │
+│ SP         │ 11338  │
+└────────────┴────────┘
+
+2024-06-01T06:03:49Z INFO 2416028 [Codegen]: Total instructions: 507072 (0.0302238 GB)
+2024-06-01T06:03:49Z INFO 2416028 [Codegen]: Total Dynamic DMA instruction count: 0
+2024-06-01T06:03:49Z USER 2416028 [Codegen]: isa_gen finished after 1.779 seconds
+2024-06-01T06:03:49Z INFO 2416028 [Codegen]: Number of DMA descriptors on each queue:
+2024-06-01T06:03:49Z INFO 2416028 [Codegen]: 
+┌───────────────────┬─────────┐
+│ Queue             │ Count   │
+├───────────────────┼─────────┤
+│ qActSpillReload0  │ 8192    │
+│ qPoolIO0          │ 16002   │
+│ qPoolPIOParam0    │ 64      │
+│ qPoolSpillReload0 │ 2016    │
+│ qSPIO0            │ 102     │
+│ qSPPIO0           │ 2048    │
+│ qSPPIOParam0      │ 2025696 │
+│ qSPSpillReload0   │ 672     │
+└───────────────────┴─────────┘
+
+2024-06-01T06:03:49Z INFO 2416028 [Codegen]: Total descriptors: 2054792 (0.0306188 GB)
+2024-06-01T06:03:49Z USER 2416028 [Codegen]: dma_desc_gen finished after 0.161 seconds
+2024-06-01T06:03:49Z INFO 2416028 [Codegen]: Estimated peak DRAM usage: 4.75675 GB
+2024-06-01T06:03:49Z INFO 2416028 [Codegen]: Generating debug info
+2024-06-01T06:03:49Z INFO 2403803 (sg01) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions
+2024-06-01T06:03:50Z INFO 2403803 (sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 326 spill/reload instructions
+2024-06-01T06:03:50Z INFO 2403803 (sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 3 spill/reload memory locations
+2024-06-01T06:03:51Z USER 2416028 [Codegen]: debug_info_gen finished after 2.145 seconds
+2024-06-01T06:03:51Z USER 2416028 [BackendDriver]: codegen finished after 4.320 seconds
+2024-06-01T06:03:51Z INFO 2416028 [BackendDriver]: curr_vmrss:  2143mb, ru_maxrss:  2353mb (delta=0mb)
+2024-06-01T06:03:51Z INFO 2416028 [BackendDriver]: Output has 1 module(s), 1 function(s), 62227 memory location(s), 1 block(s), and 284871 instruction(s). Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:51Z USER 2416028 [BackendDriver]: Running neff_packager
+2024-06-01T06:03:51Z INFO 2416028 [BackendDriver]: Inputs to neff_packager: modules=1 functions=1 allocs=62227 blocks=1 instructions=284871 Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:52Z INFO 2416028 [NeffFileWriter]: IR signature: df65c4fe0d6b721a3eef1f8151948a77 for neff artifacts
+2024-06-01T06:03:52Z USER 2416028 [BackendDriver]: neff_packager finished after 0.419 seconds
+2024-06-01T06:03:52Z INFO 2416028 [BackendDriver]: curr_vmrss:  1905mb, ru_maxrss:  2353mb (delta=0mb)
+2024-06-01T06:03:52Z INFO 2416028 [BackendDriver]: Output has 1 module(s), 1 function(s), 62227 memory location(s), 1 block(s), and 284871 instruction(s). Max writers: 257 Max Readers: 66720
+2024-06-01T06:03:52Z INFO 2403803 (sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address
+2024-06-01T06:03:52Z USER 2403803 (sg02) [ModuleForkPass]: address_rotation_sb finished after 19.624 seconds
+2024-06-01T06:03:52Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  12971mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:03:53Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 688010 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:03:53Z USER 2403803 (sg02) [ModuleForkPass]: Running coloring_allocator_dram
+2024-06-01T06:03:53Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=217043 blocks=1 instructions=688010 Max writers: 576 Max Readers: 184036
+2024-06-01T06:03:53Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 9486189328
+2024-06-01T06:03:53Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 2799 bytes
+2024-06-01T06:03:53Z INFO 2403803 [post_scheduler]: Time-aware simulation time: 59312803
+2024-06-01T06:03:53Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 1906589696
+2024-06-01T06:03:53Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 2876 bytes
+2024-06-01T06:03:53Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 32768
+2024-06-01T06:03:53Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:03:53Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: Allocating functions
+2024-06-01T06:03:53Z INFO 2403803 (sg02) [ColoringAllocator::Rep]:   linearize and check
+2024-06-01T06:03:54Z INFO 2403803 [post_scheduler]: Done  PosT ScheD Sat Jun  1 06:03:54 2024
+2024-06-01T06:03:54Z USER 2403803 (sg00) [ModuleForkPass]: post_sched finished after 14.075 seconds
+2024-06-01T06:03:54Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  12950mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:03:54Z INFO 2403803 (sg02) [DRAM_Allocator]:   allocating spills in DRAM pre_link mode
+2024-06-01T06:03:54Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 147823 memory location(s), 1 block(s), and 341001 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:03:54Z USER 2403803 (sg00) [ModuleForkPass]: Running address_rotation_sb
+2024-06-01T06:03:54Z INFO 2403803 (sg02) [DRAM_Allocator]:     reserved space = 1431150352 bytes
+2024-06-01T06:03:54Z INFO 2403803 (sg02) [DRAM_Allocator]:     spill space = 2465464320 bytes
+2024-06-01T06:03:54Z INFO 2403803 (sg02) [DRAM_Allocator]:     aligned spill space = 2465464320 bytes
+2024-06-01T06:03:54Z INFO 2403803 (sg02) [DRAM_Allocator]:     dram space = 107374182400 bytes
+2024-06-01T06:03:54Z INFO 2403803 (sg02) [DRAM_Allocator]:     renumber locations
+2024-06-01T06:03:54Z INFO 2403803 (sg02) [DRAM_Allocator]:         size = 2475
+2024-06-01T06:03:54Z INFO 2403803 (sg02) [DRAM_Allocator]:       find first defs
+2024-06-01T06:03:54Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=147823 blocks=1 instructions=341001 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:03:55Z INFO 2403803 (sg01) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 1608 SpillSaves and Reloads
+2024-06-01T06:03:55Z INFO 2403803 (sg01) [DMAOptimizationBase]:  average loaded DMA size 2761 bytes
+2024-06-01T06:03:56Z INFO 2403803 (sg01) [DMAOptimizationBase]:  average saved DMA size 2668 bytes
+2024-06-01T06:03:57Z INFO 2403803 (sg02) [DRAM_Allocator]:            Num intervals 2475 Num locations 2475
+2024-06-01T06:03:57Z INFO 2403803 (sg02) [DRAM_Allocator]:            IntervalTree Build Done
+2024-06-01T06:03:57Z INFO 2403803 (sg02) [DRAM_Allocator]:            info.neighbors init Done
+2024-06-01T06:03:57Z INFO 2403803 (sg02) [DRAM_Allocator]:            IntervalTree readback Done
+2024-06-01T06:03:57Z INFO 2403803 (sg02) [DRAM_Allocator]:         simplify interference graph
+2024-06-01T06:03:57Z INFO 2403803 (sg02) [DRAM_Allocator]:           initialize low and high
+2024-06-01T06:03:57Z INFO 2403803 (sg02) [DRAM_Allocator]:             lo = 2472
+2024-06-01T06:03:57Z INFO 2403803 (sg02) [DRAM_Allocator]:             hi = 3
+2024-06-01T06:03:57Z INFO 2403803 (sg02) [DRAM_Allocator]:             total = 2475
+2024-06-01T06:03:57Z INFO 2403803 (sg02) [DRAM_Allocator]:           simplify
+2024-06-01T06:03:57Z INFO 2403803 (sg02) [DRAM_Allocator]:             new candidates = 0
+2024-06-01T06:03:57Z INFO 2403803 (sg02) [DRAM_Allocator]:         select ranges
+2024-06-01T06:03:57Z INFO 2403803 (sg02) [DRAM_Allocator]: CC buffer size limit 524288000
+2024-06-01T06:03:57Z INFO 2403803 (sg02) [DRAM_Allocator]: allreduce_dram_hwm 836894720
+2024-06-01T06:03:57Z INFO 2403803 (sg02) [DRAM_Allocator]: Real CC buffer size 524288000
+2024-06-01T06:03:57Z INFO 2403803 (sg02) [DRAM_Allocator]: DRAM hwm after allocation: 1373765632
+2024-06-01T06:03:57Z INFO 2403803 (sg02) [DRAM_Allocator]: DRAM allocation successful
+2024-06-01T06:03:58Z INFO 2403803 (sg01) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 1 combined 708 SpillSaves and Reloads
+2024-06-01T06:03:58Z INFO 2403803 (sg01) [DMAOptimizationBase]:  average loaded DMA size 2779 bytes
+2024-06-01T06:03:58Z INFO 2403803 (sg01) [DMAOptimizationBase]:  average saved DMA size 2753 bytes
+2024-06-01T06:03:58Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 9486189328
+2024-06-01T06:03:58Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 2799 bytes
+2024-06-01T06:03:58Z INFO 2403803 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 10031 PSUM Banks
+2024-06-01T06:03:58Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 1906589696
+2024-06-01T06:03:58Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2876 bytes
+2024-06-01T06:03:58Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 32768
+2024-06-01T06:03:58Z INFO 2403803 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:03:58Z USER 2403803 (sg02) [ModuleForkPass]: coloring_allocator_dram finished after 5.839 seconds
+2024-06-01T06:03:58Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  12857mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:03:59Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 688010 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:03:59Z USER 2403803 (sg02) [ModuleForkPass]: Running address_rotation_dram
+2024-06-01T06:03:59Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=217043 blocks=1 instructions=688010 Max writers: 576 Max Readers: 184036
+2024-06-01T06:03:59Z INFO 2403803 (sg02) [DMAOptimizationBase]: Runtime page size at 512MB
+2024-06-01T06:03:59Z INFO 2403803 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 10169 PSUM Banks
+2024-06-01T06:03:59Z INFO 2403803 (sg01) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 2 combined 0 SpillSaves and Reloads
+2024-06-01T06:04:00Z INFO 2403803 (sg01) [DMAOptimizationBase]:  average loaded DMA size 2779 bytes
+2024-06-01T06:04:00Z INFO 2403803 (sg01) [DMAOptimizationBase]:  average saved DMA size 2753 bytes
+2024-06-01T06:04:00Z INFO 2403803 (sg02) [DMAOptimizationBase]: DRAM hwm before rotation 1373765632
+2024-06-01T06:04:00Z INFO 2403803 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 1505 PSUM Banks
+2024-06-01T06:04:00Z INFO 2403803 (sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 9560399632
+2024-06-01T06:04:00Z INFO 2403803 (sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 2779 bytes
+2024-06-01T06:04:00Z INFO 2403803 (sg02) [DMAOptimizationBase]: allreduce buffer size 524288000
+2024-06-01T06:04:00Z INFO 2403803 (sg02) [DMAOptimizationBase]: allreduce hwm 836894720
+2024-06-01T06:04:00Z INFO 2403803 (sg02) [DMAOptimizationBase]: Real CC buffer size 524288000
+2024-06-01T06:04:01Z INFO 2403803 (sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 1963245568
+2024-06-01T06:04:01Z INFO 2403803 (sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 2753 bytes
+2024-06-01T06:04:01Z INFO 2403803 (sg02) [DMAOptimizationBase]: DRAM hwm after rotation 1373765632
+2024-06-01T06:04:01Z INFO 2403803 (sg02) [DMAOptimizationBase]: DRAM Rotation rotated 1 Dram address
+2024-06-01T06:04:01Z USER 2403803 (sg02) [ModuleForkPass]: address_rotation_dram finished after 2.524 seconds
+2024-06-01T06:04:01Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  12746mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:04:01Z INFO 2403803 (sg00) [DMAOptimizationBase]: SB Rotation rotated 465 Sb address
+2024-06-01T06:04:01Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 688010 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:01Z USER 2403803 (sg02) [ModuleForkPass]: Running tensorcopy_accel
+2024-06-01T06:04:01Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=217043 blocks=1 instructions=688010 Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:01Z INFO 2403803 (sg02) [TensorCopyAccel::Impl]: Running peephole optimization pass
+2024-06-01T06:04:02Z INFO 2403803 (sg02) [TensorCopyAccel::Impl]: Accelerated 1280 out of 92594 tensorcopy in Function: sg0002 average acceleration factor: 1
+2024-06-01T06:04:02Z USER 2403803 (sg02) [ModuleForkPass]: tensorcopy_accel finished after 0.408 seconds
+2024-06-01T06:04:02Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  12729mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:04:02Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 688010 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:02Z USER 2403803 (sg02) [ModuleForkPass]: Running peephole_opts
+2024-06-01T06:04:02Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=217043 blocks=1 instructions=688010 Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:02Z INFO 2403803 (sg02) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: true SplitSelect: true
+2024-06-01T06:04:03Z INFO 2403803 (sg02) [PeepholeOpts]: Split Select: 11520
+2024-06-01T06:04:03Z INFO 2403803 (sg02) [PeepholeOpts]: TSP -> ACT: 17280
+2024-06-01T06:04:03Z INFO 2403803 (sg02) [PeepholeOpts]: COPY -> ACT: 0
+2024-06-01T06:04:03Z INFO 2403803 (sg02) [PeepholeOpts]: RECIPROCAL -> ACT: 0
+2024-06-01T06:04:03Z USER 2403803 (sg02) [ModuleForkPass]: peephole_opts finished after 1.317 seconds
+2024-06-01T06:04:03Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  12785mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:04:03Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 699530 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:03Z USER 2403803 (sg02) [ModuleForkPass]: Running lower_kernel
+2024-06-01T06:04:03Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=217043 blocks=1 instructions=699530 Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:03Z INFO 2403803 (sg02) [LowerKernel]: Started running LowerKernel
+2024-06-01T06:04:03Z INFO 2403803 (sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs
+2024-06-01T06:04:03Z INFO 2403803 (sg02) [LowerKernel]: Start of kernel lowering pass, number of insts: 699530, number of allocs: 217043
+2024-06-01T06:04:04Z INFO 2403803 (sg02) [LowerKernel]: Scan BKs time (s): 0.482435
+2024-06-01T06:04:04Z INFO 2403803 (sg02) [LowerKernel]: Lower BKs time (s): 5e-06
+2024-06-01T06:04:04Z USER 2403803 (sg02) [ModuleForkPass]: lower_kernel finished after 0.259 seconds
+2024-06-01T06:04:04Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  12774mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:04:04Z INFO 2403803 (sg01) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 141778560, 2.27917% out of total spill/reload dma traffic
+2024-06-01T06:04:04Z INFO 2403803 (sg01) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 3018546820, 20.9456% out of total dma traffic
+2024-06-01T06:04:04Z INFO 2403803 (sg01) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 11392811792, 66.331% input load, 2.96263% output write, 30.7064% spill/reload [sg0001]
+2024-06-01T06:04:04Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 699530 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:04Z USER 2403803 (sg02) [ModuleForkPass]: Running build_fdeps
+2024-06-01T06:04:04Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=217043 blocks=1 instructions=699530 Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:04Z INFO 2403803 (sg02) [build_flow_deps]: Start build fdeps. Invocation: 8Sat Jun  1 06:04:04 2024
+2024-06-01T06:04:04Z INFO 2403803 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 9486189328
+2024-06-01T06:04:04Z INFO 2403803 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 2799 bytes
+2024-06-01T06:04:04Z INFO 2403803 (sg00) [DMAOptimizationBase]: SB Rotation rotated 722 Sb address
+2024-06-01T06:04:04Z INFO 2403803 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 1906589696
+2024-06-01T06:04:04Z INFO 2403803 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 2876 bytes
+2024-06-01T06:04:04Z INFO 2403803 (sg02) [build_flow_deps]: Allocs: 217043 instructions: 699530
+2024-06-01T06:04:05Z INFO 2403803 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 32768
+2024-06-01T06:04:05Z INFO 2403803 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 128 bytes
+2024-06-01T06:04:05Z INFO 2403803 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 2811 bytes
+2024-06-01T06:04:05Z INFO 2403803 (sg01) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module);
+2024-06-01T06:04:05Z INFO 2403803 (sg01) [DMAOptimizationBase]: DMA optimization re-enable optimization
+2024-06-01T06:04:05Z USER 2403803 (sg01) [ModuleForkPass]: dma_optimization_sb finished after 41.561 seconds
+2024-06-01T06:04:05Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  12796mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:04:05Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 688010 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:05Z USER 2403803 (sg01) [ModuleForkPass]: Running address_rotation_sb
+2024-06-01T06:04:05Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=217043 blocks=1 instructions=688010 Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:05Z INFO 2403803 (sg00) [DMAOptimizationBase]: SB Rotation rotated 2578 Sb address
+2024-06-01T06:04:06Z INFO 2403803 (sg01) [DMAOptimizationBase]: SB Rotation rotated 1223 Sb address
+2024-06-01T06:04:09Z INFO 2403803 (sg00) [DMAOptimizationBase]: SB Rotation rotated 672 Sb address
+2024-06-01T06:04:09Z INFO 2403803 (sg02) [build_flow_deps]: Build fdeps inserted 1975620 edges 
+2024-06-01T06:04:09Z INFO 2403803 (sg02) [build_flow_deps]: Done build fdeps 1975620 Sat Jun  1 06:04:09 2024
+2024-06-01T06:04:09Z USER 2403803 (sg02) [ModuleForkPass]: build_fdeps finished after 5.158 seconds
+2024-06-01T06:04:09Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  13051mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:04:09Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 699530 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:09Z USER 2403803 (sg02) [ModuleForkPass]: Running remove_redundancies
+2024-06-01T06:04:09Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=217043 blocks=1 instructions=699530 Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:09Z INFO 2403803 (sg02) [RemoveRedundancies]: remove_clobbered_writes
+2024-06-01T06:04:10Z INFO 2403803 (sg02) [RemoveRedundancies]: remove_clobbered_writes: 627
+2024-06-01T06:04:10Z INFO 2403803 (sg02) [RemoveRedundancies]: remove_useless_insts
+2024-06-01T06:04:11Z INFO 2403803 (sg01) [DMAOptimizationBase]: SB Rotation rotated 8187 Sb address
+2024-06-01T06:04:12Z INFO 2403803 (sg00) [DMAOptimizationBase]: SB Rotation rotated 12475 Sb address
+2024-06-01T06:04:12Z INFO 2403803 (sg02) [RemoveRedundancies]: remove Useless Instructions: 0
+2024-06-01T06:04:12Z USER 2403803 (sg02) [ModuleForkPass]: remove_redundancies finished after 2.669 seconds
+2024-06-01T06:04:12Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  13037mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:04:12Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 698903 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:12Z USER 2403803 (sg02) [ModuleForkPass]: Running anti_dependency_analyzer
+2024-06-01T06:04:12Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=217043 blocks=1 instructions=698903 Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:12Z INFO 2403803 (sg02) [AntiDependencyAnalyzer]: Batch size: 1000
+2024-06-01T06:04:12Z INFO 2403803 (sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB}
+2024-06-01T06:04:13Z INFO 2403803 (sg00) [DMAOptimizationBase]: SB Rotation rotated 58 Sb address
+2024-06-01T06:04:14Z INFO 2403803 (sg01) [DMAOptimizationBase]: SB Rotation rotated 2539 Sb address
+2024-06-01T06:04:14Z INFO 2403803 (sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address
+2024-06-01T06:04:14Z USER 2403803 (sg00) [ModuleForkPass]: address_rotation_sb finished after 20.069 seconds
+2024-06-01T06:04:14Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  13085mb, ru_maxrss:  13538mb (delta=0mb)
+2024-06-01T06:04:14Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 147823 memory location(s), 1 block(s), and 341001 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:04:14Z USER 2403803 (sg00) [ModuleForkPass]: Running anti_dependency_analyzer
+2024-06-01T06:04:14Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=147823 blocks=1 instructions=341001 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:04:14Z INFO 2403803 (sg00) [AntiDependencyAnalyzer]: Batch size: 1000
+2024-06-01T06:04:14Z INFO 2403803 (sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB}
+2024-06-01T06:04:16Z INFO 2403803 (sg01) [DMAOptimizationBase]: SB Rotation rotated 117 Sb address
+2024-06-01T06:04:16Z INFO 2403803 [AntiDependencyAnalyzer-PSUM4-32-128]: Finished analyzing 29158 access patterns a mean/median 1.00885/1 intervals per access pattern and mean/median 3.18739/1.40741 intersections per interval.
+2024-06-01T06:04:16Z INFO 2403803 [AntiDependencyAnalyzer-PSUM0-32-64]: Finished analyzing 29263 access patterns a mean/median 1.00841/1 intervals per access pattern and mean/median 3.45973/1.51287 intersections per interval.
+2024-06-01T06:04:16Z INFO 2403803 [AntiDependencyAnalyzer-PSUM6-0-32]: Finished analyzing 30150 access patterns a mean/median 1.00716/1 intervals per access pattern and mean/median 3.45682/1.4518 intersections per interval.
+2024-06-01T06:04:16Z INFO 2403803 [AntiDependencyAnalyzer-PSUM7-32-128]: Finished analyzing 30238 access patterns a mean/median 1.00804/1 intervals per access pattern and mean/median 3.97346/1.80588 intersections per interval.
+2024-06-01T06:04:16Z INFO 2403803 [AntiDependencyAnalyzer-PSUM4-0-32]: Finished analyzing 29389 access patterns a mean/median 1.00878/1 intervals per access pattern and mean/median 3.23674/1.43134 intersections per interval.
+2024-06-01T06:04:16Z INFO 2403803 [AntiDependencyAnalyzer-PSUM0-0-32]: Finished analyzing 29593 access patterns a mean/median 1.00831/1.00001 intervals per access pattern and mean/median 3.54318/1.5131 intersections per interval.
+2024-06-01T06:04:16Z INFO 2403803 [AntiDependencyAnalyzer-PSUM2-32-64]: Finished analyzing 30336 access patterns a mean/median 1.0088/1 intervals per access pattern and mean/median 3.1529/1.38027 intersections per interval.
+2024-06-01T06:04:16Z INFO 2403803 [AntiDependencyAnalyzer-PSUM2-64-128]: Finished analyzing 30332 access patterns a mean/median 1.0088/1 intervals per access pattern and mean/median 3.15286/1.38015 intersections per interval.
+2024-06-01T06:04:16Z INFO 2403803 [AntiDependencyAnalyzer-PSUM6-32-64]: Finished analyzing 29589 access patterns a mean/median 1.0073/1 intervals per access pattern and mean/median 3.2962/1.41386 intersections per interval.
+2024-06-01T06:04:16Z INFO 2403803 [AntiDependencyAnalyzer-PSUM3-0-32]: Finished analyzing 31071 access patterns a mean/median 1.00753/1 intervals per access pattern and mean/median 3.50861/1.96377 intersections per interval.
+2024-06-01T06:04:16Z INFO 2403803 [AntiDependencyAnalyzer-PSUM5-0-32]: Finished analyzing 30109 access patterns a mean/median 1.00817/1 intervals per access pattern and mean/median 3.31692/1.79134 intersections per interval.
+2024-06-01T06:04:16Z INFO 2403803 [AntiDependencyAnalyzer-PSUM5-32-64]: Finished analyzing 29746 access patterns a mean/median 1.00827/1 intervals per access pattern and mean/median 3.2725/1.63498 intersections per interval.
+2024-06-01T06:04:16Z INFO 2403803 [AntiDependencyAnalyzer-PSUM2-0-32]: Finished analyzing 30666 access patterns a mean/median 1.00871/1 intervals per access pattern and mean/median 3.19305/1.38793 intersections per interval.
+2024-06-01T06:04:16Z INFO 2403803 [AntiDependencyAnalyzer-PSUM6-64-128]: Finished analyzing 29585 access patterns a mean/median 1.0073/1 intervals per access pattern and mean/median 2.71788/1.31189 intersections per interval.
+2024-06-01T06:04:16Z INFO 2403803 [AntiDependencyAnalyzer-PSUM0-64-128]: Finished analyzing 29259 access patterns a mean/median 1.00841/1 intervals per access pattern and mean/median 3.03398/1.43302 intersections per interval.
+2024-06-01T06:04:16Z INFO 2403803 [AntiDependencyAnalyzer-PSUM5-64-128]: Finished analyzing 29742 access patterns a mean/median 1.00827/1 intervals per access pattern and mean/median 3.2726/1.6377 intersections per interval.
+2024-06-01T06:04:16Z INFO 2403803 [AntiDependencyAnalyzer-PSUM7-0-32]: Finished analyzing 30733 access patterns a mean/median 1.00791/1 intervals per access pattern and mean/median 4.11712/1.99453 intersections per interval.
+2024-06-01T06:04:16Z INFO 2403803 [AntiDependencyAnalyzer-PSUM3-64-128]: Finished analyzing 30869 access patterns a mean/median 1.00758/1 intervals per access pattern and mean/median 3.46893/1.94813 intersections per interval.
+2024-06-01T06:04:16Z INFO 2403803 [AntiDependencyAnalyzer-PSUM3-32-64]: Finished analyzing 30873 access patterns a mean/median 1.00758/1 intervals per access pattern and mean/median 3.47347/1.95195 intersections per interval.
+2024-06-01T06:04:16Z INFO 2403803 [AntiDependencyAnalyzer-PSUM1-32-128]: Finished analyzing 29885 access patterns a mean/median 1.00703/1 intervals per access pattern and mean/median 2.94415/1.5555 intersections per interval.
+2024-06-01T06:04:16Z INFO 2403803 [AntiDependencyAnalyzer-PSUM1-0-32]: Finished analyzing 30017 access patterns a mean/median 1.007/1 intervals per access pattern and mean/median 2.95799/1.55256 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-DRAM]: Finished analyzing 11119 access patterns a mean/median 90.1435/112.171 intervals per access pattern and mean/median 1.76648/0.487394 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-PSUM0-64-128]: Finished analyzing 70946 access patterns a mean/median 1.00457/1 intervals per access pattern and mean/median 4.56222/5.08612 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-PSUM0-0-32]: Finished analyzing 71614 access patterns a mean/median 1.00452/1 intervals per access pattern and mean/median 4.58543/5.11404 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-PSUM0-32-64]: Finished analyzing 70954 access patterns a mean/median 1.00457/1 intervals per access pattern and mean/median 4.56195/5.0855 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-PSUM1-0-32]: Finished analyzing 72560 access patterns a mean/median 1.00413/1 intervals per access pattern and mean/median 4.44765/4.42362 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-PSUM5-0-32]: Finished analyzing 72572 access patterns a mean/median 1.00413/1 intervals per access pattern and mean/median 4.43323/4.48669 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-PSUM4-0-32]: Finished analyzing 71604 access patterns a mean/median 1.00452/1 intervals per access pattern and mean/median 4.66589/5.64798 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-PSUM3-0-32]: Finished analyzing 71465 access patterns a mean/median 1.0042/1 intervals per access pattern and mean/median 4.56634/4.96784 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-PSUM3-64-128]: Finished analyzing 70802 access patterns a mean/median 1.00424/1 intervals per access pattern and mean/median 4.54057/4.88996 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-PSUM5-32-64]: Finished analyzing 71912 access patterns a mean/median 1.00417/1 intervals per access pattern and mean/median 4.40643/4.3905 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-PSUM1-32-64]: Finished analyzing 71900 access patterns a mean/median 1.00417/1 intervals per access pattern and mean/median 4.42359/4.35832 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-PSUM5-64-128]: Finished analyzing 71907 access patterns a mean/median 1.00417/1 intervals per access pattern and mean/median 4.40659/4.39094 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-PSUM7-0-32]: Finished analyzing 71519 access patterns a mean/median 1.00419/1 intervals per access pattern and mean/median 4.47871/4.54291 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-PSUM3-32-64]: Finished analyzing 70805 access patterns a mean/median 1.00424/1 intervals per access pattern and mean/median 4.54047/4.88972 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-PSUM1-64-128]: Finished analyzing 71895 access patterns a mean/median 1.00417/1 intervals per access pattern and mean/median 4.42375/4.35882 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-PSUM2-64-128]: Finished analyzing 72080 access patterns a mean/median 1.0045/1 intervals per access pattern and mean/median 4.6104/5.06522 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-PSUM2-32-64]: Finished analyzing 72085 access patterns a mean/median 1.00449/1 intervals per access pattern and mean/median 4.61023/5.0647 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-PSUM4-64-128]: Finished analyzing 70939 access patterns a mean/median 1.00457/1 intervals per access pattern and mean/median 4.64131/5.53323 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-PSUM2-0-32]: Finished analyzing 72745 access patterns a mean/median 1.00445/1 intervals per access pattern and mean/median 4.63124/5.11254 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-PSUM4-32-64]: Finished analyzing 70944 access patterns a mean/median 1.00457/1 intervals per access pattern and mean/median 4.64114/5.53284 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-PSUM7-32-64]: Finished analyzing 70859 access patterns a mean/median 1.00423/1 intervals per access pattern and mean/median 4.45564/4.46454 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-PSUM7-64-128]: Finished analyzing 70849 access patterns a mean/median 1.00423/1 intervals per access pattern and mean/median 4.45597/4.46517 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-PSUM6-64-128]: Finished analyzing 72102 access patterns a mean/median 1.00449/1 intervals per access pattern and mean/median 4.76919/5.69189 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-PSUM6-32-64]: Finished analyzing 72107 access patterns a mean/median 1.00449/1 intervals per access pattern and mean/median 4.76901/5.69126 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-PSUM6-0-32]: Finished analyzing 72767 access patterns a mean/median 1.00445/1 intervals per access pattern and mean/median 4.79197/5.81305 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-DRAM]: Finished analyzing 13798 access patterns a mean/median 87.3006/127.934 intervals per access pattern and mean/median 1.98368/1 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-SB-64-65]: Finished analyzing 610499 access patterns a mean/median 1.06353/1 intervals per access pattern and mean/median 1.90515/1 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-SB-0-1]: Finished analyzing 623015 access patterns a mean/median 1.06225/1 intervals per access pattern and mean/median 1.88282/0.999992 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-SB-97-113]: Finished analyzing 610494 access patterns a mean/median 1.06353/1 intervals per access pattern and mean/median 1.90516/0.999993 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-SB-65-96]: Finished analyzing 610494 access patterns a mean/median 1.06353/1 intervals per access pattern and mean/median 1.90516/0.999993 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-SB-113-128]: Finished analyzing 584314 access patterns a mean/median 1.06638/1 intervals per access pattern and mean/median 1.90364/1.00001 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-SB-96-97]: Finished analyzing 610625 access patterns a mean/median 1.06352/1 intervals per access pattern and mean/median 1.90519/1 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-SB-32-33]: Finished analyzing 622719 access patterns a mean/median 1.06228/1 intervals per access pattern and mean/median 1.88316/1.00001 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-SB-1-32]: Finished analyzing 622714 access patterns a mean/median 1.06228/1 intervals per access pattern and mean/median 1.88316/1.00001 intersections per interval.
+2024-06-01T06:04:17Z INFO 2403803 [AntiDependencyAnalyzer-SB-33-64]: Finished analyzing 622714 access patterns a mean/median 1.06228/1 intervals per access pattern and mean/median 1.88316/1.00001 intersections per interval.
+2024-06-01T06:04:17Z USER 2403803 (sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 3.024 seconds
+2024-06-01T06:04:17Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  13681mb, ru_maxrss:  13681mb (delta=143mb)
+2024-06-01T06:04:18Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 147823 memory location(s), 1 block(s), and 341001 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:04:18Z USER 2403803 (sg00) [ModuleForkPass]: Running dep_opt
+2024-06-01T06:04:18Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=147823 blocks=1 instructions=341001 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:04:18Z INFO 2403803 (sg00) [build_flow_deps]: Start build fdeps. Invocation: 9Sat Jun  1 06:04:18 2024
+2024-06-01T06:04:18Z INFO 2403803 (sg00) [build_flow_deps]: Allocs: 147823 instructions: 341001
+2024-06-01T06:04:18Z INFO 2403803 (sg01) [DMAOptimizationBase]: SB Rotation rotated 16807 Sb address
+2024-06-01T06:04:21Z INFO 2403803 (sg00) [build_flow_deps]: Build fdeps inserted 795646 edges 
+2024-06-01T06:04:21Z INFO 2403803 (sg00) [build_flow_deps]: Done build fdeps 795646 Sat Jun  1 06:04:21 2024
+2024-06-01T06:04:21Z INFO 2403803 (sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address
+2024-06-01T06:04:21Z USER 2403803 (sg01) [ModuleForkPass]: address_rotation_sb finished after 15.808 seconds
+2024-06-01T06:04:21Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  13606mb, ru_maxrss:  13681mb (delta=143mb)
+2024-06-01T06:04:21Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 688010 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:21Z USER 2403803 (sg01) [ModuleForkPass]: Running coloring_allocator_dram
+2024-06-01T06:04:21Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=217043 blocks=1 instructions=688010 Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:21Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 9486189328
+2024-06-01T06:04:21Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 2799 bytes
+2024-06-01T06:04:21Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 1906589696
+2024-06-01T06:04:21Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 2876 bytes
+2024-06-01T06:04:21Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 32768
+2024-06-01T06:04:21Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:04:21Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: Allocating functions
+2024-06-01T06:04:21Z INFO 2403803 (sg01) [ColoringAllocator::Rep]:   linearize and check
+2024-06-01T06:04:21Z USER 2403803 (sg00) [ModuleForkPass]: dep_opt finished after 3.818 seconds
+2024-06-01T06:04:21Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  13448mb, ru_maxrss:  13681mb (delta=0mb)
+2024-06-01T06:04:21Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 147823 memory location(s), 1 block(s), and 341001 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:04:21Z USER 2403803 (sg00) [ModuleForkPass]: Running report_stats
+2024-06-01T06:04:21Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=147823 blocks=1 instructions=341001 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:04:22Z INFO 2403803 [AntiDependencyAnalyzer-SB-0-1]: Finished analyzing 1305767 access patterns a mean/median 1.07406/1 intervals per access pattern and mean/median 2.29639/1 intersections per interval.
+2024-06-01T06:04:22Z INFO 2403803 [AntiDependencyAnalyzer-SB-64-96]: Finished analyzing 1293418 access patterns a mean/median 1.07477/1 intervals per access pattern and mean/median 2.31706/1 intersections per interval.
+2024-06-01T06:04:22Z INFO 2403803 [AntiDependencyAnalyzer-SB-1-64]: Finished analyzing 1305287 access patterns a mean/median 1.07409/1 intervals per access pattern and mean/median 2.29707/1 intersections per interval.
+2024-06-01T06:04:22Z INFO 2403803 [AntiDependencyAnalyzer-SB-96-97]: Finished analyzing 1293738 access patterns a mean/median 1.07475/1 intervals per access pattern and mean/median 2.31662/1 intersections per interval.
+2024-06-01T06:04:22Z INFO 2403803 [AntiDependencyAnalyzer-SB-113-128]: Finished analyzing 1266574 access patterns a mean/median 1.07635/1 intervals per access pattern and mean/median 2.32753/1 intersections per interval.
+2024-06-01T06:04:22Z INFO 2403803 [AntiDependencyAnalyzer-SB-97-113]: Finished analyzing 1293418 access patterns a mean/median 1.07477/1 intervals per access pattern and mean/median 2.31706/1 intersections per interval.
+2024-06-01T06:04:22Z INFO 2403803 (sg00) [ReportStats]: Data Movement Statistics: sg0000
+┌──────────────┬────────────────────────────────────┬───────┬────────────┐
+│ Instruction  │ Kind                               │ Count │ Bytes      │
+├──────────────┼────────────────────────────────────┼───────┼────────────┤
+│ DMACopy      │ ExternalInputParameter -> Internal │ 1     │ 16384      │
+│ DMACopy      │ Internal -> Output                 │ 2     │ 600047616  │
+│ Load         │ Const -> Internal                  │ 3     │ 49408      │
+│ Load         │ ExternalInput -> Internal          │ 26    │ 18440      │
+│ Load         │ ExternalInputParameter -> Internal │ 1849  │ 722796544  │
+│ Load         │ Internal                           │ 7433  │ 1019133952 │
+│ Save         │ Internal                           │ 2055  │ 760119296  │
+│ Save         │ Internal -> ExternalOutput         │ 108   │ 37502976   │
+│ Save         │ Internal -> Output                 │ 5220  │ 84418320   │
+│ Save (Spill) │ Internal                           │ 1625  │ 177087488  │
+└──────────────┴────────────────────────────────────┴───────┴────────────┘
+
+2024-06-01T06:04:22Z INFO 2403803 (sg00) [ReportStats]: 
+┌─────────────────────┬───────┐
+│ Bytes per partition │ Count │
+├─────────────────────┼───────┤
+│ 8                   │ 16    │
+│ 72                  │ 17    │
+│ 113                 │ 288   │
+│ 128                 │ 4933  │
+│ 256                 │ 2     │
+│ 482                 │ 1152  │
+│ 488                 │ 213   │
+│ 512                 │ 916   │
+│ 760                 │ 31    │
+│ 904                 │ 32    │
+│ 964                 │ 2     │
+│ 1024                │ 7360  │
+│ 1536                │ 2     │
+│ 1784                │ 27    │
+│ 1928                │ 128   │
+│ 2048                │ 224   │
+│ 3832                │ 16    │
+│ 4096                │ 2961  │
+│ 150011904           │ 4     │
+└─────────────────────┴───────┘
+
+2024-06-01T06:04:22Z INFO 2403803 (sg00) [ReportStats]: MM Stats: #MatMults 173945 #MatMult-Transposes 99108
+2024-06-01T06:04:22Z USER 2403803 (sg00) [ModuleForkPass]: report_stats finished after 0.165 seconds
+2024-06-01T06:04:22Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  13383mb, ru_maxrss:  13681mb (delta=0mb)
+2024-06-01T06:04:22Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 147823 memory location(s), 1 block(s), and 341001 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:04:22Z USER 2403803 (sg00) [ModuleForkPass]: Running assign_trigger_engine
+2024-06-01T06:04:22Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to assign_trigger_engine: modules=1 functions=1 allocs=147823 blocks=1 instructions=341001 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:04:22Z USER 2403803 (sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 9.562 seconds
+2024-06-01T06:04:22Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  13344mb, ru_maxrss:  13681mb (delta=143mb)
+2024-06-01T06:04:22Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 698903 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:22Z USER 2403803 (sg02) [ModuleForkPass]: Running tensor_copy_elim
+2024-06-01T06:04:22Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=217043 blocks=1 instructions=698903 Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:22Z INFO 2403803 (sg00) [AssignTriggerEngine]: Assigned trigger engine for 3682 DMA instructions
+2024-06-01T06:04:22Z USER 2403803 (sg00) [ModuleForkPass]: assign_trigger_engine finished after 0.464 seconds
+2024-06-01T06:04:22Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  13298mb, ru_maxrss:  13681mb (delta=0mb)
+2024-06-01T06:04:22Z INFO 2403803 (sg01) [DRAM_Allocator]:   allocating spills in DRAM pre_link mode
+2024-06-01T06:04:22Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 147823 memory location(s), 1 block(s), and 341001 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:04:22Z USER 2403803 (sg00) [ModuleForkPass]: Running alloc_queues
+2024-06-01T06:04:22Z INFO 2403803 (sg01) [DRAM_Allocator]:     reserved space = 1431150352 bytes
+2024-06-01T06:04:22Z INFO 2403803 (sg01) [DRAM_Allocator]:     spill space = 2465464320 bytes
+2024-06-01T06:04:22Z INFO 2403803 (sg01) [DRAM_Allocator]:     aligned spill space = 2465464320 bytes
+2024-06-01T06:04:22Z INFO 2403803 (sg01) [DRAM_Allocator]:     dram space = 107374182400 bytes
+2024-06-01T06:04:22Z INFO 2403803 (sg01) [DRAM_Allocator]:     renumber locations
+2024-06-01T06:04:22Z INFO 2403803 (sg01) [DRAM_Allocator]:         size = 2475
+2024-06-01T06:04:22Z INFO 2403803 (sg01) [DRAM_Allocator]:       find first defs
+2024-06-01T06:04:22Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=147823 blocks=1 instructions=341001 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:04:23Z USER 2403803 (sg00) [ModuleForkPass]: alloc_queues finished after 0.245 seconds
+2024-06-01T06:04:23Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  13300mb, ru_maxrss:  13681mb (delta=0mb)
+2024-06-01T06:04:23Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 147823 memory location(s), 1 block(s), and 341001 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:04:23Z USER 2403803 (sg00) [ModuleForkPass]: Running dep_reduction
+2024-06-01T06:04:23Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=147823 blocks=1 instructions=341001 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:04:23Z INFO 2403803 (sg00) [DepReduction]: Start Dependency Reduction
+2024-06-01T06:04:23Z INFO 2403803 (sg01) [DRAM_Allocator]:            Num intervals 2475 Num locations 2475
+2024-06-01T06:04:23Z INFO 2403803 (sg01) [DRAM_Allocator]:            IntervalTree Build Done
+2024-06-01T06:04:23Z INFO 2403803 (sg01) [DRAM_Allocator]:            info.neighbors init Done
+2024-06-01T06:04:23Z INFO 2403803 (sg01) [DRAM_Allocator]:            IntervalTree readback Done
+2024-06-01T06:04:23Z INFO 2403803 (sg01) [DRAM_Allocator]:         simplify interference graph
+2024-06-01T06:04:23Z INFO 2403803 (sg01) [DRAM_Allocator]:           initialize low and high
+2024-06-01T06:04:23Z INFO 2403803 (sg00) [DepReduction]: Processing async instrs...
+2024-06-01T06:04:23Z INFO 2403803 (sg00) [DepReduction]: Processing secondary edges per engine...
+2024-06-01T06:04:23Z INFO 2403803 (sg01) [DRAM_Allocator]:             lo = 2472
+2024-06-01T06:04:23Z INFO 2403803 (sg01) [DRAM_Allocator]:             hi = 3
+2024-06-01T06:04:23Z INFO 2403803 (sg01) [DRAM_Allocator]:             total = 2475
+2024-06-01T06:04:23Z INFO 2403803 (sg01) [DRAM_Allocator]:           simplify
+2024-06-01T06:04:23Z INFO 2403803 (sg01) [DRAM_Allocator]:             new candidates = 0
+2024-06-01T06:04:23Z INFO 2403803 (sg01) [DRAM_Allocator]:         select ranges
+2024-06-01T06:04:23Z INFO 2403803 (sg01) [DRAM_Allocator]: CC buffer size limit 524288000
+2024-06-01T06:04:24Z INFO 2403803 (sg01) [DRAM_Allocator]: allreduce_dram_hwm 836894720
+2024-06-01T06:04:24Z INFO 2403803 (sg01) [DRAM_Allocator]: Real CC buffer size 524288000
+2024-06-01T06:04:24Z INFO 2403803 (sg00) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 238281
+2024-06-01T06:04:24Z INFO 2403803 (sg00) [DepReduction]: Processing redundant descendants...
+2024-06-01T06:04:24Z INFO 2403803 (sg01) [DRAM_Allocator]: DRAM hwm after allocation: 1373765632
+2024-06-01T06:04:24Z INFO 2403803 (sg01) [DRAM_Allocator]: DRAM allocation successful
+2024-06-01T06:04:24Z INFO 2403803 (sg02) [TensorCopyElim]: Tensor CP elimination: 0
+2024-06-01T06:04:24Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 9486189328
+2024-06-01T06:04:24Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 2799 bytes
+2024-06-01T06:04:24Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 1906589696
+2024-06-01T06:04:24Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2876 bytes
+2024-06-01T06:04:24Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 32768
+2024-06-01T06:04:24Z INFO 2403803 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:04:24Z USER 2403803 (sg01) [ModuleForkPass]: coloring_allocator_dram finished after 3.252 seconds
+2024-06-01T06:04:24Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  13365mb, ru_maxrss:  13681mb (delta=0mb)
+2024-06-01T06:04:24Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 688010 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:24Z USER 2403803 (sg01) [ModuleForkPass]: Running address_rotation_dram
+2024-06-01T06:04:24Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=217043 blocks=1 instructions=688010 Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:24Z INFO 2403803 (sg01) [DMAOptimizationBase]: Runtime page size at 512MB
+2024-06-01T06:04:25Z INFO 2403803 (sg00) [DepReduction]: Processing redundant descendants, Done. Num edges removed 19804
+2024-06-01T06:04:25Z INFO 2403803 (sg00) [DepReduction]: Processing async instrs, Done. Num edges removed 258085
+2024-06-01T06:04:25Z USER 2403803 (sg02) [ModuleForkPass]: tensor_copy_elim finished after 3.504 seconds
+2024-06-01T06:04:25Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  13370mb, ru_maxrss:  13681mb (delta=0mb)
+2024-06-01T06:04:25Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 698903 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:25Z USER 2403803 (sg02) [ModuleForkPass]: Running post_sched
+2024-06-01T06:04:25Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=217043 blocks=1 instructions=698903 Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:25Z INFO 2403803 [post_scheduler]: Start PosT ScheD 3 sunda Sat Jun  1 06:04:25 2024
+2024-06-01T06:04:26Z INFO 2403803 (sg01) [DMAOptimizationBase]: DRAM hwm before rotation 1373765632
+2024-06-01T06:04:27Z INFO 2403803 (sg01) [DMAOptimizationBase]: allreduce buffer size 524288000
+2024-06-01T06:04:27Z INFO 2403803 (sg01) [DMAOptimizationBase]: allreduce hwm 836894720
+2024-06-01T06:04:27Z INFO 2403803 (sg01) [DMAOptimizationBase]: Real CC buffer size 524288000
+2024-06-01T06:04:28Z INFO 2403803 (sg01) [DMAOptimizationBase]: DRAM hwm after rotation 1373765632
+2024-06-01T06:04:28Z INFO 2403803 (sg01) [DMAOptimizationBase]: DRAM Rotation rotated 1 Dram address
+2024-06-01T06:04:28Z USER 2403803 (sg01) [ModuleForkPass]: address_rotation_dram finished after 4.187 seconds
+2024-06-01T06:04:28Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  13431mb, ru_maxrss:  13681mb (delta=0mb)
+2024-06-01T06:04:29Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 688010 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:29Z USER 2403803 (sg01) [ModuleForkPass]: Running tensorcopy_accel
+2024-06-01T06:04:29Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=217043 blocks=1 instructions=688010 Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:29Z INFO 2403803 (sg01) [TensorCopyAccel::Impl]: Running peephole optimization pass
+2024-06-01T06:04:29Z INFO 2403803 (sg00) [DepReduction]: Num Async removed: 0
+2024-06-01T06:04:29Z INFO 2403803 (sg00) [DepReduction]: Finished dependency reduction: 1888130 removed, new total 170403
+2024-06-01T06:04:29Z INFO 2403803 (sg00) [DepReduction]: Finished Dependency Reduction
+2024-06-01T06:04:29Z USER 2403803 (sg00) [ModuleForkPass]: dep_reduction finished after 6.081 seconds
+2024-06-01T06:04:29Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  13428mb, ru_maxrss:  13681mb (delta=0mb)
+2024-06-01T06:04:29Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 147823 memory location(s), 1 block(s), and 341001 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:04:29Z USER 2403803 (sg00) [ModuleForkPass]: Running bir_racecheck
+2024-06-01T06:04:29Z INFO 2403803 (sg00) [ModuleForkPass]: Inputs to bir_racecheck: modules=1 functions=1 allocs=147823 blocks=1 instructions=341001 Max writers: 5184 Max Readers: 93924
+2024-06-01T06:04:29Z INFO 2403803 (sg01) [TensorCopyAccel::Impl]: Accelerated 1280 out of 92594 tensorcopy in Function: sg0001 average acceleration factor: 1
+2024-06-01T06:04:29Z USER 2403803 (sg01) [ModuleForkPass]: tensorcopy_accel finished after 0.566 seconds
+2024-06-01T06:04:29Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  13481mb, ru_maxrss:  13681mb (delta=0mb)
+2024-06-01T06:04:29Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 688010 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:29Z USER 2403803 (sg01) [ModuleForkPass]: Running peephole_opts
+2024-06-01T06:04:29Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=217043 blocks=1 instructions=688010 Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:29Z INFO 2403803 (sg01) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: true SplitSelect: true
+2024-06-01T06:04:30Z INFO 2403803 (sg01) [PeepholeOpts]: Split Select: 11520
+2024-06-01T06:04:30Z INFO 2403803 (sg01) [PeepholeOpts]: TSP -> ACT: 17280
+2024-06-01T06:04:30Z INFO 2403803 (sg01) [PeepholeOpts]: COPY -> ACT: 0
+2024-06-01T06:04:30Z INFO 2403803 (sg01) [PeepholeOpts]: RECIPROCAL -> ACT: 0
+2024-06-01T06:04:30Z USER 2403803 (sg01) [ModuleForkPass]: peephole_opts finished after 1.256 seconds
+2024-06-01T06:04:30Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  13667mb, ru_maxrss:  13681mb (delta=0mb)
+2024-06-01T06:04:31Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 699530 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:31Z USER 2403803 (sg01) [ModuleForkPass]: Running lower_kernel
+2024-06-01T06:04:31Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=217043 blocks=1 instructions=699530 Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:31Z INFO 2403803 (sg01) [LowerKernel]: Started running LowerKernel
+2024-06-01T06:04:31Z INFO 2403803 (sg01) [LowerKernel]: Start of kernel lowering pass, number of insts: 699530, number of allocs: 217043
+2024-06-01T06:04:31Z INFO 2403803 (sg01) [LowerKernel]: Scan BKs time (s): 0.147243
+2024-06-01T06:04:31Z INFO 2403803 (sg01) [LowerKernel]: Lower BKs time (s): 5e-06
+2024-06-01T06:04:31Z USER 2403803 (sg01) [ModuleForkPass]: lower_kernel finished after 0.082 seconds
+2024-06-01T06:04:31Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  13688mb, ru_maxrss:  13688mb (delta=7mb)
+2024-06-01T06:04:31Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 699530 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:31Z USER 2403803 (sg01) [ModuleForkPass]: Running build_fdeps
+2024-06-01T06:04:31Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=217043 blocks=1 instructions=699530 Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:31Z INFO 2403803 (sg01) [build_flow_deps]: Start build fdeps. Invocation: 10Sat Jun  1 06:04:31 2024
+2024-06-01T06:04:31Z INFO 2403803 (sg01) [build_flow_deps]: Allocs: 217043 instructions: 699530
+2024-06-01T06:04:33Z USER 2403803 (sg00) [ModuleForkPass]: bir_racecheck finished after 4.219 seconds
+2024-06-01T06:04:33Z INFO 2403803 (sg00) [ModuleForkPass]: curr_vmrss:  13987mb, ru_maxrss:  13987mb (delta=306mb)
+2024-06-01T06:04:34Z INFO 2403803 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 147823 memory location(s), 1 block(s), and 341001 instruction(s). Max writers: 5184 Max Readers: 93924
+2024-06-01T06:04:35Z INFO 2403803 (sg01) [build_flow_deps]: Build fdeps inserted 1975620 edges 
+2024-06-01T06:04:35Z INFO 2403803 (sg01) [build_flow_deps]: Done build fdeps 1975620 Sat Jun  1 06:04:35 2024
+2024-06-01T06:04:35Z USER 2403803 (sg01) [ModuleForkPass]: build_fdeps finished after 4.366 seconds
+2024-06-01T06:04:35Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  13868mb, ru_maxrss:  13987mb (delta=299mb)
+2024-06-01T06:04:35Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 699530 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:35Z USER 2403803 (sg01) [ModuleForkPass]: Running remove_redundancies
+2024-06-01T06:04:35Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=217043 blocks=1 instructions=699530 Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:35Z INFO 2403803 (sg01) [RemoveRedundancies]: remove_clobbered_writes
+2024-06-01T06:04:36Z INFO 2403803 (sg01) [RemoveRedundancies]: remove_clobbered_writes: 627
+2024-06-01T06:04:36Z INFO 2403803 (sg01) [RemoveRedundancies]: remove_useless_insts
+2024-06-01T06:04:37Z INFO 2403803 (sg01) [RemoveRedundancies]: remove Useless Instructions: 0
+2024-06-01T06:04:37Z USER 2403803 (sg01) [ModuleForkPass]: remove_redundancies finished after 1.698 seconds
+2024-06-01T06:04:37Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  13785mb, ru_maxrss:  13987mb (delta=0mb)
+2024-06-01T06:04:37Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 698903 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:37Z USER 2403803 (sg01) [ModuleForkPass]: Running anti_dependency_analyzer
+2024-06-01T06:04:37Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=217043 blocks=1 instructions=698903 Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:37Z INFO 2403803 (sg01) [AntiDependencyAnalyzer]: Batch size: 1000
+2024-06-01T06:04:37Z INFO 2403803 (sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB}
+2024-06-01T06:04:40Z INFO 2403803 [post_scheduler]: Time-aware hwm post-sched
+2024-06-01T06:04:43Z INFO 2403803 [AntiDependencyAnalyzer-PSUM5-64-128]: Finished analyzing 71907 access patterns a mean/median 1.00417/1 intervals per access pattern and mean/median 4.40659/4.39094 intersections per interval.
+2024-06-01T06:04:43Z INFO 2403803 [AntiDependencyAnalyzer-PSUM5-0-32]: Finished analyzing 72572 access patterns a mean/median 1.00413/1 intervals per access pattern and mean/median 4.43323/4.48669 intersections per interval.
+2024-06-01T06:04:43Z INFO 2403803 [AntiDependencyAnalyzer-PSUM5-32-64]: Finished analyzing 71912 access patterns a mean/median 1.00417/1 intervals per access pattern and mean/median 4.40643/4.3905 intersections per interval.
+2024-06-01T06:04:43Z INFO 2403803 [AntiDependencyAnalyzer-PSUM7-64-128]: Finished analyzing 70849 access patterns a mean/median 1.00423/1 intervals per access pattern and mean/median 4.45597/4.46517 intersections per interval.
+2024-06-01T06:04:43Z INFO 2403803 [AntiDependencyAnalyzer-PSUM7-0-32]: Finished analyzing 71519 access patterns a mean/median 1.00419/1 intervals per access pattern and mean/median 4.47871/4.54291 intersections per interval.
+2024-06-01T06:04:43Z INFO 2403803 [AntiDependencyAnalyzer-PSUM3-32-64]: Finished analyzing 70805 access patterns a mean/median 1.00424/1 intervals per access pattern and mean/median 4.54047/4.88972 intersections per interval.
+2024-06-01T06:04:43Z INFO 2403803 [AntiDependencyAnalyzer-PSUM0-0-32]: Finished analyzing 71614 access patterns a mean/median 1.00452/1 intervals per access pattern and mean/median 4.58543/5.11404 intersections per interval.
+2024-06-01T06:04:43Z INFO 2403803 [AntiDependencyAnalyzer-PSUM0-32-64]: Finished analyzing 70954 access patterns a mean/median 1.00457/1 intervals per access pattern and mean/median 4.56195/5.0855 intersections per interval.
+2024-06-01T06:04:43Z INFO 2403803 [AntiDependencyAnalyzer-PSUM3-64-128]: Finished analyzing 70802 access patterns a mean/median 1.00424/1 intervals per access pattern and mean/median 4.54057/4.88996 intersections per interval.
+2024-06-01T06:04:43Z INFO 2403803 [AntiDependencyAnalyzer-PSUM7-32-64]: Finished analyzing 70859 access patterns a mean/median 1.00423/1 intervals per access pattern and mean/median 4.45564/4.46454 intersections per interval.
+2024-06-01T06:04:43Z INFO 2403803 [AntiDependencyAnalyzer-PSUM0-64-128]: Finished analyzing 70946 access patterns a mean/median 1.00457/1 intervals per access pattern and mean/median 4.56222/5.08612 intersections per interval.
+2024-06-01T06:04:43Z INFO 2403803 [AntiDependencyAnalyzer-PSUM3-0-32]: Finished analyzing 71465 access patterns a mean/median 1.0042/1 intervals per access pattern and mean/median 4.56634/4.96784 intersections per interval.
+2024-06-01T06:04:43Z INFO 2403803 [AntiDependencyAnalyzer-PSUM4-32-64]: Finished analyzing 70944 access patterns a mean/median 1.00457/1 intervals per access pattern and mean/median 4.64114/5.53284 intersections per interval.
+2024-06-01T06:04:43Z INFO 2403803 [AntiDependencyAnalyzer-PSUM4-64-128]: Finished analyzing 70939 access patterns a mean/median 1.00457/1 intervals per access pattern and mean/median 4.64131/5.53323 intersections per interval.
+2024-06-01T06:04:43Z INFO 2403803 [AntiDependencyAnalyzer-PSUM4-0-32]: Finished analyzing 71604 access patterns a mean/median 1.00452/1 intervals per access pattern and mean/median 4.66589/5.64798 intersections per interval.
+2024-06-01T06:04:43Z INFO 2403803 [AntiDependencyAnalyzer-PSUM1-0-32]: Finished analyzing 72560 access patterns a mean/median 1.00413/1 intervals per access pattern and mean/median 4.44765/4.42362 intersections per interval.
+2024-06-01T06:04:43Z INFO 2403803 [AntiDependencyAnalyzer-PSUM1-32-64]: Finished analyzing 71900 access patterns a mean/median 1.00417/1 intervals per access pattern and mean/median 4.42359/4.35832 intersections per interval.
+2024-06-01T06:04:43Z INFO 2403803 [AntiDependencyAnalyzer-PSUM1-64-128]: Finished analyzing 71895 access patterns a mean/median 1.00417/1 intervals per access pattern and mean/median 4.42375/4.35882 intersections per interval.
+2024-06-01T06:04:43Z INFO 2403803 [AntiDependencyAnalyzer-PSUM2-64-128]: Finished analyzing 72080 access patterns a mean/median 1.0045/1 intervals per access pattern and mean/median 4.6104/5.06522 intersections per interval.
+2024-06-01T06:04:43Z INFO 2403803 [AntiDependencyAnalyzer-PSUM2-32-64]: Finished analyzing 72085 access patterns a mean/median 1.00449/1 intervals per access pattern and mean/median 4.61023/5.0647 intersections per interval.
+2024-06-01T06:04:43Z INFO 2403803 [AntiDependencyAnalyzer-PSUM2-0-32]: Finished analyzing 72745 access patterns a mean/median 1.00445/1 intervals per access pattern and mean/median 4.63124/5.11254 intersections per interval.
+2024-06-01T06:04:43Z INFO 2403803 [AntiDependencyAnalyzer-PSUM6-0-32]: Finished analyzing 72767 access patterns a mean/median 1.00445/1 intervals per access pattern and mean/median 4.79197/5.81305 intersections per interval.
+2024-06-01T06:04:43Z INFO 2403803 [AntiDependencyAnalyzer-PSUM6-32-64]: Finished analyzing 72107 access patterns a mean/median 1.00449/1 intervals per access pattern and mean/median 4.76901/5.69126 intersections per interval.
+2024-06-01T06:04:43Z INFO 2403803 [AntiDependencyAnalyzer-PSUM6-64-128]: Finished analyzing 72102 access patterns a mean/median 1.00449/1 intervals per access pattern and mean/median 4.76919/5.69189 intersections per interval.
+2024-06-01T06:04:44Z INFO 2403803 [AntiDependencyAnalyzer-DRAM]: Finished analyzing 13798 access patterns a mean/median 87.3006/127.934 intervals per access pattern and mean/median 1.98368/1 intersections per interval.
+2024-06-01T06:04:44Z INFO 2403803 [AntiDependencyAnalyzer-SB-0-1]: Finished analyzing 1305767 access patterns a mean/median 1.07406/1 intervals per access pattern and mean/median 2.29639/1 intersections per interval.
+2024-06-01T06:04:44Z INFO 2403803 [AntiDependencyAnalyzer-SB-1-64]: Finished analyzing 1305287 access patterns a mean/median 1.07409/1 intervals per access pattern and mean/median 2.29707/1 intersections per interval.
+2024-06-01T06:04:44Z INFO 2403803 [AntiDependencyAnalyzer-SB-64-96]: Finished analyzing 1293418 access patterns a mean/median 1.07477/1 intervals per access pattern and mean/median 2.31706/1 intersections per interval.
+2024-06-01T06:04:44Z INFO 2403803 [AntiDependencyAnalyzer-SB-97-113]: Finished analyzing 1293418 access patterns a mean/median 1.07477/1 intervals per access pattern and mean/median 2.31706/1 intersections per interval.
+2024-06-01T06:04:44Z INFO 2403803 [AntiDependencyAnalyzer-SB-113-128]: Finished analyzing 1266574 access patterns a mean/median 1.07635/1 intervals per access pattern and mean/median 2.32753/1 intersections per interval.
+2024-06-01T06:04:44Z INFO 2403803 [AntiDependencyAnalyzer-SB-96-97]: Finished analyzing 1293738 access patterns a mean/median 1.07475/1 intervals per access pattern and mean/median 2.31662/1 intersections per interval.
+2024-06-01T06:04:44Z USER 2403803 (sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 7.061 seconds
+2024-06-01T06:04:44Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  14338mb, ru_maxrss:  14338mb (delta=351mb)
+2024-06-01T06:04:44Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 698903 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:44Z USER 2403803 (sg01) [ModuleForkPass]: Running tensor_copy_elim
+2024-06-01T06:04:45Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=217043 blocks=1 instructions=698903 Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:47Z INFO 2403803 (sg01) [TensorCopyElim]: Tensor CP elimination: 0
+2024-06-01T06:04:49Z USER 2403803 (sg01) [ModuleForkPass]: tensor_copy_elim finished after 4.800 seconds
+2024-06-01T06:04:49Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  14059mb, ru_maxrss:  14338mb (delta=0mb)
+2024-06-01T06:04:49Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 698903 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:49Z USER 2403803 (sg01) [ModuleForkPass]: Running post_sched
+2024-06-01T06:04:50Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=217043 blocks=1 instructions=698903 Max writers: 576 Max Readers: 184036
+2024-06-01T06:04:50Z INFO 2403803 [post_scheduler]: Start PosT ScheD 3 sunda Sat Jun  1 06:04:50 2024
+2024-06-01T06:04:58Z INFO 2403803 [post_scheduler]: Time-aware simulation time: 119783503
+2024-06-01T06:05:00Z INFO 2403803 [post_scheduler]: Done  PosT ScheD Sat Jun  1 06:05:00 2024
+2024-06-01T06:05:01Z USER 2403803 (sg02) [ModuleForkPass]: post_sched finished after 35.082 seconds
+2024-06-01T06:05:01Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  14296mb, ru_maxrss:  14338mb (delta=657mb)
+2024-06-01T06:05:01Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 698903 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:05:01Z USER 2403803 (sg02) [ModuleForkPass]: Running address_rotation_sb
+2024-06-01T06:05:01Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=217043 blocks=1 instructions=698903 Max writers: 576 Max Readers: 184036
+2024-06-01T06:05:05Z INFO 2403803 [post_scheduler]: Time-aware hwm post-sched
+2024-06-01T06:05:07Z INFO 2403803 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 16366 PSUM Banks
+2024-06-01T06:05:09Z INFO 2403803 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 14280 PSUM Banks
+2024-06-01T06:05:11Z INFO 2403803 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 2173 PSUM Banks
+2024-06-01T06:05:14Z INFO 2403803 (sg02) [DMAOptimizationBase]: SB Rotation rotated 200 Sb address
+2024-06-01T06:05:17Z INFO 2403803 (sg02) [DMAOptimizationBase]: SB Rotation rotated 1144 Sb address
+2024-06-01T06:05:20Z INFO 2403803 (sg02) [DMAOptimizationBase]: SB Rotation rotated 2197 Sb address
+2024-06-01T06:05:20Z INFO 2403803 [post_scheduler]: Time-aware simulation time: 119783503
+2024-06-01T06:05:22Z INFO 2403803 (sg02) [DMAOptimizationBase]: SB Rotation rotated 889 Sb address
+2024-06-01T06:05:23Z INFO 2403803 [post_scheduler]: Done  PosT ScheD Sat Jun  1 06:05:23 2024
+2024-06-01T06:05:23Z USER 2403803 (sg01) [ModuleForkPass]: post_sched finished after 33.240 seconds
+2024-06-01T06:05:23Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  14308mb, ru_maxrss:  14338mb (delta=0mb)
+2024-06-01T06:05:23Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 698903 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:05:23Z USER 2403803 (sg01) [ModuleForkPass]: Running address_rotation_sb
+2024-06-01T06:05:23Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=217043 blocks=1 instructions=698903 Max writers: 576 Max Readers: 184036
+2024-06-01T06:05:25Z INFO 2403803 (sg02) [DMAOptimizationBase]: SB Rotation rotated 15563 Sb address
+2024-06-01T06:05:28Z INFO 2403803 (sg02) [DMAOptimizationBase]: SB Rotation rotated 63 Sb address
+2024-06-01T06:05:29Z INFO 2403803 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 16366 PSUM Banks
+2024-06-01T06:05:29Z INFO 2403803 (sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address
+2024-06-01T06:05:29Z USER 2403803 (sg02) [ModuleForkPass]: address_rotation_sb finished after 28.769 seconds
+2024-06-01T06:05:29Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  14061mb, ru_maxrss:  14338mb (delta=0mb)
+2024-06-01T06:05:30Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 698903 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:05:30Z USER 2403803 (sg02) [ModuleForkPass]: Running anti_dependency_analyzer
+2024-06-01T06:05:30Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=217043 blocks=1 instructions=698903 Max writers: 576 Max Readers: 184036
+2024-06-01T06:05:30Z INFO 2403803 (sg02) [AntiDependencyAnalyzer]: Batch size: 1000
+2024-06-01T06:05:30Z INFO 2403803 (sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB}
+2024-06-01T06:05:31Z INFO 2403803 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 14280 PSUM Banks
+2024-06-01T06:05:33Z INFO 2403803 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 2173 PSUM Banks
+2024-06-01T06:05:33Z INFO 2403803 [AntiDependencyAnalyzer-PSUM1-64-128]: Finished analyzing 72393 access patterns a mean/median 1.00435/1 intervals per access pattern and mean/median 4.41914/4.42926 intersections per interval.
+2024-06-01T06:05:33Z INFO 2403803 [AntiDependencyAnalyzer-PSUM1-32-64]: Finished analyzing 72403 access patterns a mean/median 1.00435/1 intervals per access pattern and mean/median 4.41882/4.42837 intersections per interval.
+2024-06-01T06:05:33Z INFO 2403803 [AntiDependencyAnalyzer-PSUM1-0-32]: Finished analyzing 73228 access patterns a mean/median 1.0043/1 intervals per access pattern and mean/median 4.44464/4.4508 intersections per interval.
+2024-06-01T06:05:33Z INFO 2403803 [AntiDependencyAnalyzer-PSUM6-0-32]: Finished analyzing 71247 access patterns a mean/median 1.00413/1 intervals per access pattern and mean/median 4.75241/5.94147 intersections per interval.
+2024-06-01T06:05:33Z INFO 2403803 [AntiDependencyAnalyzer-PSUM7-64-128]: Finished analyzing 72518 access patterns a mean/median 1.00467/1 intervals per access pattern and mean/median 4.46349/4.47278 intersections per interval.
+2024-06-01T06:05:33Z INFO 2403803 [AntiDependencyAnalyzer-PSUM6-64-128]: Finished analyzing 70381 access patterns a mean/median 1.00418/1 intervals per access pattern and mean/median 4.72625/5.82962 intersections per interval.
+2024-06-01T06:05:33Z INFO 2403803 [AntiDependencyAnalyzer-PSUM7-32-64]: Finished analyzing 72523 access patterns a mean/median 1.00467/1 intervals per access pattern and mean/median 4.46333/4.47282 intersections per interval.
+2024-06-01T06:05:33Z INFO 2403803 [AntiDependencyAnalyzer-PSUM6-32-64]: Finished analyzing 70389 access patterns a mean/median 1.00418/1 intervals per access pattern and mean/median 4.72596/5.82902 intersections per interval.
+2024-06-01T06:05:33Z INFO 2403803 [AntiDependencyAnalyzer-PSUM7-0-32]: Finished analyzing 73249 access patterns a mean/median 1.00463/1 intervals per access pattern and mean/median 4.4938/4.5833 intersections per interval.
+2024-06-01T06:05:33Z INFO 2403803 [AntiDependencyAnalyzer-PSUM0-0-32]: Finished analyzing 70270 access patterns a mean/median 1.00393/1 intervals per access pattern and mean/median 4.68285/5.33894 intersections per interval.
+2024-06-01T06:05:33Z INFO 2403803 [AntiDependencyAnalyzer-PSUM0-32-64]: Finished analyzing 69775 access patterns a mean/median 1.00396/1 intervals per access pattern and mean/median 4.66024/5.42372 intersections per interval.
+2024-06-01T06:05:33Z INFO 2403803 [AntiDependencyAnalyzer-PSUM4-64-128]: Finished analyzing 69668 access patterns a mean/median 1.00426/1 intervals per access pattern and mean/median 4.4879/4.92501 intersections per interval.
+2024-06-01T06:05:33Z INFO 2403803 [AntiDependencyAnalyzer-PSUM4-32-64]: Finished analyzing 69673 access patterns a mean/median 1.00426/1 intervals per access pattern and mean/median 4.48773/4.92454 intersections per interval.
+2024-06-01T06:05:33Z INFO 2403803 [AntiDependencyAnalyzer-PSUM4-0-32]: Finished analyzing 70564 access patterns a mean/median 1.00421/1 intervals per access pattern and mean/median 4.52122/5.02716 intersections per interval.
+2024-06-01T06:05:33Z INFO 2403803 [AntiDependencyAnalyzer-PSUM5-32-128]: Finished analyzing 72633 access patterns a mean/median 1.00483/1 intervals per access pattern and mean/median 4.39118/4.24928 intersections per interval.
+2024-06-01T06:05:33Z INFO 2403803 [AntiDependencyAnalyzer-PSUM0-64-128]: Finished analyzing 69767 access patterns a mean/median 1.00396/1 intervals per access pattern and mean/median 4.66053/5.42445 intersections per interval.
+2024-06-01T06:05:33Z INFO 2403803 [AntiDependencyAnalyzer-PSUM5-0-32]: Finished analyzing 73029 access patterns a mean/median 1.00481/1 intervals per access pattern and mean/median 4.40831/4.32458 intersections per interval.
+2024-06-01T06:05:33Z INFO 2403803 [AntiDependencyAnalyzer-PSUM3-32-64]: Finished analyzing 73122 access patterns a mean/median 1.00435/1 intervals per access pattern and mean/median 4.55737/4.90732 intersections per interval.
+2024-06-01T06:05:33Z INFO 2403803 [AntiDependencyAnalyzer-PSUM3-0-32]: Finished analyzing 73584 access patterns a mean/median 1.00432/1 intervals per access pattern and mean/median 4.5717/4.94711 intersections per interval.
+2024-06-01T06:05:33Z INFO 2403803 [AntiDependencyAnalyzer-PSUM3-64-128]: Finished analyzing 73117 access patterns a mean/median 1.00435/1 intervals per access pattern and mean/median 4.55753/4.90729 intersections per interval.
+2024-06-01T06:05:33Z INFO 2403803 [AntiDependencyAnalyzer-PSUM2-64-128]: Finished analyzing 71043 access patterns a mean/median 1.00431/1 intervals per access pattern and mean/median 4.74671/5.94614 intersections per interval.
+2024-06-01T06:05:33Z INFO 2403803 [AntiDependencyAnalyzer-PSUM2-32-64]: Finished analyzing 71048 access patterns a mean/median 1.00431/1 intervals per access pattern and mean/median 4.74653/5.94564 intersections per interval.
+2024-06-01T06:05:33Z INFO 2403803 [AntiDependencyAnalyzer-PSUM2-0-32]: Finished analyzing 71675 access patterns a mean/median 1.00427/1 intervals per access pattern and mean/median 4.76848/6.01744 intersections per interval.
+2024-06-01T06:05:34Z INFO 2403803 (sg01) [DMAOptimizationBase]: SB Rotation rotated 200 Sb address
+2024-06-01T06:05:34Z INFO 2403803 [AntiDependencyAnalyzer-DRAM]: Finished analyzing 13798 access patterns a mean/median 87.3006/127.97 intervals per access pattern and mean/median 1.98368/0.999997 intersections per interval.
+2024-06-01T06:05:36Z INFO 2403803 [AntiDependencyAnalyzer-SB-64-96]: Finished analyzing 1293367 access patterns a mean/median 1.07477/1 intervals per access pattern and mean/median 2.32361/1.00001 intersections per interval.
+2024-06-01T06:05:36Z INFO 2403803 [AntiDependencyAnalyzer-SB-97-113]: Finished analyzing 1293367 access patterns a mean/median 1.07477/1 intervals per access pattern and mean/median 2.32361/1.00001 intersections per interval.
+2024-06-01T06:05:36Z INFO 2403803 [AntiDependencyAnalyzer-SB-0-1]: Finished analyzing 1305818 access patterns a mean/median 1.07406/1 intervals per access pattern and mean/median 2.30288/0.999997 intersections per interval.
+2024-06-01T06:05:36Z INFO 2403803 [AntiDependencyAnalyzer-SB-96-97]: Finished analyzing 1293687 access patterns a mean/median 1.07475/1 intervals per access pattern and mean/median 2.32316/1 intersections per interval.
+2024-06-01T06:05:36Z INFO 2403803 [AntiDependencyAnalyzer-SB-113-128]: Finished analyzing 1266523 access patterns a mean/median 1.07635/1 intervals per access pattern and mean/median 2.33589/0.999996 intersections per interval.
+2024-06-01T06:05:36Z INFO 2403803 [AntiDependencyAnalyzer-SB-1-64]: Finished analyzing 1305338 access patterns a mean/median 1.07408/1 intervals per access pattern and mean/median 2.30352/1 intersections per interval.
+2024-06-01T06:05:36Z USER 2403803 (sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 6.657 seconds
+2024-06-01T06:05:36Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  14178mb, ru_maxrss:  14338mb (delta=0mb)
+2024-06-01T06:05:37Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 698903 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:05:37Z USER 2403803 (sg02) [ModuleForkPass]: Running dep_opt
+2024-06-01T06:05:37Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=217043 blocks=1 instructions=698903 Max writers: 576 Max Readers: 184036
+2024-06-01T06:05:37Z INFO 2403803 (sg02) [build_flow_deps]: Start build fdeps. Invocation: 11Sat Jun  1 06:05:37 2024
+2024-06-01T06:05:37Z INFO 2403803 (sg02) [build_flow_deps]: Allocs: 217043 instructions: 698903
+2024-06-01T06:05:38Z INFO 2403803 (sg01) [DMAOptimizationBase]: SB Rotation rotated 1144 Sb address
+2024-06-01T06:05:41Z INFO 2403803 (sg01) [DMAOptimizationBase]: SB Rotation rotated 2197 Sb address
+2024-06-01T06:05:41Z INFO 2403803 (sg02) [build_flow_deps]: Build fdeps inserted 1943323 edges 
+2024-06-01T06:05:41Z INFO 2403803 (sg02) [build_flow_deps]: Done build fdeps 1943323 Sat Jun  1 06:05:41 2024
+2024-06-01T06:05:43Z USER 2403803 (sg02) [ModuleForkPass]: dep_opt finished after 6.118 seconds
+2024-06-01T06:05:43Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  13845mb, ru_maxrss:  14338mb (delta=0mb)
+2024-06-01T06:05:43Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 698903 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:05:43Z USER 2403803 (sg02) [ModuleForkPass]: Running report_stats
+2024-06-01T06:05:43Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=217043 blocks=1 instructions=698903 Max writers: 576 Max Readers: 184036
+2024-06-01T06:05:43Z INFO 2403803 (sg02) [ReportStats]: Data Movement Statistics: sg0002
+┌──────────────┬────────────────────────────────────┬───────┬────────────┐
+│ Instruction  │ Kind                               │ Count │ Bytes      │
+├──────────────┼────────────────────────────────────┼───────┼────────────┤
+│ DMACopy      │ ExternalInputParameter -> Internal │ 2     │ 32768      │
+│ DMACopy      │ Input -> Internal                  │ 1     │ 900071424  │
+│ DMACopy      │ Internal -> Output                 │ 2     │ 600047616  │
+│ Load         │ Const -> Internal                  │ 81    │ 1343488    │
+│ Load         │ ExternalInputParameter -> Internal │ 16649 │ 7471169536 │
+│ Load         │ Input -> Internal                  │ 663   │ 84418320   │
+│ Load         │ Internal                           │ 8889  │ 1888166912 │
+│ Save         │ Internal                           │ 2607  │ 1056604160 │
+│ Save         │ Internal -> ExternalOutput         │ 108   │ 37502976   │
+│ Save         │ Internal -> Output                 │ 576   │ 300023808  │
+│ Save (Spill) │ Internal                           │ 2291  │ 512458752  │
+└──────────────┴────────────────────────────────────┴───────┴────────────┘
+
+2024-06-01T06:05:43Z INFO 2403803 (sg02) [ReportStats]: 
+┌─────────────────────┬───────┐
+│ Bytes per partition │ Count │
+├─────────────────────┼───────┤
+│ 113                 │ 16    │
+│ 128                 │ 388   │
+│ 256                 │ 1     │
+│ 482                 │ 1152  │
+│ 488                 │ 253   │
+│ 512                 │ 852   │
+│ 904                 │ 32    │
+│ 1024                │ 7277  │
+│ 1137                │ 8     │
+│ 1152                │ 8     │
+│ 1265                │ 59    │
+│ 1928                │ 128   │
+│ 2048                │ 211   │
+│ 2289                │ 205   │
+│ 3584                │ 15360 │
+│ 4096                │ 5916  │
+│ 150011904           │ 4     │
+│ 300023808           │ 3     │
+└─────────────────────┴───────┘
+
+2024-06-01T06:05:43Z INFO 2403803 (sg02) [ReportStats]: MM Stats: #MatMults 474564 #MatMult-Transposes 189220
+2024-06-01T06:05:43Z USER 2403803 (sg02) [ModuleForkPass]: report_stats finished after 0.423 seconds
+2024-06-01T06:05:43Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  13761mb, ru_maxrss:  14338mb (delta=0mb)
+2024-06-01T06:05:43Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 698903 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:05:43Z USER 2403803 (sg02) [ModuleForkPass]: Running assign_trigger_engine
+2024-06-01T06:05:43Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to assign_trigger_engine: modules=1 functions=1 allocs=217043 blocks=1 instructions=698903 Max writers: 576 Max Readers: 184036
+2024-06-01T06:05:44Z INFO 2403803 (sg02) [AssignTriggerEngine]: Assigned trigger engine for 5476 DMA instructions
+2024-06-01T06:05:44Z USER 2403803 (sg02) [ModuleForkPass]: assign_trigger_engine finished after 0.681 seconds
+2024-06-01T06:05:44Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  13795mb, ru_maxrss:  14338mb (delta=0mb)
+2024-06-01T06:05:44Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 698903 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:05:44Z USER 2403803 (sg02) [ModuleForkPass]: Running alloc_queues
+2024-06-01T06:05:44Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=217043 blocks=1 instructions=698903 Max writers: 576 Max Readers: 184036
+2024-06-01T06:05:45Z USER 2403803 (sg02) [ModuleForkPass]: alloc_queues finished after 0.436 seconds
+2024-06-01T06:05:45Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  13789mb, ru_maxrss:  14338mb (delta=0mb)
+2024-06-01T06:05:45Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 698903 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:05:45Z USER 2403803 (sg02) [ModuleForkPass]: Running dep_reduction
+2024-06-01T06:05:45Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=217043 blocks=1 instructions=698903 Max writers: 576 Max Readers: 184036
+2024-06-01T06:05:45Z INFO 2403803 (sg02) [DepReduction]: Start Dependency Reduction
+2024-06-01T06:05:45Z INFO 2403803 (sg01) [DMAOptimizationBase]: SB Rotation rotated 889 Sb address
+2024-06-01T06:05:46Z INFO 2403803 (sg02) [DepReduction]: Processing async instrs...
+2024-06-01T06:05:46Z INFO 2403803 (sg02) [DepReduction]: Processing secondary edges per engine...
+2024-06-01T06:05:47Z INFO 2403803 (sg02) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 560853
+2024-06-01T06:05:47Z INFO 2403803 (sg02) [DepReduction]: Processing redundant descendants...
+2024-06-01T06:05:49Z INFO 2403803 (sg01) [DMAOptimizationBase]: SB Rotation rotated 15563 Sb address
+2024-06-01T06:05:50Z INFO 2403803 (sg02) [DepReduction]: Processing redundant descendants, Done. Num edges removed 39590
+2024-06-01T06:05:50Z INFO 2403803 (sg02) [DepReduction]: Processing async instrs, Done. Num edges removed 600443
+2024-06-01T06:05:52Z INFO 2403803 (sg01) [DMAOptimizationBase]: SB Rotation rotated 63 Sb address
+2024-06-01T06:05:54Z INFO 2403803 (sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address
+2024-06-01T06:05:54Z USER 2403803 (sg01) [ModuleForkPass]: address_rotation_sb finished after 31.213 seconds
+2024-06-01T06:05:54Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  13861mb, ru_maxrss:  14338mb (delta=0mb)
+2024-06-01T06:05:54Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 698903 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:05:54Z USER 2403803 (sg01) [ModuleForkPass]: Running anti_dependency_analyzer
+2024-06-01T06:05:54Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=217043 blocks=1 instructions=698903 Max writers: 576 Max Readers: 184036
+2024-06-01T06:05:54Z INFO 2403803 (sg01) [AntiDependencyAnalyzer]: Batch size: 1000
+2024-06-01T06:05:54Z INFO 2403803 (sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB}
+2024-06-01T06:05:57Z INFO 2403803 (sg02) [DepReduction]: Num Async removed: 0
+2024-06-01T06:05:57Z INFO 2403803 (sg02) [DepReduction]: Finished dependency reduction: 4245596 removed, new total 266838
+2024-06-01T06:05:57Z INFO 2403803 (sg02) [DepReduction]: Finished Dependency Reduction
+2024-06-01T06:05:57Z USER 2403803 (sg02) [ModuleForkPass]: dep_reduction finished after 12.608 seconds
+2024-06-01T06:05:57Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  14272mb, ru_maxrss:  14338mb (delta=0mb)
+2024-06-01T06:05:58Z INFO 2403803 [AntiDependencyAnalyzer-PSUM1-0-32]: Finished analyzing 73228 access patterns a mean/median 1.0043/1 intervals per access pattern and mean/median 4.44464/4.4508 intersections per interval.
+2024-06-01T06:05:58Z INFO 2403803 [AntiDependencyAnalyzer-PSUM1-32-64]: Finished analyzing 72403 access patterns a mean/median 1.00435/1 intervals per access pattern and mean/median 4.41882/4.42837 intersections per interval.
+2024-06-01T06:05:58Z INFO 2403803 [AntiDependencyAnalyzer-PSUM0-0-32]: Finished analyzing 70270 access patterns a mean/median 1.00393/1 intervals per access pattern and mean/median 4.68285/5.33894 intersections per interval.
+2024-06-01T06:05:58Z INFO 2403803 [AntiDependencyAnalyzer-PSUM1-64-128]: Finished analyzing 72393 access patterns a mean/median 1.00435/1 intervals per access pattern and mean/median 4.41914/4.42926 intersections per interval.
+2024-06-01T06:05:58Z INFO 2403803 [AntiDependencyAnalyzer-PSUM6-32-64]: Finished analyzing 70389 access patterns a mean/median 1.00418/1 intervals per access pattern and mean/median 4.72596/5.82902 intersections per interval.
+2024-06-01T06:05:58Z INFO 2403803 [AntiDependencyAnalyzer-PSUM6-64-128]: Finished analyzing 70381 access patterns a mean/median 1.00418/1 intervals per access pattern and mean/median 4.72625/5.82962 intersections per interval.
+2024-06-01T06:05:58Z INFO 2403803 [AntiDependencyAnalyzer-PSUM0-32-64]: Finished analyzing 69775 access patterns a mean/median 1.00396/1 intervals per access pattern and mean/median 4.66024/5.42372 intersections per interval.
+2024-06-01T06:05:58Z INFO 2403803 [AntiDependencyAnalyzer-PSUM7-0-32]: Finished analyzing 73249 access patterns a mean/median 1.00463/1 intervals per access pattern and mean/median 4.4938/4.5833 intersections per interval.
+2024-06-01T06:05:58Z INFO 2403803 [AntiDependencyAnalyzer-PSUM6-0-32]: Finished analyzing 71247 access patterns a mean/median 1.00413/1 intervals per access pattern and mean/median 4.75241/5.94147 intersections per interval.
+2024-06-01T06:05:58Z INFO 2403803 [AntiDependencyAnalyzer-PSUM4-0-32]: Finished analyzing 70564 access patterns a mean/median 1.00421/1 intervals per access pattern and mean/median 4.52122/5.02716 intersections per interval.
+2024-06-01T06:05:58Z INFO 2403803 [AntiDependencyAnalyzer-PSUM7-64-128]: Finished analyzing 72518 access patterns a mean/median 1.00467/1 intervals per access pattern and mean/median 4.46349/4.47278 intersections per interval.
+2024-06-01T06:05:58Z INFO 2403803 [AntiDependencyAnalyzer-PSUM4-64-128]: Finished analyzing 69668 access patterns a mean/median 1.00426/1 intervals per access pattern and mean/median 4.4879/4.92501 intersections per interval.
+2024-06-01T06:05:58Z INFO 2403803 [AntiDependencyAnalyzer-PSUM4-32-64]: Finished analyzing 69673 access patterns a mean/median 1.00426/1 intervals per access pattern and mean/median 4.48773/4.92454 intersections per interval.
+2024-06-01T06:05:58Z INFO 2403803 [AntiDependencyAnalyzer-PSUM7-32-64]: Finished analyzing 72523 access patterns a mean/median 1.00467/1 intervals per access pattern and mean/median 4.46333/4.47282 intersections per interval.
+2024-06-01T06:05:58Z INFO 2403803 [AntiDependencyAnalyzer-PSUM0-64-128]: Finished analyzing 69767 access patterns a mean/median 1.00396/1 intervals per access pattern and mean/median 4.66053/5.42445 intersections per interval.
+2024-06-01T06:05:58Z INFO 2403803 [AntiDependencyAnalyzer-PSUM5-32-128]: Finished analyzing 72633 access patterns a mean/median 1.00483/1 intervals per access pattern and mean/median 4.39118/4.24928 intersections per interval.
+2024-06-01T06:05:58Z INFO 2403803 [AntiDependencyAnalyzer-PSUM3-0-32]: Finished analyzing 73584 access patterns a mean/median 1.00432/1 intervals per access pattern and mean/median 4.5717/4.94711 intersections per interval.
+2024-06-01T06:05:58Z INFO 2403803 [AntiDependencyAnalyzer-PSUM2-0-32]: Finished analyzing 71675 access patterns a mean/median 1.00427/1 intervals per access pattern and mean/median 4.76848/6.01744 intersections per interval.
+2024-06-01T06:05:58Z INFO 2403803 [AntiDependencyAnalyzer-PSUM3-64-128]: Finished analyzing 73117 access patterns a mean/median 1.00435/1 intervals per access pattern and mean/median 4.55753/4.90729 intersections per interval.
+2024-06-01T06:05:58Z INFO 2403803 [AntiDependencyAnalyzer-PSUM5-0-32]: Finished analyzing 73029 access patterns a mean/median 1.00481/1 intervals per access pattern and mean/median 4.40831/4.32458 intersections per interval.
+2024-06-01T06:05:58Z INFO 2403803 [AntiDependencyAnalyzer-PSUM3-32-64]: Finished analyzing 73122 access patterns a mean/median 1.00435/1 intervals per access pattern and mean/median 4.55737/4.90732 intersections per interval.
+2024-06-01T06:05:58Z INFO 2403803 [AntiDependencyAnalyzer-PSUM2-64-128]: Finished analyzing 71043 access patterns a mean/median 1.00431/1 intervals per access pattern and mean/median 4.74671/5.94614 intersections per interval.
+2024-06-01T06:05:58Z INFO 2403803 [AntiDependencyAnalyzer-PSUM2-32-64]: Finished analyzing 71048 access patterns a mean/median 1.00431/1 intervals per access pattern and mean/median 4.74653/5.94564 intersections per interval.
+2024-06-01T06:05:58Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 698903 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:05:58Z USER 2403803 (sg02) [ModuleForkPass]: Running bir_racecheck
+2024-06-01T06:05:58Z INFO 2403803 (sg02) [ModuleForkPass]: Inputs to bir_racecheck: modules=1 functions=1 allocs=217043 blocks=1 instructions=698903 Max writers: 576 Max Readers: 184036
+2024-06-01T06:05:59Z INFO 2403803 [AntiDependencyAnalyzer-DRAM]: Finished analyzing 13798 access patterns a mean/median 87.3006/127.97 intervals per access pattern and mean/median 1.98368/0.999997 intersections per interval.
+2024-06-01T06:06:00Z INFO 2403803 [AntiDependencyAnalyzer-SB-0-1]: Finished analyzing 1305818 access patterns a mean/median 1.07406/1 intervals per access pattern and mean/median 2.30288/0.999997 intersections per interval.
+2024-06-01T06:06:00Z INFO 2403803 [AntiDependencyAnalyzer-SB-96-97]: Finished analyzing 1293687 access patterns a mean/median 1.07475/1 intervals per access pattern and mean/median 2.32316/1 intersections per interval.
+2024-06-01T06:06:00Z INFO 2403803 [AntiDependencyAnalyzer-SB-1-64]: Finished analyzing 1305338 access patterns a mean/median 1.07408/1 intervals per access pattern and mean/median 2.30352/1 intersections per interval.
+2024-06-01T06:06:00Z INFO 2403803 [AntiDependencyAnalyzer-SB-64-96]: Finished analyzing 1293367 access patterns a mean/median 1.07477/1 intervals per access pattern and mean/median 2.32361/1.00001 intersections per interval.
+2024-06-01T06:06:00Z INFO 2403803 [AntiDependencyAnalyzer-SB-97-113]: Finished analyzing 1293367 access patterns a mean/median 1.07477/1 intervals per access pattern and mean/median 2.32361/1.00001 intersections per interval.
+2024-06-01T06:06:00Z INFO 2403803 [AntiDependencyAnalyzer-SB-113-128]: Finished analyzing 1266523 access patterns a mean/median 1.07635/1 intervals per access pattern and mean/median 2.33589/0.999996 intersections per interval.
+2024-06-01T06:06:01Z USER 2403803 (sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 6.512 seconds
+2024-06-01T06:06:01Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  14430mb, ru_maxrss:  14430mb (delta=92mb)
+2024-06-01T06:06:01Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 698903 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:06:01Z USER 2403803 (sg01) [ModuleForkPass]: Running dep_opt
+2024-06-01T06:06:01Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=217043 blocks=1 instructions=698903 Max writers: 576 Max Readers: 184036
+2024-06-01T06:06:01Z INFO 2403803 (sg01) [build_flow_deps]: Start build fdeps. Invocation: 12Sat Jun  1 06:06:01 2024
+2024-06-01T06:06:02Z INFO 2403803 (sg01) [build_flow_deps]: Allocs: 217043 instructions: 698903
+2024-06-01T06:06:07Z INFO 2403803 (sg01) [build_flow_deps]: Build fdeps inserted 1943323 edges 
+2024-06-01T06:06:07Z INFO 2403803 (sg01) [build_flow_deps]: Done build fdeps 1943323 Sat Jun  1 06:06:07 2024
+2024-06-01T06:06:09Z USER 2403803 (sg01) [ModuleForkPass]: dep_opt finished after 7.695 seconds
+2024-06-01T06:06:09Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  14858mb, ru_maxrss:  14858mb (delta=428mb)
+2024-06-01T06:06:09Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 698903 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:06:09Z USER 2403803 (sg01) [ModuleForkPass]: Running report_stats
+2024-06-01T06:06:09Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=217043 blocks=1 instructions=698903 Max writers: 576 Max Readers: 184036
+2024-06-01T06:06:09Z INFO 2403803 (sg01) [ReportStats]: Data Movement Statistics: sg0001
+┌──────────────┬────────────────────────────────────┬───────┬────────────┐
+│ Instruction  │ Kind                               │ Count │ Bytes      │
+├──────────────┼────────────────────────────────────┼───────┼────────────┤
+│ DMACopy      │ ExternalInputParameter -> Internal │ 2     │ 32768      │
+│ DMACopy      │ Input -> Internal                  │ 1     │ 900071424  │
+│ DMACopy      │ Internal -> Output                 │ 2     │ 600047616  │
+│ Load         │ Const -> Internal                  │ 81    │ 1343488    │
+│ Load         │ ExternalInputParameter -> Internal │ 16649 │ 7471169536 │
+│ Load         │ Input -> Internal                  │ 663   │ 84418320   │
+│ Load         │ Internal                           │ 8889  │ 1888166912 │
+│ Save         │ Internal                           │ 2607  │ 1056604160 │
+│ Save         │ Internal -> ExternalOutput         │ 108   │ 37502976   │
+│ Save         │ Internal -> Output                 │ 576   │ 300023808  │
+│ Save (Spill) │ Internal                           │ 2291  │ 512458752  │
+└──────────────┴────────────────────────────────────┴───────┴────────────┘
+
+2024-06-01T06:06:09Z INFO 2403803 (sg01) [ReportStats]: 
+┌─────────────────────┬───────┐
+│ Bytes per partition │ Count │
+├─────────────────────┼───────┤
+│ 113                 │ 16    │
+│ 128                 │ 388   │
+│ 256                 │ 1     │
+│ 482                 │ 1152  │
+│ 488                 │ 253   │
+│ 512                 │ 852   │
+│ 904                 │ 32    │
+│ 1024                │ 7277  │
+│ 1137                │ 8     │
+│ 1152                │ 8     │
+│ 1265                │ 59    │
+│ 1928                │ 128   │
+│ 2048                │ 211   │
+│ 2289                │ 205   │
+│ 3584                │ 15360 │
+│ 4096                │ 5916  │
+│ 150011904           │ 4     │
+│ 300023808           │ 3     │
+└─────────────────────┴───────┘
+
+2024-06-01T06:06:09Z INFO 2403803 (sg01) [ReportStats]: MM Stats: #MatMults 474564 #MatMult-Transposes 189220
+2024-06-01T06:06:09Z USER 2403803 (sg01) [ModuleForkPass]: report_stats finished after 0.593 seconds
+2024-06-01T06:06:09Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  14841mb, ru_maxrss:  14858mb (delta=0mb)
+2024-06-01T06:06:10Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 698903 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:06:10Z USER 2403803 (sg01) [ModuleForkPass]: Running assign_trigger_engine
+2024-06-01T06:06:10Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to assign_trigger_engine: modules=1 functions=1 allocs=217043 blocks=1 instructions=698903 Max writers: 576 Max Readers: 184036
+2024-06-01T06:06:10Z USER 2403803 (sg02) [ModuleForkPass]: bir_racecheck finished after 12.065 seconds
+2024-06-01T06:06:10Z INFO 2403803 (sg02) [ModuleForkPass]: curr_vmrss:  14367mb, ru_maxrss:  14858mb (delta=520mb)
+2024-06-01T06:06:10Z INFO 2403803 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 698903 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:06:10Z INFO 2403803 (sg01) [AssignTriggerEngine]: Assigned trigger engine for 5476 DMA instructions
+2024-06-01T06:06:10Z USER 2403803 (sg01) [ModuleForkPass]: assign_trigger_engine finished after 0.889 seconds
+2024-06-01T06:06:10Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  14044mb, ru_maxrss:  14858mb (delta=0mb)
+2024-06-01T06:06:11Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 698903 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:06:11Z USER 2403803 (sg01) [ModuleForkPass]: Running alloc_queues
+2024-06-01T06:06:11Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=217043 blocks=1 instructions=698903 Max writers: 576 Max Readers: 184036
+2024-06-01T06:06:11Z USER 2403803 (sg01) [ModuleForkPass]: alloc_queues finished after 0.173 seconds
+2024-06-01T06:06:11Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  13791mb, ru_maxrss:  14858mb (delta=0mb)
+2024-06-01T06:06:11Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 698903 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:06:11Z USER 2403803 (sg01) [ModuleForkPass]: Running dep_reduction
+2024-06-01T06:06:11Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=217043 blocks=1 instructions=698903 Max writers: 576 Max Readers: 184036
+2024-06-01T06:06:11Z INFO 2403803 (sg01) [DepReduction]: Start Dependency Reduction
+2024-06-01T06:06:12Z INFO 2403803 (sg01) [DepReduction]: Processing async instrs...
+2024-06-01T06:06:12Z INFO 2403803 (sg01) [DepReduction]: Processing secondary edges per engine...
+2024-06-01T06:06:13Z INFO 2403803 (sg01) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 560853
+2024-06-01T06:06:13Z INFO 2403803 (sg01) [DepReduction]: Processing redundant descendants...
+2024-06-01T06:06:16Z INFO 2403803 (sg01) [DepReduction]: Processing redundant descendants, Done. Num edges removed 39590
+2024-06-01T06:06:16Z INFO 2403803 (sg01) [DepReduction]: Processing async instrs, Done. Num edges removed 600443
+2024-06-01T06:06:23Z INFO 2403803 (sg01) [DepReduction]: Num Async removed: 0
+2024-06-01T06:06:23Z INFO 2403803 (sg01) [DepReduction]: Finished dependency reduction: 4245596 removed, new total 266838
+2024-06-01T06:06:23Z INFO 2403803 (sg01) [DepReduction]: Finished Dependency Reduction
+2024-06-01T06:06:23Z USER 2403803 (sg01) [ModuleForkPass]: dep_reduction finished after 12.196 seconds
+2024-06-01T06:06:23Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  13718mb, ru_maxrss:  14858mb (delta=0mb)
+2024-06-01T06:06:23Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 698903 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:06:23Z USER 2403803 (sg01) [ModuleForkPass]: Running bir_racecheck
+2024-06-01T06:06:23Z INFO 2403803 (sg01) [ModuleForkPass]: Inputs to bir_racecheck: modules=1 functions=1 allocs=217043 blocks=1 instructions=698903 Max writers: 576 Max Readers: 184036
+2024-06-01T06:06:34Z USER 2403803 (sg01) [ModuleForkPass]: bir_racecheck finished after 10.273 seconds
+2024-06-01T06:06:34Z INFO 2403803 (sg01) [ModuleForkPass]: curr_vmrss:  14525mb, ru_maxrss:  14858mb (delta=0mb)
+2024-06-01T06:06:34Z INFO 2403803 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 217043 memory location(s), 1 block(s), and 698903 instruction(s). Max writers: 576 Max Readers: 184036
+2024-06-01T06:06:34Z USER 2403803 [ModuleForkPass]: Compilation status: Total modules: 4, Passed: 4, Failed: 0
+2024-06-01T06:06:34Z USER 2403803 [BackendDriver]: mod_parallel_pass finished after 353.154 seconds
+2024-06-01T06:06:34Z INFO 2403803 [BackendDriver]: curr_vmrss:  13693mb, ru_maxrss:  14858mb (delta=13857mb)
+2024-06-01T06:06:35Z INFO 2403803 [BackendDriver]: Output has 4 module(s), 4 function(s), 648505 memory location(s), 4 block(s), and 2092354 instruction(s). Max writers: 5184 Max Readers: 184036
+2024-06-01T06:06:35Z USER 2403803 [BackendDriver]: Running bir_linker
+2024-06-01T06:06:35Z INFO 2403803 [BackendDriver]: Inputs to bir_linker: modules=4 functions=4 allocs=648505 blocks=4 instructions=2092354 Max writers: 5184 Max Readers: 184036
+2024-06-01T06:06:35Z INFO 2403803 (sgLnk) [BirLinker]: bir_linker cwd: "/root/llava_mistral_0531/app/neuronxcc-42xre48h"
+2024-06-01T06:06:35Z INFO 2403803 (sgLnk) [BirLinker]: Num intermediates 66
+2024-06-01T06:06:35Z INFO 2403803 (sgLnk) [BirLinker]: Num Module Definitions 4
+2024-06-01T06:06:35Z INFO 2403803 (sgLnk) [BirLinker]: Linking to a call-graph structure
+2024-06-01T06:06:38Z INFO 2403803 (sgLnk) [BirLinker]:  Added a new SpillReload Que qPoolPIOParam0
+2024-06-01T06:07:02Z INFO 2403803 (sgLnk) [BirLinker]: tensor_map verification successful.
+2024-06-01T06:07:02Z INFO 2403803 (sgLnk) [BirLinker]: Writing updated tensor_map sgLnk/sg00/tensor_map.json
+2024-06-01T06:07:02Z INFO 2403803 (sgLnk) [BirLinker]: bir_linker chdir to "/root/llava_mistral_0531/app/neuronxcc-42xre48h/sgLnk/sg00"
+2024-06-01T06:07:04Z INFO 2403803 (sgLnk) [BirLinker]: PostLink Stats: #MatMults 15178885 #MatMult-Transposes 6045824
+2024-06-01T06:07:04Z INFO 2403803 (sgLnk) [BirLinker]: Total Intermediate MMTs 447552 #out: 285732 #inp: 161820 #symmetric: 0
+2024-06-01T06:07:04Z INFO 2403803 (sgLnk) [BirLinker]: Total Intermediate IOs with MMTs: 36 #out: 33 #inp: 3 #both: 0
+2024-06-01T06:07:04Z INFO 2403803 (sgLnk) [BirLinker]: releasing pre-link modules
+2024-06-01T06:07:29Z INFO 2403803 (sgLnk) [BirLinker]: linking Done.
+2024-06-01T06:07:29Z USER 2403803 [BackendDriver]: bir_linker finished after 53.858 seconds
+2024-06-01T06:07:29Z INFO 2403803 [BackendDriver]: curr_vmrss:  23095mb, ru_maxrss:  23104mb (delta=8246mb)
+2024-06-01T06:07:30Z INFO 2403803 [BackendDriver]: Output has 1 module(s), 5 function(s), 648930 memory location(s), 5 block(s), and 2092406 instruction(s). Max writers: 5184 Max Readers: 184036
+2024-06-01T06:07:30Z USER 2403803 [BackendDriver]: Running postlnk_dma_report
+2024-06-01T06:07:31Z INFO 2403803 [BackendDriver]: Inputs to postlnk_dma_report: modules=1 functions=5 allocs=648930 blocks=5 instructions=2092406 Max writers: 5184 Max Readers: 184036
+2024-06-01T06:07:31Z INFO 2403803 (sgLnk) [DMAReport]: DMA Report: Bytes loaded or saved 33216928348, 69.0878% input load, 2.40007% output write, 28.5121% spill/reload 
+2024-06-01T06:07:31Z USER 2403803 [BackendDriver]: postlnk_dma_report finished after 0.370 seconds
+2024-06-01T06:07:31Z INFO 2403803 [BackendDriver]: curr_vmrss:  12325mb, ru_maxrss:  23104mb (delta=0mb)
+2024-06-01T06:07:31Z INFO 2403803 [BackendDriver]: Output has 1 module(s), 5 function(s), 648930 memory location(s), 5 block(s), and 2092406 instruction(s). Max writers: 5184 Max Readers: 184036
+2024-06-01T06:07:31Z USER 2403803 [BackendDriver]: Running report_stats
+2024-06-01T06:07:31Z INFO 2403803 [BackendDriver]: Inputs to report_stats: modules=1 functions=5 allocs=648930 blocks=5 instructions=2092406 Max writers: 5184 Max Readers: 184036
+2024-06-01T06:07:31Z INFO 2403803 (sgLnk) [ReportStats]: Data Movement Statistics: main
+┌─────────────┬──────┬───────┬───────┐
+│ Instruction │ Kind │ Count │ Bytes │
+└─────────────┴──────┴───────┴───────┘
+
+2024-06-01T06:07:31Z INFO 2403803 (sgLnk) [ReportStats]: 
+┌─────────────────────┬───────┐
+│ Bytes per partition │ Count │
+└─────────────────────┴───────┘
+
+2024-06-01T06:07:31Z INFO 2403803 (sgLnk) [ReportStats]: Data Movement Statistics: sg0000
+┌──────────────┬────────────────────────────────────┬───────┬────────────┐
+│ Instruction  │ Kind                               │ Count │ Bytes      │
+├──────────────┼────────────────────────────────────┼───────┼────────────┤
+│ DMACopy      │ ExternalInputParameter -> Internal │ 1     │ 16384      │
+│ DMACopy      │ Internal -> Output                 │ 2     │ 600047616  │
+│ Load         │ Const -> Internal                  │ 3     │ 49408      │
+│ Load         │ ExternalInput -> Internal          │ 26    │ 18440      │
+│ Load         │ ExternalInputParameter -> Internal │ 1849  │ 722796544  │
+│ Load         │ Internal                           │ 7433  │ 1019133952 │
+│ Save         │ Internal                           │ 2055  │ 760119296  │
+│ Save         │ Internal -> ExternalOutput         │ 108   │ 37502976   │
+│ Save         │ Internal -> Output                 │ 5220  │ 84418320   │
+│ Save (Spill) │ Internal                           │ 1625  │ 177087488  │
+└──────────────┴────────────────────────────────────┴───────┴────────────┘
+
+2024-06-01T06:07:31Z INFO 2403803 (sgLnk) [ReportStats]: 
+┌─────────────────────┬───────┐
+│ Bytes per partition │ Count │
+├─────────────────────┼───────┤
+│ 8                   │ 16    │
+│ 72                  │ 17    │
+│ 113                 │ 288   │
+│ 128                 │ 4933  │
+│ 256                 │ 2     │
+│ 482                 │ 1152  │
+│ 488                 │ 213   │
+│ 512                 │ 916   │
+│ 760                 │ 31    │
+│ 904                 │ 32    │
+│ 964                 │ 2     │
+│ 1024                │ 7360  │
+│ 1536                │ 2     │
+│ 1784                │ 27    │
+│ 1928                │ 128   │
+│ 2048                │ 224   │
+│ 3832                │ 16    │
+│ 4096                │ 2961  │
+│ 150011904           │ 4     │
+└─────────────────────┴───────┘
+
+2024-06-01T06:07:32Z INFO 2403803 (sgLnk) [ReportStats]: Data Movement Statistics: sg0001
+┌──────────────┬────────────────────────────────────┬───────┬────────────┐
+│ Instruction  │ Kind                               │ Count │ Bytes      │
+├──────────────┼────────────────────────────────────┼───────┼────────────┤
+│ DMACopy      │ ExternalInputParameter -> Internal │ 2     │ 32768      │
+│ DMACopy      │ Input -> Internal                  │ 1     │ 900071424  │
+│ DMACopy      │ Internal -> Output                 │ 2     │ 600047616  │
+│ Load         │ Const -> Internal                  │ 81    │ 1343488    │
+│ Load         │ ExternalInputParameter -> Internal │ 16649 │ 7471169536 │
+│ Load         │ Input -> Internal                  │ 663   │ 84418320   │
+│ Load         │ Internal                           │ 8889  │ 1888166912 │
+│ Save         │ Internal                           │ 2607  │ 1056604160 │
+│ Save         │ Internal -> ExternalOutput         │ 108   │ 37502976   │
+│ Save         │ Internal -> Output                 │ 576   │ 300023808  │
+│ Save (Spill) │ Internal                           │ 2291  │ 512458752  │
+└──────────────┴────────────────────────────────────┴───────┴────────────┘
+
+2024-06-01T06:07:32Z INFO 2403803 (sgLnk) [ReportStats]: 
+┌─────────────────────┬───────┐
+│ Bytes per partition │ Count │
+├─────────────────────┼───────┤
+│ 113                 │ 16    │
+│ 128                 │ 388   │
+│ 256                 │ 1     │
+│ 482                 │ 1152  │
+│ 488                 │ 253   │
+│ 512                 │ 852   │
+│ 904                 │ 32    │
+│ 1024                │ 7277  │
+│ 1137                │ 8     │
+│ 1152                │ 8     │
+│ 1265                │ 59    │
+│ 1928                │ 128   │
+│ 2048                │ 211   │
+│ 2289                │ 205   │
+│ 3584                │ 15360 │
+│ 4096                │ 5916  │
+│ 150011904           │ 4     │
+│ 300023808           │ 3     │
+└─────────────────────┴───────┘
+
+2024-06-01T06:07:32Z INFO 2403803 (sgLnk) [ReportStats]: Data Movement Statistics: sg0002
+┌──────────────┬────────────────────────────────────┬───────┬────────────┐
+│ Instruction  │ Kind                               │ Count │ Bytes      │
+├──────────────┼────────────────────────────────────┼───────┼────────────┤
+│ DMACopy      │ ExternalInputParameter -> Internal │ 2     │ 32768      │
+│ DMACopy      │ Input -> Internal                  │ 1     │ 900071424  │
+│ DMACopy      │ Internal -> Output                 │ 2     │ 600047616  │
+│ Load         │ Const -> Internal                  │ 81    │ 1343488    │
+│ Load         │ ExternalInputParameter -> Internal │ 16649 │ 7471169536 │
+│ Load         │ Input -> Internal                  │ 663   │ 84418320   │
+│ Load         │ Internal                           │ 8889  │ 1888166912 │
+│ Save         │ Internal                           │ 2607  │ 1056604160 │
+│ Save         │ Internal -> ExternalOutput         │ 108   │ 37502976   │
+│ Save         │ Internal -> Output                 │ 576   │ 300023808  │
+│ Save (Spill) │ Internal                           │ 2291  │ 512458752  │
+└──────────────┴────────────────────────────────────┴───────┴────────────┘
+
+2024-06-01T06:07:32Z INFO 2403803 (sgLnk) [ReportStats]: 
+┌─────────────────────┬───────┐
+│ Bytes per partition │ Count │
+├─────────────────────┼───────┤
+│ 113                 │ 16    │
+│ 128                 │ 388   │
+│ 256                 │ 1     │
+│ 482                 │ 1152  │
+│ 488                 │ 253   │
+│ 512                 │ 852   │
+│ 904                 │ 32    │
+│ 1024                │ 7277  │
+│ 1137                │ 8     │
+│ 1152                │ 8     │
+│ 1265                │ 59    │
+│ 1928                │ 128   │
+│ 2048                │ 211   │
+│ 2289                │ 205   │
+│ 3584                │ 15360 │
+│ 4096                │ 5916  │
+│ 150011904           │ 4     │
+│ 300023808           │ 3     │
+└─────────────────────┴───────┘
+
+2024-06-01T06:07:32Z INFO 2403803 (sgLnk) [ReportStats]: Data Movement Statistics: sg0003
+┌─────────────┬────────────────────────────────────┬───────┬────────────┐
+│ Instruction │ Kind                               │ Count │ Bytes      │
+├─────────────┼────────────────────────────────────┼───────┼────────────┤
+│ DMACopy     │ ExternalInputParameter -> Internal │ 1     │ 16384      │
+│ DMACopy     │ Input -> Internal                  │ 1     │ 900071424  │
+│ DMACopy     │ Internal                           │ 2     │ 900071424  │
+│ Load        │ Const -> Internal                  │ 1     │ 32768      │
+│ Load        │ ExternalInput -> Internal          │ 1     │ 4          │
+│ Load        │ ExternalInputParameter -> Internal │ 15585 │ 7111991296 │
+│ Load        │ Internal                           │ 576   │ 300023808  │
+│ Save        │ Internal                           │ 576   │ 300023808  │
+│ Save        │ Internal -> ExternalOutput         │ 63    │ 256032     │
+└─────────────┴────────────────────────────────────┴───────┴────────────┘
+
+2024-06-01T06:07:32Z INFO 2403803 (sgLnk) [ReportStats]: 
+┌─────────────────────┬───────┐
+│ Bytes per partition │ Count │
+├─────────────────────┼───────┤
+│ 4                   │ 1     │
+│ 32                  │ 63    │
+│ 128                 │ 1     │
+│ 256                 │ 1     │
+│ 512                 │ 1     │
+│ 2178                │ 32    │
+│ 2304                │ 192   │
+│ 3584                │ 15360 │
+│ 4096                │ 1152  │
+│ 150011904           │ 6     │
+│ 300023808           │ 3     │
+└─────────────────────┴───────┘
+
+2024-06-01T06:07:32Z INFO 2403803 (sgLnk) [ReportStats]: MM Stats: #MatMults 1416529 #MatMult-Transposes 558444
+2024-06-01T06:07:32Z USER 2403803 [BackendDriver]: report_stats finished after 1.582 seconds
+2024-06-01T06:07:32Z INFO 2403803 [BackendDriver]: curr_vmrss:  12325mb, ru_maxrss:  23104mb (delta=0mb)
+2024-06-01T06:07:32Z INFO 2403803 [BackendDriver]: Output has 1 module(s), 5 function(s), 648930 memory location(s), 5 block(s), and 2092406 instruction(s). Max writers: 5184 Max Readers: 184036
+2024-06-01T06:07:32Z USER 2403803 [BackendDriver]: Running coloring_allocator_dram_post_lnk
+2024-06-01T06:07:33Z INFO 2403803 [BackendDriver]: Inputs to coloring_allocator_dram_post_lnk: modules=1 functions=5 allocs=648930 blocks=5 instructions=2092406 Max writers: 5184 Max Readers: 184036
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 28044242732
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 2820 bytes
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 5172587312
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 2063 bytes
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 98304
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: Allocating functions
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]:   linearize and check
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [DRAM_Allocator]:   allocating spills in DRAM post_link mode
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [DRAM_Allocator]:     reserved space = 5056654376 bytes
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [DRAM_Allocator]:     spill space = 18985918224 bytes
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [DRAM_Allocator]:     aligned spill space = 18985922560 bytes
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [DRAM_Allocator]:     dram space = 107374182400 bytes
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [DRAM_Allocator]:     renumber locations
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [DRAM_Allocator]:         size = 66
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [DRAM_Allocator]:       find first defs
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [DRAM_Allocator]:            Num intervals 66 Num locations 66
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [DRAM_Allocator]:            IntervalTree Build Done
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [DRAM_Allocator]:            info.neighbors init Done
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [DRAM_Allocator]:            IntervalTree readback Done
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [DRAM_Allocator]:         simplify interference graph
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [DRAM_Allocator]:           initialize low and high
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [DRAM_Allocator]:             lo = 66
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [DRAM_Allocator]:             hi = 0
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [DRAM_Allocator]:             total = 66
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [DRAM_Allocator]:           simplify
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [DRAM_Allocator]:             new candidates = 0
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [DRAM_Allocator]:         select ranges
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [DRAM_Allocator]: All sub-graph DRAM hwm: 1373765632
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [DRAM_Allocator]: CC buffer size limit 524288000
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [DRAM_Allocator]: allreduce_dram_hwm 1373765632
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [DRAM_Allocator]: Real CC buffer size 1373765632
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [DRAM_Allocator]: DRAM hwm after allocation: 3521249280
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [DRAM_Allocator]: DRAM allocation successful
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 28044242732
+2024-06-01T06:07:33Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 2820 bytes
+2024-06-01T06:07:34Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 5172587312
+2024-06-01T06:07:34Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2063 bytes
+2024-06-01T06:07:34Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 98304
+2024-06-01T06:07:34Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:07:35Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 28044242732
+2024-06-01T06:07:35Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 2820 bytes
+2024-06-01T06:07:36Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 5172587312
+2024-06-01T06:07:36Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 2063 bytes
+2024-06-01T06:07:37Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 98304
+2024-06-01T06:07:37Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:07:37Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: Allocating functions
+2024-06-01T06:07:37Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]:   linearize and check
+2024-06-01T06:07:38Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 28044242732
+2024-06-01T06:07:38Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 2820 bytes
+2024-06-01T06:07:38Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 5172587312
+2024-06-01T06:07:38Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2063 bytes
+2024-06-01T06:07:39Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 98304
+2024-06-01T06:07:39Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:07:39Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 28044242732
+2024-06-01T06:07:39Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 2820 bytes
+2024-06-01T06:07:39Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 5172587312
+2024-06-01T06:07:39Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 2063 bytes
+2024-06-01T06:07:39Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 98304
+2024-06-01T06:07:39Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:07:39Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: Allocating functions
+2024-06-01T06:07:39Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]:   linearize and check
+2024-06-01T06:07:42Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 28044242732
+2024-06-01T06:07:42Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 2820 bytes
+2024-06-01T06:07:43Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 5172587312
+2024-06-01T06:07:43Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2063 bytes
+2024-06-01T06:07:43Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 98304
+2024-06-01T06:07:43Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:07:44Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 28044242732
+2024-06-01T06:07:44Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 2820 bytes
+2024-06-01T06:07:44Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 5172587312
+2024-06-01T06:07:44Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 2063 bytes
+2024-06-01T06:07:44Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 98304
+2024-06-01T06:07:44Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:07:44Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: Allocating functions
+2024-06-01T06:07:44Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]:   linearize and check
+2024-06-01T06:07:46Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 28044242732
+2024-06-01T06:07:46Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 2820 bytes
+2024-06-01T06:07:47Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 5172587312
+2024-06-01T06:07:47Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2063 bytes
+2024-06-01T06:07:47Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 98304
+2024-06-01T06:07:47Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:07:48Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 28044242732
+2024-06-01T06:07:48Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 2820 bytes
+2024-06-01T06:07:48Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 5172587312
+2024-06-01T06:07:48Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 2063 bytes
+2024-06-01T06:07:49Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 98304
+2024-06-01T06:07:49Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:07:49Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: Allocating functions
+2024-06-01T06:07:49Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]:   linearize and check
+2024-06-01T06:07:50Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 28044242732
+2024-06-01T06:07:50Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 2820 bytes
+2024-06-01T06:07:50Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 5172587312
+2024-06-01T06:07:50Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2063 bytes
+2024-06-01T06:07:51Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 98304
+2024-06-01T06:07:51Z INFO 2403803 (sgLnk) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 128 bytes
+2024-06-01T06:07:51Z USER 2403803 [BackendDriver]: coloring_allocator_dram_post_lnk finished after 18.146 seconds
+2024-06-01T06:07:51Z INFO 2403803 [BackendDriver]: curr_vmrss:  12331mb, ru_maxrss:  23104mb (delta=0mb)
+2024-06-01T06:07:51Z INFO 2403803 [BackendDriver]: Output has 1 module(s), 5 function(s), 648930 memory location(s), 5 block(s), and 2092406 instruction(s). Max writers: 5184 Max Readers: 184036
+2024-06-01T06:07:51Z USER 2403803 [BackendDriver]: Running lower_dma
+2024-06-01T06:07:51Z INFO 2403803 [BackendDriver]: Inputs to lower_dma: modules=1 functions=5 allocs=648930 blocks=5 instructions=2092406 Max writers: 5184 Max Readers: 184036
+2024-06-01T06:08:38Z USER 2403803 [BackendDriver]: lower_dma finished after 47.585 seconds
+2024-06-01T06:08:38Z INFO 2403803 [BackendDriver]: curr_vmrss:  15702mb, ru_maxrss:  23104mb (delta=0mb)
+2024-06-01T06:08:38Z INFO 2403803 [BackendDriver]: Output has 1 module(s), 5 function(s), 648931 memory location(s), 5 block(s), and 2092421 instruction(s). Max writers: 5184 Max Readers: 184036
+2024-06-01T06:08:38Z USER 2403803 [BackendDriver]: Running alloc_semaphores
+2024-06-01T06:08:39Z INFO 2403803 [BackendDriver]: Inputs to alloc_semaphores: modules=1 functions=5 allocs=648931 blocks=5 instructions=2092421 Max writers: 5184 Max Readers: 184036
+2024-06-01T06:08:41Z USER 2403803 [BackendDriver]: alloc_semaphores finished after 2.759 seconds
+2024-06-01T06:08:41Z INFO 2403803 [BackendDriver]: curr_vmrss:  15727mb, ru_maxrss:  23104mb (delta=0mb)
+2024-06-01T06:08:41Z INFO 2403803 [BackendDriver]: Output has 1 module(s), 5 function(s), 648931 memory location(s), 5 block(s), and 2092421 instruction(s). Max writers: 5184 Max Readers: 184036
+2024-06-01T06:08:41Z USER 2403803 [BackendDriver]: Running expand_inst_late
+2024-06-01T06:08:41Z INFO 2403803 [BackendDriver]: Inputs to expand_inst_late: modules=1 functions=5 allocs=648931 blocks=5 instructions=2092421 Max writers: 5184 Max Readers: 184036
+2024-06-01T06:08:42Z USER 2403803 [BackendDriver]: expand_inst_late finished after 0.564 seconds
+2024-06-01T06:08:42Z INFO 2403803 [BackendDriver]: curr_vmrss:  15727mb, ru_maxrss:  23104mb (delta=0mb)
+2024-06-01T06:08:42Z INFO 2403803 [BackendDriver]: Output has 1 module(s), 5 function(s), 648931 memory location(s), 5 block(s), and 2092421 instruction(s). Max writers: 5184 Max Readers: 184036
+2024-06-01T06:08:42Z USER 2403803 [BackendDriver]: Running lower_sync
+2024-06-01T06:08:42Z INFO 2403803 [BackendDriver]: Inputs to lower_sync: modules=1 functions=5 allocs=648931 blocks=5 instructions=2092421 Max writers: 5184 Max Readers: 184036
+2024-06-01T06:08:44Z USER 2403803 [BackendDriver]: lower_sync finished after 1.859 seconds
+2024-06-01T06:08:44Z INFO 2403803 [BackendDriver]: curr_vmrss:  16099mb, ru_maxrss:  23104mb (delta=0mb)
+2024-06-01T06:08:44Z INFO 2403803 [BackendDriver]: Output has 1 module(s), 5 function(s), 648931 memory location(s), 5 block(s), and 2212919 instruction(s). Max writers: 5184 Max Readers: 184036
+2024-06-01T06:08:44Z USER 2403803 [BackendDriver]: Running lower_act
+2024-06-01T06:08:45Z INFO 2403803 [BackendDriver]: Inputs to lower_act: modules=1 functions=5 allocs=648931 blocks=5 instructions=2212919 Max writers: 5184 Max Readers: 184036
+2024-06-01T06:08:46Z USER 2403803 [BackendDriver]: lower_act finished after 1.778 seconds
+2024-06-01T06:08:46Z INFO 2403803 [BackendDriver]: curr_vmrss:  16102mb, ru_maxrss:  23104mb (delta=0mb)
+2024-06-01T06:08:46Z INFO 2403803 [BackendDriver]: Output has 1 module(s), 5 function(s), 648931 memory location(s), 5 block(s), and 2213407 instruction(s). Max writers: 5184 Max Readers: 184036
+2024-06-01T06:08:46Z USER 2403803 [BackendDriver]: Running lower_dve
+2024-06-01T06:08:47Z INFO 2403803 [BackendDriver]: Inputs to lower_dve: modules=1 functions=5 allocs=648931 blocks=5 instructions=2213407 Max writers: 5184 Max Readers: 184036
+2024-06-01T06:08:47Z INFO 2403803 (sgLnk) [LowerDVE]: Loading DVE opcodes table dve_info.json from /root/anaconda3/envs/masp_fastapi/lib/python3.9/site-packages/neuronxcc/dve/dve_bin/dve_info.json
+2024-06-01T06:08:52Z USER 2403803 [BackendDriver]: lower_dve finished after 5.950 seconds
+2024-06-01T06:08:52Z INFO 2403803 [BackendDriver]: curr_vmrss:  16340mb, ru_maxrss:  23104mb (delta=0mb)
+2024-06-01T06:08:53Z INFO 2403803 [BackendDriver]: Output has 1 module(s), 5 function(s), 648931 memory location(s), 5 block(s), and 2213407 instruction(s). Max writers: 5184 Max Readers: 184036
+2024-06-01T06:08:53Z USER 2403803 [BackendDriver]: Running lower_ap
+2024-06-01T06:08:53Z INFO 2403803 [BackendDriver]: Inputs to lower_ap: modules=1 functions=5 allocs=648931 blocks=5 instructions=2213407 Max writers: 5184 Max Readers: 184036
+2024-06-01T06:08:54Z USER 2403803 [BackendDriver]: lower_ap finished after 1.323 seconds
+2024-06-01T06:08:54Z INFO 2403803 [BackendDriver]: curr_vmrss:  16340mb, ru_maxrss:  23104mb (delta=0mb)
+2024-06-01T06:08:55Z INFO 2403803 [BackendDriver]: Output has 1 module(s), 5 function(s), 648931 memory location(s), 5 block(s), and 2213407 instruction(s). Max writers: 5184 Max Readers: 184036
+2024-06-01T06:08:55Z USER 2403803 [BackendDriver]: Running alloc_regs
+2024-06-01T06:08:55Z INFO 2403803 [BackendDriver]: Inputs to alloc_regs: modules=1 functions=5 allocs=648931 blocks=5 instructions=2213407 Max writers: 5184 Max Readers: 184036
+2024-06-01T06:08:55Z INFO 2403803 (sgLnk) [AllocRegs]:   allocating REG
+2024-06-01T06:08:55Z INFO 2403803 (sgLnk) [AllocRegs]:     main loop iteration 1
+2024-06-01T06:08:55Z INFO 2403803 (sgLnk) [AllocRegs]:   allocating REG
+2024-06-01T06:08:55Z INFO 2403803 (sgLnk) [AllocRegs]:     main loop iteration 1
+2024-06-01T06:08:55Z INFO 2403803 (sgLnk) [AllocRegs]:   allocating REG
+2024-06-01T06:08:55Z INFO 2403803 (sgLnk) [AllocRegs]:     main loop iteration 1
+2024-06-01T06:08:55Z INFO 2403803 (sgLnk) [AllocRegs]:   allocating REG
+2024-06-01T06:08:55Z INFO 2403803 (sgLnk) [AllocRegs]:     main loop iteration 1
+2024-06-01T06:08:56Z INFO 2403803 (sgLnk) [AllocRegs]:   allocating REG
+2024-06-01T06:08:56Z INFO 2403803 (sgLnk) [AllocRegs]:     main loop iteration 1
+2024-06-01T06:08:56Z USER 2403803 [BackendDriver]: alloc_regs finished after 0.733 seconds
+2024-06-01T06:08:56Z INFO 2403803 [BackendDriver]: curr_vmrss:  16340mb, ru_maxrss:  23104mb (delta=0mb)
+2024-06-01T06:08:56Z INFO 2403803 [BackendDriver]: Output has 1 module(s), 5 function(s), 648931 memory location(s), 5 block(s), and 2213407 instruction(s). Max writers: 5184 Max Readers: 184036
+2024-06-01T06:08:56Z USER 2403803 [BackendDriver]: Running birverifier
+2024-06-01T06:08:57Z INFO 2403803 [BackendDriver]: Inputs to birverifier: modules=1 functions=5 allocs=648931 blocks=5 instructions=2213407 Max writers: 5184 Max Readers: 184036
+2024-06-01T06:09:03Z USER 2403803 [BackendDriver]: birverifier finished after 7.182 seconds
+2024-06-01T06:09:03Z INFO 2403803 [BackendDriver]: curr_vmrss:  16770mb, ru_maxrss:  23104mb (delta=0mb)
+2024-06-01T06:09:04Z INFO 2403803 [BackendDriver]: Output has 1 module(s), 5 function(s), 648931 memory location(s), 5 block(s), and 2213407 instruction(s). Max writers: 5184 Max Readers: 184036
+2024-06-01T06:09:04Z USER 2403803 [BackendDriver]: Running codegen
+2024-06-01T06:09:04Z INFO 2403803 [BackendDriver]: Inputs to codegen: modules=1 functions=5 allocs=648931 blocks=5 instructions=2213407 Max writers: 5184 Max Readers: 184036
+2024-06-01T06:09:05Z INFO 2403803 (sgLnk) [Codegen]: Total compiler allocated DRAM tensors: 3.27942 GB
+2024-06-01T06:09:05Z INFO 2403803 (sgLnk) [Codegen]: Total un-allocated DRAM tensors by kind: 
+2024-06-01T06:09:05Z INFO 2403803 (sgLnk) [Codegen]: 
+┌────────────────────────┬─────────────┐
+│ TensorKind             │ Size (GB)   │
+├────────────────────────┼─────────────┤
+│ ExternalInput          │ 8.59052e-06 │
+│ ExternalInputParameter │ 3.59145     │
+│ ExternalOutput         │ 1.11791     │
+│ Const                  │ 0.000168085 │
+│ Pointer                │ 7.45058e-09 │
+└────────────────────────┴─────────────┘
+
+2024-06-01T06:09:05Z INFO 2403803 (sgLnk) [Codegen]: Total runtime managed DRAM tensors: 4.70954 GB
+2024-06-01T06:09:17Z INFO 2403803 (sgLnk) [Codegen]: Instruction Stats: 
+2024-06-01T06:09:17Z INFO 2403803 (sgLnk) [Codegen]: 
+┌───────────────────────────┬─────────┐
+│ Opcode                    │ Count   │
+├───────────────────────────┼─────────┤
+│ MATMUL                    │ 1416534 │
+│ LDWEIGHTS                 │ 1401658 │
+│ ACTIVATE                  │ 356734  │
+│ EVENT_SEMAPHORE           │ 120498  │
+│ PSEUDO_DMA_TRIGGER        │ 98867   │
+│ MEMSET                    │ 47919   │
+│ COPY_PREDICATED           │ 34560   │
+│ UNKNOWN(0x8d)             │ 34560   │
+│ TENSOR_TENSOR             │ 27663   │
+│ COPY                      │ 27459   │
+│ UNKNOWN(0x8b)             │ 13056   │
+│ UNKNOWN(0x24)             │ 6912    │
+│ TENSOR_REDUCE             │ 6912    │
+│ RECIPROCAL                │ 6912    │
+│ UNKNOWN(0x8a)             │ 6016    │
+│ CAST                      │ 3482    │
+│ UNKNOWN(0x8f)             │ 1920    │
+│ TENSOR_SCALAR             │ 1440    │
+│ TENSOR_SCALAR_ADDR        │ 586     │
+│ PSEUDO_DMA_MEMCPY         │ 512     │
+│ PSEUDO_BRANCH_LABEL       │ 512     │
+│ UNKNOWN(0xd6)             │ 512     │
+│ ACT_TABLE_LOAD            │ 488     │
+│ UNKNOWN(0xd3)             │ 165     │
+│ TENSOR_SCALAR             │ 142     │
+│ STREAM_SHUFFLE            │ 92      │
+│ LOAD_MASK_SELECT          │ 92      │
+│ IOTA                      │ 69      │
+│ UNKNOWN(0xd2)             │ 20      │
+│ NOP                       │ 15      │
+│ UNKNOWN(0xcf)             │ 15      │
+│ PSEUDO_TRIGGER_COLLECTIVE │ 12      │
+└───────────────────────────┴─────────┘
+
+2024-06-01T06:09:17Z INFO 2403803 (sgLnk) [Codegen]: 
+┌────────────┬─────────┐
+│ Engine     │ Count   │
+├────────────┼─────────┤
+│ Unassigned │ 0       │
+│ Pool       │ 13220   │
+│ Activation │ 397395  │
+│ PE         │ 2844928 │
+│ DMA        │ 0       │
+│ DVE        │ 231983  │
+│ SP         │ 128833  │
+└────────────┴─────────┘
+
+2024-06-01T06:09:17Z INFO 2403803 (sgLnk) [Codegen]: Total instructions: 3616359 (0.215552 GB)
+2024-06-01T06:09:17Z INFO 2403803 (sgLnk) [Codegen]: Total Dynamic DMA instruction count: 0
+2024-06-01T06:09:17Z USER 2403803 (sgLnk) [Codegen]: isa_gen finished after 12.040 seconds
+2024-06-01T06:09:25Z INFO 2403803 (sgLnk) [Codegen]: Number of DMA descriptors on each queue:
+2024-06-01T06:09:25Z INFO 2403803 (sgLnk) [Codegen]: 
+┌───────────────────────────┬───────────┐
+│ Queue                     │ Count     │
+├───────────────────────────┼───────────┤
+│ qActSpillReload0_defId_0  │ 629056    │
+│ qActSpillReload0_defId_1  │ 940928    │
+│ qActSpillReload0_defId_2  │ 940928    │
+│ qActSpillReload0_defId_3  │ 146496    │
+│ qDVESpillReload0_defId_0  │ 312960    │
+│ qDVESpillReload0_defId_1  │ 308992    │
+│ qDVESpillReload0_defId_2  │ 308992    │
+│ qPoolIO0                  │ 1187970   │
+│ qPoolPIOParam0            │ 5868996   │
+│ qPoolSpillReload0_defId_0 │ 256       │
+│ qPoolSpillReload0_defId_1 │ 146496    │
+│ qPoolSpillReload0_defId_2 │ 146496    │
+│ qPoolSpillReload0_defId_3 │ 146496    │
+│ qSPIO0                    │ 4687926   │
+│ qSPPIO0                   │ 256       │
+│ qSPPIOParam0              │ 141592764 │
+│ qSPSpillReload0_defId_0   │ 1933332   │
+│ qSPSpillReload0_defId_1   │ 2176960   │
+│ qSPSpillReload0_defId_2   │ 2176960   │
+│ qSPSpillReload0_defId_3   │ 146752    │
+└───────────────────────────┴───────────┘
+
+2024-06-01T06:09:25Z INFO 2403803 (sgLnk) [Codegen]: Total descriptors: 163800012 (2.44081 GB)
+2024-06-01T06:09:28Z USER 2403803 (sgLnk) [Codegen]: dma_desc_gen finished after 10.677 seconds
+2024-06-01T06:09:28Z INFO 2403803 (sgLnk) [Codegen]: Estimated peak DRAM usage: 10.6453 GB
+2024-06-01T06:09:28Z INFO 2403803 (sgLnk) [Codegen]: Generating debug info
+2024-06-01T06:09:39Z USER 2403803 (sgLnk) [Codegen]: debug_info_gen finished after 11.666 seconds
+2024-06-01T06:09:43Z USER 2403803 [BackendDriver]: codegen finished after 38.862 seconds
+2024-06-01T06:09:43Z INFO 2403803 [BackendDriver]: curr_vmrss:  20979mb, ru_maxrss:  23104mb (delta=0mb)
+2024-06-01T06:09:44Z INFO 2403803 [BackendDriver]: Output has 1 module(s), 5 function(s), 648931 memory location(s), 5 block(s), and 2213407 instruction(s). Max writers: 5184 Max Readers: 184036
+2024-06-01T06:09:44Z USER 2403803 [BackendDriver]: Running neff_packager
+2024-06-01T06:09:44Z INFO 2403803 [BackendDriver]: Inputs to neff_packager: modules=1 functions=5 allocs=648931 blocks=5 instructions=2213407 Max writers: 5184 Max Readers: 184036
+2024-06-01T06:09:49Z INFO 2403803 [NeffFileWriter]: IR signature: 3892b0b737d362bedc185c123929f432 for neff artifacts
+2024-06-01T06:09:49Z USER 2403803 [BackendDriver]: neff_packager finished after 4.847 seconds
+2024-06-01T06:09:49Z INFO 2403803 [BackendDriver]: curr_vmrss:  17208mb, ru_maxrss:  23104mb (delta=0mb)
+2024-06-01T06:09:49Z INFO 2403803 [BackendDriver]: Output has 1 module(s), 5 function(s), 648931 memory location(s), 5 block(s), and 2213407 instruction(s). Max writers: 5184 Max Readers: 184036
+2024-06-01T06:10:14Z INFO 2398881 [job.WalrusDriver.0]: new_lnkState: {"model": ["/tmp/root/neuroncc_compile_workdir/6dc0ff72-9752-4e1f-8880-eae65c0e6f3a/model.MODULE_3143bb21695f957f3b75+2c9e451d.hlo.pb"], "tensormap": "tensor_map.json", "bir": "walrus_bir.out.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "cached_wavegraph": "walrus_bir.out.json", "state_dir": "/root/llava_mistral_0531/app/neuronxcc-42xre48h/sgLnk/sg00", "state_id": "sg00"}
+2024-06-01T06:10:14Z INFO 2398881 [job.WalrusDriver.0]: MTBackend: completed successfully.
+2024-06-01T06:10:14Z INFO 2398881 [pipeline.Pipeline.0]: Finished job job.WalrusDriver.0
+2024-06-01T06:10:14Z INFO 2398881 [pipeline.Pipeline.0]: Starting job job.BIRLinker.0
+2024-06-01T06:10:14Z INFO 2398881 [job.BIRLinker.0]: Replay this job by calling: /root/anaconda3/envs/masp_fastapi/bin/neuronx-cc compile --framework XLA --state '{"model": ["/tmp/root/neuroncc_compile_workdir/6dc0ff72-9752-4e1f-8880-eae65c0e6f3a/model.MODULE_3143bb21695f957f3b75+2c9e451d.hlo.pb"], "tensormap": "tensor_map.json", "bir": "walrus_bir.out.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "cached_wavegraph": "walrus_bir.out.json", "state_dir": "/root/llava_mistral_0531/app/neuronxcc-42xre48h/sgLnk/sg00", "state_id": "sg00"}' --pipeline BIRLinker
+2024-06-01T06:10:14Z INFO 2398881 [job.BIRLinker.0]: BIRLinker cwd: /root/llava_mistral_0531/app/neuronxcc-42xre48h
+2024-06-01T06:10:14Z INFO 2398881 [job.BIRLinker.0]: Linking already done.
+2024-06-01T06:10:14Z INFO 2398881 [pipeline.Pipeline.0]: Finished job job.BIRLinker.0
+2024-06-01T06:10:14Z INFO 2398881 [pipeline.Pipeline.0]: Starting job job.Kelper.0
+2024-06-01T06:10:14Z INFO 2398881 [job.Kelper.0]: Skipping neff generation which was already performed by neff_packager
+2024-06-01T06:10:14Z INFO 2398881 [pipeline.Pipeline.0]: Finished job job.Kelper.0
+2024-06-01T06:10:14Z INFO 2398881 [pipeline.Pipeline.0]: Finished pipeline Pipeline
+2024-06-01T06:10:14Z INFO 2398881 [pipeline.Pipeline.0]: Job #0 finished
+2024-06-01T06:10:14Z INFO 2398673 [root]: Subcommand returned with exitcode=0
diff --git a/app/main.py b/app/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b41a984a1493e0216008b8491d6e69ad2e7ce52
--- /dev/null
+++ b/app/main.py
@@ -0,0 +1,241 @@
+import time
+import os
+import base64
+from io import BytesIO
+import concurrent.futures
+import logging
+import numpy as np
+from PIL import Image
+import torch
+import torch.nn as nn
+import torch_neuronx
+import transformers
+from transformers import AutoConfig, AutoTokenizer
+from llava.constants import MM_TOKEN_INDEX, DEFAULT_VIDEO_TOKEN, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN
+from llava.conversation import conv_templates
+from llava.model.utils import LayerNorm
+from llava.mm_utils import tokenizer_image_token
+from llava.model.multimodal_encoder.processor import Blip2ImageTrainProcessor
+
+from transformers_neuronx import MistralForSampling, NeuronConfig, constants
+
+from typing import Dict, Optional, Any, List
+from fastapi import FastAPI, Request, HTTPException
+
+# Suppress transformers logging
+transformers.logging.set_verbosity_error()
+
+NUM_SEGMENTS = 10  # Number of frame segments to use
+WEIGHT_ROOT = '/root/llava_mistral_0531/inf2_dir_0531/'  # Root directory for model weights
+
+CONFIG_DIR = os.path.join(WEIGHT_ROOT, "llava-mistral_videollava_092")  # Tokenizer directory
+NEURON_VISION_PATH = os.path.join(WEIGHT_ROOT, "inf2_weights", "neuron_eva_vit_batch1.pth")  # Vision model weights (Neuron format)
+NEURON_BERT_PATH = os.path.join(WEIGHT_ROOT, "inf2_weights", "neuron_bert.pth")  # BERT model weights (Neuron format)
+PROJECTOR_PATH = os.path.join(WEIGHT_ROOT, "inf2_weights", 'neuron_projector.pt')  # Projector weights
+EMBED_TOKEN_PATH = os.path.join(WEIGHT_ROOT, "inf2_weights", 'embed_tokens.pth')  # Embedding weights
+QUERY_TOKEN_PATH = os.path.join(WEIGHT_ROOT, "inf2_weights", 'query_tokens.pth')
+LAYERNORM_SAVE_PATH = os.path.join(WEIGHT_ROOT, "inf2_weights", 'ln_state_dict.pth')
+POSITION_ENCODING_SAVE_PATH = os.path.join(WEIGHT_ROOT, "inf2_weights", 'frame_position_encoding.pth')
+COMPILED_MODEL_PATH = os.path.join(WEIGHT_ROOT, 'mistral-compiled-bf16-b16')
+
+os.environ['NEURON_CC_FLAGS'] = f"--enable-experimental-spmd --tensorizer-options='--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2'"
+
+class MistralModel:
+    def __init__(self, model_name):
+        self.neuron_config = NeuronConfig(
+            attention_layout=constants.LAYOUT_BSH,
+            collectives_layout=constants.LAYOUT_BSH
+            )
+        self.model_name = model_name
+        self.amp = 'bf16'
+        self.batch_size = 16
+        self.tp_degree = 4
+        self.n_positions = [2289+512]
+        self.context_length_estimate = [2289]
+        context_unroll = None
+        
+        self.model = MistralForSampling.from_pretrained(
+            self.model_name,
+            amp=self.amp,
+            batch_size=self.batch_size,
+            tp_degree=self.tp_degree,
+            n_positions=self.n_positions,
+            context_length_estimate=self.context_length_estimate,
+            context_unroll=context_unroll,
+            neuron_config=self.neuron_config
+        )
+        self.model.load(COMPILED_MODEL_PATH)
+        self.model.to_neuron()
+        #self.model.save(COMPILED_MODEL_PATH)
+        
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        
+
+    def generate(self, inputs: torch.tensor, parameters: Optional[Dict[str, Any]] = None) -> List[str]:
+        try:
+            max_new_tokens = parameters.get("max_new_tokens", 90)
+            top_k = parameters.get("top_k", 4)
+            top_p = parameters.get("top_p", None)
+            temperature = parameters.get("temperature", 0.01)
+            no_repeat_ngram_size = parameters.get("no_repeat_ngram_size", None)
+            
+            with torch.inference_mode():
+                generated_sequence = self.model.sample(inputs,
+                                                       sequence_length=self.context_length_estimate[0]+max_new_tokens,
+                                                       start_ids=None, top_k=top_k, top_p=top_p, temperature=temperature,
+                                                       no_repeat_ngram_size=no_repeat_ngram_size)
+
+                decoded_output = [self.tokenizer.decode(tok) for tok in generated_sequence]
+            generated_text=[str(item).strip("</s>").strip() for item in decoded_output]
+            #generated_text = str(decoded_output)
+            return generated_text
+        except Exception as e:
+            logging.error(f"Error generating text: {e}")
+            raise
+
+
+# Create FastAPI app
+app = FastAPI()
+mistral_model = MistralModel(model_name=CONFIG_DIR)  # Load Mistral model
+
+processor = Blip2ImageTrainProcessor(image_size=224, is_training=False)
+
+def generate_input_ids(tokenizer):
+    conv = conv_templates['thoth'].copy()  # Copy the conversation template
+    qs = "Please describe this video in detail."
+    qs = DEFAULT_VIDEO_START_TOKEN + DEFAULT_VIDEO_TOKEN + DEFAULT_VIDEO_END_TOKEN + '\n' + qs  # Prepend video tokens
+    conv.append_message(conv.roles[0], qs)  # Add the question to the conversation
+    conv.append_message(conv.roles[1], None)  # Add a placeholder for the response
+    prompt = conv.get_prompt()  # Get the conversation prompt
+    input_ids = tokenizer_image_token(prompt, tokenizer, MM_TOKEN_INDEX, return_tensors='pt').unsqueeze(0)  # Tokenize and convert to tensor
+    return input_ids
+
+
+def process_anyres_image(packed_data):
+    byte_data = base64.b64decode(packed_data)
+    image = Image.open(BytesIO(byte_data)).convert('RGB')
+    new_image = Image.new('RGB', (224, 224), (0, 0, 0))
+    new_image.paste(image.resize((224, 224)), (0, 0))
+    torch_stack = processor.preprocess(new_image).unsqueeze(0)
+    res = vision_module_neuron(torch_stack)
+    return res.repeat(7,1,1)
+
+
+# Load model configuration and tokenizer
+config = AutoConfig.from_pretrained(CONFIG_DIR, trust_remote_code=True)
+tokenizer = mistral_model.tokenizer
+
+# Load embedding weights and set up embedding module
+padding_idx = config.pad_token_id
+embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx)
+embed_weight = torch.load(EMBED_TOKEN_PATH)
+embed_tokens.load_state_dict(embed_weight)
+embed_tokens = embed_tokens.eval()
+
+input_ids = generate_input_ids(tokenizer)[0]  # Generate input IDs and conversation template
+vision_token_indice = torch.where(input_ids == MM_TOKEN_INDEX)[0][0]  # Get index of vision token
+pre_text_token = embed_tokens(input_ids[:vision_token_indice])  # Embed tokens before vision token
+post_text_token = embed_tokens(input_ids[vision_token_indice + 1:])  # Embed tokens after vision token
+
+#layer norm
+vision_width = 1408
+ln_vision = LayerNorm(vision_width)
+ln_vision_weight = torch.load(LAYERNORM_SAVE_PATH)
+ln_vision.load_state_dict(ln_vision_weight)
+ln_vision = ln_vision.eval()
+
+num_query_token = 32
+query_tokens = nn.Parameter(
+    torch.zeros(1, num_query_token, 768)
+)
+query_tokens.data.normal_(mean=0.0, std=0.02)
+query_tokens_weight = torch.load(QUERY_TOKEN_PATH)['query_tokens']
+query_tokens.data = query_tokens_weight
+
+frame_position_encoding = nn.Embedding(10, 768)
+frame_position_encoding_weight = torch.load(POSITION_ENCODING_SAVE_PATH)
+frame_position_encoding.load_state_dict(frame_position_encoding_weight)
+
+with torch_neuronx.experimental.neuron_cores_context(start_nc=0, nc_count=4): 
+    vision_module_neuron = torch.jit.load(NEURON_VISION_PATH)
+    neuron_bert = torch.jit.load(NEURON_BERT_PATH)
+    projector = torch.jit.load(PROJECTOR_PATH)
+vision_module_neuron = vision_module_neuron.eval()
+neuron_bert = neuron_bert.eval()
+projector = projector.eval()
+ 
+@app.post("/generate")
+async def generate(request: Request) -> Dict[str, List[str]]:
+    try:
+        s1 = time.time()
+        request_payload = await request.json()
+        request_payload_keys = request_payload.keys()
+        s11 = time.time()
+        print("request_payload_keys time: ", s11-s1)
+        if "images" in request_payload_keys:  # If input is a list of images
+            packed_data = request_payload.get("images")
+            s12 = time.time()
+            print("packed_data time: ", s12-s11)
+            with concurrent.futures.ThreadPoolExecutor(16) as executor:
+                new_images = list(executor.map(process_anyres_image, packed_data))
+            image_features = torch.vstack(new_images)
+            print("s1 - image_features time: ", time.time() - s1)
+            image_features_list = list(torch.split(image_features, 70, dim=0))
+            print("image_features_list: ", image_features_list[0].shape)
+
+            input_embeds_list = []
+            for image_feature in image_features_list:
+                s2 = time.time()
+                with torch.inference_mode():  # Enable inference mode
+                    image_feature = ln_vision(image_feature)
+                    attn_mask = torch.ones(image_feature.size()[:-1], dtype=torch.int64)
+                    query_tokens_inputs = query_tokens.expand(image_feature.shape[0], -1, -1)
+                    s21 = time.time()
+                    print("ln_vision time: ", s21 - s2)
+
+                    image_feature = neuron_bert(
+                        query_tokens_inputs,
+                        image_feature,
+                        attn_mask
+                    )["last_hidden_state"]
+                    s22 = time.time()
+                    print("neuron bert time: ", s22 - s21)
+
+                    frame_ids = torch.arange(10, dtype=torch.long, device=image_feature.device).unsqueeze(1)
+                    frame_ids = frame_ids.repeat(1, 7).flatten(0, 1)  # [num_frames * num_patches]
+                    image_feature += frame_position_encoding(frame_ids).unsqueeze(-2)  # [num_frames, 1, 768]
+                    projected_features = projector(image_feature)
+                    s23 = time.time()
+                    print("projector time: ", s23 - s22)
+
+                    image_feature = projected_features.flatten(0, 1)
+                    input_embeds = torch.cat([pre_text_token, image_feature, post_text_token]).unsqueeze(0)
+                    print("s2 - image_feature prepare time: ", time.time() - s23)
+                    input_embeds_list.append(input_embeds)
+        
+        else:
+            raise HTTPException(status_code=400, detail="Please provide correct input")
+        
+        s3 = time.time()
+        input_embeds_batch = torch.cat(input_embeds_list, dim=0)
+        parameters = request_payload.get("parameters", {})  # Get additional parameters
+        generated_text = mistral_model.generate(input_embeds_batch, parameters)  # Generate text using Mistral model
+
+        print("s3 - generated_text time: ", time.time() - s3)
+        print("total inference time: ", time.time() - s1)
+        return {"generated_text": generated_text}
+
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error generating text: {str(e)}")
+
+"""
+build:
+docker build -t masp_image:latest .
+
+up: 
+docker-compose -f docker-compose-inf2-48xl.yaml up
+
+down:
+docker-compose -f docker-compose-inf2-48xl.yaml down
+
+"""
diff --git a/app/requirements.txt b/app/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..cffbbb2a457970686b712f28d900f278caa72a8e
--- /dev/null
+++ b/app/requirements.txt
@@ -0,0 +1,10 @@
+git+https://github.com/davidshtian/transformers-neuronx.git
+uvicorn[standard]
+fastapi
+msgpack
+neuronx-cc==2.*
+torch-neuronx==1.13.*
+torchvision
+omegaconf
+timm
+sentencepiece
\ No newline at end of file
diff --git a/app/tmp/main_old.py b/app/tmp/main_old.py
new file mode 100644
index 0000000000000000000000000000000000000000..de612b13cf181c1faafe5232f3577555edc72968
--- /dev/null
+++ b/app/tmp/main_old.py
@@ -0,0 +1,270 @@
+from typing import Dict
+from fastapi import FastAPI, Request, HTTPException
+from backend_model import MistralModel
+from llava.model.multimodal_encoder.processor import Blip2ImageTrainProcessor
+from transformers import AutoTokenizer
+from llava.constants import MM_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, \
+    DEFAULT_VIDEO_PATCH_TOKEN, DEFAULT_VIDEO_TOKEN, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN
+from llava.conversation import conv_templates
+from llava.model.multimodal_encoder.qformer import BertConfig, BertLMHeadModel, BertModel
+from llava.model.multimodal_projector.builder import build_vision_projector
+from llava.model.utils import LayerNorm
+from llava.model.multimodal_encoder.eva_clip_encoder import EvaClipVisionTower
+from llava.mm_utils import tokenizer_image_token, process_images_v2
+import torch
+import numpy as np
+from PIL import Image
+import os
+import msgpack
+from io import BytesIO
+import base64
+import torch.nn as nn
+from transformers import AutoConfig
+from collections import OrderedDict
+
+import torch_neuronx
+
+NUM_SEGMENTS = 10
+
+def generate_input_ids(tokenizer):
+    conv = conv_templates['thoth'].copy()
+    qs = "Describe the following video in detail."
+    qs = DEFAULT_VIDEO_START_TOKEN + DEFAULT_VIDEO_TOKEN + DEFAULT_VIDEO_END_TOKEN + '\n' + qs
+    conv.append_message(conv.roles[0], qs)
+    conv.append_message(conv.roles[1], None)
+    prompt = conv.get_prompt()
+    input_ids = tokenizer_image_token(prompt, tokenizer, MM_TOKEN_INDEX, return_tensors='pt').unsqueeze(0)
+    return input_ids, conv
+
+
+def generate_images(frame_folder, image_processor, model_cfg):
+    images = load_frames(frame_folder)
+    if len(images) > NUM_SEGMENTS:
+        images = uniform_sample(images, NUM_SEGMENTS)
+    return process_images_v2(images, image_processor, model_cfg)
+
+
+def uniform_sample(frames, num_segments):
+    indices = np.linspace(start=0, stop=len(frames) - 1, num=num_segments).astype(int)
+    frames = [frames[ind] for ind in indices]
+    return frames
+
+
+def load_frames(frames_dir):
+    results = []
+    image_files = [(int(os.path.splitext(img)[0]), img) for img in os.listdir(frames_dir) if img.endswith('jpg')]
+    image_files = sorted(image_files, key=lambda img: img[0])
+
+    for frame_name in image_files:
+        image_path = f"{frames_dir}/{frame_name[1]}"
+        image = Image.open(image_path).convert('RGB')
+        results.append(image)
+    return results
+
+
+class MASPVision(torch.nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        # device = 'cuda:0'
+        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        device_map = {"": 0}
+
+        vision_tower = EvaClipVisionTower("eva-vit-g", config, delay_load=True)
+        vision_tower.load_model(device_map=device_map)
+        vision_tower.to(device=device, dtype=torch.float16)
+
+        image_processor = Blip2ImageTrainProcessor(
+            image_size=config.img_size,
+            is_training=False)
+
+        cross_attention_freq = 2
+        vision_width = vision_tower.hidden_size
+        num_query_token = config.num_query_token
+        ln_vision = LayerNorm(vision_width)
+        encoder_config = BertConfig.from_pretrained("bert-base-uncased")
+        encoder_config.encoder_width = vision_width
+        # insert cross-attention layer every other block
+        encoder_config.add_cross_attention = True
+        encoder_config.cross_attention_freq = cross_attention_freq
+        encoder_config.query_length = num_query_token
+        # Qformer = BertLMHeadModel(config=encoder_config)
+        self.bert = BertModel(encoder_config, add_pooling_layer=False)
+        self.bert.embeddings.word_embeddings = None
+        self.bert.embeddings.position_embeddings = None
+
+        for layer in self.bert.encoder.layer:
+            layer.output = None
+            layer.intermediate = None
+
+        query_tokens = nn.Parameter(
+            torch.zeros(1, num_query_token, encoder_config.hidden_size)
+        )
+        query_tokens.data.normal_(mean=0.0, std=encoder_config.initializer_range)
+
+        frame_position_encoding = nn.Embedding(
+            config.max_num_segments,
+            encoder_config.hidden_size
+        )
+
+        mm_projector = build_vision_projector(config)
+
+        self.vision_tower = vision_tower
+        # self.qformer = Qformer
+        self.projector = mm_projector
+        self.query_tokens = query_tokens
+        self.ln_vision = ln_vision
+        self.frame_position_encoding = frame_position_encoding
+
+    def forward(self, images):
+        # images: [num_frames, patches, 3, image_size, image_size]
+        image_features = self.vision_tower(images.flatten(0, 1))
+        image_features = self.ln_vision(image_features)
+        attn_mask = torch.ones(image_features.size()[:-1], dtype=torch.long).to(
+            image_features.device)  # [num_frames * num_patches, 256]
+        query_tokens = self.query_tokens.expand(image_features.shape[0], -1, -1)  # [num_frames * num_patches, 32, 768]
+        dtype_ = self.vision_tower.dtype
+        image_features = self.bert(
+            query_embeds=query_tokens.to(dtype_),
+            encoder_hidden_states=image_features.to(dtype_),
+            encoder_attention_mask=attn_mask,
+            return_dict=True
+        ).last_hidden_state.to(dtype_)
+        frame_ids = torch.arange(images.shape[0], dtype=torch.long, device=image_features.device).unsqueeze(1)
+        frame_ids = frame_ids.repeat(1, images.shape[1]).flatten(0, 1)  # [num_frames * num_patches]
+        image_features += self.frame_position_encoding(frame_ids).unsqueeze(-2)  # [num_frames, 1, 768]
+        return self.projector(image_features)
+
+        # zheng add
+
+    def forward_features(self, a, b):
+        # images: [num_frames, patches, 3, image_size, image_size]
+        images = a
+        image_features = b
+        image_features = self.ln_vision(image_features)
+        attn_mask = torch.ones(image_features.size()[:-1], dtype=torch.long).to(
+            image_features.device)  # [num_frames * num_patches, 256]
+        query_tokens = self.query_tokens.expand(image_features.shape[0], -1, -1)  # [num_frames * num_patches, 32, 768]
+        dtype_ = self.vision_tower.dtype
+        image_features = self.bert(
+            query_embeds=query_tokens.to(dtype_),
+            encoder_hidden_states=image_features.to(dtype_),
+            encoder_attention_mask=attn_mask,
+            return_dict=True
+        ).last_hidden_state.to(dtype_)
+        frame_ids = torch.arange(images.shape[0], dtype=torch.long, device=image_features.device).unsqueeze(1)
+        frame_ids = frame_ids.repeat(1, images.shape[1]).flatten(0, 1)  # [num_frames * num_patches]
+        image_features += self.frame_position_encoding(frame_ids).unsqueeze(-2)  # [num_frames, 1, 768]
+        return self.projector(image_features)
+
+
+WEIGHT_ROOT = '/root/masp_models_inf2'
+tokenizer_dir = '../tokenizer_dir'
+NEURON_VISION_PATH = os.path.join(WEIGHT_ROOT, "./neuron_eva_vit_base.pt")
+VISION_STATE_DICT = os.path.join(WEIGHT_ROOT, 'new_vision_state_dict.pth')
+EMBED_TOKEN_PATH = os.path.join(WEIGHT_ROOT, 'embed_tokens.pth')
+EVA_VIT_PATH = os.path.join(WEIGHT_ROOT, 'eva_vit_g.pth')
+app = FastAPI()
+mistral_model = MistralModel()
+
+config = AutoConfig.from_pretrained(tokenizer_dir, trust_remote_code=True)
+
+config.vit_model_path = EVA_VIT_PATH
+tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
+
+tokenizer.add_tokens(
+    [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN],
+    special_tokens=True)
+
+input_ids, conv = generate_input_ids(tokenizer)
+input_ids = input_ids[0].to('cpu')  # [token_len]
+
+image_processor = Blip2ImageTrainProcessor(
+    image_size=config.img_size,
+    is_training=False)
+vision_module = MASPVision(config=config)
+# new_vision_state_dict = torch.load('new_vision_state_dict.pth')
+new_vision_state_dict = torch.load(VISION_STATE_DICT, map_location='cpu')
+
+vision_module.load_state_dict(new_vision_state_dict)
+vision_module = vision_module.eval()
+vision_module = vision_module.to('cpu')
+
+# vision_module.to(torch.float16)
+# zheng add
+vision_module.to(torch.float32)
+
+vision_module_neuron = torch.jit.load(NEURON_VISION_PATH)
+vision_module_neuron = vision_module_neuron.eval()
+
+padding_idx = config.pad_token_id
+embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx)
+embed_weight = torch.load(EMBED_TOKEN_PATH)
+embed_tokens.load_state_dict(embed_weight)
+embed_tokens = embed_tokens.eval()
+embed_tokens.to(torch.float16).to('cpu')
+
+@app.post("/generate")
+async def generate(request: Request) -> Dict[str, str]:
+    """
+    Generate text using the Mistral model.
+
+    Args:
+        request (Request): The incoming request object.
+
+    Returns:
+        Dict[str, str]: A dictionary containing the generated text or an error message.
+    """
+    try:
+        request_payload = await request.json()
+        packed_data = request_payload.get("images")
+        parameters = request_payload.get("parameters", {})
+        #unpacked_data = msgpack.unpackb(packed_data, raw=False)
+        unpacked_data = [base64.b64decode(item) for item in packed_data]
+        input_images = [Image.open(BytesIO(byte_data)).convert('RGB') for byte_data in unpacked_data]
+
+        input_images = uniform_sample(input_images, NUM_SEGMENTS)
+        input_images = process_images_v2(input_images, image_processor, config)
+
+        with torch.inference_mode():
+            # get image feature
+
+            # image_features = vision_module(images).flatten(0, 1) # [num_frames * num_patches * num_query_token, 4096]
+            image_features = torch.Tensor()  # init a tensor
+            for image in input_images:
+                output = vision_module_neuron(image)
+                output = output[:, 1:].to(torch.float32)
+                if len(image_features) == 0:
+                    image_features = output
+                else:
+                    image_features = torch.cat([image_features, output], dim=0)
+
+            # zheng [70, 256, 1408]
+            image_features = vision_module.forward_features(input_images, image_features)
+            image_features = image_features.flatten(0, 1)
+            print(image_features.shape)  # zheng [70, 32, 4096]
+
+            image_features.to(device='cpu', dtype=torch.float16)
+            image_features_numpy = image_features.detach().cpu().numpy()
+            # image_features_saved = np.load('image_features_numpy.npy')
+            # print(np.sum(image_features_numpy -image_features_saved ))
+            # image_features_numpy = image_features.detach().cpu().numpy
+            # np.save('image_features_numpy.npy', image_features_numpy)
+            # print('images features shape', image_features.shape)
+            # image_features = loaded(images).flatten(0, 1)
+            # concat with text features
+            vision_token_indice = torch.where(input_ids == MM_TOKEN_INDEX)[0][0]
+            pre_text_token = embed_tokens(input_ids[:vision_token_indice])  # zheng [32, 4096]
+            post_text_token = embed_tokens(input_ids[vision_token_indice + 1:])
+
+            inputs_embeds = torch.cat([pre_text_token, image_features, post_text_token]).unsqueeze(
+                0)  # [1, num_token, 4096]
+            inputs = inputs_embeds.detach().cpu().numpy().tolist()
+
+        if not inputs:
+            raise HTTPException(status_code=400, detail="No input provided")
+
+        generated_text = mistral_model.generate(inputs, parameters)
+        return {"generated_text": generated_text}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error generating text: {str(e)}")
diff --git a/app/tmp/mistral_standalone.py b/app/tmp/mistral_standalone.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7da15ee43e00cdc2db25393f048cbdfb92c1a77
--- /dev/null
+++ b/app/tmp/mistral_standalone.py
@@ -0,0 +1,27 @@
+import logging
+from typing import Union, List, Optional, Dict, Any, Literal
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer
+from transformers_neuronx import MistralForSampling, GQA, NeuronConfig
+import time
+import math
+
+
+model_name = './checkpoint-3000'
+amp =  'bf16'
+batch_size = 1
+tp_degree = 8
+n_positions = 8192
+neuron_config = NeuronConfig(group_query_attention=GQA.SHARD_OVER_HEADS)
+
+
+model = MistralForSampling.from_pretrained(
+    model_name,
+    amp=amp,
+    batch_size=batch_size,
+    tp_degree=tp_degree,
+    n_positions=n_positions,
+    neuron_config=neuron_config
+)
+model.to_neuron()
\ No newline at end of file
diff --git a/app/tmp/predict_vision.py b/app/tmp/predict_vision.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3b2f3e5f7a53de693b671a42b5f2ec55e2245f0
--- /dev/null
+++ b/app/tmp/predict_vision.py
@@ -0,0 +1,270 @@
+from llava.model.multimodal_encoder.processor import Blip2ImageTrainProcessor
+from transformers import AutoTokenizer
+from llava.constants import MM_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, \
+    DEFAULT_VIDEO_PATCH_TOKEN, DEFAULT_VIDEO_TOKEN, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN
+from llava.conversation import conv_templates
+from llava.model.multimodal_encoder.qformer import BertConfig, BertLMHeadModel, BertModel
+from llava.model.multimodal_projector.builder import build_vision_projector
+from llava.model.utils import LayerNorm
+from llava.model.multimodal_encoder.eva_clip_encoder import EvaClipVisionTower
+import torch
+from llava.mm_utils import tokenizer_image_token, process_images_v2, KeywordsStoppingCriteria
+import numpy as np
+from PIL import Image
+import os
+import torch.nn as nn
+from transformers import AutoConfig
+from collections import OrderedDict
+
+import torch_neuronx
+
+NUM_SEGMENTS = 10
+
+def generate_input_ids(tokenizer):
+    conv = conv_templates['thoth'].copy()
+    qs = "Describe the following video in detail."
+    qs = DEFAULT_VIDEO_START_TOKEN + DEFAULT_VIDEO_TOKEN + DEFAULT_VIDEO_END_TOKEN + '\n' + qs
+    conv.append_message(conv.roles[0], qs)
+    conv.append_message(conv.roles[1], None)
+    prompt = conv.get_prompt()
+    input_ids = tokenizer_image_token(prompt, tokenizer, MM_TOKEN_INDEX, return_tensors='pt').unsqueeze(0)
+    return input_ids, conv
+
+
+def generate_images(frame_folder, image_processor, model_cfg):
+    images = load_frames(frame_folder)
+    if len(images) > NUM_SEGMENTS:
+        images = uniform_sample(images, NUM_SEGMENTS)
+    return process_images_v2(images, image_processor, model_cfg)
+
+
+def uniform_sample(frames, num_segments):
+    indices = np.linspace(start=0, stop=len(frames) - 1, num=num_segments).astype(int)
+    frames = [frames[ind] for ind in indices]
+    return frames
+
+
+def load_frames(frames_dir):
+    results = []
+    image_files = [(int(os.path.splitext(img)[0]), img) for img in os.listdir(frames_dir) if img.endswith('jpg')]
+    image_files = sorted(image_files, key=lambda img: img[0])
+
+    for frame_name in image_files:
+        image_path = f"{frames_dir}/{frame_name[1]}"
+        image = Image.open(image_path).convert('RGB')
+        results.append(image)
+    return results
+
+
+class MASPVision(torch.nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        # device = 'cuda:0'
+        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        device_map = {"": 0}
+        config.vit_model_path = 'eva_vit_g.pth'
+        vision_tower = EvaClipVisionTower("eva-vit-g", config, delay_load=True)
+        vision_tower.load_model(device_map=device_map)
+        vision_tower.to(device=device, dtype=torch.float16)
+
+        image_processor = Blip2ImageTrainProcessor(
+            image_size=config.img_size,
+            is_training=False)
+
+        cross_attention_freq = 2
+        vision_width = vision_tower.hidden_size
+        num_query_token = config.num_query_token
+        ln_vision = LayerNorm(vision_width)
+        encoder_config = BertConfig.from_pretrained("bert-base-uncased")
+        encoder_config.encoder_width = vision_width
+        # insert cross-attention layer every other block
+        encoder_config.add_cross_attention = True
+        encoder_config.cross_attention_freq = cross_attention_freq
+        encoder_config.query_length = num_query_token
+        # Qformer = BertLMHeadModel(config=encoder_config)
+        self.bert = BertModel(encoder_config, add_pooling_layer=False)
+        self.bert.embeddings.word_embeddings = None
+        self.bert.embeddings.position_embeddings = None
+
+        for layer in self.bert.encoder.layer:
+            layer.output = None
+            layer.intermediate = None
+
+        query_tokens = nn.Parameter(
+            torch.zeros(1, num_query_token, encoder_config.hidden_size)
+        )
+        query_tokens.data.normal_(mean=0.0, std=encoder_config.initializer_range)
+
+        frame_position_encoding = nn.Embedding(
+            config.max_num_segments,
+            encoder_config.hidden_size
+        )
+
+        mm_projector = build_vision_projector(config)
+
+        self.vision_tower = vision_tower
+        # self.qformer = Qformer
+        self.projector = mm_projector
+        self.query_tokens = query_tokens
+        self.ln_vision = ln_vision
+        self.frame_position_encoding = frame_position_encoding
+
+    def forward(self, images):
+        # images: [num_frames, patches, 3, image_size, image_size]
+        image_features = self.vision_tower(images.flatten(0, 1))
+        image_features = self.ln_vision(image_features)
+        attn_mask = torch.ones(image_features.size()[:-1], dtype=torch.long).to(
+            image_features.device)  # [num_frames * num_patches, 256]
+        query_tokens = self.query_tokens.expand(image_features.shape[0], -1, -1)  # [num_frames * num_patches, 32, 768]
+        dtype_ = self.vision_tower.dtype
+        image_features = self.bert(
+            query_embeds=query_tokens.to(dtype_),
+            encoder_hidden_states=image_features.to(dtype_),
+            encoder_attention_mask=attn_mask,
+            return_dict=True
+        ).last_hidden_state.to(dtype_)
+        frame_ids = torch.arange(images.shape[0], dtype=torch.long, device=image_features.device).unsqueeze(1)
+        frame_ids = frame_ids.repeat(1, images.shape[1]).flatten(0, 1)  # [num_frames * num_patches]
+        image_features += self.frame_position_encoding(frame_ids).unsqueeze(-2)  # [num_frames, 1, 768]
+        return self.projector(image_features)
+
+        # zheng add
+
+    def forward_features(self, a, b):
+        # images: [num_frames, patches, 3, image_size, image_size]
+        images = a
+        image_features = b
+        image_features = self.ln_vision(image_features)
+        attn_mask = torch.ones(image_features.size()[:-1], dtype=torch.long).to(
+            image_features.device)  # [num_frames * num_patches, 256]
+        query_tokens = self.query_tokens.expand(image_features.shape[0], -1, -1)  # [num_frames * num_patches, 32, 768]
+        dtype_ = self.vision_tower.dtype
+        image_features = self.bert(
+            query_embeds=query_tokens.to(dtype_),
+            encoder_hidden_states=image_features.to(dtype_),
+            encoder_attention_mask=attn_mask,
+            return_dict=True
+        ).last_hidden_state.to(dtype_)
+        frame_ids = torch.arange(images.shape[0], dtype=torch.long, device=image_features.device).unsqueeze(1)
+        frame_ids = frame_ids.repeat(1, images.shape[1]).flatten(0, 1)  # [num_frames * num_patches]
+        image_features += self.frame_position_encoding(frame_ids).unsqueeze(-2)  # [num_frames, 1, 768]
+        return self.projector(image_features)
+
+
+if __name__ == '__main__':
+    frame_folder = './v12044gd0000cl5c6rfog65i2eoqcqig'
+    tokenizer_dir = '../tokenizer_dir'
+    # device = 'cuda:0'
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
+    config = AutoConfig.from_pretrained(tokenizer_dir, trust_remote_code=True)
+    tokenizer.add_tokens(
+        [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN],
+        special_tokens=True)
+
+    image_processor = Blip2ImageTrainProcessor(
+        image_size=config.img_size,
+        is_training=False)
+
+    input_ids, conv = generate_input_ids(tokenizer)
+
+    # images = generate_images(frame_folder, image_processor, config).to(device).half() # [num_frames, patches, 3, image_size, image_size]
+
+    # zheng
+    images = generate_images(frame_folder, image_processor, config).to(device)
+
+    vision_module = MASPVision(config=config)
+
+    input_ids = input_ids[0].to(device)  # [token_len]
+    # new_vision_state_dict = torch.load('new_vision_state_dict.pth')
+    new_vision_state_dict = torch.load('new_vision_state_dict.pth', map_location=device)
+
+    # vision_state_dict = torch.load('masp_vision_statedict.pth', map_location="cuda:0")
+    # new_vision_state_dict = OrderedDict()
+    # for k, v in vision_state_dict.items():
+    #     if 'qformer' in k:
+    #         new_key = k[8:]
+    #         new_vision_state_dict[new_key] = v
+    #     else:
+    #         new_vision_state_dict[k] = v
+
+    vision_module.load_state_dict(new_vision_state_dict)
+    vision_module = vision_module.eval()
+    vision_module = vision_module.to(device)
+
+    # vision_module.to(torch.float16)
+    # zheng add
+    vision_module.to(torch.float32)
+
+    # zheng add
+    vision_module_neuron = torch.jit.load("./neuron_eva_vit_base.pt")
+    vision_module_neuron = vision_module_neuron.eval()
+    # output=vision_module_neuron(images)
+
+    padding_idx = config.pad_token_id
+    embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx)
+    embed_weight = torch.load('embed_tokens.pth')
+    embed_tokens.load_state_dict(embed_weight)
+    embed_tokens = embed_tokens.eval()
+    embed_tokens.to(torch.float16).to(device)
+    # vision_module = vision_module.eval()
+    # vision_state_dict = vision_module.state_dict()
+    # torch.save(vision_state_dict, 'masp_vision_statedict.pth')
+
+    # infernece
+    # print(images.shape) # [10, 7, 3, 224, 224]
+    # dummy_images = torch.rand(10, 7, 3, 224, 224).to(model.device)
+    # scripted_vision_module = torch.jit.script(vision_module)
+    # print('begin to trace')
+    # traced_vision_module = torch.jit.trace(vision_module, (images))
+    # traced_vision_module.save('traced_vision_module.pt')
+    # loaded = torch.jit.load('traced_vision_module.pt')
+
+    import time
+
+    start = time.time()
+    with torch.inference_mode():
+        # get image feature
+
+        # image_features = vision_module(images).flatten(0, 1) # [num_frames * num_patches * num_query_token, 4096]
+
+        image_features = torch.Tensor()  # init a tensor
+        for image in images:
+            output = vision_module_neuron(image)
+            output = output[:, 1:].to(torch.float32)
+            if len(image_features) == 0:
+                image_features = output
+            else:
+                image_features = torch.cat([image_features, output], dim=0)
+
+        # zheng [70, 256, 1408]
+        image_features = vision_module.forward_features(images, image_features)
+        image_features = image_features.flatten(0, 1)
+        print(image_features.shape)  # zheng [70, 32, 4096]
+
+        image_features.to(device=device, dtype=torch.float16)
+        image_features_numpy = image_features.detach().cpu().numpy()
+        # image_features_saved = np.load('image_features_numpy.npy')
+        # print(np.sum(image_features_numpy -image_features_saved ))
+        # image_features_numpy = image_features.detach().cpu().numpy
+        # np.save('image_features_numpy.npy', image_features_numpy)
+        # print('images features shape', image_features.shape)
+        # image_features = loaded(images).flatten(0, 1)
+        # concat with text features
+        vision_token_indice = torch.where(input_ids == MM_TOKEN_INDEX)[0][0]
+        pre_text_token = embed_tokens(input_ids[:vision_token_indice])  # zheng [32, 4096]
+        post_text_token = embed_tokens(input_ids[vision_token_indice + 1:])
+
+        inputs_embeds = torch.cat([pre_text_token, image_features, post_text_token]).unsqueeze(
+            0)  # [1, num_token, 4096]
+
+        print("Inference time:", time.time() - start)
+
+        input_embeds_numpy = inputs_embeds.detach().cpu().numpy()
+        image_embeds_saved = np.load('inputs_embeds.npy')
+        diff = np.sum(input_embeds_numpy - image_embeds_saved)
+        print('diff with saved in the disk', diff)
+        # print('inputs embeds numpy shape', input_embeds_numpy.shape)
+        # np.save('inputs_embeds.npy', input_embeds_numpy)
\ No newline at end of file
diff --git a/app/tmp/qformer-tian.py b/app/tmp/qformer-tian.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffd6fa5cc9321859aef735970adbeb211c95866e
--- /dev/null
+++ b/app/tmp/qformer-tian.py
@@ -0,0 +1,1151 @@
+"""
+Adapted from salesforce@LAVIS. Below is the original copyright:
+ * Copyright (c) 2023, salesforce.com, inc.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+ * By Junnan Li
+ * Based on huggingface code base
+ * https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/models/bert
+"""
+
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple, Dict, Any
+
+import torch
+from torch import Tensor, device, dtype, nn
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+import torch.nn.functional as F
+
+from transformers.activations import ACT2FN
+from transformers.file_utils import (
+    ModelOutput,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import (
+    PreTrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from transformers.utils import logging
+from transformers.models.bert.configuration_bert import BertConfig
+from llava.model.utils import LayerNorm
+
+logger = logging.get_logger(__name__)
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word and position embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = None
+        self.position_embeddings = None
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.LayerNorm = LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))
+        )
+        self.position_embedding_type = getattr(
+            config, "position_embedding_type", "absolute"
+        )
+
+        self.config = config
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        query_embeds=None,
+        past_key_values_length=0,
+    ):
+        seq_length = 0
+
+        position_ids = self.position_ids[
+            :, past_key_values_length : seq_length + past_key_values_length
+            ].clone()
+
+        embeddings = query_embeds
+
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertSelfAttention(nn.Module):
+    def __init__(self, config, is_cross_attention):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+            config, "embedding_size"
+        ):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_width, self.all_head_size)
+            self.value = nn.Linear(config.encoder_width, self.all_head_size)
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if (
+            self.position_embedding_type == "relative_key"
+            or self.position_embedding_type == "relative_key_query"
+        ):
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(
+                2 * config.max_position_embeddings - 1, self.attention_head_size
+            )
+        self.save_attention = False
+
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+
+    def get_attn_gradients(self):
+        return self.attn_gradients
+
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+
+    def get_attention_map(self):
+        return self.attention_map
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (
+            self.num_attention_heads,
+            self.attention_head_size,
+        )
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        mixed_query_layer = self.query(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if (
+            self.position_embedding_type == "relative_key"
+            or self.position_embedding_type == "relative_key_query"
+        ):
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(
+                seq_length, dtype=torch.long, device=hidden_states.device
+            ).view(-1, 1)
+            position_ids_r = torch.arange(
+                seq_length, dtype=torch.long, device=hidden_states.device
+            ).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(
+                distance + self.max_position_embeddings - 1
+            )
+            positional_embedding = positional_embedding.to(
+                dtype=query_layer.dtype
+            )  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum(
+                    "bhld,lrd->bhlr", query_layer, positional_embedding
+                )
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum(
+                    "bhld,lrd->bhlr", query_layer, positional_embedding
+                )
+                relative_position_scores_key = torch.einsum(
+                    "bhrd,lrd->bhlr", key_layer, positional_embedding
+                )
+                attention_scores = (
+                    attention_scores
+                    + relative_position_scores_query
+                    + relative_position_scores_key
+                )
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores.to(torch.float32)).to(attention_scores.dtype)
+
+        if is_cross_attention and self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+
+        context_layer = torch.matmul(attention_probs_dropped, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (
+            (context_layer, attention_probs) if output_attentions else (context_layer,)
+        )
+
+        outputs = outputs + (past_key_value,)
+        return outputs
+
+
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.LayerNorm = LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.self = BertSelfAttention(config, is_cross_attention)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads,
+            self.self.num_attention_heads,
+            self.self.attention_head_size,
+            self.pruned_heads,
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = (
+            self.self.attention_head_size * self.self.num_attention_heads
+        )
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[
+            1:
+        ]  # add attentions if we output them
+        return outputs
+
+
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.LayerNorm = LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+    def __init__(self, config, layer_num):
+        super().__init__()
+        self.config = config
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config)
+        self.layer_num = layer_num
+        if (
+            self.config.add_cross_attention
+            and layer_num % self.config.cross_attention_freq == 0
+        ):
+            self.crossattention = BertAttention(
+                config, is_cross_attention=self.config.add_cross_attention
+            )
+            self.has_cross_attention = True
+        else:
+            self.has_cross_attention = False
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+        self.intermediate_query = BertIntermediate(config)
+        self.output_query = BertOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        query_length=0,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = (
+            past_key_value[:2] if past_key_value is not None else None
+        )
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:-1]
+
+        present_key_value = self_attention_outputs[-1]
+
+        if query_length > 0:
+            query_attention_output = attention_output[:, :query_length, :]
+
+            if self.has_cross_attention:
+                assert (
+                    encoder_hidden_states is not None
+                ), "encoder_hidden_states must be given for cross-attention layers"
+                cross_attention_outputs = self.crossattention(
+                    query_attention_output,
+                    attention_mask,
+                    head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+                query_attention_output = cross_attention_outputs[0]
+                outputs = (
+                    outputs + cross_attention_outputs[1:-1]
+                )  # add cross attentions if we output attention weights
+
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk_query,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                query_attention_output,
+            )
+            if attention_output.shape[1] > query_length:
+                layer_output_text = apply_chunking_to_forward(
+                    self.feed_forward_chunk,
+                    self.chunk_size_feed_forward,
+                    self.seq_len_dim,
+                    attention_output[:, query_length:, :],
+                )
+                layer_output = torch.cat([layer_output, layer_output_text], dim=1)
+        else:
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                attention_output,
+            )
+        outputs = (layer_output,) + outputs
+
+        outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+    def feed_forward_chunk_query(self, attention_output):
+        intermediate_output = self.intermediate_query(attention_output)
+        layer_output = self.output_query(intermediate_output, attention_output)
+        return layer_output
+
+
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [BertLayer(config, i) for i in range(config.num_hidden_layers)]
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        query_length=0,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = (
+            () if output_attentions and self.config.add_cross_attention else None
+        )
+
+        next_decoder_cache = () if use_cache else None
+
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+
+                if use_cache:
+                    logger.warn(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(
+                            *inputs, past_key_value, output_attentions, query_length
+                        )
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                    query_length,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.LayerNorm = LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class BertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = BertConfig
+    base_model_prefix = "bert"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+
+class BertModel(BertPreTrainedModel):
+    """
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+    input to the forward pass.
+    """
+
+    def __init__(self, config, add_pooling_layer=False):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = BertEmbeddings(config)
+
+        self.encoder = BertEncoder(config)
+
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_extended_attention_mask(
+        self,
+        attention_mask: Tensor,
+        input_shape: Tuple[int],
+        device: device,
+        is_decoder: bool,
+        has_query: bool = False,
+    ) -> Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+
+        Arguments:
+            attention_mask (:obj:`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (:obj:`Tuple[int]`):
+                The shape of the input to the model.
+            device: (:obj:`torch.device`):
+                The device of the input to the model.
+
+        Returns:
+            :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if is_decoder:
+                batch_size, seq_length = input_shape
+
+                seq_ids = torch.arange(seq_length, device=device)
+                causal_mask = (
+                    seq_ids[None, None, :].repeat(batch_size, seq_length, 1)
+                    <= seq_ids[None, :, None]
+                )
+
+                # add a prefix ones mask to the causal mask
+                # causal and attention masks must have same type with pytorch version < 1.3
+                causal_mask = causal_mask.to(attention_mask.dtype)
+
+                if causal_mask.shape[1] < attention_mask.shape[1]:
+                    prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
+                    if has_query:  # UniLM style attention mask
+                        causal_mask = torch.cat(
+                            [
+                                torch.zeros(
+                                    (batch_size, prefix_seq_len, seq_length),
+                                    device=device,
+                                    dtype=causal_mask.dtype,
+                                ),
+                                causal_mask,
+                            ],
+                            axis=1,
+                        )
+                    causal_mask = torch.cat(
+                        [
+                            torch.ones(
+                                (batch_size, causal_mask.shape[1], prefix_seq_len),
+                                device=device,
+                                dtype=causal_mask.dtype,
+                            ),
+                            causal_mask,
+                        ],
+                        axis=-1,
+                    )
+                extended_attention_mask = (
+                    causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
+                )
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(
+            dtype=self.dtype
+        )  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    def forward(
+        self,
+        query_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = self.config.output_attentions
+
+        output_hidden_states = self.config.output_hidden_states
+
+        return_dict = True
+        
+        use_cache = False
+
+        input_ids = None
+        
+        position_ids = None
+
+        # past_key_values_length
+        past_key_values_length = 0
+
+        query_length = query_embeds.shape[1]
+        
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            query_embeds=query_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+
+        input_shape = embedding_output.size()[:-1]
+        batch_size = input_shape[-2]
+        seq_length = input_shape[-1]
+        device = embedding_output.device
+
+        attention_mask = None
+        attention_mask = torch.ones(
+            ((batch_size, seq_length + past_key_values_length)), device=device
+            )
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        is_decoder = False
+        extended_attention_mask = self.get_extended_attention_mask(
+                attention_mask, input_shape, device, is_decoder
+            )
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        print("encoder_hidden_states.size(): ", encoder_hidden_states.size())
+        encoder_batch_size = encoder_hidden_states.size()[-3]
+        encoder_sequence_length = encoder_hidden_states.size()[-2]
+        encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+        print("encoder_hidden_shape: ", encoder_hidden_shape)
+
+        encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = None
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        past_key_values = None
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            query_length=query_length,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = None
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+class BertLMHeadModel(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        query_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=True,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        return_logits=False,
+        is_decoder=True,
+        reduction="mean",
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
+            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        Returns:
+        Example::
+            >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
+            >>> import torch
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+            >>> config = BertConfig.from_pretrained("bert-base-cased")
+            >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+            >>> prediction_logits = outputs.logits
+        """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        if labels is not None:
+            use_cache = False
+        if past_key_values is not None:
+            query_embeds = None
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            query_embeds=query_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+        )
+
+        sequence_output = outputs[0]
+        if query_embeds is not None:
+            sequence_output = outputs[0][:, query_embeds.shape[1] :, :]
+
+        prediction_scores = self.cls(sequence_output)
+
+        if return_logits:
+            return prediction_scores[:, :-1, :].contiguous()
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1)
+            lm_loss = loss_fct(
+                shifted_prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1),
+            )
+            if reduction == "none":
+                lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1)
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, query_embeds, past=None, attention_mask=None, **model_kwargs
+    ):
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_ids.shape)
+        query_mask = input_ids.new_ones(query_embeds.shape[:-1])
+        attention_mask = torch.cat([query_mask, attention_mask], dim=-1)
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            "input_ids": input_ids,
+            "query_embeds": query_embeds,
+            "attention_mask": attention_mask,
+            "past_key_values": past,
+            "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None),
+            "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None),
+            "is_decoder": True,
+        }
+
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (
+                tuple(
+                    past_state.index_select(0, beam_idx) for past_state in layer_past
+                ),
+            )
+        return reordered_past
+
+
+class BertForMaskedLM(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        query_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        return_logits=False,
+        is_decoder=False,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        """
+
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            query_embeds=query_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+        )
+
+        if query_embeds is not None:
+            sequence_output = outputs[0][:, query_embeds.shape[1] :, :]
+        prediction_scores = self.cls(sequence_output)
+
+        if return_logits:
+            return prediction_scores
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)
+            )
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return (
+                ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+            )
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/client/client.py b/client/client.py
new file mode 100644
index 0000000000000000000000000000000000000000..78c0e317da5714afa9eefcc5411d316ceb21b29e
--- /dev/null
+++ b/client/client.py
@@ -0,0 +1,12 @@
+import requests
+
+url = 'http://127.0.0.1:8000/generate'
+
+
+
+payload = {'inputs': 'Who are you?', "parameters":{"max_new_tokens": 4}}
+headers = {'Content-Type': 'application/json'}
+
+response = requests.post(url, json=payload, headers=headers)
+
+print(response.text)
diff --git a/client/concurrent_client.py b/client/concurrent_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..28e148e64ace0224f03c3f0d6072a9ce617ac95a
--- /dev/null
+++ b/client/concurrent_client.py
@@ -0,0 +1,70 @@
+from pprint import pprint
+from PIL import Image
+import os
+import time
+import requests
+import base64
+import numpy as np
+from io import BytesIO
+import concurrent.futures
+
+NUM_SEGMENTS = 10
+
+tic = time.time()
+video_dir = '/home/ubuntu/shared_storage/images'
+frames = [(os.path.splitext(item)[0], os.path.join(video_dir, item)) for item in os.listdir(video_dir)]
+frames = [item[1] for item in sorted(frames, key=lambda x: x[0])]
+indices = np.linspace(start=0, stop=len(frames)-1, num=NUM_SEGMENTS).astype(int)
+image_paths = [frames[ind] for ind in indices]
+
+request = {}
+byte_images = []
+
+def image_path_handler(image_path):
+    img = Image.open(image_path)
+    byte_io = BytesIO()
+    img.save(byte_io, format='PNG')
+    encoded_image = base64.b64encode(byte_io.getvalue()).decode('utf-8')
+    return encoded_image
+
+with concurrent.futures.ThreadPoolExecutor(16) as executor:
+    byte_images = list(executor.map(image_path_handler, image_paths))
+
+payload = {
+        "images": byte_images,
+        "parameters": {
+            "max_new_tokens": 90,
+            "top_k": 4,
+            "top_p": None,
+            "temperature": 0.01,
+            "no_repeat_ngram_size": None,
+        }
+    }
+
+IP = '127.0.0.1'
+headers = {'Content-Type': 'application/json'}
+tic1 = time.time()
+
+def loop_request(port):
+    url = f'http://{IP}:{port}/generate'
+    response = requests.post(url, json=payload, headers=headers)
+    return response.text
+
+num = 12
+ports = []
+for port in range(num):
+   ports.append(str(8000 + port))
+
+# with concurrent.futures.ProcessPoolExecutor(num) as executor:
+#     results = list(executor.map(loop_request, ports))
+# toc = time.time()
+# pprint(results)
+with concurrent.futures.ThreadPoolExecutor(num) as executor:
+    results = [executor.submit(loop_request, port) for port in ports]
+
+toc = time.time()
+for result in results:
+    print(result.result())
+
+print('stage 1', tic1 - tic)
+print('stage 2', toc - tic1)
diff --git a/client/concurrent_client_batch.py b/client/concurrent_client_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6f41e8de476231a9fea02464a1e10dd65503bd5
--- /dev/null
+++ b/client/concurrent_client_batch.py
@@ -0,0 +1,70 @@
+from pprint import pprint
+from PIL import Image
+import os
+import time
+import requests
+import base64
+import numpy as np
+from io import BytesIO
+import concurrent.futures
+
+NUM_SEGMENTS = 10
+
+tic = time.time()
+video_dir = '/home/ubuntu/shared_storage/images'
+frames = [(os.path.splitext(item)[0], os.path.join(video_dir, item)) for item in os.listdir(video_dir)]
+frames = [item[1] for item in sorted(frames, key=lambda x: x[0])]
+indices = np.linspace(start=0, stop=len(frames)-1, num=NUM_SEGMENTS).astype(int)
+image_paths = [frames[ind] for ind in indices]
+
+request = {}
+byte_images = []
+
+def image_path_handler(image_path):
+    img = Image.open(image_path)
+    byte_io = BytesIO()
+    img.save(byte_io, format='PNG')
+    encoded_image = base64.b64encode(byte_io.getvalue()).decode('utf-8')
+    return encoded_image
+
+with concurrent.futures.ThreadPoolExecutor(16) as executor:
+    byte_images = list(executor.map(image_path_handler, image_paths))
+
+payload = {
+        "images": byte_images * 16,
+        "parameters": {
+            "max_new_tokens": 90,
+            "top_k": 4,
+            "top_p": None,
+            "temperature": 0.01,
+            "no_repeat_ngram_size": None,
+        }
+    }
+
+IP = '127.0.0.1'
+headers = {'Content-Type': 'application/json'}
+tic1 = time.time()
+
+def loop_request(port):
+    url = f'http://{IP}:{port}/generate'
+    response = requests.post(url, json=payload, headers=headers)
+    return response.text
+
+num = 6
+ports = []
+for port in range(num):
+   ports.append(str(8000 + port))
+
+# with concurrent.futures.ProcessPoolExecutor(num) as executor:
+#     results = list(executor.map(loop_request, ports))
+# toc = time.time()
+# pprint(results)
+with concurrent.futures.ThreadPoolExecutor(num) as executor:
+    results = [executor.submit(loop_request, port) for port in ports]
+
+toc = time.time()
+for result in results:
+    print(result.result())
+
+print('stage 1', tic1 - tic)
+print('stage 2', toc - tic1)
diff --git a/client/embeds_client.py b/client/embeds_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e24da324a8a2fa05cdac31396fd75ebf516d4c5
--- /dev/null
+++ b/client/embeds_client.py
@@ -0,0 +1,14 @@
+import json
+import requests
+import torch
+import numpy as np
+
+image_embeds_saved = np.load('input_embeds.npy')
+payload = {'inputs': image_embeds_saved.tolist()}
+
+url = 'http://127.0.0.1:8000/generate'
+headers = {'Content-Type': 'application/json'}
+
+response = requests.post(url, json=payload, headers=headers)
+
+print(response.text)
diff --git a/client/embeds_save.py b/client/embeds_save.py
new file mode 100644
index 0000000000000000000000000000000000000000..a91b8fd4ee51a1607ea7f549c4734cf4a7d61a3d
--- /dev/null
+++ b/client/embeds_save.py
@@ -0,0 +1,34 @@
+import numpy as np
+import torch.nn.functional as F
+from transformers import AutoTokenizer
+from transformers_neuronx import MistralForSampling, GQA, NeuronConfig
+
+# Set sharding strategy for GQA to be shard over heads
+neuron_config = NeuronConfig(
+    group_query_attention=GQA.SHARD_OVER_HEADS
+)
+
+# Create and compile the Neuron model
+model_neuron = MistralForSampling.from_pretrained('mistralai/Mistral-7B-Instruct-v0.2', amp='bf16', batch_size=1, tp_degree=2, n_positions=2048, neuron_config=neuron_config)
+model_neuron.to_neuron()
+
+tokenizer = AutoTokenizer.from_pretrained('mistralai/Mistral-7B-Instruct-v0.2')
+tokenizer.pad_token_id = tokenizer.eos_token_id
+
+input_prompt = 'Who are you?'
+
+input_prompt = "[INST] " + input_prompt + " [/INST]"
+encoded_input = tokenizer(input_prompt, return_tensors='pt')
+original_input_ids = encoded_input.input_ids
+input_ids_length = original_input_ids.shape[1]
+power_of_length = 64
+while power_of_length < input_ids_length:
+    power_of_length *= 2
+padding_size = ((input_ids_length - 1) // 64 + 1) * power_of_length
+padding_gap = padding_size - input_ids_length
+padded_input_ids = F.pad(original_input_ids, (padding_gap, 0), value=tokenizer.pad_token_id)
+
+input_embeds = model_neuron.chkpt_model.model.embed_tokens(padded_input_ids)
+
+input_embeds_np = input_embeds.detach().numpy()
+np.save('./input_embeds.npy', input_embeds_np)
diff --git a/client/images_client.py b/client/images_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e48d3f27e3009b31707a11000b4c02308ce3780
--- /dev/null
+++ b/client/images_client.py
@@ -0,0 +1,54 @@
+from PIL import Image
+import os
+import time
+import requests
+import base64
+import numpy as np
+from io import BytesIO
+import concurrent.futures
+
+NUM_SEGMENTS = 10
+
+tic = time.time()
+video_dir = '/home/ubuntu/shared_storage/images'
+frames = [(os.path.splitext(item)[0], os.path.join(video_dir, item)) for item in os.listdir(video_dir)]
+frames = [item[1] for item in sorted(frames, key=lambda x: x[0])]
+indices = np.linspace(start=0, stop=len(frames)-1, num=NUM_SEGMENTS).astype(int)
+image_paths = [frames[ind] for ind in indices]
+
+request = {}
+byte_images = []
+
+def image_path_handler(image_path):
+    img = Image.open(image_path)
+    byte_io = BytesIO()
+    img.save(byte_io, format='PNG')
+    encoded_image = base64.b64encode(byte_io.getvalue()).decode('utf-8')
+    return encoded_image
+
+with concurrent.futures.ThreadPoolExecutor() as executor:
+    byte_images = list(executor.map(image_path_handler, image_paths))
+
+payload = {
+        "images": byte_images,
+        "parameters": {
+            "max_new_tokens": 90,
+            "top_k": 4,
+            "top_p": None,
+            "temperature": 0.01,
+            "no_repeat_ngram_size": None,
+        }
+    }
+
+IP = '127.0.0.1'
+url = f'http://{IP}:8000/generate'
+headers = {'Content-Type': 'application/json'}
+
+tic1 = time.time()
+
+response = requests.post(url, json=payload, headers=headers)
+toc = time.time()
+print(response.text)
+
+print('stage 1', tic1 - tic)
+print('stage 2', toc - tic1)
diff --git a/client/images_client_batch.py b/client/images_client_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bdbfd7a90e01b2701053365bfaf51e8ea2093a3
--- /dev/null
+++ b/client/images_client_batch.py
@@ -0,0 +1,54 @@
+from PIL import Image
+import os
+import time
+import requests
+import base64
+import numpy as np
+from io import BytesIO
+import concurrent.futures
+
+NUM_SEGMENTS = 10
+
+tic = time.time()
+video_dir = '/home/ubuntu/shared_storage/images'
+frames = [(os.path.splitext(item)[0], os.path.join(video_dir, item)) for item in os.listdir(video_dir)]
+frames = [item[1] for item in sorted(frames, key=lambda x: x[0])]
+indices = np.linspace(start=0, stop=len(frames)-1, num=NUM_SEGMENTS).astype(int)
+image_paths = [frames[ind] for ind in indices]
+
+request = {}
+byte_images = []
+
+def image_path_handler(image_path):
+    img = Image.open(image_path)
+    byte_io = BytesIO()
+    img.save(byte_io, format='PNG')
+    encoded_image = base64.b64encode(byte_io.getvalue()).decode('utf-8')
+    return encoded_image
+
+with concurrent.futures.ThreadPoolExecutor() as executor:
+    byte_images = list(executor.map(image_path_handler, image_paths))
+
+payload = {
+        "images": byte_images * 16,
+        "parameters": {
+            "max_new_tokens": 90,
+            "top_k": 4,
+            "top_p": None,
+            "temperature": 0.01,
+            "no_repeat_ngram_size": None,
+        }
+    }
+
+IP = '127.0.0.1'
+url = f'http://{IP}:8000/generate'
+headers = {'Content-Type': 'application/json'}
+
+tic1 = time.time()
+
+response = requests.post(url, json=payload, headers=headers)
+toc = time.time()
+print(response.text)
+
+print('stage 1', tic1 - tic)
+print('stage 2', toc - tic1)
diff --git a/client/images_client_laplace.py b/client/images_client_laplace.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9cb63d01e3aba65570bbf11923cd5a6a7ab101d
--- /dev/null
+++ b/client/images_client_laplace.py
@@ -0,0 +1,52 @@
+import os
+import random
+import requests
+import base64
+import time
+from io import BytesIO
+import numpy as np
+
+from PIL import Image
+import msgpack
+from laplace import Client
+from io import BytesIO
+
+
+def select_frames(input_frames, num_segments=10):
+    indices = np.linspace(start=0, stop=len(input_frames) - 1, num=num_segments).astype(int)
+    frames = [input_frames[ind] for ind in indices]
+    return frames
+
+client = Client("sd://data.tns.masp_inf2?cluster=default", timeout=100)
+
+video_dir = './v12044gd0000cl5c6rfog65i2eoqcqig'
+frames = [(os.path.splitext(item)[0], os.path.join(video_dir, item)) for item in os.listdir(video_dir)]
+frames = [item[1] for item in sorted(frames, key=lambda x: int(x[0]))]
+
+out_frames = select_frames(frames)
+request = {}
+byte_images = []
+for image_path in out_frames:
+    img = Image.open(image_path)
+    byte_io = BytesIO()
+    img.save(byte_io, format='PNG')
+    byte_images.append(byte_io.getvalue())
+
+# Step 4: Pack the byte data with msgpack
+packed_data = msgpack.packb(byte_images)
+request['images'] = [packed_data]
+
+# adjust it if you hope to tune those paramters:
+temp = 0.01
+# prompt = 'Did the image contains a lion? answer yes or no'
+request['temperature'] = [str(temp).encode()]
+# request['prompt'] = [prompt.encode()]
+
+tic = time.time()
+results = client.matx_inference(model_name="data-tns-masp-inf2", input_lists=request)
+toc = time.time()
+print('time to do the inference')
+print(toc - tic)
+# results = client.matx_inference(model_name="data-tns-masp-model-b",input_lists=request)
+outputs = results.output_bytes_lists
+print(outputs['output'][0])
diff --git a/conversions/convert2inf2.py b/conversions/convert2inf2.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3ead030d2b7adf4e0d280027c9a2cceeb723a99
--- /dev/null
+++ b/conversions/convert2inf2.py
@@ -0,0 +1,184 @@
+import sys
+import torch
+import os
+import random
+from io import BytesIO
+import numpy as np
+import time
+from llava.constants import MM_TOKEN_INDEX, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN, DEFAULT_VIDEO_TOKEN, \
+    DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from llava.conversation import conv_templates, SeparatorStyle
+from llava.utils import disable_torch_init
+from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria, process_images_v2
+from llava.model.builder import load_pretrained_model
+from llava.model.multimodal_encoder.processor import Blip2ImageTrainProcessor
+from llava.model import LlavaMistralForCausalLM
+from llava.model.multimodal_encoder.eva_vit import create_eva_vit_g
+import torch_neuronx
+import torch
+import torch_neuronx
+from llava.model import LlavaMistralForCausalLM
+from transformers import AutoTokenizer
+from llava.constants import MM_TOKEN_INDEX, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN, DEFAULT_VIDEO_TOKEN, \
+    DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+
+from transformers import CLIPImageProcessor
+from PIL import Image
+import logging
+from qformer_tian import BertConfig, BertModel
+
+
+def select_frames(input_frames, num_segments=10):
+    indices = np.linspace(start=0, stop=len(input_frames) - 1, num=num_segments).astype(int)
+
+    frames = [input_frames[ind] for ind in indices]
+
+    return frames
+
+
+def generate_input_ids(tokenizer):
+    conv = conv_templates['v1'].copy()
+    qs = "Describe the following video in detail."
+    qs = DEFAULT_VIDEO_START_TOKEN + DEFAULT_VIDEO_TOKEN + DEFAULT_VIDEO_END_TOKEN + '\n' + qs
+    conv.append_message(conv.roles[0], qs)
+    conv.append_message(conv.roles[1], None)
+    prompt = conv.get_prompt()
+    input_ids = tokenizer_image_token(prompt, tokenizer, MM_TOKEN_INDEX, return_tensors='pt').unsqueeze(0)
+    return input_ids, conv
+
+
+def uniform_sample(frames, num_segments):
+    indices = np.linspace(start=0, stop=len(frames) - 1, num=num_segments).astype(int)
+    frames = [frames[ind] for ind in indices]
+    return frames
+
+
+save_root = '/root/inf2_dir_0531/inf2_weights'
+if not os.path.isdir(save_root):
+    os.makedirs(save_root)
+
+EVITG_SAVE_PATH = os.path.join(save_root, 'neuron_eva_vit.pth')
+LAYERNORM_SAVE_PATH = os.path.join(save_root, 'ln_state_dict.pth')
+QUERYTOKEN_SAVE_PATH = os.path.join(save_root, 'query_tokens.pth')
+BERT_SAVE_PATH = os.path.join(save_root, 'neuron_bert.pth')
+POSITION_ENCODING_SAVE_PATH = os.path.join(save_root, 'frame_position_encoding.pth')
+PROJECTOR_SAVE_PATH = os.path.join(save_root, 'projector.pth')
+EMBED_TOKENS_SAVE_PATH = os.path.join(save_root, 'embed_tokens.pth')
+
+model_path = './llava-mistral_videollava_092/'
+disable_torch_init()
+# print(model_path)
+device_map = {"": 'cpu'}
+kwargs = {"device_map": device_map}
+kwargs['torch_dtype'] = torch.float32
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+model = LlavaMistralForCausalLM.from_pretrained(
+    model_path,
+    low_cpu_mem_usage=True,
+    **kwargs
+)
+tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN],
+                     special_tokens=True)
+model.resize_token_embeddings(len(tokenizer))
+
+model.config.vit_precision == 'fp32'
+vision_tower = model.get_vision_tower()
+vision_tower.is_loaded = False
+vision_tower.load_model(device_map=device_map)
+vision_tower = vision_tower.to(torch.float32)
+
+vision_tower = vision_tower.eval()
+print('vision tower hiidden size')
+print(vision_tower.hidden_size)
+
+batch_size = 7
+img_size = 224
+input_shape = (batch_size, 3, img_size, img_size)
+input_data = torch.zeros(input_shape, dtype=torch.float32)
+model_neuronx = torch_neuronx.trace(vision_tower, input_data, compiler_args=["--model-type=transformer"])
+model_neuronx.save(EVITG_SAVE_PATH)
+
+image_processor = Blip2ImageTrainProcessor(
+    image_size=model.config.img_size,
+    is_training=False)
+
+input_ids, conv = generate_input_ids(tokenizer)
+device = torch.device('cpu')
+model = model.to(device)
+conv_mode = 'v1'
+NUM_SEGMENTS = 10
+
+video_dir = './v12044gd0000cl5c6rfog65i2eoqcqig'
+frames = [(int(os.path.splitext(item)[0]), os.path.join(video_dir, item)) for item in os.listdir(video_dir)]
+frames = [item[1] for item in sorted(frames, key=lambda x: x[0])]
+images = [Image.open(frame).convert('RGB') for frame in frames]
+images = uniform_sample(images, NUM_SEGMENTS)
+images = process_images_v2(images, image_processor, model.config)
+
+# save layer norm
+ln_vision = model.get_ln_vision()
+ln_vision = ln_vision.eval()
+ln_state_dict = ln_vision.state_dict()
+torch.save(ln_state_dict, LAYERNORM_SAVE_PATH)
+
+query_tokens = model.get_query_tokens()
+# save query tokens
+# attn_mask = torch.ones(image_features.size()[:-1], dtype=torch.long).to(image_features.device)
+# query_tokens_inputs = query_tokens.expand(image_features.shape[0], -1, -1)
+
+query_tokens_state_dict = {'query_tokens': query_tokens.data}
+torch.save(query_tokens_state_dict, QUERYTOKEN_SAVE_PATH)
+# print('shape of query tokens', query_tokens_inputs.shape)
+
+# save qformer
+qformer = model.get_qformer()
+bert_torch = qformer.bert
+bert_torch = bert_torch.eval()
+bert_torch = bert_torch.to(torch.float32)
+
+vision_width = 1408
+cross_attention_freq = 2
+num_query_token = 32
+encoder_config = BertConfig.from_pretrained("bert-base-uncased")
+encoder_config.encoder_width = vision_width
+# insert cross-attention layer every other block
+encoder_config.add_cross_attention = True
+encoder_config.cross_attention_freq = cross_attention_freq
+encoder_config.query_length = num_query_token
+bert = BertModel(encoder_config, add_pooling_layer=False)
+bert.embeddings.word_embeddings = None
+bert.embeddings.position_embeddings = None
+
+for layer in bert.encoder.layer:
+    layer.output = None
+    layer.intermediate = None
+
+bert.load_state_dict(bert_torch.state_dict())
+bert = bert.eval()
+
+input_example = (
+    torch.zeros(70, 32, 768, dtype=torch.float32),
+    torch.zeros(70, 256, 1408, dtype=torch.float32),
+    torch.zeros(70, 256, dtype=torch.int64)
+)
+neuron_bert = torch_neuronx.trace(bert, input_example)
+neuron_bert.save(BERT_SAVE_PATH)
+
+# save projector and frame position encoding
+frame_position_encoding = model.get_frame_position_encoding()
+projector = model.get_model().mm_projector
+
+frame_position_encoding = frame_position_encoding.eval()
+frame_position_encoding = frame_position_encoding.to(torch.float32)
+
+projector = projector.eval()
+projector = projector.to(torch.float32)
+
+torch.save(frame_position_encoding.state_dict(), POSITION_ENCODING_SAVE_PATH)
+torch.save(projector.state_dict(), PROJECTOR_SAVE_PATH)
+
+# save embed_tokenss
+embed_tokens = model.get_model().embed_tokens
+embed_tokens = embed_tokens.eval()
+embed_tokens = embed_tokens.to(torch.float32)
+torch.save(embed_tokens.state_dict(), EMBED_TOKENS_SAVE_PATH)
\ No newline at end of file
diff --git a/conversions/inference_inf2.py b/conversions/inference_inf2.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bc2e0732fb4e17216969768b62b0a001c55ca7e
--- /dev/null
+++ b/conversions/inference_inf2.py
@@ -0,0 +1,217 @@
+import time
+import os
+import base64
+from io import BytesIO
+import concurrent.futures
+import logging
+import numpy as np
+from PIL import Image
+import torch
+import torch.nn as nn
+import torch_neuronx
+import transformers
+from transformers import AutoConfig, AutoTokenizer
+from llava.constants import MM_TOKEN_INDEX, DEFAULT_VIDEO_TOKEN, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN
+from llava.conversation import conv_templates
+from llava.model.utils import LayerNorm
+from llava.mm_utils import tokenizer_image_token
+from llava.model.multimodal_encoder.processor import Blip2ImageTrainProcessor
+from transformers_neuronx import MistralForSampling, GQA, NeuronConfig, QuantizationConfig
+from typing import Dict, Optional, Any
+from fastapi import FastAPI, Request, HTTPException
+# Suppress transformers logging
+transformers.logging.set_verbosity_error()
+NUM_SEGMENTS = 10  # Number of frame segments to use
+WEIGHT_ROOT = '/home/ubuntu/'  # Root directory for model weights
+CONFIG_DIR = os.path.join(WEIGHT_ROOT, "llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch")  # Tokenizer directory
+NEURON_VISION_PATH = os.path.join(WEIGHT_ROOT, "inf2_weights", "neuron_eva_vit_batch7.pth")  # Vision model weights (Neuron format)
+NEURON_BERT_PATH = os.path.join(WEIGHT_ROOT, "inf2_weights", "neuron_bert.pth")  # BERT model weights (Neuron format)
+PROJECTOR_PATH = os.path.join(WEIGHT_ROOT, "inf2_weights", 'projector.pth')  # Projector weights
+EMBED_TOKEN_PATH = os.path.join(WEIGHT_ROOT, "inf2_weights", 'embed_tokens.pth')  # Embedding weights
+QUERY_TOKEN_PATH = os.path.join(WEIGHT_ROOT, "inf2_weights", 'query_tokens.pth')
+LAYERNORM_SAVE_PATH = os.path.join(WEIGHT_ROOT, "inf2_weights", 'ln_state_dict.pth')
+POSITION_ENCODING_SAVE_PATH = os.path.join(WEIGHT_ROOT, "inf2_weights", 'frame_position_encoding.pth')
+COMPILED_MODEL_PATH = os.path.join(WEIGHT_ROOT, 'mistral-compiled')
+class MistralModel:
+    def __init__(self, model_name):
+        self.neuron_config = NeuronConfig(group_query_attention=GQA.SHARD_OVER_HEADS,
+                                          quant=QuantizationConfig(quant_dtype='s8', dequant_dtype='bf16'))
+        self.model_name = model_name
+        self.amp = 'bf16'
+        self.batch_size = 1
+        self.tp_degree = 2
+        self.n_positions = 4096
+        self.context_length_estimate_start = 2289
+        self.context_length_estimate = [self.context_length_estimate_start, 4096]
+        self.model = MistralForSampling.from_pretrained(
+            self.model_name,
+            amp=self.amp,
+            batch_size=self.batch_size,
+            tp_degree=self.tp_degree,
+            n_positions=self.n_positions,
+            neuron_config=self.neuron_config,
+            context_length_estimate=self.context_length_estimate
+        )
+        self.model.load(COMPILED_MODEL_PATH)
+        self.model.to_neuron()
+        # self.model.save(COMPILED_MODEL_PATH)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+    def generate(self, inputs: torch.tensor, parameters: Optional[Dict[str, Any]] = None) -> str:
+        try:
+            max_new_tokens = parameters.get("max_new_tokens", 256)
+            top_k = parameters.get("top_k", 100)
+            top_p = parameters.get("top_p", 0.1)
+            temperature = parameters.get("temperature", 0.1)
+            no_repeat_ngram_size = parameters.get("no_repeat_ngram_size", 3)
+            with torch.inference_mode():
+                generated_sequence = self.model.sample(inputs,
+                                                       sequence_length=min(self.n_positions, self.context_length_estimate_start + max_new_tokens),
+                                                       start_ids=None, top_k=top_k, top_p=top_p, temperature=temperature,
+                                                       no_repeat_ngram_size=no_repeat_ngram_size)
+                with concurrent.futures.ThreadPoolExecutor(16) as executor:
+                    decoded_output = list(executor.map(self.tokenizer.decode, generated_sequence))
+            generated_text = decoded_output[0].strip("</s>").strip()
+            return generated_text
+        except Exception as e:
+            logging.error(f"Error generating text: {e}")
+            raise
+# Create FastAPI app
+app = FastAPI()
+mistral_model = MistralModel(model_name=CONFIG_DIR)  # Load Mistral model
+processor = Blip2ImageTrainProcessor(image_size=224, is_training=False)
+def generate_input_ids(tokenizer):
+    conv = conv_templates['thoth'].copy()  # Copy the conversation template
+    qs = "Please describe this video in detail."
+    qs = DEFAULT_VIDEO_START_TOKEN + DEFAULT_VIDEO_TOKEN + DEFAULT_VIDEO_END_TOKEN + '\n' + qs  # Prepend video tokens
+    conv.append_message(conv.roles[0], qs)  # Add the question to the conversation
+    conv.append_message(conv.roles[1], None)  # Add a placeholder for the response
+    prompt = conv.get_prompt()  # Get the conversation prompt
+    input_ids = tokenizer_image_token(prompt, tokenizer, MM_TOKEN_INDEX, return_tensors='pt').unsqueeze(0)  # Tokenize and convert to tensor
+    return input_ids
+def uniform_sample(frames, num_segments):
+    indices = np.linspace(start=0, stop=len(frames) - 1, num=num_segments).astype(
+        int)  # Calculate indices for uniform sampling
+    frames = [frames[ind] for ind in indices]  # Sample frames based on indices
+    return frames
+def image_open_byteio(byte_data):
+    output = Image.open(BytesIO(byte_data)).convert('RGB')
+    return output
+def process_anyres_image(image):
+    new_image = Image.new('RGB', (224, 224), (0, 0, 0))
+    new_image.paste(image.resize((224, 224)), (0, 0))
+    torch_stack = processor.preprocess(new_image).repeat(7,1,1,1)
+    return torch_stack
+# Load model configuration and tokenizer
+config = AutoConfig.from_pretrained(CONFIG_DIR, trust_remote_code=True)
+tokenizer = mistral_model.tokenizer
+input_ids = generate_input_ids(tokenizer)  # Generate input IDs and conversation template
+input_ids = input_ids[0].to('cpu')  # [token_len]
+with torch_neuronx.experimental.neuron_cores_context(start_nc=0, nc_count=2):  # Use Neuron cores for inference
+    vision_module_neuron = torch.jit.load(NEURON_VISION_PATH)
+vision_module_neuron = vision_module_neuron.eval()
+# Load embedding weights and set up embedding module
+padding_idx = config.pad_token_id
+embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx)
+embed_weight = torch.load(EMBED_TOKEN_PATH)
+embed_tokens.load_state_dict(embed_weight)
+embed_tokens = embed_tokens.eval()
+embed_tokens.to(torch.float16).to('cpu')
+#layer norm
+vision_width = 1408
+ln_vision = LayerNorm(vision_width)
+ln_vision_weight = torch.load(LAYERNORM_SAVE_PATH)
+ln_vision.load_state_dict(ln_vision_weight)
+ln_vision = ln_vision.eval()
+ln_vision = ln_vision.to(torch.float32)
+num_query_token = 32
+query_tokens = nn.Parameter(
+    torch.zeros(1, num_query_token, 768)
+)
+query_tokens.data.normal_(mean=0.0, std=0.02)
+query_tokens_weight = torch.load(QUERY_TOKEN_PATH)['query_tokens']
+query_tokens.data = query_tokens_weight
+frame_position_encoding = nn.Embedding(10, 768)
+frame_position_encoding_weight = torch.load(POSITION_ENCODING_SAVE_PATH)
+frame_position_encoding.load_state_dict(frame_position_encoding_weight)
+projector = nn.Linear(config.mm_hidden_size, config.hidden_size)
+projector_weight = torch.load(PROJECTOR_PATH)
+projector.load_state_dict(projector_weight)
+neuron_bert = torch.jit.load(NEURON_BERT_PATH)
+neuron_bert = neuron_bert.eval()
+@app.post("/generate")
+async def generate(request: Request) -> Dict[str, str]:
+    """
+    Generate text using the Mistral model.
+    Args:
+        request (Request): The incoming request object.
+    Returns:
+        Dict[str, str]: A dictionary containing the generated text or an error message.
+    """
+    try:
+        s1 = time.time()
+        request_payload = await request.json()
+        request_payload_keys = request_payload.keys()
+        s11 = time.time()
+        print("request_payload_keys time: ", s11-s1)
+        if "images" in request_payload_keys:  # If input is a list of images
+            packed_data = request_payload.get("images")
+            s12 = time.time()
+            print("packed_data time: ", s12-s11)
+            with concurrent.futures.ThreadPoolExecutor(10) as executor:
+                unpacked_data = list(executor.map(base64.b64decode, packed_data))
+            s13 = time.time()
+            print("unpacked_data time: ", s13-s12)
+            with concurrent.futures.ThreadPoolExecutor(10) as executor:
+                input_images = list(executor.map(image_open_byteio, unpacked_data))
+            s14 = time.time()
+            print("image_open_byteio time: ", s14-s13)
+            input_images = uniform_sample(input_images, NUM_SEGMENTS)  # Sample frames
+            s15 = time.time()
+            print("uniform_sample time: ", s15-s14)
+            with concurrent.futures.ThreadPoolExecutor(10) as executor:
+                new_images = list(executor.map(process_anyres_image, input_images))
+            input_images = torch.stack(new_images, dim=0)
+            s16 = time.time()
+            print("process_images_v2 time: ", s16-s15)
+            print("s1 - input_images time: ", time.time() - s1)
+            si = time.time()
+            with torch.inference_mode():  # Enable inference mode
+                with concurrent.futures.ThreadPoolExecutor(2) as executor:  # Use thread pool for parallel processing
+                    image_features_list = list(executor.map(vision_module_neuron, input_images))
+                image_features = torch.cat(image_features_list, dim=0)  # Concatenate image features
+                print("si - image_features neuron time: ", time.time() - si)
+                s2 = time.time()
+                image_features = ln_vision(image_features)
+                attn_mask = torch.ones(image_features.size()[:-1], dtype=torch.long).to(image_features.device)
+                query_tokens_inputs = query_tokens.expand(image_features.shape[0], -1, -1)
+                image_features = neuron_bert(
+                    query_tokens_inputs.to(torch.float32),
+                    image_features.to(torch.float32),
+                    attn_mask.to(torch.int64)
+                )["last_hidden_state"].to(torch.float32)
+                frame_ids = torch.arange(input_images.shape[0], dtype=torch.long, device=image_features.device).unsqueeze(1)
+                frame_ids = frame_ids.repeat(1, input_images.shape[1]).flatten(0, 1)  # [num_frames * num_patches]
+                image_features += frame_position_encoding(frame_ids).unsqueeze(-2)  # [num_frames, 1, 768]
+                projected_features = projector(image_features)
+                image_features = projected_features.flatten(0, 1)
+                print(image_features.shape)
+                image_features.to(device='cpu', dtype=torch.float16)  # Convert to float16 and move to CPU
+                print("s2 - image_features prepare time: ", time.time() - s2)
+                s3 = time.time()
+                vision_token_indice = torch.where(input_ids == MM_TOKEN_INDEX)[0][0]  # Get index of vision token
+                pre_text_token = embed_tokens(input_ids[:vision_token_indice])  # Embed tokens before vision token
+                post_text_token = embed_tokens(input_ids[vision_token_indice + 1:])  # Embed tokens after vision token
+                print("s3 - text_token time: ", time.time() - s3)
+                s4 = time.time()
+                inputs_embeds = torch.cat([pre_text_token, image_features, post_text_token]).unsqueeze(0)  # Concatenate input embeddings
+                print("s4 - inputs time: ", time.time() - s4)
+        else:
+            raise HTTPException(status_code=400, detail="Please provide correct input")
+        s5 = time.time()
+        parameters = request_payload.get("parameters", {})  # Get additional parameters
+        generated_text = mistral_model.generate(inputs_embeds, parameters)  # Generate text using Mistral model
+        print("s5 - generated_text time: ", time.time() - s5)
+        print("total inference time: ", time.time() - si)
+        return {"generated_text": generated_text}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error generating text: {str(e)}")
\ No newline at end of file
diff --git a/conversions/neuron_projector_compile.py b/conversions/neuron_projector_compile.py
new file mode 100644
index 0000000000000000000000000000000000000000..d36f49791ee976474fdb84d5bf5a6596935462d5
--- /dev/null
+++ b/conversions/neuron_projector_compile.py
@@ -0,0 +1,18 @@
+import os
+import torch
+import torch_neuronx
+
+projector = torch.nn.Linear(768, 4096)
+WEIGHT_ROOT = '/root/inf2_dir_0531/'
+PROJECTOR_PATH = os.path.join(WEIGHT_ROOT, "inf2_weights", 'projector.pth')
+
+projector_weight = torch.load(PROJECTOR_PATH)
+projector.load_state_dict(projector_weight)
+
+projector.eval()
+example=torch.zeros((70, 32, 768), dtype=torch.float32)
+neuron_projector = torch_neuronx.trace(projector, example)
+
+filename = 'neuron_projector.pt'
+filepath = os.path.join(WEIGHT_ROOT, filename, "inf2_weights")
+torch.jit.save(neuron_projector, filepath)
\ No newline at end of file
diff --git a/conversions/qformer_tian.py b/conversions/qformer_tian.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffd6fa5cc9321859aef735970adbeb211c95866e
--- /dev/null
+++ b/conversions/qformer_tian.py
@@ -0,0 +1,1151 @@
+"""
+Adapted from salesforce@LAVIS. Below is the original copyright:
+ * Copyright (c) 2023, salesforce.com, inc.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+ * By Junnan Li
+ * Based on huggingface code base
+ * https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/models/bert
+"""
+
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple, Dict, Any
+
+import torch
+from torch import Tensor, device, dtype, nn
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+import torch.nn.functional as F
+
+from transformers.activations import ACT2FN
+from transformers.file_utils import (
+    ModelOutput,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import (
+    PreTrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from transformers.utils import logging
+from transformers.models.bert.configuration_bert import BertConfig
+from llava.model.utils import LayerNorm
+
+logger = logging.get_logger(__name__)
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word and position embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = None
+        self.position_embeddings = None
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.LayerNorm = LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))
+        )
+        self.position_embedding_type = getattr(
+            config, "position_embedding_type", "absolute"
+        )
+
+        self.config = config
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        query_embeds=None,
+        past_key_values_length=0,
+    ):
+        seq_length = 0
+
+        position_ids = self.position_ids[
+            :, past_key_values_length : seq_length + past_key_values_length
+            ].clone()
+
+        embeddings = query_embeds
+
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertSelfAttention(nn.Module):
+    def __init__(self, config, is_cross_attention):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+            config, "embedding_size"
+        ):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_width, self.all_head_size)
+            self.value = nn.Linear(config.encoder_width, self.all_head_size)
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if (
+            self.position_embedding_type == "relative_key"
+            or self.position_embedding_type == "relative_key_query"
+        ):
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(
+                2 * config.max_position_embeddings - 1, self.attention_head_size
+            )
+        self.save_attention = False
+
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+
+    def get_attn_gradients(self):
+        return self.attn_gradients
+
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+
+    def get_attention_map(self):
+        return self.attention_map
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (
+            self.num_attention_heads,
+            self.attention_head_size,
+        )
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        mixed_query_layer = self.query(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if (
+            self.position_embedding_type == "relative_key"
+            or self.position_embedding_type == "relative_key_query"
+        ):
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(
+                seq_length, dtype=torch.long, device=hidden_states.device
+            ).view(-1, 1)
+            position_ids_r = torch.arange(
+                seq_length, dtype=torch.long, device=hidden_states.device
+            ).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(
+                distance + self.max_position_embeddings - 1
+            )
+            positional_embedding = positional_embedding.to(
+                dtype=query_layer.dtype
+            )  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum(
+                    "bhld,lrd->bhlr", query_layer, positional_embedding
+                )
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum(
+                    "bhld,lrd->bhlr", query_layer, positional_embedding
+                )
+                relative_position_scores_key = torch.einsum(
+                    "bhrd,lrd->bhlr", key_layer, positional_embedding
+                )
+                attention_scores = (
+                    attention_scores
+                    + relative_position_scores_query
+                    + relative_position_scores_key
+                )
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores.to(torch.float32)).to(attention_scores.dtype)
+
+        if is_cross_attention and self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+
+        context_layer = torch.matmul(attention_probs_dropped, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (
+            (context_layer, attention_probs) if output_attentions else (context_layer,)
+        )
+
+        outputs = outputs + (past_key_value,)
+        return outputs
+
+
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.LayerNorm = LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.self = BertSelfAttention(config, is_cross_attention)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads,
+            self.self.num_attention_heads,
+            self.self.attention_head_size,
+            self.pruned_heads,
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = (
+            self.self.attention_head_size * self.self.num_attention_heads
+        )
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[
+            1:
+        ]  # add attentions if we output them
+        return outputs
+
+
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.LayerNorm = LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+    def __init__(self, config, layer_num):
+        super().__init__()
+        self.config = config
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config)
+        self.layer_num = layer_num
+        if (
+            self.config.add_cross_attention
+            and layer_num % self.config.cross_attention_freq == 0
+        ):
+            self.crossattention = BertAttention(
+                config, is_cross_attention=self.config.add_cross_attention
+            )
+            self.has_cross_attention = True
+        else:
+            self.has_cross_attention = False
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+        self.intermediate_query = BertIntermediate(config)
+        self.output_query = BertOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        query_length=0,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = (
+            past_key_value[:2] if past_key_value is not None else None
+        )
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:-1]
+
+        present_key_value = self_attention_outputs[-1]
+
+        if query_length > 0:
+            query_attention_output = attention_output[:, :query_length, :]
+
+            if self.has_cross_attention:
+                assert (
+                    encoder_hidden_states is not None
+                ), "encoder_hidden_states must be given for cross-attention layers"
+                cross_attention_outputs = self.crossattention(
+                    query_attention_output,
+                    attention_mask,
+                    head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+                query_attention_output = cross_attention_outputs[0]
+                outputs = (
+                    outputs + cross_attention_outputs[1:-1]
+                )  # add cross attentions if we output attention weights
+
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk_query,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                query_attention_output,
+            )
+            if attention_output.shape[1] > query_length:
+                layer_output_text = apply_chunking_to_forward(
+                    self.feed_forward_chunk,
+                    self.chunk_size_feed_forward,
+                    self.seq_len_dim,
+                    attention_output[:, query_length:, :],
+                )
+                layer_output = torch.cat([layer_output, layer_output_text], dim=1)
+        else:
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                attention_output,
+            )
+        outputs = (layer_output,) + outputs
+
+        outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+    def feed_forward_chunk_query(self, attention_output):
+        intermediate_output = self.intermediate_query(attention_output)
+        layer_output = self.output_query(intermediate_output, attention_output)
+        return layer_output
+
+
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [BertLayer(config, i) for i in range(config.num_hidden_layers)]
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        query_length=0,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = (
+            () if output_attentions and self.config.add_cross_attention else None
+        )
+
+        next_decoder_cache = () if use_cache else None
+
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+
+                if use_cache:
+                    logger.warn(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(
+                            *inputs, past_key_value, output_attentions, query_length
+                        )
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                    query_length,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.LayerNorm = LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class BertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = BertConfig
+    base_model_prefix = "bert"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+
+class BertModel(BertPreTrainedModel):
+    """
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+    input to the forward pass.
+    """
+
+    def __init__(self, config, add_pooling_layer=False):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = BertEmbeddings(config)
+
+        self.encoder = BertEncoder(config)
+
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_extended_attention_mask(
+        self,
+        attention_mask: Tensor,
+        input_shape: Tuple[int],
+        device: device,
+        is_decoder: bool,
+        has_query: bool = False,
+    ) -> Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+
+        Arguments:
+            attention_mask (:obj:`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (:obj:`Tuple[int]`):
+                The shape of the input to the model.
+            device: (:obj:`torch.device`):
+                The device of the input to the model.
+
+        Returns:
+            :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if is_decoder:
+                batch_size, seq_length = input_shape
+
+                seq_ids = torch.arange(seq_length, device=device)
+                causal_mask = (
+                    seq_ids[None, None, :].repeat(batch_size, seq_length, 1)
+                    <= seq_ids[None, :, None]
+                )
+
+                # add a prefix ones mask to the causal mask
+                # causal and attention masks must have same type with pytorch version < 1.3
+                causal_mask = causal_mask.to(attention_mask.dtype)
+
+                if causal_mask.shape[1] < attention_mask.shape[1]:
+                    prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
+                    if has_query:  # UniLM style attention mask
+                        causal_mask = torch.cat(
+                            [
+                                torch.zeros(
+                                    (batch_size, prefix_seq_len, seq_length),
+                                    device=device,
+                                    dtype=causal_mask.dtype,
+                                ),
+                                causal_mask,
+                            ],
+                            axis=1,
+                        )
+                    causal_mask = torch.cat(
+                        [
+                            torch.ones(
+                                (batch_size, causal_mask.shape[1], prefix_seq_len),
+                                device=device,
+                                dtype=causal_mask.dtype,
+                            ),
+                            causal_mask,
+                        ],
+                        axis=-1,
+                    )
+                extended_attention_mask = (
+                    causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
+                )
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(
+            dtype=self.dtype
+        )  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    def forward(
+        self,
+        query_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = self.config.output_attentions
+
+        output_hidden_states = self.config.output_hidden_states
+
+        return_dict = True
+        
+        use_cache = False
+
+        input_ids = None
+        
+        position_ids = None
+
+        # past_key_values_length
+        past_key_values_length = 0
+
+        query_length = query_embeds.shape[1]
+        
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            query_embeds=query_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+
+        input_shape = embedding_output.size()[:-1]
+        batch_size = input_shape[-2]
+        seq_length = input_shape[-1]
+        device = embedding_output.device
+
+        attention_mask = None
+        attention_mask = torch.ones(
+            ((batch_size, seq_length + past_key_values_length)), device=device
+            )
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        is_decoder = False
+        extended_attention_mask = self.get_extended_attention_mask(
+                attention_mask, input_shape, device, is_decoder
+            )
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        print("encoder_hidden_states.size(): ", encoder_hidden_states.size())
+        encoder_batch_size = encoder_hidden_states.size()[-3]
+        encoder_sequence_length = encoder_hidden_states.size()[-2]
+        encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+        print("encoder_hidden_shape: ", encoder_hidden_shape)
+
+        encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = None
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        past_key_values = None
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            query_length=query_length,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = None
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+class BertLMHeadModel(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        query_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=True,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        return_logits=False,
+        is_decoder=True,
+        reduction="mean",
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
+            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        Returns:
+        Example::
+            >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
+            >>> import torch
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+            >>> config = BertConfig.from_pretrained("bert-base-cased")
+            >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+            >>> prediction_logits = outputs.logits
+        """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        if labels is not None:
+            use_cache = False
+        if past_key_values is not None:
+            query_embeds = None
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            query_embeds=query_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+        )
+
+        sequence_output = outputs[0]
+        if query_embeds is not None:
+            sequence_output = outputs[0][:, query_embeds.shape[1] :, :]
+
+        prediction_scores = self.cls(sequence_output)
+
+        if return_logits:
+            return prediction_scores[:, :-1, :].contiguous()
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1)
+            lm_loss = loss_fct(
+                shifted_prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1),
+            )
+            if reduction == "none":
+                lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1)
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, query_embeds, past=None, attention_mask=None, **model_kwargs
+    ):
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_ids.shape)
+        query_mask = input_ids.new_ones(query_embeds.shape[:-1])
+        attention_mask = torch.cat([query_mask, attention_mask], dim=-1)
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            "input_ids": input_ids,
+            "query_embeds": query_embeds,
+            "attention_mask": attention_mask,
+            "past_key_values": past,
+            "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None),
+            "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None),
+            "is_decoder": True,
+        }
+
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (
+                tuple(
+                    past_state.index_select(0, beam_idx) for past_state in layer_past
+                ),
+            )
+        return reordered_past
+
+
+class BertForMaskedLM(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        query_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        return_logits=False,
+        is_decoder=False,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        """
+
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            query_embeds=query_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+        )
+
+        if query_embeds is not None:
+            sequence_output = outputs[0][:, query_embeds.shape[1] :, :]
+        prediction_scores = self.cls(sequence_output)
+
+        if return_logits:
+            return prediction_scores
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)
+            )
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return (
+                ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+            )
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/doas-install.sh b/doas-install.sh
new file mode 100644
index 0000000000000000000000000000000000000000..fe5db62548166a843b8fabe416e5c0db5c97ab54
--- /dev/null
+++ b/doas-install.sh
@@ -0,0 +1,693 @@
+#!/bin/bash
+
+set -u
+
+# If DOAS_UPDATE_ROOT is unset or empty, default it.
+DOAS_SCM_REPO_NAME=${DOAS_SCM_REPO_NAME:-security/zti/doas}
+DOAS_UPDATE_ROOT="${DOAS_UPDATE_ROOT:-https://tosv.byted.org/obj}"
+
+DOAS_VALID_ENV_VARS=( "cn" "sg" "us" "i18n" "ttp" )
+DOAS_VALID_TOS_ENV_VARS=( "cn" "sg" "us" "i18n" )
+
+usage() {
+    cat 1>&2 <<EOF
+doas-install - install doas
+The installer for doas
+
+USAGE:
+    doas-install [FLAGS] [OPTIONS]
+
+FLAGS:
+    -v, --verbose           Enable verbose output
+    -q, --quiet             Disable progress output
+    -y                      Disable confirmation prompt.
+    -h, --help              Prints help information
+    --scm                   Use SCM source to download doas
+
+OPTIONS:
+    -e, --env <>...         Runtime Environment (cn/us/i18n/ttp)
+EOF
+}
+
+
+main() {
+    downloader --check
+    need_cmd uname
+    need_cmd mktemp
+    need_cmd chmod
+    need_cmd mkdir
+    need_cmd rm
+    need_cmd rmdir
+    need_cmd mv
+    need_cmd cp
+
+    get_architecture || return 1
+    local _arch="$RETVAL"
+    assert_nz "$_arch" "arch"
+    local _result=${_env:-}
+    local _version
+    local _dir
+    _dir="$(ensure mktemp -d)"
+    local _file="${_dir}/doas"
+    local _version_file="${_dir}/doas_version"
+    local _doas_path
+
+    local _ansi_escapes_are_valid=false
+    if [ -t 2 ]; then
+        if [ "${TERM+set}" = 'set' ]; then
+            case "$TERM" in
+                xterm*|rxvt*|urxvt*|linux*|vt*)
+                    _ansi_escapes_are_valid=true
+                ;;
+            esac
+        fi
+    fi
+
+    # check if we have to use /dev/tty to prompt the user
+    local need_tty=yes
+    # check if we only to use SCM download doas
+    local only_scm=no
+    local _env=${_env:-}
+    for arg in "$@"; do
+        case "$arg" in
+            -h|--help)
+                usage
+                exit 0
+                ;;
+            -y)
+                shift
+                # user wants to skip the prompt -- we don't need /dev/tty
+                need_tty=no
+                ;;
+            --scm)
+                shift
+                # user wants to use SCM to download doas
+                only_scm=yes
+                ;;
+            -e|--env)
+                shift
+                _env="$1"
+                if [[ ! " ${DOAS_VALID_ENV_VARS[*]} " =~ " ${_env} " ]]; then
+                    echo "Invalid environment: ${_env}" >&2
+                    exit 1
+                fi
+                ;;
+            *)
+                ;;
+        esac
+    done
+
+    if $_ansi_escapes_are_valid; then
+        printf "\33[1minfo:\33[0m detecting doas lastest version\n" 1>&2
+    else
+        printf '%s\n' 'info: detecting doas lastest version' 1>&2
+    fi
+    
+    ensure mkdir -p "$_dir"
+
+    if [ ! "$only_scm" = "yes" ]; then
+        if $_ansi_escapes_are_valid; then
+            printf "\33[1minfo:\33[0m downloading doas binary from TOS bucket\n" 1>&2
+        else
+            printf '%s\n' 'info: downloading doas binary from TOS bucket' 1>&2
+        fi
+        if tos_downloader "$_env" "$_version_file" "$_file" "$_arch"; then
+            _result=1
+        fi
+        ignore rm "${_version_file}"
+    fi
+
+    if [[ -z "$_result" && $_arch = "linux-amd64" ]]; then
+        if $_ansi_escapes_are_valid; then
+            printf "\33[1minfo:\33[0m downloading doas binary from SCM\n" 1>&2
+        else
+            printf '%s\n' 'info: downloading doas binary from SCM' 1>&2
+        fi
+        if scm_downloader "$_env" "$_dir" "$_file" "$_arch"; then
+            _result=1
+        fi
+    fi
+
+    if [ -z "$_result" ]; then
+        if $_ansi_escapes_are_valid; then
+            printf "\33[1merror:\33[0m failed to download doas\n" 1>&2
+        else
+            printf '%s\n' 'error: failed to download doas' 1>&2
+        fi
+        exit 1
+    fi
+
+    ensure chmod u+x "$_file"
+    case "$_arch" in
+
+        *darwin*)
+        if check_cmd xattr; then
+            xattr -p com.apple.quarantine "${_file}" >/dev/null 2>&1
+            status=$?
+            if [ -z $status ]; then
+                xattr -d com.apple.quarantine "${_file}"
+            fi 
+        else
+            echo "warning: xattr not found, skipping com.apple.quarantine"
+        fi
+        ;;
+    esac
+
+    if [ ! -x "$_file" ]; then
+        printf '%s\n' "Cannot execute $_file (likely because of mounting /tmp as noexec)." 1>&2
+        printf '%s\n' "Please copy the file to a location where you can execute binaries and run ./doas install." 1>&2
+        exit 1
+    fi
+
+    if [ "$need_tty" = "yes" ]; then
+        # The installer is going to want to ask for confirmation by
+        # reading stdin.  This script was piped into `sh` though and
+        # doesn't have stdin to pass to its children. Instead we're going
+        # to explicitly connect /dev/tty to the installer's stdin.
+        if [ ! -t 1 ]; then
+            err "Unable to run interactively. Run with -y to accept defaults, --help for additional options"
+        fi
+
+        ignore "$_file" install < /dev/tty
+    else
+        ignore "$_file" install
+    fi
+
+    local _retval=$?
+
+    _doas_path=$(command -v doas)
+    if [[ -z $_doas_path ]]; then
+        cp -rf "$_file" "${PWD}/doas"
+        if $_ansi_escapes_are_valid; then
+            printf "\33[1mError:\33[0m No installation permission, please manually move doas from the current directory to \$PATH\n" 1>&2
+        else
+            printf '%s\n' "Error: No installation permission, please manually move doas from the current directory to \$PATH" 1>&2
+        fi
+    else
+        if ! cmp_binary "$_file" "$_doas_path"; then
+            cp -rf "$_file" "${PWD}/doas"
+            if $_ansi_escapes_are_valid; then
+                printf "\33[1mError:\33[0m No installation permission, please manually move doas from the current directory to \$PATH\n" 1>&2
+            else
+                printf '%s\n' "Error: No installation permission, please manually move doas from the current directory to \$PATH" 1>&2
+            fi
+        else
+            if $_ansi_escapes_are_valid; then
+                printf "\33[1minfo:\33[0m doas installed successfully, current path: %s\n" "$_doas_path" 1>&2
+            else
+                printf '%s\n' "info: doas installed successfully, current path: $_doas_path" 1>&2
+            fi
+        fi
+    fi
+
+    ignore rm "${_file}"
+    ignore rmdir "$_dir"
+
+    return "$_retval"
+}
+
+cmp_binary() {
+    local _status
+    _status=$(cmp --silent "$1" "$2"; echo $?)
+    return "$_status"
+}
+# This is just for indicating that commands' results are being
+# intentionally ignored. Usually, because it's being executed
+# as part of error handling.
+ignore() {
+    "$@"
+}
+
+assert_nz() {
+    if [ -z "$1" ]; then err "assert_nz $2"; fi
+}
+
+check_proc() {
+    # Check for /proc by looking for the /proc/self/exe link
+    # This is only run on Linux
+    if ! test -L /proc/self/exe ; then
+        err "fatal: Unable to find /proc/self/exe.  Is /proc mounted?  Installation cannot proceed without /proc."
+    fi
+}
+
+get_bitness() {
+    need_cmd head
+    # Architecture detection without dependencies beyond coreutils.
+    # ELF files start out "\x7fELF", and the following byte is
+    #   0x01 for 32-bit and
+    #   0x02 for 64-bit.
+    # The printf builtin on some shells like dash only supports octal
+    # escape sequences, so we use those.
+    local _current_exe_head
+    _current_exe_head=$(head -c 5 /proc/self/exe )
+    if [ "$_current_exe_head" = "$(printf '\177ELF\001')" ]; then
+        echo 32
+    elif [ "$_current_exe_head" = "$(printf '\177ELF\002')" ]; then
+        echo 64
+    else
+        err "unknown platform bitness"
+    fi
+}
+
+is_host_amd64_elf() {
+    need_cmd head
+    need_cmd tail
+    # ELF e_machine detection without dependencies beyond coreutils.
+    # Two-byte field at offset 0x12 indicates the CPU,
+    # but we're interested in it being 0x3E to indicate amd64, or not that.
+    local _current_exe_machine
+    _current_exe_machine=$(head -c 19 /proc/self/exe | tail -c 1)
+    [ "$_current_exe_machine" = "$(printf '\076')" ]
+}
+
+say() {
+    printf 'doas-install: %s\n' "$1"
+}
+
+err() {
+    say "$1" >&2
+    exit 1
+}
+
+need_cmd() {
+    if ! check_cmd "$1"; then
+        err "need '$1' (command not found)"
+    fi
+}
+
+check_cmd() {
+    command -v "$1" > /dev/null 2>&1
+}
+
+
+# This wraps curl or wget. Try curl first, if not installed,
+# use wget instead.
+downloader() {
+    local _dld
+    local _ciphersuites
+    local _err
+    local _status
+    if check_cmd curl; then
+        _dld=curl
+    elif check_cmd wget; then
+        _dld=wget
+    else
+        _dld='curl or wget' # to be used in error message of need_cmd
+    fi
+
+    if [ "$1" = --check ]; then
+        need_cmd "$_dld"
+    elif [ "$_dld" = curl ]; then
+        get_ciphersuites_for_curl
+        _ciphersuites="$RETVAL"
+        if [ -n "$_ciphersuites" ]; then
+            _err=$(curl --proto '=https' --tlsv1.2 --ciphers "$_ciphersuites" --silent --show-error --fail --location "$1" --output "$2" 2>&1)
+            _status=$?
+        else
+            echo "Warning: Not enforcing strong cipher suites for TLS, this is potentially less secure"
+            if ! check_help_for "$3" curl --proto --tlsv1.2; then
+                echo "Warning: Not enforcing TLS v1.2, this is potentially less secure"
+                _err=$(curl --silent --show-error --fail --location "$1" --output "$2" 2>&1)
+                _status=$?
+            else
+                _err=$(curl --proto '=https' --tlsv1.2 --silent --show-error --fail --location "$1" --output "$2" 2>&1)
+                _status=$?
+            fi
+        fi
+        if [ -n "$_err" ]; then
+            echo "$_err" >&2
+            if echo "$_err" | grep -q 404$; then
+                if [[ ! "$1" =~ "CURRENT_VERSION" ]]; then
+                    err "installer for platform '$3' not found, this may be unsupported"
+                fi
+            fi
+        fi
+        return $_status
+    elif [ "$_dld" = wget ]; then
+        get_ciphersuites_for_wget
+        _ciphersuites="$RETVAL"
+        if [ -n "$_ciphersuites" ]; then
+            _err=$(wget --https-only --secure-protocol=TLSv1_2 --ciphers "$_ciphersuites" "$1" -O "$2" 2>&1)
+            _status=$?
+        else
+            echo "Warning: Not enforcing strong cipher suites for TLS, this is potentially less secure"
+            if ! check_help_for "$3" wget --https-only --secure-protocol; then
+                echo "Warning: Not enforcing TLS v1.2, this is potentially less secure"
+                _err=$(wget "$1" -O "$2" 2>&1)
+                _status=$?
+            else
+                _err=$(wget --https-only --secure-protocol=TLSv1_2 "$1" -O "$2" 2>&1)
+                _status=$?
+            fi
+        fi
+        if [ -n "$_err" ]; then
+            echo "$_err" >&2
+            if echo "$_err" | grep -q ' 404 Not Found$'; then
+                if [[ ! "$1" =~ "CURRENT_VERSION" ]]; then
+                    err "installer for platform '$3' not found, this may be unsupported"
+                fi
+            fi
+        fi
+        return $_status
+    else
+        err "Unknown downloader"   # should not reach here
+    fi
+}
+
+check_help_for() {
+    local _arch
+    local _cmd
+    local _arg
+    _arch="$1"
+    shift
+    _cmd="$1"
+    shift
+
+    local _category
+    if "$_cmd" --help | grep -q 'For all options use the manual or "--help all".'; then
+      _category="all"
+    else
+      _category=""
+    fi
+
+    case "$_arch" in
+
+        *darwin*)
+        if check_cmd sw_vers; then
+            case $(sw_vers -productVersion) in
+                10.*)
+                    # If we're running on macOS, older than 10.13, then we always
+                    # fail to find these options to force fallback
+                    if [ "$(sw_vers -productVersion | cut -d. -f2)" -lt 13 ]; then
+                        # Older than 10.13
+                        echo "Warning: Detected macOS platform older than 10.13"
+                        return 1
+                    fi
+                    ;;
+                11.*)
+                    # We assume Big Sur will be OK for now
+                    ;;
+                *)
+                    # Unknown product version, warn and continue
+                    echo "Warning: Detected unknown macOS major version: $(sw_vers -productVersion)"
+                    echo "Warning TLS capabilities detection may fail"
+                    ;;
+            esac
+        fi
+        ;;
+
+    esac
+
+    for _arg in "$@"; do
+        if ! "$_cmd" --help "$_category" | grep -q -- "$_arg"; then
+            return 1
+        fi
+    done
+
+    true # not strictly needed
+}
+
+get_architecture() {
+    local _ostype _cputype _bitness _arch
+    _ostype="$(uname -s)"
+    _cputype="$(uname -m)"
+
+    if [ "$_ostype" = Darwin ] && [ "$_cputype" = i386 ]; then
+        # Darwin `uname -m` lies
+        if sysctl hw.optional.x86_64 | grep -q ': 1'; then
+            _cputype=x86_64
+        fi
+    fi
+
+    case "$_ostype" in
+
+        Linux)
+            check_proc
+            _ostype=linux
+            _bitness=$(get_bitness)
+            ;;
+
+        FreeBSD)
+            _ostype=freebsd
+            ;;
+
+        Darwin)
+            _ostype=darwin
+            ;;
+
+        MINGW* | MSYS* | CYGWIN*)
+            _ostype=windows
+            ;;
+
+        *)
+            err "unrecognized OS type: $_ostype"
+            ;;
+
+    esac
+
+    case "$_cputype" in
+
+        i386 | i486 | i686 | i786 | x86)
+            _cputype=386
+            ;;
+
+        xscale | arm | armv6l)
+            _cputype=arm
+            ;;
+
+        aarch64 | arm64)
+            _cputype=arm64
+            ;;
+
+        x86_64 | x86-64 | x64 | amd64)
+            _cputype=amd64
+            ;;
+
+        *)
+            err "unsupported CPU type: $_cputype"
+
+    esac
+
+    _arch="${_ostype}-${_cputype}"
+
+    RETVAL="$_arch"
+}
+
+# Return cipher suite string specified by user, otherwise return strong TLS 1.2-1.3 cipher suites
+# if support by local tools is detected. Detection currently supports these wget backends: 
+# GnuTLS and OpenSSL (possibly also LibreSSL and BoringSSL). Return value can be empty.
+get_ciphersuites_for_wget() {
+    if [ -n "${RUSTUP_TLS_CIPHERSUITES-}" ]; then
+        # user specified custom cipher suites, assume they know what they're doing
+        RETVAL="$RUSTUP_TLS_CIPHERSUITES"
+        return
+    fi
+
+    local _cs=""
+    if wget -V | grep -q '\-DHAVE_LIBSSL'; then
+        # "unspecified" is for arch, allows for possibility old OS using macports, homebrew, etc.
+        if check_help_for "notspecified" "wget" "TLSv1_2" "--ciphers" "--https-only" "--secure-protocol"; then
+            _cs=$(get_strong_ciphersuites_for "openssl")
+        fi
+    elif wget -V | grep -q '\-DHAVE_LIBGNUTLS'; then
+        # "unspecified" is for arch, allows for possibility old OS using macports, homebrew, etc.
+        if check_help_for "notspecified" "wget" "TLSv1_2" "--ciphers" "--https-only" "--secure-protocol"; then
+            _cs=$(get_strong_ciphersuites_for "gnutls")
+        fi
+    fi
+
+    RETVAL="$_cs"
+}
+
+
+# Return cipher suite string specified by user, otherwise return strong TLS 1.2-1.3 cipher suites
+# if support by local tools is detected. Detection currently supports these curl backends: 
+# GnuTLS and OpenSSL (possibly also LibreSSL and BoringSSL). Return value can be empty.
+get_ciphersuites_for_curl() {
+    if [ -n "${RUSTUP_TLS_CIPHERSUITES-}" ]; then
+        # user specified custom cipher suites, assume they know what they're doing
+        RETVAL="$RUSTUP_TLS_CIPHERSUITES"
+        return
+    fi
+
+    local _openssl_syntax="no"
+    local _gnutls_syntax="no"
+    local _backend_supported="yes"
+    if curl -V | grep -q ' OpenSSL/'; then
+        _openssl_syntax="yes"
+    elif curl -V | grep -iq ' LibreSSL/'; then
+        _openssl_syntax="yes"
+    elif curl -V | grep -iq ' BoringSSL/'; then
+        _openssl_syntax="yes"
+    elif curl -V | grep -iq ' GnuTLS/'; then
+        _gnutls_syntax="yes"
+    else
+        _backend_supported="no"
+    fi
+
+    local _args_supported="no"
+    if [ "$_backend_supported" = "yes" ]; then
+        # "unspecified" is for arch, allows for possibility old OS using macports, homebrew, etc.
+        if check_help_for "notspecified" "curl" "--tlsv1.2" "--ciphers" "--proto"; then
+            _args_supported="yes"
+        fi
+    fi
+
+    local _cs=""
+    if [ "$_args_supported" = "yes" ]; then
+        if [ "$_openssl_syntax" = "yes" ]; then
+            _cs=$(get_strong_ciphersuites_for "openssl")
+        elif [ "$_gnutls_syntax" = "yes" ]; then
+            _cs=$(get_strong_ciphersuites_for "gnutls")
+        fi
+    fi
+
+    RETVAL="$_cs"
+}
+
+# Run a command that should never fail. If the command fails execution
+# will immediately terminate with an error showing the failing
+# command.
+ensure() {
+    if ! "$@"; then err "command failed: $*"; fi
+}
+
+
+# Return strong TLS 1.2-1.3 cipher suites in OpenSSL or GnuTLS syntax. TLS 1.2 
+# excludes non-ECDHE and non-AEAD cipher suites. DHE is excluded due to bad 
+# DH params often found on servers (see RFC 7919). Sequence matches or is
+# similar to Firefox 68 ESR with weak cipher suites disabled via about:config.  
+# $1 must be openssl or gnutls.
+get_strong_ciphersuites_for() {
+    if [ "$1" = "openssl" ]; then
+        # OpenSSL is forgiving of unknown values, no problems with TLS 1.3 values on versions that don't support it yet.
+        echo "TLS_AES_128_GCM_SHA256:TLS_CHACHA20_POLY1305_SHA256:TLS_AES_256_GCM_SHA384:ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384"
+    elif [ "$1" = "gnutls" ]; then
+        # GnuTLS isn't forgiving of unknown values, so this may require a GnuTLS version that supports TLS 1.3 even if wget doesn't.
+        # Begin with SECURE128 (and higher) then remove/add to build cipher suites. Produces same 9 cipher suites as OpenSSL but in slightly different order.
+        echo "SECURE128:-VERS-SSL3.0:-VERS-TLS1.0:-VERS-TLS1.1:-VERS-DTLS-ALL:-CIPHER-ALL:-MAC-ALL:-KX-ALL:+AEAD:+ECDHE-ECDSA:+ECDHE-RSA:+AES-128-GCM:+CHACHA20-POLY1305:+AES-256-GCM"
+    fi 
+}
+
+get_tos_env_bucket() {
+    if [ "$1" = "cn" ]; then
+        echo "doas-user-binary"
+    elif [ "$1" = "sg" ]; then
+        echo "doas-user-binary-sg"
+    elif [ "$1" = "us" ]; then
+        echo "doas-user-binary-us"
+    elif [ "$1" = "i18n" ]; then
+        echo "doas-user-binary-aiso"
+    elif [ "$1" = "ttp" ]; then
+        err "TTP is not supported to use TOS bucket downloader"
+    fi 
+}
+
+get_scm_env_url() {
+    if [ "$1" = "cn" ]; then
+        echo "https://luban-source.byted.org"
+    elif [ "$1" = "sg" ]; then
+        echo "https://luban-source.byted.org"
+    elif [ "$1" = "us" ]; then
+        echo "https://luban-source-us.byted.org"
+    elif [ "$1" = "i18n" ]; then
+        echo "https://luban-source-us.byted.org"
+    elif [ "$1" = "ttp" ]; then
+        echo "https://luban-source.tiktokd.org"
+    fi 
+}
+
+tos_downloader() {
+    need_cmd gzip
+    local _retval=${_retval:-}
+    local _version
+    local _version_url
+    local _bucket_name
+    local _tos_env=$1
+    local _version_file=$2
+    local _file=$3
+    local _arch=$4
+    if [[ -z "$1" ]]; then
+        for _tos_env in "${DOAS_VALID_TOS_ENV_VARS[@]}"; do
+            _bucket_name=$(get_tos_env_bucket "$_tos_env")
+            _version_url="${DOAS_UPDATE_ROOT}/${_bucket_name}/CURRENT_VERSION"
+            if downloader "$_version_url" "$_version_file" "$_arch"; then
+                _retval="$_tos_env"
+                break
+            fi
+        done
+    elif [[  "$_tos_env" = "ttp" ]]; then
+        err "TOS bucket downloader not supports TTP"
+    else
+        _bucket_name=$(get_tos_env_bucket "$_tos_env")
+        _version_url="${DOAS_UPDATE_ROOT}/${_bucket_name}/CURRENT_VERSION"
+        if downloader "$_version_url" "$_version_file" "$_arch"; then
+            _retval="$_tos_env"
+        fi
+    fi
+
+    if [[ -z "$_retval" ]]; then 
+        echo "Unable to download doas from TOS bucket" 
+        return 1
+    fi
+
+    _version="$(cat "$_version_file")"
+    if [[ -z "$_version" ]]; then
+        echo "Unable to read version from $_version_file"
+        return 1
+    fi
+    local _url="${DOAS_UPDATE_ROOT}/${_bucket_name}/${_version}/doas-${_arch}.gz"
+    ensure downloader "$_url" "${_file}.gz" "$_arch"
+    ensure gzip -d "${_file}.gz"
+
+    true
+}
+
+scm_downloader() {
+    need_cmd tar
+    local _retval=${_retval:-}
+    local _scm_host
+    local _scm_url
+    local _compress_ext=".tar.gz"
+    local _dir=$2
+    local _file=$3
+    local _arch=$4
+    if [[ -z "$1" ]]; then
+        for _env in "${DOAS_VALID_ENV_VARS[@]}"; do
+            _scm_host=$(get_scm_env_url "$_env")
+            _scm_url="${_scm_host}/repository/scm/api/v1/download_latest/?name=${DOAS_SCM_REPO_NAME}"
+            if downloader "$_scm_url" "${_file}${_compress_ext}" "$_arch"; then
+                _retval="$_env"
+                break
+            fi
+        done
+    else
+        _scm_host=$(get_scm_env_url "$1")
+        _scm_url="${_scm_host}/repository/scm/api/v1/download_latest/?name=${DOAS_SCM_REPO_NAME}"
+        if downloader "$_scm_url" "${_file}${_compress_ext}" "$_arch"; then
+            _retval="$1"
+        fi
+    fi
+
+    if [[ -z "$_retval" ]]; then 
+        echo "Unable to download doas from SCM Repo ${DOAS_SCM_REPO_NAME}" 
+        return 1
+    fi
+    
+    ensure tar zxf "${_file}${_compress_ext}" -C "$_dir"
+    ensure mv "$_dir/bin/doas" "$_file"
+
+    ignore rm "${_file}${_compress_ext}"
+    ignore rmdir "$_dir/bin"
+    ignore rm "$_dir/current_revision"
+
+    true
+}
+
+SUDO_USER=${SUDO_USER:-}
+if [ -n "$SUDO_USER" ]; then
+    SUDO_PATH=$(sudo -Hiu $SUDO_USER printenv PATH)
+fi
+export SUDO_PATH=${SUDO_PATH:-}
+
+main "$@" || exit 1
\ No newline at end of file
diff --git a/docker-compose-batch.yml b/docker-compose-batch.yml
new file mode 100644
index 0000000000000000000000000000000000000000..ab92a3588b48cd8cf369310ade2e3572e984fdaf
--- /dev/null
+++ b/docker-compose-batch.yml
@@ -0,0 +1,96 @@
+services:
+  masp:
+    image: masp_inf2_b16
+    deploy:
+    cpuset: "0-31"
+    container_name: masp
+    ports:
+      - 8000:8000
+    devices:
+      - /dev/neuron0
+      - /dev/neuron1
+    volumes:
+      - ~/shared_storage/inf2_weights:/home/ubuntu/inf2_weights
+      - ~/shared_storage/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch:/home/ubuntu/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch
+      - ~/shared_storage/llava:/app/llava
+      - ~/shared_storage/mistral-compiled-bf16-b16:/home/ubuntu/mistral-compiled-bf16-b16
+
+  masp2:
+    image: masp_inf2_b16
+    deploy:
+    cpuset: "32-63"
+    container_name: masp2
+    ports:
+      - 8001:8000
+    devices:
+      - /dev/neuron2
+      - /dev/neuron3
+    volumes:
+      - ~/shared_storage/inf2_weights:/home/ubuntu/inf2_weights
+      - ~/shared_storage/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch:/home/ubuntu/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch
+      - ~/shared_storage/llava:/app/llava
+      - ~/shared_storage/mistral-compiled-bf16-b16:/home/ubuntu/mistral-compiled-bf16-b16
+
+  masp3:
+    image: masp_inf2_b16
+    deploy:
+    cpuset: "64-95"
+    container_name: masp3
+    ports:
+      - 8002:8000
+    devices:
+      - /dev/neuron4
+      - /dev/neuron5
+    volumes:
+      - ~/shared_storage/inf2_weights:/home/ubuntu/inf2_weights
+      - ~/shared_storage/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch:/home/ubuntu/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch
+      - ~/shared_storage/llava:/app/llava
+      - ~/shared_storage/mistral-compiled-bf16-b16:/home/ubuntu/mistral-compiled-bf16-b16
+
+  masp4:
+    image: masp_inf2_b16
+    deploy:
+    cpuset: "96-127"
+    container_name: masp4
+    ports:
+      - 8003:8000
+    devices:
+      - /dev/neuron6
+      - /dev/neuron7
+    volumes:
+      - ~/shared_storage/inf2_weights:/home/ubuntu/inf2_weights
+      - ~/shared_storage/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch:/home/ubuntu/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch
+      - ~/shared_storage/llava:/app/llava
+      - ~/shared_storage/mistral-compiled-bf16-b16:/home/ubuntu/mistral-compiled-bf16-b16
+
+  masp5:
+    image: masp_inf2_b16
+    deploy:
+    cpuset: "128-159"
+    container_name: masp5
+    ports:
+      - 8004:8000
+    devices:
+      - /dev/neuron8
+      - /dev/neuron9
+    volumes:
+      - ~/shared_storage/inf2_weights:/home/ubuntu/inf2_weights
+      - ~/shared_storage/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch:/home/ubuntu/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch
+      - ~/shared_storage/llava:/app/llava
+      - ~/shared_storage/mistral-compiled-bf16-b16:/home/ubuntu/mistral-compiled-bf16-b16
+
+  masp6:
+    image: masp_inf2_b16
+    deploy:
+    cpuset: "160-191"
+    container_name: masp6
+    ports:
+      - 8005:8000
+    devices:
+      - /dev/neuron10
+      - /dev/neuron11
+    volumes:
+      - ~/shared_storage/inf2_weights:/home/ubuntu/inf2_weights
+      - ~/shared_storage/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch:/home/ubuntu/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch
+      - ~/shared_storage/llava:/app/llava
+      - ~/shared_storage/mistral-compiled-bf16-b16:/home/ubuntu/mistral-compiled-bf16-b16
diff --git a/docker-compose-inf2-48xl.yaml b/docker-compose-inf2-48xl.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9ff7a377a8a2f3e8de3d04b39f886e27b0ea1ab2
--- /dev/null
+++ b/docker-compose-inf2-48xl.yaml
@@ -0,0 +1,172 @@
+version: '3.7'
+
+services:
+  masp-0:
+    image: masp_image:latest
+    deploy:
+      resources:
+        reservations:
+          cpus: '16'
+    ports:
+      - "8000:8000"
+    volumes:
+      - ./app:/app
+      - ./inf2_dir/:/root/inf2_dir/
+    devices:
+      - "/dev/neuron0"
+
+  masp-1:
+    image: masp_image:latest
+    deploy:
+      resources:
+        reservations:
+          cpus: '16'
+    ports:
+      - "8001:8000"
+    volumes:
+      - ./app:/app
+      - ./inf2_dir/:/root/inf2_dir/
+    devices:
+      - "/dev/neuron1"
+
+
+  masp-2:
+    image: masp_image:latest
+    deploy:
+      resources:
+        reservations:
+          cpus: '16'
+    ports:
+      - "8002:8000"
+    volumes:
+      - ./app:/app
+      - ./inf2_dir/:/root/inf2_dir/
+    devices:
+      - "/dev/neuron2"
+
+  masp-3:
+    image: masp_image:latest
+    deploy:
+      resources:
+        reservations:
+          cpus: '16'
+    ports:
+      - "8003:8000"
+    volumes:
+      - ./app:/app
+      - ./inf2_dir/:/root/inf2_dir/
+    devices:
+      - "/dev/neuron3"
+
+  masp-4:
+    image: masp_image:latest
+    deploy:
+      resources:
+        reservations:
+          cpus: '16'
+    ports:
+      - "8004:8000"
+    volumes:
+      - ./app:/app
+      - ./inf2_dir/:/root/inf2_dir/
+    devices:
+      - "/dev/neuron4"
+
+  masp-5:
+    image: masp_image:latest
+    deploy:
+      resources:
+        reservations:
+          cpus: '16'
+    ports:
+      - "8005:8000"
+    volumes:
+      - ./app:/app
+      - ./inf2_dir/:/root/inf2_dir/
+    devices:
+      - "/dev/neuron5"
+
+  masp-6:
+    image: masp_image:latest
+    deploy:
+      resources:
+        reservations:
+          cpus: '16'
+    ports:
+      - "8006:8000"
+    volumes:
+      - ./app:/app
+      - ./inf2_dir/:/root/inf2_dir/
+    devices:
+      - "/dev/neuron6"
+
+  masp-7:
+    image: masp_image:latest
+    deploy:
+      resources:
+        reservations:
+          cpus: '16'
+    ports:
+      - "8007:8000"
+    volumes:
+      - ./app:/app
+      - ./inf2_dir/:/root/inf2_dir/
+    devices:
+      - "/dev/neuron7"
+
+  masp-8:
+    image: masp_image:latest
+    deploy:
+      resources:
+        reservations:
+          cpus: '16'
+    ports:
+      - "8008:8000"
+    volumes:
+      - ./app:/app
+      - ./inf2_dir/:/root/inf2_dir/
+    devices:
+      - "/dev/neuron8"
+
+  masp-9:
+    image: masp_image:latest
+    deploy:
+      resources:
+        reservations:
+          cpus: '16'
+    ports:
+      - "8009:8000"
+    volumes:
+      - ./app:/app
+      - ./inf2_dir/:/root/inf2_dir/
+    devices:
+      - "/dev/neuron9"
+
+  masp-10:
+    image: masp_image:latest
+    deploy:
+      resources:
+        reservations:
+          cpus: '16'
+    ports:
+      - "8010:8000"
+    volumes:
+      - ./app:/app
+      - ./inf2_dir/:/root/inf2_dir/
+    devices:
+      - "/dev/neuron10"
+
+  masp-11:
+    image: masp_image:latest
+    deploy:
+      resources:
+        reservations:
+          cpus: '16'
+    ports:
+      - "8011:8000"
+    volumes:
+      - ./app:/app
+      - ./inf2_dir/:/root/inf2_dir/
+    devices:
+      - "/dev/neuron11"
+
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000000000000000000000000000000000000..f2abb5aad1e8303144672ae6e22f1f4bb4bc6531
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,180 @@
+services:
+  masp:
+    image: masp_inf2
+    deploy:
+    cpuset: "0-15"
+    container_name: masp
+    ports:
+      - 8000:8000
+    devices:
+      - /dev/neuron0
+    volumes:
+      - ~/shared_storage/inf2_weights:/home/ubuntu/inf2_weights
+      - ~/shared_storage/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch:/home/ubuntu/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch
+      - ~/shared_storage/llava:/app/llava
+      - ~/shared_storage/mistral-compiled-bf16:/home/ubuntu/mistral-compiled-bf16
+
+  masp2:
+    image: masp_inf2
+    deploy:
+    cpuset: "16-31"
+    container_name: masp2
+    ports:
+      - 8001:8000
+    devices:
+      - /dev/neuron1
+    volumes:
+      - ~/shared_storage/inf2_weights:/home/ubuntu/inf2_weights
+      - ~/shared_storage/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch:/home/ubuntu/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch
+      - ~/shared_storage/llava:/app/llava
+      - ~/shared_storage/mistral-compiled-bf16:/home/ubuntu/mistral-compiled-bf16
+
+  masp3:
+    image: masp_inf2
+    deploy:
+    cpuset: "32-47"
+    container_name: masp3
+    ports:
+      - 8002:8000
+    devices:
+      - /dev/neuron2
+    volumes:
+      - ~/shared_storage/inf2_weights:/home/ubuntu/inf2_weights
+      - ~/shared_storage/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch:/home/ubuntu/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch
+      - ~/shared_storage/llava:/app/llava
+      - ~/shared_storage/mistral-compiled-bf16:/home/ubuntu/mistral-compiled-bf16
+
+  masp4:
+    image: masp_inf2
+    deploy:
+    cpuset: "48-63"
+    container_name: masp4
+    ports:
+      - 8003:8000
+    devices:
+      - /dev/neuron3
+    volumes:
+      - ~/shared_storage/inf2_weights:/home/ubuntu/inf2_weights
+      - ~/shared_storage/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch:/home/ubuntu/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch
+      - ~/shared_storage/llava:/app/llava
+      - ~/shared_storage/mistral-compiled-bf16:/home/ubuntu/mistral-compiled-bf16
+
+  masp5:
+    image: masp_inf2
+    deploy:
+    cpuset: "64-79"
+    container_name: masp5
+    ports:
+      - 8004:8000
+    devices:
+      - /dev/neuron4
+    volumes:
+      - ~/shared_storage/inf2_weights:/home/ubuntu/inf2_weights
+      - ~/shared_storage/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch:/home/ubuntu/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch
+      - ~/shared_storage/llava:/app/llava
+      - ~/shared_storage/mistral-compiled-bf16:/home/ubuntu/mistral-compiled-bf16
+
+  masp6:
+    image: masp_inf2
+    deploy:
+    cpuset: "80-95"
+    container_name: masp6
+    ports:
+      - 8005:8000
+    devices:
+      - /dev/neuron5
+    volumes:
+      - ~/shared_storage/inf2_weights:/home/ubuntu/inf2_weights
+      - ~/shared_storage/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch:/home/ubuntu/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch
+      - ~/shared_storage/llava:/app/llava
+      - ~/shared_storage/mistral-compiled-bf16:/home/ubuntu/mistral-compiled-bf16
+
+  masp7:
+    image: masp_inf2
+    deploy:
+    cpuset: "96-111"
+    container_name: masp7
+    ports:
+      - 8006:8000
+    devices:
+      - /dev/neuron6
+    volumes:
+      - ~/shared_storage/inf2_weights:/home/ubuntu/inf2_weights
+      - ~/shared_storage/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch:/home/ubuntu/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch
+      - ~/shared_storage/llava:/app/llava
+      - ~/shared_storage/mistral-compiled-bf16:/home/ubuntu/mistral-compiled-bf16
+
+  masp8:
+    image: masp_inf2
+    deploy:
+    cpuset: "112-127"
+    container_name: masp8
+    ports:
+      - 8007:8000
+    devices:
+      - /dev/neuron7
+    volumes:
+      - ~/shared_storage/inf2_weights:/home/ubuntu/inf2_weights
+      - ~/shared_storage/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch:/home/ubuntu/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch
+      - ~/shared_storage/llava:/app/llava
+      - ~/shared_storage/mistral-compiled-bf16:/home/ubuntu/mistral-compiled-bf16
+
+  masp9:
+    image: masp_inf2
+    deploy:
+    cpuset: "128-143"
+    container_name: masp9
+    ports:
+      - 8008:8000
+    devices:
+      - /dev/neuron8
+    volumes:
+      - ~/shared_storage/inf2_weights:/home/ubuntu/inf2_weights
+      - ~/shared_storage/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch:/home/ubuntu/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch
+      - ~/shared_storage/llava:/app/llava
+      - ~/shared_storage/mistral-compiled-bf16:/home/ubuntu/mistral-compiled-bf16
+
+  masp10:
+    image: masp_inf2
+    deploy:
+    cpuset: "144-159"
+    container_name: masp10
+    ports:
+      - 8009:8000
+    devices:
+      - /dev/neuron9
+    volumes:
+      - ~/shared_storage/inf2_weights:/home/ubuntu/inf2_weights
+      - ~/shared_storage/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch:/home/ubuntu/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch
+      - ~/shared_storage/llava:/app/llava
+      - ~/shared_storage/mistral-compiled-bf16:/home/ubuntu/mistral-compiled-bf16
+
+  masp11:
+    image: masp_inf2
+    deploy:
+    cpuset: "160-175"
+    container_name: masp11
+    ports:
+      - 8010:8000
+    devices:
+      - /dev/neuron10
+    volumes:
+      - ~/shared_storage/inf2_weights:/home/ubuntu/inf2_weights
+      - ~/shared_storage/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch:/home/ubuntu/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch
+      - ~/shared_storage/llava:/app/llava
+      - ~/shared_storage/mistral-compiled-bf16:/home/ubuntu/mistral-compiled-bf16
+
+  masp12:
+    image: masp_inf2
+    deploy:
+    cpuset: "176-191"
+    container_name: masp12
+    ports:
+      - 8011:8000
+    devices:
+      - /dev/neuron11
+    volumes:
+      - ~/shared_storage/inf2_weights:/home/ubuntu/inf2_weights
+      - ~/shared_storage/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch:/home/ubuntu/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch
+      - ~/shared_storage/llava:/app/llava
+      - ~/shared_storage/mistral-compiled-bf16:/home/ubuntu/mistral-compiled-bf16
diff --git a/images/0.jpg b/images/0.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..7277d3ed29d3d30f1b97f22536c20a4cef099490
Binary files /dev/null and b/images/0.jpg differ
diff --git a/images/1028.jpg b/images/1028.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3980dfd352b0348458b427affbe5a1972bfb33d0
Binary files /dev/null and b/images/1028.jpg differ
diff --git a/images/104.jpg b/images/104.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b82f862811143d8b11d8d75e828e3368228d8b93
Binary files /dev/null and b/images/104.jpg differ
diff --git a/images/1058.jpg b/images/1058.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..354cc1d011f289913ff6d9d4d20eaf832281b1bc
Binary files /dev/null and b/images/1058.jpg differ
diff --git a/images/1087.jpg b/images/1087.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..fe87e8cdebef4c7fea47911d8e4be7bf377bf68b
Binary files /dev/null and b/images/1087.jpg differ
diff --git a/images/1117.jpg b/images/1117.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2d39dd1e615b45a2c302ea6433c6e08ccaa242e3
Binary files /dev/null and b/images/1117.jpg differ
diff --git a/images/1147.jpg b/images/1147.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d77038917deb509aeac860f13b438ca6127c6a87
Binary files /dev/null and b/images/1147.jpg differ
diff --git a/images/1177.jpg b/images/1177.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..def465137f20774bc83dc32172eb80ae054f9849
Binary files /dev/null and b/images/1177.jpg differ
diff --git a/images/1207.jpg b/images/1207.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..ab3122352d9a4a08bb1aeb5a6d54be7e29391046
Binary files /dev/null and b/images/1207.jpg differ
diff --git a/images/1236.jpg b/images/1236.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..046bebc7d324b39a9741647537a9b4f4502a23f4
Binary files /dev/null and b/images/1236.jpg differ
diff --git a/images/1266.jpg b/images/1266.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..46652e06e91b1bf48a23bbaecf3deebb4b17fcca
Binary files /dev/null and b/images/1266.jpg differ
diff --git a/images/1281.jpg b/images/1281.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..73e079989235b771f90dbc8d9100362952089522
Binary files /dev/null and b/images/1281.jpg differ
diff --git a/images/134.jpg b/images/134.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..68fb3892bb605dbaebefa3d10e58e8fe7a2a9cb3
Binary files /dev/null and b/images/134.jpg differ
diff --git a/images/15.jpg b/images/15.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..ffbe6d21af5d4bfec992027b6f6e2c268eefa0aa
Binary files /dev/null and b/images/15.jpg differ
diff --git a/images/164.jpg b/images/164.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..cb79d624b81831afb5006db4f141b7374b3156ec
Binary files /dev/null and b/images/164.jpg differ
diff --git a/images/194.jpg b/images/194.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d186a42f8cd18958d71c4e38c50f1f791ffa65b9
Binary files /dev/null and b/images/194.jpg differ
diff --git a/images/223.jpg b/images/223.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..35147da2a1be90df438a527e169c9691dcb741ba
Binary files /dev/null and b/images/223.jpg differ
diff --git a/images/253.jpg b/images/253.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1032d88a414215fd16b17c8a648ed68dcb2fd7ae
Binary files /dev/null and b/images/253.jpg differ
diff --git a/images/283.jpg b/images/283.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b411c7840647948f413abaa3153934d79fbe8eee
Binary files /dev/null and b/images/283.jpg differ
diff --git a/images/313.jpg b/images/313.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..29312a09cb911fc6929a2789d1534ea4ae82d6d6
Binary files /dev/null and b/images/313.jpg differ
diff --git a/images/343.jpg b/images/343.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d6a265c5e0e629f60861291e79ffdb4002684629
Binary files /dev/null and b/images/343.jpg differ
diff --git a/images/372.jpg b/images/372.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..09e59f16742ef66a7b3a6aed6d7d519007bcbb7d
Binary files /dev/null and b/images/372.jpg differ
diff --git a/images/402.jpg b/images/402.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..53bd089a0bad195d3b5e5a0b9ab67129dbd52341
Binary files /dev/null and b/images/402.jpg differ
diff --git a/images/432.jpg b/images/432.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9e7ae3193a9036e19061dd810a6f819b7feb8407
Binary files /dev/null and b/images/432.jpg differ
diff --git a/images/45.jpg b/images/45.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..79124bb8bff256932a6119e4410b7dc6a0cbf5d8
Binary files /dev/null and b/images/45.jpg differ
diff --git a/images/462.jpg b/images/462.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1a37a0255319353d00f38d141484d77b5d74146d
Binary files /dev/null and b/images/462.jpg differ
diff --git a/images/492.jpg b/images/492.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..72c3f4c615c1926f91bfa283a64a4cb4d6c5c1c8
Binary files /dev/null and b/images/492.jpg differ
diff --git a/images/521.jpg b/images/521.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2393208ba3cd83dab58254e92d932fc37e109f37
Binary files /dev/null and b/images/521.jpg differ
diff --git a/images/551.jpg b/images/551.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..677451ade8ebad4768ea013cce34b1cfbaca74fb
Binary files /dev/null and b/images/551.jpg differ
diff --git a/images/581.jpg b/images/581.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..efc54fe368248f03dc85087ea16f2af888f596a3
Binary files /dev/null and b/images/581.jpg differ
diff --git a/images/611.jpg b/images/611.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1b6488e58a7f1ec37508908c9bb87bca73a76db2
Binary files /dev/null and b/images/611.jpg differ
diff --git a/images/641.jpg b/images/641.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..c4205ef8169182c823f58275d7ddb74765944083
Binary files /dev/null and b/images/641.jpg differ
diff --git a/images/670.jpg b/images/670.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6ca31722f112dc535d472ea6ef3576ddec3982cb
Binary files /dev/null and b/images/670.jpg differ
diff --git a/images/700.jpg b/images/700.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..011dce508845d53b61adcb18a106122a418f2a94
Binary files /dev/null and b/images/700.jpg differ
diff --git a/images/730.jpg b/images/730.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..e70f79fba8f3536ee93f064db0f34f5165186fff
Binary files /dev/null and b/images/730.jpg differ
diff --git a/images/74.jpg b/images/74.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..97c0d98a25b656a6ac4443482a0b3331851ece26
Binary files /dev/null and b/images/74.jpg differ
diff --git a/images/760.jpg b/images/760.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..41855e43e1449266337c5327aa9ec6d39e3a0334
Binary files /dev/null and b/images/760.jpg differ
diff --git a/images/789.jpg b/images/789.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..aad54cd54150688cbcc669d4edfddbe1fd512224
Binary files /dev/null and b/images/789.jpg differ
diff --git a/images/819.jpg b/images/819.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9d9b01f24a50a7f95a678406f083341839044f46
Binary files /dev/null and b/images/819.jpg differ
diff --git a/images/849.jpg b/images/849.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8d233a8f2d39fb32ad26d7c60ec587d5adb41b9c
Binary files /dev/null and b/images/849.jpg differ
diff --git a/images/879.jpg b/images/879.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..5adf2eeef0f944120581fbdf11e8fba9ab4eb547
Binary files /dev/null and b/images/879.jpg differ
diff --git a/images/909.jpg b/images/909.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d139a095df3a831faee813e861b520a4367a5edf
Binary files /dev/null and b/images/909.jpg differ
diff --git a/images/938.jpg b/images/938.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3d7b98b73d240a3422ef73ebbaecaa98f0075978
Binary files /dev/null and b/images/938.jpg differ
diff --git a/images/968.jpg b/images/968.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..7bc1e605822b977a3dbceef0b00dbc51d108b6b9
Binary files /dev/null and b/images/968.jpg differ
diff --git a/images/998.jpg b/images/998.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..7fad4fce23ebeca2bd980722b3ad033754ae7066
Binary files /dev/null and b/images/998.jpg differ
diff --git a/inf2_dir_0531/inf2_weights/embed_tokens.pth b/inf2_dir_0531/inf2_weights/embed_tokens.pth
new file mode 100644
index 0000000000000000000000000000000000000000..de12e5f076c86eccd80a63b770b69416fcfa990a
--- /dev/null
+++ b/inf2_dir_0531/inf2_weights/embed_tokens.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e540be5a36a0c42513bc779c842bae8f90d013271a51e901866ec3e5a6a37499
+size 524354362
diff --git a/inf2_dir_0531/inf2_weights/frame_position_encoding.pth b/inf2_dir_0531/inf2_weights/frame_position_encoding.pth
new file mode 100644
index 0000000000000000000000000000000000000000..50a2bfa4a91688c5ec68235385f5ee7388578cac
--- /dev/null
+++ b/inf2_dir_0531/inf2_weights/frame_position_encoding.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b374401a5cd0b9d31d990833c6a6b87608d402268e614774d9c15cf78fbbab09
+size 31643
diff --git a/inf2_dir_0531/inf2_weights/ln_state_dict.pth b/inf2_dir_0531/inf2_weights/ln_state_dict.pth
new file mode 100644
index 0000000000000000000000000000000000000000..7e49bbb2c26e3b63767e9d5c9b852700715cbe35
--- /dev/null
+++ b/inf2_dir_0531/inf2_weights/ln_state_dict.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:511d03bc3b7dfb5c512088dd89aabcbff65ddc2eaf26fe2a16874662c455d81d
+size 12351
diff --git a/inf2_dir_0531/inf2_weights/neuron_bert.pth b/inf2_dir_0531/inf2_weights/neuron_bert.pth
new file mode 100644
index 0000000000000000000000000000000000000000..c92b711415220339ea538927de826d9eed269c05
--- /dev/null
+++ b/inf2_dir_0531/inf2_weights/neuron_bert.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a375f5a1423c8e351f87be42dae4d10318ae1729b31a0dd1af7071cb4e3996b
+size 175611386
diff --git a/inf2_dir_0531/inf2_weights/neuron_eva_vit_batch1.pth b/inf2_dir_0531/inf2_weights/neuron_eva_vit_batch1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..e116b4982ca50cde13e7470540981d73e27a1f62
--- /dev/null
+++ b/inf2_dir_0531/inf2_weights/neuron_eva_vit_batch1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8dcc0cc7c9b0fcb615667050d8f2914c7f1982c6d1d0772ce9fd9e37f41e8a4d
+size 1613470163
diff --git a/inf2_dir_0531/inf2_weights/neuron_projector.pt b/inf2_dir_0531/inf2_weights/neuron_projector.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b91c5e77351bd9a907db1a58608ce6590b5ebbfd
--- /dev/null
+++ b/inf2_dir_0531/inf2_weights/neuron_projector.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc050956172c0c8cf2acb1d901b0e51fef80d91c9ee7d4ac3c97b685de1e61dd
+size 5155301
diff --git a/inf2_dir_0531/inf2_weights/projector.pth b/inf2_dir_0531/inf2_weights/projector.pth
new file mode 100644
index 0000000000000000000000000000000000000000..635ff0f888e8e4e8c923503b2107e3bf6b9485e5
--- /dev/null
+++ b/inf2_dir_0531/inf2_weights/projector.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b631a715b0d9846c14d5c0afa59fbd787bd4558775035be34c49641a39c79a60
+size 12600367
diff --git a/inf2_dir_0531/inf2_weights/query_tokens.pth b/inf2_dir_0531/inf2_weights/query_tokens.pth
new file mode 100644
index 0000000000000000000000000000000000000000..8bd4cdac7e03df1d575ec90ce3845565e4de57c9
--- /dev/null
+++ b/inf2_dir_0531/inf2_weights/query_tokens.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ac363c2f57f08fa3e0a58608ef62f494abbac60997e370fdc7eff45847f1bfb
+size 99066
diff --git a/inf2_dir_0531/llava-mistral_videollava_092/added_tokens.json b/inf2_dir_0531/llava-mistral_videollava_092/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..6026716cf93a162a66d98c4e111c6cfe6bbe2162
--- /dev/null
+++ b/inf2_dir_0531/llava-mistral_videollava_092/added_tokens.json
@@ -0,0 +1,6 @@
+{
+  "<im_end>": 32001,
+  "<im_start>": 32000,
+  "<vid_end>": 32003,
+  "<vid_start>": 32002
+}
diff --git a/inf2_dir_0531/llava-mistral_videollava_092/config.json b/inf2_dir_0531/llava-mistral_videollava_092/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f39ec3b0a2c5f12824fb8e42f35a0f152c8dfc93
--- /dev/null
+++ b/inf2_dir_0531/llava-mistral_videollava_092/config.json
@@ -0,0 +1,52 @@
+{
+  "_name_or_path": "mistralai/Mistral-7B-Instruct-v0.2",
+  "adapter_module_name": null,
+  "adapter_module_path": null,
+  "architectures": [
+    "LlavaMistralForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "drop_path_rate": 0.0,
+  "eos_token_id": 2,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "anyres",
+  "image_grid_pinpoints": "[(448, 672), (672, 448)]",
+  "img_size": 224,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "lora_lr": null,
+  "max_num_segments": 10,
+  "max_position_embeddings": 32768,
+  "mm_hidden_size": 768,
+  "mm_patch_merge_type": "flat",
+  "mm_projector_lr": null,
+  "mm_projector_type": "linear",
+  "mm_use_patch_token": false,
+  "mm_use_start_end": true,
+  "mm_vision_select_feature": "patch",
+  "mm_vision_select_layer": -2,
+  "mm_vision_tower": "eva-vit-g",
+  "model_type": "mistral",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "num_query_token": 32,
+  "qformer_model_path": "./model/blip2_pretrained_flant5xxl.pth",
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 4096,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.1",
+  "tune_mm_mlp_adapter": false,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "vit_model_path": "./model/eva_vit_g.pth",
+  "vit_precision": "fp16",
+  "vocab_size": 32004
+}
diff --git a/inf2_dir_0531/llava-mistral_videollava_092/generation_config.json b/inf2_dir_0531/llava-mistral_videollava_092/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0b1a8731b0741d4ed33026951c009b0c8d4ed399
--- /dev/null
+++ b/inf2_dir_0531/llava-mistral_videollava_092/generation_config.json
@@ -0,0 +1,6 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "transformers_version": "4.38.1"
+}
diff --git a/inf2_dir_0531/llava-mistral_videollava_092/model-00001-of-00004.safetensors b/inf2_dir_0531/llava-mistral_videollava_092/model-00001-of-00004.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c283dd8a91a0da206ef6f71bc0bb1bfd6db1e716
--- /dev/null
+++ b/inf2_dir_0531/llava-mistral_videollava_092/model-00001-of-00004.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8becb0a18e25f80c7d433e18b6522c5f654b15004d7a9e14ec8ce6684632f366
+size 4943244352
diff --git a/inf2_dir_0531/llava-mistral_videollava_092/model-00002-of-00004.safetensors b/inf2_dir_0531/llava-mistral_videollava_092/model-00002-of-00004.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e3ce38a66fa00fcab17f73dfddcb77e3da6b5122
--- /dev/null
+++ b/inf2_dir_0531/llava-mistral_videollava_092/model-00002-of-00004.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a34c59963dfda05577778c87df73338c31ec4153faf1cf2947f74928c857a01
+size 4999819336
diff --git a/inf2_dir_0531/llava-mistral_videollava_092/model-00003-of-00004.safetensors b/inf2_dir_0531/llava-mistral_videollava_092/model-00003-of-00004.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..61b1732d0b0edd3258b68b56317d3bac32091aa5
--- /dev/null
+++ b/inf2_dir_0531/llava-mistral_videollava_092/model-00003-of-00004.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:643a25a8859581262fbdc9c896c3565dda2c6081850090c7293a9aa33942e332
+size 4999653632
diff --git a/inf2_dir_0531/llava-mistral_videollava_092/model-00004-of-00004.safetensors b/inf2_dir_0531/llava-mistral_videollava_092/model-00004-of-00004.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3cf8d78cd6798cc70f0a65e2a26d78ee95b1e30d
--- /dev/null
+++ b/inf2_dir_0531/llava-mistral_videollava_092/model-00004-of-00004.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3537e14b71851870b8d34a632ac62b08174f7ae91803c4d99377b044bde627a3
+size 1729385856
diff --git a/inf2_dir_0531/llava-mistral_videollava_092/model.safetensors.index.json b/inf2_dir_0531/llava-mistral_videollava_092/model.safetensors.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..fc1bf2f90d7e35272b8d1c7af74979df82b28f57
--- /dev/null
+++ b/inf2_dir_0531/llava-mistral_videollava_092/model.safetensors.index.json
@@ -0,0 +1,1070 @@
+{
+  "metadata": {
+    "total_size": 16671968000
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.embeddings.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.embeddings.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.embeddings.position_ids": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.attention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.attention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.attention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.attention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.attention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.attention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.attention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.attention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.attention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.attention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.crossattention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.crossattention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.crossattention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.crossattention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.crossattention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.crossattention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.crossattention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.crossattention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.crossattention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.crossattention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.intermediate_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.intermediate_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.output_query.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.output_query.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.output_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.output_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.1.attention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.1.attention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.1.attention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.1.attention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.1.attention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.1.attention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.1.attention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.1.attention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.1.attention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.1.attention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.1.intermediate_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.1.intermediate_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.1.output_query.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.1.output_query.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.1.output_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.1.output_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.attention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.attention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.attention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.attention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.attention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.attention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.attention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.attention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.attention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.attention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.crossattention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.crossattention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.crossattention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.crossattention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.crossattention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.crossattention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.crossattention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.crossattention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.crossattention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.crossattention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.intermediate_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.intermediate_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.output_query.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.output_query.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.output_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.output_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.11.attention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.11.attention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.11.attention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.11.attention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.11.attention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.11.attention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.11.attention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.11.attention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.11.attention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.11.attention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.11.intermediate_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.11.intermediate_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.11.output_query.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.11.output_query.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.11.output_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.11.output_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.attention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.attention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.attention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.attention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.attention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.attention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.attention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.attention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.attention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.attention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.crossattention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.crossattention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.crossattention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.crossattention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.crossattention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.crossattention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.crossattention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.crossattention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.crossattention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.crossattention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.intermediate_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.intermediate_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.output_query.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.output_query.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.output_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.output_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.3.attention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.3.attention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.3.attention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.3.attention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.3.attention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.3.attention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.3.attention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.3.attention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.3.attention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.3.attention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.3.intermediate_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.3.intermediate_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.3.output_query.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.3.output_query.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.3.output_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.3.output_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.attention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.attention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.attention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.attention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.attention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.attention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.attention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.attention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.attention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.attention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.crossattention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.crossattention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.crossattention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.crossattention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.crossattention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.crossattention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.crossattention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.crossattention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.crossattention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.crossattention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.intermediate_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.intermediate_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.output_query.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.output_query.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.output_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.output_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.5.attention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.5.attention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.5.attention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.5.attention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.5.attention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.5.attention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.5.attention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.5.attention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.5.attention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.5.attention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.5.intermediate_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.5.intermediate_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.5.output_query.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.5.output_query.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.5.output_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.5.output_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.attention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.attention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.attention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.attention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.attention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.attention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.attention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.attention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.attention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.attention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.crossattention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.crossattention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.crossattention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.crossattention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.crossattention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.crossattention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.crossattention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.crossattention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.crossattention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.crossattention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.intermediate_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.intermediate_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.output_query.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.output_query.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.output_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.output_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.7.attention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.7.attention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.7.attention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.7.attention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.7.attention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.7.attention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.7.attention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.7.attention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.7.attention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.7.attention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.7.intermediate_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.7.intermediate_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.7.output_query.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.7.output_query.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.7.output_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.7.output_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.attention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.attention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.attention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.attention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.attention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.attention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.attention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.attention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.attention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.attention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.crossattention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.crossattention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.crossattention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.crossattention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.crossattention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.crossattention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.crossattention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.crossattention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.crossattention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.crossattention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.intermediate_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.intermediate_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.output_query.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.output_query.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.output_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.output_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.9.attention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.9.attention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.9.attention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.9.attention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.9.attention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.9.attention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.9.attention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.9.attention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.9.attention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.9.attention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.9.intermediate_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.9.intermediate_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.9.output_query.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.9.output_query.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.9.output_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.9.output_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.frame_position_encoding.weight": "model-00004-of-00004.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.ln_vision.bias": "model-00004-of-00004.safetensors",
+    "model.ln_vision.weight": "model-00004-of-00004.safetensors",
+    "model.mm_projector.bias": "model-00004-of-00004.safetensors",
+    "model.mm_projector.weight": "model-00004-of-00004.safetensors",
+    "model.norm.weight": "model-00003-of-00004.safetensors",
+    "model.query_tokens": "model-00001-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.32.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.32.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.32.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.32.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.32.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.32.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.32.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.32.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.32.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.32.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.32.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.32.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.32.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.33.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.33.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.33.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.33.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.33.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.33.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.33.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.33.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.33.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.33.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.33.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.33.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.33.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.34.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.34.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.34.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.34.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.34.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.34.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.34.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.34.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.34.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.34.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.34.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.34.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.34.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.35.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.35.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.35.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.35.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.35.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.35.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.35.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.35.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.35.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.35.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.35.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.35.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.35.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.36.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.36.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.36.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.36.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.36.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.36.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.36.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.36.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.36.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.36.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.36.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.36.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.36.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.37.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.37.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.37.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.37.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.37.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.37.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.37.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.37.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.37.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.37.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.37.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.37.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.37.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.38.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.38.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.38.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.38.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.38.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.38.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.38.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.38.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.38.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.38.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.38.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.38.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.38.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.cls_token": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.patch_embed.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.patch_embed.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.pos_embed": "model-00003-of-00004.safetensors"
+  }
+}
diff --git a/inf2_dir_0531/llava-mistral_videollava_092/special_tokens_map.json b/inf2_dir_0531/llava-mistral_videollava_092/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..dbca2649c4283d1a3d00918c51afc73ec26deb89
--- /dev/null
+++ b/inf2_dir_0531/llava-mistral_videollava_092/special_tokens_map.json
@@ -0,0 +1,54 @@
+{
+  "additional_special_tokens": [
+    {
+      "content": "<im_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<im_end>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<vid_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<vid_end>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<unk>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/inf2_dir_0531/llava-mistral_videollava_092/tokenizer.model b/inf2_dir_0531/llava-mistral_videollava_092/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..8b443ef19c2a19acc3ac64fb9c3db4a72921dff6
--- /dev/null
+++ b/inf2_dir_0531/llava-mistral_videollava_092/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443
diff --git a/inf2_dir_0531/llava-mistral_videollava_092/tokenizer_config.json b/inf2_dir_0531/llava-mistral_videollava_092/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ad096b8664e37c6d47ae5587e626caf5aaa5f8c2
--- /dev/null
+++ b/inf2_dir_0531/llava-mistral_videollava_092/tokenizer_config.json
@@ -0,0 +1,82 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32000": {
+      "content": "<im_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<im_end>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32002": {
+      "content": "<vid_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32003": {
+      "content": "<vid_end>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<im_start>",
+    "<im_end>",
+    "<vid_start>",
+    "<vid_end>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 4096,
+  "pad_token": "<unk>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/inf2_dir_0531/llava-mistral_videollava_092/trainer_state.json b/inf2_dir_0531/llava-mistral_videollava_092/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d56ce3488afc6302dcdb557abf2e0b747b639510
--- /dev/null
+++ b/inf2_dir_0531/llava-mistral_videollava_092/trainer_state.json
@@ -0,0 +1,98198 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 14024,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "grad_norm": 530.0854355732855,
+      "learning_rate": 2.3752969121140145e-08,
+      "loss": 3.2181,
+      "step": 1
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 599.6073854621411,
+      "learning_rate": 4.750593824228029e-08,
+      "loss": 3.2965,
+      "step": 2
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 387.01571907973465,
+      "learning_rate": 7.125890736342044e-08,
+      "loss": 3.1962,
+      "step": 3
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 412.66858200718514,
+      "learning_rate": 9.501187648456058e-08,
+      "loss": 3.3318,
+      "step": 4
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 429.02978828687424,
+      "learning_rate": 1.1876484560570071e-07,
+      "loss": 3.2708,
+      "step": 5
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 329.8604400076927,
+      "learning_rate": 1.4251781472684087e-07,
+      "loss": 3.2421,
+      "step": 6
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 368.74430316727154,
+      "learning_rate": 1.6627078384798102e-07,
+      "loss": 3.18,
+      "step": 7
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 398.02496592833785,
+      "learning_rate": 1.9002375296912116e-07,
+      "loss": 3.2381,
+      "step": 8
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 296.5029384546233,
+      "learning_rate": 2.1377672209026128e-07,
+      "loss": 3.0465,
+      "step": 9
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 262.9215693506895,
+      "learning_rate": 2.3752969121140143e-07,
+      "loss": 3.014,
+      "step": 10
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 160.50159977995017,
+      "learning_rate": 2.6128266033254157e-07,
+      "loss": 2.821,
+      "step": 11
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 131.57975728664968,
+      "learning_rate": 2.8503562945368174e-07,
+      "loss": 2.6855,
+      "step": 12
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 218.5974243446953,
+      "learning_rate": 3.0878859857482186e-07,
+      "loss": 2.619,
+      "step": 13
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 247.59203293395262,
+      "learning_rate": 3.3254156769596203e-07,
+      "loss": 2.6762,
+      "step": 14
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 160.55829640380375,
+      "learning_rate": 3.5629453681710215e-07,
+      "loss": 2.3108,
+      "step": 15
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 264.5845119250823,
+      "learning_rate": 3.800475059382423e-07,
+      "loss": 2.264,
+      "step": 16
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 172.26289486734973,
+      "learning_rate": 4.038004750593825e-07,
+      "loss": 2.221,
+      "step": 17
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 202.246484008442,
+      "learning_rate": 4.2755344418052256e-07,
+      "loss": 2.221,
+      "step": 18
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 185.84225538556257,
+      "learning_rate": 4.5130641330166273e-07,
+      "loss": 2.2023,
+      "step": 19
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 130.16216121601747,
+      "learning_rate": 4.7505938242280285e-07,
+      "loss": 2.1272,
+      "step": 20
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 33.23561809178372,
+      "learning_rate": 4.988123515439431e-07,
+      "loss": 2.0068,
+      "step": 21
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 56.5148573735759,
+      "learning_rate": 5.225653206650831e-07,
+      "loss": 2.113,
+      "step": 22
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 31.854499067642994,
+      "learning_rate": 5.463182897862233e-07,
+      "loss": 2.0364,
+      "step": 23
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 42.21277645732634,
+      "learning_rate": 5.700712589073635e-07,
+      "loss": 2.0448,
+      "step": 24
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 51.57809246603758,
+      "learning_rate": 5.938242280285035e-07,
+      "loss": 1.9949,
+      "step": 25
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 24.15041103689964,
+      "learning_rate": 6.175771971496437e-07,
+      "loss": 2.0048,
+      "step": 26
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 22.554239060544138,
+      "learning_rate": 6.41330166270784e-07,
+      "loss": 1.9157,
+      "step": 27
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 25.660162842459783,
+      "learning_rate": 6.650831353919241e-07,
+      "loss": 1.8632,
+      "step": 28
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 21.788662996088387,
+      "learning_rate": 6.888361045130641e-07,
+      "loss": 1.9002,
+      "step": 29
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 20.083999235406825,
+      "learning_rate": 7.125890736342043e-07,
+      "loss": 1.8298,
+      "step": 30
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 24.627973373035815,
+      "learning_rate": 7.363420427553445e-07,
+      "loss": 1.7903,
+      "step": 31
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 22.34264760928385,
+      "learning_rate": 7.600950118764846e-07,
+      "loss": 1.8808,
+      "step": 32
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 17.897708691807352,
+      "learning_rate": 7.838479809976247e-07,
+      "loss": 1.8769,
+      "step": 33
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 16.433259761790413,
+      "learning_rate": 8.07600950118765e-07,
+      "loss": 1.824,
+      "step": 34
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 43.020403190795406,
+      "learning_rate": 8.313539192399051e-07,
+      "loss": 1.8217,
+      "step": 35
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 23.22575054818917,
+      "learning_rate": 8.551068883610451e-07,
+      "loss": 1.8691,
+      "step": 36
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 15.516146949935374,
+      "learning_rate": 8.788598574821854e-07,
+      "loss": 1.7311,
+      "step": 37
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 18.882182297034287,
+      "learning_rate": 9.026128266033255e-07,
+      "loss": 1.7054,
+      "step": 38
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 20.315116280645555,
+      "learning_rate": 9.263657957244656e-07,
+      "loss": 1.7509,
+      "step": 39
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 16.331023676859406,
+      "learning_rate": 9.501187648456057e-07,
+      "loss": 1.7503,
+      "step": 40
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 16.672871399184892,
+      "learning_rate": 9.73871733966746e-07,
+      "loss": 1.8014,
+      "step": 41
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 12.20436768473636,
+      "learning_rate": 9.976247030878861e-07,
+      "loss": 1.6841,
+      "step": 42
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 21.04376033025822,
+      "learning_rate": 1.0213776722090261e-06,
+      "loss": 1.7445,
+      "step": 43
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 12.58258245209557,
+      "learning_rate": 1.0451306413301663e-06,
+      "loss": 1.6915,
+      "step": 44
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 18.37331189694571,
+      "learning_rate": 1.0688836104513065e-06,
+      "loss": 1.6992,
+      "step": 45
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 12.58122890942581,
+      "learning_rate": 1.0926365795724466e-06,
+      "loss": 1.7427,
+      "step": 46
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 16.3966658866462,
+      "learning_rate": 1.1163895486935868e-06,
+      "loss": 1.7191,
+      "step": 47
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 12.432587824750017,
+      "learning_rate": 1.140142517814727e-06,
+      "loss": 1.6839,
+      "step": 48
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 17.038638413796573,
+      "learning_rate": 1.1638954869358671e-06,
+      "loss": 1.7258,
+      "step": 49
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 14.570769147294332,
+      "learning_rate": 1.187648456057007e-06,
+      "loss": 1.695,
+      "step": 50
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 14.543048404785539,
+      "learning_rate": 1.2114014251781475e-06,
+      "loss": 1.6561,
+      "step": 51
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 15.12716444065227,
+      "learning_rate": 1.2351543942992874e-06,
+      "loss": 1.704,
+      "step": 52
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 13.505028083974226,
+      "learning_rate": 1.2589073634204276e-06,
+      "loss": 1.6887,
+      "step": 53
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 13.402004933155297,
+      "learning_rate": 1.282660332541568e-06,
+      "loss": 1.6591,
+      "step": 54
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 13.243091504412499,
+      "learning_rate": 1.306413301662708e-06,
+      "loss": 1.6386,
+      "step": 55
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 13.838321319820112,
+      "learning_rate": 1.3301662707838481e-06,
+      "loss": 1.5528,
+      "step": 56
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 10.314404987652852,
+      "learning_rate": 1.3539192399049883e-06,
+      "loss": 1.6187,
+      "step": 57
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 13.605357477513168,
+      "learning_rate": 1.3776722090261283e-06,
+      "loss": 1.5805,
+      "step": 58
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 13.28024035508829,
+      "learning_rate": 1.4014251781472684e-06,
+      "loss": 1.5683,
+      "step": 59
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 17.101657571404175,
+      "learning_rate": 1.4251781472684086e-06,
+      "loss": 1.6212,
+      "step": 60
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 12.29780763063185,
+      "learning_rate": 1.448931116389549e-06,
+      "loss": 1.6288,
+      "step": 61
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 13.777323095327398,
+      "learning_rate": 1.472684085510689e-06,
+      "loss": 1.5845,
+      "step": 62
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 16.43905577170044,
+      "learning_rate": 1.4964370546318291e-06,
+      "loss": 1.5967,
+      "step": 63
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 13.823792442872604,
+      "learning_rate": 1.5201900237529693e-06,
+      "loss": 1.6033,
+      "step": 64
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 10.64420172042377,
+      "learning_rate": 1.5439429928741092e-06,
+      "loss": 1.6381,
+      "step": 65
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 12.395277874234006,
+      "learning_rate": 1.5676959619952494e-06,
+      "loss": 1.5933,
+      "step": 66
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 13.56110465662864,
+      "learning_rate": 1.5914489311163898e-06,
+      "loss": 1.5605,
+      "step": 67
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 10.472448174220766,
+      "learning_rate": 1.61520190023753e-06,
+      "loss": 1.5918,
+      "step": 68
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 14.418209133302405,
+      "learning_rate": 1.63895486935867e-06,
+      "loss": 1.5496,
+      "step": 69
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 13.097079920418556,
+      "learning_rate": 1.6627078384798101e-06,
+      "loss": 1.5521,
+      "step": 70
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 12.478526758002642,
+      "learning_rate": 1.6864608076009503e-06,
+      "loss": 1.5163,
+      "step": 71
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 14.292596234498868,
+      "learning_rate": 1.7102137767220902e-06,
+      "loss": 1.5028,
+      "step": 72
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 12.197056626991,
+      "learning_rate": 1.7339667458432304e-06,
+      "loss": 1.5215,
+      "step": 73
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 13.315312300629058,
+      "learning_rate": 1.7577197149643708e-06,
+      "loss": 1.5724,
+      "step": 74
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 12.627600146731822,
+      "learning_rate": 1.781472684085511e-06,
+      "loss": 1.5322,
+      "step": 75
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 13.946305382919165,
+      "learning_rate": 1.805225653206651e-06,
+      "loss": 1.5503,
+      "step": 76
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 11.22376338836648,
+      "learning_rate": 1.828978622327791e-06,
+      "loss": 1.5483,
+      "step": 77
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 16.190872459117,
+      "learning_rate": 1.8527315914489313e-06,
+      "loss": 1.5508,
+      "step": 78
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 13.067927805258293,
+      "learning_rate": 1.8764845605700712e-06,
+      "loss": 1.4911,
+      "step": 79
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 16.07840937980365,
+      "learning_rate": 1.9002375296912114e-06,
+      "loss": 1.4907,
+      "step": 80
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 14.371558290708276,
+      "learning_rate": 1.9239904988123518e-06,
+      "loss": 1.5575,
+      "step": 81
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 13.929745085397512,
+      "learning_rate": 1.947743467933492e-06,
+      "loss": 1.5081,
+      "step": 82
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 14.959073196131998,
+      "learning_rate": 1.9714964370546317e-06,
+      "loss": 1.458,
+      "step": 83
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 15.577904488231164,
+      "learning_rate": 1.9952494061757723e-06,
+      "loss": 1.4728,
+      "step": 84
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 11.034354460961133,
+      "learning_rate": 2.0190023752969125e-06,
+      "loss": 1.5772,
+      "step": 85
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 15.83563491626924,
+      "learning_rate": 2.0427553444180522e-06,
+      "loss": 1.4863,
+      "step": 86
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 12.405225428167244,
+      "learning_rate": 2.0665083135391924e-06,
+      "loss": 1.5085,
+      "step": 87
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 13.724278435131065,
+      "learning_rate": 2.0902612826603326e-06,
+      "loss": 1.4589,
+      "step": 88
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 13.221565293034788,
+      "learning_rate": 2.114014251781473e-06,
+      "loss": 1.5672,
+      "step": 89
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 16.2645270792205,
+      "learning_rate": 2.137767220902613e-06,
+      "loss": 1.4825,
+      "step": 90
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 14.774052718508544,
+      "learning_rate": 2.161520190023753e-06,
+      "loss": 1.5066,
+      "step": 91
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 13.47955134662321,
+      "learning_rate": 2.1852731591448932e-06,
+      "loss": 1.4962,
+      "step": 92
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 18.41677953583139,
+      "learning_rate": 2.2090261282660334e-06,
+      "loss": 1.4333,
+      "step": 93
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 13.332919291201579,
+      "learning_rate": 2.2327790973871736e-06,
+      "loss": 1.5046,
+      "step": 94
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 12.91269605246235,
+      "learning_rate": 2.2565320665083138e-06,
+      "loss": 1.5232,
+      "step": 95
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 14.90231401440046,
+      "learning_rate": 2.280285035629454e-06,
+      "loss": 1.5142,
+      "step": 96
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 19.04836499841221,
+      "learning_rate": 2.304038004750594e-06,
+      "loss": 1.4893,
+      "step": 97
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 14.13387036315814,
+      "learning_rate": 2.3277909738717343e-06,
+      "loss": 1.3987,
+      "step": 98
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 12.54432076742004,
+      "learning_rate": 2.3515439429928744e-06,
+      "loss": 1.4782,
+      "step": 99
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 13.786741221839991,
+      "learning_rate": 2.375296912114014e-06,
+      "loss": 1.5005,
+      "step": 100
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 14.110787996940493,
+      "learning_rate": 2.3990498812351544e-06,
+      "loss": 1.437,
+      "step": 101
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 11.823652878159539,
+      "learning_rate": 2.422802850356295e-06,
+      "loss": 1.4165,
+      "step": 102
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 17.032901570198774,
+      "learning_rate": 2.446555819477435e-06,
+      "loss": 1.4262,
+      "step": 103
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 14.615821294875762,
+      "learning_rate": 2.470308788598575e-06,
+      "loss": 1.4257,
+      "step": 104
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 15.711166555504922,
+      "learning_rate": 2.494061757719715e-06,
+      "loss": 1.4113,
+      "step": 105
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 17.068849724913157,
+      "learning_rate": 2.5178147268408552e-06,
+      "loss": 1.5082,
+      "step": 106
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 13.458112507908659,
+      "learning_rate": 2.5415676959619954e-06,
+      "loss": 1.3941,
+      "step": 107
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 16.887439152521853,
+      "learning_rate": 2.565320665083136e-06,
+      "loss": 1.3999,
+      "step": 108
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 17.170226336894228,
+      "learning_rate": 2.5890736342042753e-06,
+      "loss": 1.3894,
+      "step": 109
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 11.228221517606254,
+      "learning_rate": 2.612826603325416e-06,
+      "loss": 1.4792,
+      "step": 110
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 15.947433235752897,
+      "learning_rate": 2.636579572446556e-06,
+      "loss": 1.4528,
+      "step": 111
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 15.827420186390578,
+      "learning_rate": 2.6603325415676963e-06,
+      "loss": 1.4682,
+      "step": 112
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 12.943085107882512,
+      "learning_rate": 2.6840855106888364e-06,
+      "loss": 1.3842,
+      "step": 113
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 14.27561178993794,
+      "learning_rate": 2.7078384798099766e-06,
+      "loss": 1.3961,
+      "step": 114
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 21.28328307733994,
+      "learning_rate": 2.7315914489311168e-06,
+      "loss": 1.4175,
+      "step": 115
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 16.57880616533679,
+      "learning_rate": 2.7553444180522565e-06,
+      "loss": 1.4992,
+      "step": 116
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 13.750027042719273,
+      "learning_rate": 2.7790973871733967e-06,
+      "loss": 1.4177,
+      "step": 117
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 11.962662649678006,
+      "learning_rate": 2.802850356294537e-06,
+      "loss": 1.4596,
+      "step": 118
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 12.701059642605978,
+      "learning_rate": 2.826603325415677e-06,
+      "loss": 1.4695,
+      "step": 119
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 14.748276235102303,
+      "learning_rate": 2.850356294536817e-06,
+      "loss": 1.4654,
+      "step": 120
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 12.575629941262651,
+      "learning_rate": 2.874109263657958e-06,
+      "loss": 1.4524,
+      "step": 121
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 14.903303020847021,
+      "learning_rate": 2.897862232779098e-06,
+      "loss": 1.3872,
+      "step": 122
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 19.19788951081469,
+      "learning_rate": 2.9216152019002373e-06,
+      "loss": 1.4811,
+      "step": 123
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 12.236430981720163,
+      "learning_rate": 2.945368171021378e-06,
+      "loss": 1.385,
+      "step": 124
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 14.379821387290914,
+      "learning_rate": 2.969121140142518e-06,
+      "loss": 1.4058,
+      "step": 125
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 15.63022737868073,
+      "learning_rate": 2.9928741092636582e-06,
+      "loss": 1.4635,
+      "step": 126
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 12.949701045172189,
+      "learning_rate": 3.0166270783847984e-06,
+      "loss": 1.4709,
+      "step": 127
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 12.31020198615718,
+      "learning_rate": 3.0403800475059386e-06,
+      "loss": 1.4455,
+      "step": 128
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 14.651057736923647,
+      "learning_rate": 3.0641330166270788e-06,
+      "loss": 1.3665,
+      "step": 129
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 18.546372417490083,
+      "learning_rate": 3.0878859857482185e-06,
+      "loss": 1.3606,
+      "step": 130
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 14.669488324675042,
+      "learning_rate": 3.1116389548693587e-06,
+      "loss": 1.3843,
+      "step": 131
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 14.476695395158888,
+      "learning_rate": 3.135391923990499e-06,
+      "loss": 1.4445,
+      "step": 132
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 14.93448069106449,
+      "learning_rate": 3.159144893111639e-06,
+      "loss": 1.3786,
+      "step": 133
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 14.49604886162594,
+      "learning_rate": 3.1828978622327796e-06,
+      "loss": 1.3605,
+      "step": 134
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 19.451009223658264,
+      "learning_rate": 3.2066508313539198e-06,
+      "loss": 1.3508,
+      "step": 135
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 13.230750721998495,
+      "learning_rate": 3.23040380047506e-06,
+      "loss": 1.4595,
+      "step": 136
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 13.80996417704827,
+      "learning_rate": 3.2541567695961997e-06,
+      "loss": 1.4402,
+      "step": 137
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 16.16332121580094,
+      "learning_rate": 3.27790973871734e-06,
+      "loss": 1.3555,
+      "step": 138
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 12.498100662715567,
+      "learning_rate": 3.30166270783848e-06,
+      "loss": 1.4045,
+      "step": 139
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 14.581709871514358,
+      "learning_rate": 3.3254156769596202e-06,
+      "loss": 1.4258,
+      "step": 140
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 14.14085509562696,
+      "learning_rate": 3.3491686460807604e-06,
+      "loss": 1.355,
+      "step": 141
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 16.537959566307876,
+      "learning_rate": 3.3729216152019006e-06,
+      "loss": 1.4221,
+      "step": 142
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 19.771567998302707,
+      "learning_rate": 3.3966745843230407e-06,
+      "loss": 1.4195,
+      "step": 143
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 17.597445808860723,
+      "learning_rate": 3.4204275534441805e-06,
+      "loss": 1.4044,
+      "step": 144
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 14.785425696136617,
+      "learning_rate": 3.4441805225653207e-06,
+      "loss": 1.3899,
+      "step": 145
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 10.36721864852308,
+      "learning_rate": 3.467933491686461e-06,
+      "loss": 1.4042,
+      "step": 146
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 10.816762180943188,
+      "learning_rate": 3.4916864608076014e-06,
+      "loss": 1.4292,
+      "step": 147
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 16.682406462378978,
+      "learning_rate": 3.5154394299287416e-06,
+      "loss": 1.3404,
+      "step": 148
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 19.779128886549874,
+      "learning_rate": 3.5391923990498818e-06,
+      "loss": 1.395,
+      "step": 149
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 9.824879433457413,
+      "learning_rate": 3.562945368171022e-06,
+      "loss": 1.35,
+      "step": 150
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 14.194304563712716,
+      "learning_rate": 3.5866983372921617e-06,
+      "loss": 1.341,
+      "step": 151
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 15.685343684774876,
+      "learning_rate": 3.610451306413302e-06,
+      "loss": 1.4149,
+      "step": 152
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 13.836910340631347,
+      "learning_rate": 3.634204275534442e-06,
+      "loss": 1.3857,
+      "step": 153
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 15.575921540976669,
+      "learning_rate": 3.657957244655582e-06,
+      "loss": 1.3569,
+      "step": 154
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 17.365839416555033,
+      "learning_rate": 3.6817102137767224e-06,
+      "loss": 1.3503,
+      "step": 155
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 16.61474639141763,
+      "learning_rate": 3.7054631828978625e-06,
+      "loss": 1.3694,
+      "step": 156
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 18.839122743923824,
+      "learning_rate": 3.7292161520190027e-06,
+      "loss": 1.3854,
+      "step": 157
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 14.323993664172106,
+      "learning_rate": 3.7529691211401425e-06,
+      "loss": 1.3752,
+      "step": 158
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 13.123668193071858,
+      "learning_rate": 3.7767220902612826e-06,
+      "loss": 1.4114,
+      "step": 159
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 11.305287505242283,
+      "learning_rate": 3.800475059382423e-06,
+      "loss": 1.3274,
+      "step": 160
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 10.2896891189189,
+      "learning_rate": 3.824228028503563e-06,
+      "loss": 1.4218,
+      "step": 161
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 11.871724038019433,
+      "learning_rate": 3.8479809976247036e-06,
+      "loss": 1.3426,
+      "step": 162
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 13.567569130818292,
+      "learning_rate": 3.871733966745843e-06,
+      "loss": 1.4118,
+      "step": 163
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 11.08572750268809,
+      "learning_rate": 3.895486935866984e-06,
+      "loss": 1.3242,
+      "step": 164
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 15.192117688535117,
+      "learning_rate": 3.919239904988124e-06,
+      "loss": 1.3392,
+      "step": 165
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 11.164438574844604,
+      "learning_rate": 3.942992874109263e-06,
+      "loss": 1.3629,
+      "step": 166
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 11.412544395436907,
+      "learning_rate": 3.966745843230404e-06,
+      "loss": 1.3615,
+      "step": 167
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 16.2040633898272,
+      "learning_rate": 3.990498812351545e-06,
+      "loss": 1.3115,
+      "step": 168
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 16.915821931863885,
+      "learning_rate": 4.014251781472684e-06,
+      "loss": 1.2765,
+      "step": 169
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 11.397962620944677,
+      "learning_rate": 4.038004750593825e-06,
+      "loss": 1.2828,
+      "step": 170
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 15.209026461753123,
+      "learning_rate": 4.061757719714965e-06,
+      "loss": 1.3906,
+      "step": 171
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 13.879949837717186,
+      "learning_rate": 4.0855106888361044e-06,
+      "loss": 1.3608,
+      "step": 172
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 16.686599500949725,
+      "learning_rate": 4.109263657957245e-06,
+      "loss": 1.3546,
+      "step": 173
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 16.134655852445704,
+      "learning_rate": 4.133016627078385e-06,
+      "loss": 1.3929,
+      "step": 174
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 10.748787173506718,
+      "learning_rate": 4.156769596199525e-06,
+      "loss": 1.3703,
+      "step": 175
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 14.706526767882387,
+      "learning_rate": 4.180522565320665e-06,
+      "loss": 1.3108,
+      "step": 176
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 15.349209880901988,
+      "learning_rate": 4.204275534441806e-06,
+      "loss": 1.3542,
+      "step": 177
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 12.859830924385317,
+      "learning_rate": 4.228028503562946e-06,
+      "loss": 1.3206,
+      "step": 178
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 16.098189081568144,
+      "learning_rate": 4.251781472684085e-06,
+      "loss": 1.3665,
+      "step": 179
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 17.13773177076354,
+      "learning_rate": 4.275534441805226e-06,
+      "loss": 1.2937,
+      "step": 180
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 14.980400596130082,
+      "learning_rate": 4.299287410926366e-06,
+      "loss": 1.3238,
+      "step": 181
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 11.77511815199208,
+      "learning_rate": 4.323040380047506e-06,
+      "loss": 1.3244,
+      "step": 182
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 14.714689374505186,
+      "learning_rate": 4.346793349168647e-06,
+      "loss": 1.3603,
+      "step": 183
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 12.677360044554293,
+      "learning_rate": 4.3705463182897865e-06,
+      "loss": 1.3671,
+      "step": 184
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 17.815652992642846,
+      "learning_rate": 4.394299287410927e-06,
+      "loss": 1.4173,
+      "step": 185
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 15.617509190371722,
+      "learning_rate": 4.418052256532067e-06,
+      "loss": 1.3581,
+      "step": 186
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 15.799591804147944,
+      "learning_rate": 4.441805225653207e-06,
+      "loss": 1.3352,
+      "step": 187
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 13.38970690632497,
+      "learning_rate": 4.465558194774347e-06,
+      "loss": 1.3696,
+      "step": 188
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 12.984012130441354,
+      "learning_rate": 4.489311163895487e-06,
+      "loss": 1.3108,
+      "step": 189
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 9.404673598617968,
+      "learning_rate": 4.5130641330166275e-06,
+      "loss": 1.3488,
+      "step": 190
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 14.526772019603488,
+      "learning_rate": 4.536817102137768e-06,
+      "loss": 1.2899,
+      "step": 191
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 15.817075841633667,
+      "learning_rate": 4.560570071258908e-06,
+      "loss": 1.2843,
+      "step": 192
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 10.759334489502052,
+      "learning_rate": 4.584323040380048e-06,
+      "loss": 1.4034,
+      "step": 193
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 11.647003031714911,
+      "learning_rate": 4.608076009501188e-06,
+      "loss": 1.353,
+      "step": 194
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 10.372509633470697,
+      "learning_rate": 4.631828978622328e-06,
+      "loss": 1.3323,
+      "step": 195
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 15.175225284315404,
+      "learning_rate": 4.6555819477434686e-06,
+      "loss": 1.2774,
+      "step": 196
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 15.728466724638357,
+      "learning_rate": 4.679334916864608e-06,
+      "loss": 1.3423,
+      "step": 197
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 11.751296952173632,
+      "learning_rate": 4.703087885985749e-06,
+      "loss": 1.3292,
+      "step": 198
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 15.35626095035699,
+      "learning_rate": 4.726840855106889e-06,
+      "loss": 1.2804,
+      "step": 199
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 13.827461042828567,
+      "learning_rate": 4.750593824228028e-06,
+      "loss": 1.2861,
+      "step": 200
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 13.297494366978508,
+      "learning_rate": 4.774346793349169e-06,
+      "loss": 1.2627,
+      "step": 201
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 12.701928156830075,
+      "learning_rate": 4.798099762470309e-06,
+      "loss": 1.2925,
+      "step": 202
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 12.225747231156424,
+      "learning_rate": 4.821852731591449e-06,
+      "loss": 1.348,
+      "step": 203
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 11.045242016330286,
+      "learning_rate": 4.84560570071259e-06,
+      "loss": 1.3221,
+      "step": 204
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 14.544701707974687,
+      "learning_rate": 4.86935866983373e-06,
+      "loss": 1.3262,
+      "step": 205
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 13.844514954925778,
+      "learning_rate": 4.89311163895487e-06,
+      "loss": 1.3013,
+      "step": 206
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 11.716608064474384,
+      "learning_rate": 4.91686460807601e-06,
+      "loss": 1.2729,
+      "step": 207
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 16.059150863076916,
+      "learning_rate": 4.94061757719715e-06,
+      "loss": 1.2808,
+      "step": 208
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 13.13897217593007,
+      "learning_rate": 4.96437054631829e-06,
+      "loss": 1.3248,
+      "step": 209
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 14.074381107583106,
+      "learning_rate": 4.98812351543943e-06,
+      "loss": 1.3242,
+      "step": 210
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 13.21269411008212,
+      "learning_rate": 5.011876484560571e-06,
+      "loss": 1.3191,
+      "step": 211
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 11.817062456588673,
+      "learning_rate": 5.0356294536817105e-06,
+      "loss": 1.3661,
+      "step": 212
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 10.846496955979292,
+      "learning_rate": 5.059382422802851e-06,
+      "loss": 1.3008,
+      "step": 213
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 9.999385539988142,
+      "learning_rate": 5.083135391923991e-06,
+      "loss": 1.3334,
+      "step": 214
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 14.517297863929363,
+      "learning_rate": 5.106888361045131e-06,
+      "loss": 1.4214,
+      "step": 215
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 17.425671619350425,
+      "learning_rate": 5.130641330166272e-06,
+      "loss": 1.418,
+      "step": 216
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 11.090391448584423,
+      "learning_rate": 5.154394299287412e-06,
+      "loss": 1.3458,
+      "step": 217
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 14.342017085238616,
+      "learning_rate": 5.178147268408551e-06,
+      "loss": 1.2716,
+      "step": 218
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 14.205024585471532,
+      "learning_rate": 5.201900237529691e-06,
+      "loss": 1.3385,
+      "step": 219
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 14.087472475675987,
+      "learning_rate": 5.225653206650832e-06,
+      "loss": 1.3117,
+      "step": 220
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 14.59238665308349,
+      "learning_rate": 5.2494061757719716e-06,
+      "loss": 1.345,
+      "step": 221
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 11.54559797223761,
+      "learning_rate": 5.273159144893112e-06,
+      "loss": 1.3273,
+      "step": 222
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 12.63290942003362,
+      "learning_rate": 5.296912114014252e-06,
+      "loss": 1.2813,
+      "step": 223
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 13.839180624491329,
+      "learning_rate": 5.3206650831353925e-06,
+      "loss": 1.2583,
+      "step": 224
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 9.341343493997352,
+      "learning_rate": 5.344418052256532e-06,
+      "loss": 1.3136,
+      "step": 225
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 16.969484491585646,
+      "learning_rate": 5.368171021377673e-06,
+      "loss": 1.2943,
+      "step": 226
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 11.330019337215834,
+      "learning_rate": 5.391923990498813e-06,
+      "loss": 1.2712,
+      "step": 227
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 15.952858907712564,
+      "learning_rate": 5.415676959619953e-06,
+      "loss": 1.3026,
+      "step": 228
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 14.247863216062932,
+      "learning_rate": 5.439429928741094e-06,
+      "loss": 1.2503,
+      "step": 229
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 14.4826062288306,
+      "learning_rate": 5.4631828978622335e-06,
+      "loss": 1.3077,
+      "step": 230
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 11.371425792543864,
+      "learning_rate": 5.486935866983374e-06,
+      "loss": 1.297,
+      "step": 231
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 13.934435552183617,
+      "learning_rate": 5.510688836104513e-06,
+      "loss": 1.3433,
+      "step": 232
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 10.428076033508786,
+      "learning_rate": 5.534441805225654e-06,
+      "loss": 1.3271,
+      "step": 233
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 12.614446453380744,
+      "learning_rate": 5.558194774346793e-06,
+      "loss": 1.2134,
+      "step": 234
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 13.037871171566291,
+      "learning_rate": 5.581947743467934e-06,
+      "loss": 1.2446,
+      "step": 235
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 16.976219561534784,
+      "learning_rate": 5.605700712589074e-06,
+      "loss": 1.2347,
+      "step": 236
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 9.940883764349541,
+      "learning_rate": 5.629453681710214e-06,
+      "loss": 1.3278,
+      "step": 237
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 14.737084656284852,
+      "learning_rate": 5.653206650831354e-06,
+      "loss": 1.3595,
+      "step": 238
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 12.096920162370825,
+      "learning_rate": 5.676959619952495e-06,
+      "loss": 1.3663,
+      "step": 239
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 12.702301643957222,
+      "learning_rate": 5.700712589073634e-06,
+      "loss": 1.3656,
+      "step": 240
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 15.247481294917272,
+      "learning_rate": 5.724465558194775e-06,
+      "loss": 1.2771,
+      "step": 241
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 11.468101869068748,
+      "learning_rate": 5.748218527315916e-06,
+      "loss": 1.2909,
+      "step": 242
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 15.579053466749297,
+      "learning_rate": 5.771971496437055e-06,
+      "loss": 1.3901,
+      "step": 243
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 11.632068217839242,
+      "learning_rate": 5.795724465558196e-06,
+      "loss": 1.3297,
+      "step": 244
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 9.625589872632604,
+      "learning_rate": 5.819477434679336e-06,
+      "loss": 1.2887,
+      "step": 245
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 14.413396636872955,
+      "learning_rate": 5.843230403800475e-06,
+      "loss": 1.3406,
+      "step": 246
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 14.21368105925228,
+      "learning_rate": 5.866983372921615e-06,
+      "loss": 1.3133,
+      "step": 247
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 14.344265152208765,
+      "learning_rate": 5.890736342042756e-06,
+      "loss": 1.2695,
+      "step": 248
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 12.153346940387292,
+      "learning_rate": 5.9144893111638955e-06,
+      "loss": 1.3095,
+      "step": 249
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 13.307384103820155,
+      "learning_rate": 5.938242280285036e-06,
+      "loss": 1.2882,
+      "step": 250
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 11.432167957839038,
+      "learning_rate": 5.961995249406176e-06,
+      "loss": 1.2133,
+      "step": 251
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 14.129432791892816,
+      "learning_rate": 5.9857482185273165e-06,
+      "loss": 1.3615,
+      "step": 252
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 12.773149623638119,
+      "learning_rate": 6.009501187648456e-06,
+      "loss": 1.3892,
+      "step": 253
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 13.81922462562399,
+      "learning_rate": 6.033254156769597e-06,
+      "loss": 1.3372,
+      "step": 254
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 10.246074057769903,
+      "learning_rate": 6.057007125890737e-06,
+      "loss": 1.2695,
+      "step": 255
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 12.857027196873013,
+      "learning_rate": 6.080760095011877e-06,
+      "loss": 1.3326,
+      "step": 256
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 12.63893217027068,
+      "learning_rate": 6.104513064133018e-06,
+      "loss": 1.2596,
+      "step": 257
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 17.621377836197706,
+      "learning_rate": 6.1282660332541575e-06,
+      "loss": 1.1978,
+      "step": 258
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 12.02424762453877,
+      "learning_rate": 6.152019002375298e-06,
+      "loss": 1.2653,
+      "step": 259
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 19.98245888007629,
+      "learning_rate": 6.175771971496437e-06,
+      "loss": 1.3205,
+      "step": 260
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 11.634372394131027,
+      "learning_rate": 6.199524940617578e-06,
+      "loss": 1.3197,
+      "step": 261
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 18.583623915522026,
+      "learning_rate": 6.223277909738717e-06,
+      "loss": 1.2456,
+      "step": 262
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 12.744517387762187,
+      "learning_rate": 6.247030878859858e-06,
+      "loss": 1.288,
+      "step": 263
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 20.06189973710654,
+      "learning_rate": 6.270783847980998e-06,
+      "loss": 1.323,
+      "step": 264
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 11.18281784073105,
+      "learning_rate": 6.294536817102138e-06,
+      "loss": 1.3144,
+      "step": 265
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 9.922608875334808,
+      "learning_rate": 6.318289786223278e-06,
+      "loss": 1.3175,
+      "step": 266
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 12.34358885445237,
+      "learning_rate": 6.342042755344419e-06,
+      "loss": 1.246,
+      "step": 267
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 10.878260374334934,
+      "learning_rate": 6.365795724465559e-06,
+      "loss": 1.3058,
+      "step": 268
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 15.994822039978114,
+      "learning_rate": 6.389548693586699e-06,
+      "loss": 1.2612,
+      "step": 269
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 14.700837513657016,
+      "learning_rate": 6.4133016627078396e-06,
+      "loss": 1.2543,
+      "step": 270
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 14.056818099653656,
+      "learning_rate": 6.437054631828979e-06,
+      "loss": 1.2424,
+      "step": 271
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 11.817866054071915,
+      "learning_rate": 6.46080760095012e-06,
+      "loss": 1.2931,
+      "step": 272
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 11.655012894280503,
+      "learning_rate": 6.48456057007126e-06,
+      "loss": 1.3046,
+      "step": 273
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 12.221080710285706,
+      "learning_rate": 6.508313539192399e-06,
+      "loss": 1.2996,
+      "step": 274
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 12.0402947672448,
+      "learning_rate": 6.532066508313539e-06,
+      "loss": 1.3063,
+      "step": 275
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 14.769611718274001,
+      "learning_rate": 6.55581947743468e-06,
+      "loss": 1.1979,
+      "step": 276
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 12.78909191869516,
+      "learning_rate": 6.5795724465558195e-06,
+      "loss": 1.3416,
+      "step": 277
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 10.751999806184543,
+      "learning_rate": 6.60332541567696e-06,
+      "loss": 1.29,
+      "step": 278
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 15.649019373046672,
+      "learning_rate": 6.6270783847981e-06,
+      "loss": 1.2484,
+      "step": 279
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 14.112762254690388,
+      "learning_rate": 6.6508313539192404e-06,
+      "loss": 1.271,
+      "step": 280
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 11.070356855683432,
+      "learning_rate": 6.674584323040381e-06,
+      "loss": 1.2493,
+      "step": 281
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 11.440278064510233,
+      "learning_rate": 6.698337292161521e-06,
+      "loss": 1.3406,
+      "step": 282
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 12.377424611738187,
+      "learning_rate": 6.722090261282661e-06,
+      "loss": 1.2542,
+      "step": 283
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 13.265508995839673,
+      "learning_rate": 6.745843230403801e-06,
+      "loss": 1.2239,
+      "step": 284
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 14.206386472944192,
+      "learning_rate": 6.769596199524942e-06,
+      "loss": 1.2778,
+      "step": 285
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 11.536370688506485,
+      "learning_rate": 6.7933491686460815e-06,
+      "loss": 1.3003,
+      "step": 286
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 15.789283669754393,
+      "learning_rate": 6.817102137767222e-06,
+      "loss": 1.3159,
+      "step": 287
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 13.990434287417122,
+      "learning_rate": 6.840855106888361e-06,
+      "loss": 1.3094,
+      "step": 288
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 12.362797550333303,
+      "learning_rate": 6.8646080760095016e-06,
+      "loss": 1.268,
+      "step": 289
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 13.132335220080723,
+      "learning_rate": 6.888361045130641e-06,
+      "loss": 1.2466,
+      "step": 290
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 10.794504122711373,
+      "learning_rate": 6.912114014251782e-06,
+      "loss": 1.2221,
+      "step": 291
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 12.399883239680305,
+      "learning_rate": 6.935866983372922e-06,
+      "loss": 1.2689,
+      "step": 292
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 11.842466212832909,
+      "learning_rate": 6.959619952494062e-06,
+      "loss": 1.256,
+      "step": 293
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 12.24340549285756,
+      "learning_rate": 6.983372921615203e-06,
+      "loss": 1.2189,
+      "step": 294
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 12.665677397139586,
+      "learning_rate": 7.007125890736343e-06,
+      "loss": 1.2669,
+      "step": 295
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 11.172906278867272,
+      "learning_rate": 7.030878859857483e-06,
+      "loss": 1.2343,
+      "step": 296
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 10.673728223232366,
+      "learning_rate": 7.054631828978623e-06,
+      "loss": 1.2219,
+      "step": 297
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 10.191274664746453,
+      "learning_rate": 7.0783847980997635e-06,
+      "loss": 1.2897,
+      "step": 298
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 9.52622074978708,
+      "learning_rate": 7.102137767220903e-06,
+      "loss": 1.2516,
+      "step": 299
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 15.350962619896226,
+      "learning_rate": 7.125890736342044e-06,
+      "loss": 1.2426,
+      "step": 300
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 10.80642398234627,
+      "learning_rate": 7.149643705463184e-06,
+      "loss": 1.3228,
+      "step": 301
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 10.80797104252134,
+      "learning_rate": 7.173396674584323e-06,
+      "loss": 1.2841,
+      "step": 302
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 10.60655056347957,
+      "learning_rate": 7.197149643705463e-06,
+      "loss": 1.2313,
+      "step": 303
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 10.582221106395394,
+      "learning_rate": 7.220902612826604e-06,
+      "loss": 1.2289,
+      "step": 304
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 14.374136658706993,
+      "learning_rate": 7.2446555819477435e-06,
+      "loss": 1.2483,
+      "step": 305
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 14.013729605780126,
+      "learning_rate": 7.268408551068884e-06,
+      "loss": 1.2466,
+      "step": 306
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 11.301058957628316,
+      "learning_rate": 7.292161520190024e-06,
+      "loss": 1.3625,
+      "step": 307
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 19.385277340786583,
+      "learning_rate": 7.315914489311164e-06,
+      "loss": 1.2909,
+      "step": 308
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 10.683260926961042,
+      "learning_rate": 7.339667458432305e-06,
+      "loss": 1.2063,
+      "step": 309
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 12.988952878382923,
+      "learning_rate": 7.363420427553445e-06,
+      "loss": 1.2349,
+      "step": 310
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 19.881384215594874,
+      "learning_rate": 7.387173396674585e-06,
+      "loss": 1.2717,
+      "step": 311
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 15.32908188145667,
+      "learning_rate": 7.410926365795725e-06,
+      "loss": 1.2085,
+      "step": 312
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 12.28838571938078,
+      "learning_rate": 7.434679334916866e-06,
+      "loss": 1.2703,
+      "step": 313
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 11.3697968377905,
+      "learning_rate": 7.458432304038005e-06,
+      "loss": 1.2454,
+      "step": 314
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 11.730745528861931,
+      "learning_rate": 7.482185273159146e-06,
+      "loss": 1.2437,
+      "step": 315
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 9.871105016598209,
+      "learning_rate": 7.505938242280285e-06,
+      "loss": 1.2999,
+      "step": 316
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 8.113564550156173,
+      "learning_rate": 7.5296912114014255e-06,
+      "loss": 1.2728,
+      "step": 317
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 11.71401108479556,
+      "learning_rate": 7.553444180522565e-06,
+      "loss": 1.2474,
+      "step": 318
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 13.942586211292607,
+      "learning_rate": 7.577197149643706e-06,
+      "loss": 1.2682,
+      "step": 319
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 13.90546025947131,
+      "learning_rate": 7.600950118764846e-06,
+      "loss": 1.2876,
+      "step": 320
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 11.981725419551113,
+      "learning_rate": 7.624703087885986e-06,
+      "loss": 1.324,
+      "step": 321
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 11.02920008877058,
+      "learning_rate": 7.648456057007126e-06,
+      "loss": 1.2124,
+      "step": 322
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 12.768614728086236,
+      "learning_rate": 7.672209026128267e-06,
+      "loss": 1.268,
+      "step": 323
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 11.690467728445844,
+      "learning_rate": 7.695961995249407e-06,
+      "loss": 1.2524,
+      "step": 324
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 14.534257451989795,
+      "learning_rate": 7.719714964370548e-06,
+      "loss": 1.2578,
+      "step": 325
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 10.747380824149603,
+      "learning_rate": 7.743467933491687e-06,
+      "loss": 1.2847,
+      "step": 326
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 10.71267156970865,
+      "learning_rate": 7.767220902612827e-06,
+      "loss": 1.2685,
+      "step": 327
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 12.298053016555006,
+      "learning_rate": 7.790973871733968e-06,
+      "loss": 1.2783,
+      "step": 328
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 12.42554058953156,
+      "learning_rate": 7.814726840855108e-06,
+      "loss": 1.2273,
+      "step": 329
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 11.539915085645754,
+      "learning_rate": 7.838479809976247e-06,
+      "loss": 1.2477,
+      "step": 330
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 16.448975733242104,
+      "learning_rate": 7.862232779097388e-06,
+      "loss": 1.2045,
+      "step": 331
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 12.27246223362942,
+      "learning_rate": 7.885985748218527e-06,
+      "loss": 1.2804,
+      "step": 332
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 9.841802609369697,
+      "learning_rate": 7.909738717339667e-06,
+      "loss": 1.2262,
+      "step": 333
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 12.516649413261044,
+      "learning_rate": 7.933491686460808e-06,
+      "loss": 1.2578,
+      "step": 334
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 14.748381926809019,
+      "learning_rate": 7.957244655581949e-06,
+      "loss": 1.256,
+      "step": 335
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 10.805956755738672,
+      "learning_rate": 7.98099762470309e-06,
+      "loss": 1.2774,
+      "step": 336
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 8.242439060304449,
+      "learning_rate": 8.004750593824228e-06,
+      "loss": 1.2752,
+      "step": 337
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 14.205110393506795,
+      "learning_rate": 8.028503562945369e-06,
+      "loss": 1.2103,
+      "step": 338
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 13.62765196283523,
+      "learning_rate": 8.05225653206651e-06,
+      "loss": 1.2397,
+      "step": 339
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 12.580255075037757,
+      "learning_rate": 8.07600950118765e-06,
+      "loss": 1.2859,
+      "step": 340
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 12.133634458416568,
+      "learning_rate": 8.099762470308789e-06,
+      "loss": 1.2513,
+      "step": 341
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 12.219703275367518,
+      "learning_rate": 8.12351543942993e-06,
+      "loss": 1.2764,
+      "step": 342
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 8.683875202747483,
+      "learning_rate": 8.14726840855107e-06,
+      "loss": 1.2641,
+      "step": 343
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 10.410931000141426,
+      "learning_rate": 8.171021377672209e-06,
+      "loss": 1.2481,
+      "step": 344
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 14.490655369107824,
+      "learning_rate": 8.19477434679335e-06,
+      "loss": 1.1834,
+      "step": 345
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 13.86505041031124,
+      "learning_rate": 8.21852731591449e-06,
+      "loss": 1.2157,
+      "step": 346
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 14.469839893226338,
+      "learning_rate": 8.24228028503563e-06,
+      "loss": 1.2741,
+      "step": 347
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 13.56076541281493,
+      "learning_rate": 8.26603325415677e-06,
+      "loss": 1.239,
+      "step": 348
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 13.372151565236994,
+      "learning_rate": 8.28978622327791e-06,
+      "loss": 1.1661,
+      "step": 349
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 9.029430727190377,
+      "learning_rate": 8.31353919239905e-06,
+      "loss": 1.235,
+      "step": 350
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 12.339832136230447,
+      "learning_rate": 8.337292161520191e-06,
+      "loss": 1.239,
+      "step": 351
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 12.270986559336942,
+      "learning_rate": 8.36104513064133e-06,
+      "loss": 1.1842,
+      "step": 352
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 8.932245405021622,
+      "learning_rate": 8.38479809976247e-06,
+      "loss": 1.2599,
+      "step": 353
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 11.18683529664822,
+      "learning_rate": 8.408551068883611e-06,
+      "loss": 1.2608,
+      "step": 354
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 15.15717725564581,
+      "learning_rate": 8.432304038004752e-06,
+      "loss": 1.3545,
+      "step": 355
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 9.282904301702956,
+      "learning_rate": 8.456057007125893e-06,
+      "loss": 1.2535,
+      "step": 356
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 13.19743343684533,
+      "learning_rate": 8.479809976247032e-06,
+      "loss": 1.2585,
+      "step": 357
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 16.707829742958754,
+      "learning_rate": 8.50356294536817e-06,
+      "loss": 1.2273,
+      "step": 358
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 9.761697183798084,
+      "learning_rate": 8.527315914489311e-06,
+      "loss": 1.2443,
+      "step": 359
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 11.452813348087119,
+      "learning_rate": 8.551068883610452e-06,
+      "loss": 1.2727,
+      "step": 360
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 11.387851449407833,
+      "learning_rate": 8.574821852731592e-06,
+      "loss": 1.2354,
+      "step": 361
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 8.952825524957088,
+      "learning_rate": 8.598574821852733e-06,
+      "loss": 1.2668,
+      "step": 362
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 11.683069362593356,
+      "learning_rate": 8.622327790973872e-06,
+      "loss": 1.2463,
+      "step": 363
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 12.993880650949125,
+      "learning_rate": 8.646080760095012e-06,
+      "loss": 1.2598,
+      "step": 364
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 14.48473155971066,
+      "learning_rate": 8.669833729216153e-06,
+      "loss": 1.2539,
+      "step": 365
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 14.41228578667357,
+      "learning_rate": 8.693586698337293e-06,
+      "loss": 1.217,
+      "step": 366
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 12.13507200324324,
+      "learning_rate": 8.717339667458432e-06,
+      "loss": 1.1923,
+      "step": 367
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 12.964526093659954,
+      "learning_rate": 8.741092636579573e-06,
+      "loss": 1.2878,
+      "step": 368
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 9.345145332466922,
+      "learning_rate": 8.764845605700714e-06,
+      "loss": 1.1721,
+      "step": 369
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 11.53962341250306,
+      "learning_rate": 8.788598574821854e-06,
+      "loss": 1.2885,
+      "step": 370
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 9.330812373139056,
+      "learning_rate": 8.812351543942995e-06,
+      "loss": 1.2687,
+      "step": 371
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 14.026276902942238,
+      "learning_rate": 8.836104513064134e-06,
+      "loss": 1.2983,
+      "step": 372
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 14.167614918055556,
+      "learning_rate": 8.859857482185273e-06,
+      "loss": 1.3124,
+      "step": 373
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 12.229603890525366,
+      "learning_rate": 8.883610451306413e-06,
+      "loss": 1.2933,
+      "step": 374
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 11.568771404307208,
+      "learning_rate": 8.907363420427554e-06,
+      "loss": 1.2979,
+      "step": 375
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 12.69014457547116,
+      "learning_rate": 8.931116389548694e-06,
+      "loss": 1.2517,
+      "step": 376
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 13.274799170736658,
+      "learning_rate": 8.954869358669835e-06,
+      "loss": 1.2718,
+      "step": 377
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 11.019375679035845,
+      "learning_rate": 8.978622327790974e-06,
+      "loss": 1.2081,
+      "step": 378
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 11.825033307698435,
+      "learning_rate": 9.002375296912114e-06,
+      "loss": 1.2247,
+      "step": 379
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 11.196887227298971,
+      "learning_rate": 9.026128266033255e-06,
+      "loss": 1.242,
+      "step": 380
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 10.118989521501287,
+      "learning_rate": 9.049881235154396e-06,
+      "loss": 1.165,
+      "step": 381
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 11.764099193748807,
+      "learning_rate": 9.073634204275536e-06,
+      "loss": 1.3368,
+      "step": 382
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 12.877089807253851,
+      "learning_rate": 9.097387173396675e-06,
+      "loss": 1.1037,
+      "step": 383
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 12.28136409966607,
+      "learning_rate": 9.121140142517816e-06,
+      "loss": 1.2547,
+      "step": 384
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 9.954551221736406,
+      "learning_rate": 9.144893111638956e-06,
+      "loss": 1.2655,
+      "step": 385
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 13.97306746530814,
+      "learning_rate": 9.168646080760095e-06,
+      "loss": 1.2563,
+      "step": 386
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 9.174211115653934,
+      "learning_rate": 9.192399049881236e-06,
+      "loss": 1.2534,
+      "step": 387
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 11.691112910753533,
+      "learning_rate": 9.216152019002376e-06,
+      "loss": 1.2476,
+      "step": 388
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 11.635778944373254,
+      "learning_rate": 9.239904988123515e-06,
+      "loss": 1.1632,
+      "step": 389
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 14.04208880187382,
+      "learning_rate": 9.263657957244656e-06,
+      "loss": 1.1914,
+      "step": 390
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 11.975198393212326,
+      "learning_rate": 9.287410926365797e-06,
+      "loss": 1.2132,
+      "step": 391
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 13.626541259129457,
+      "learning_rate": 9.311163895486937e-06,
+      "loss": 1.2329,
+      "step": 392
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 11.719531933123607,
+      "learning_rate": 9.334916864608076e-06,
+      "loss": 1.2956,
+      "step": 393
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 10.546548777977915,
+      "learning_rate": 9.358669833729217e-06,
+      "loss": 1.2513,
+      "step": 394
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 13.320614579727005,
+      "learning_rate": 9.382422802850357e-06,
+      "loss": 1.2728,
+      "step": 395
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 13.494031241013868,
+      "learning_rate": 9.406175771971498e-06,
+      "loss": 1.2289,
+      "step": 396
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 12.762509457205624,
+      "learning_rate": 9.429928741092638e-06,
+      "loss": 1.2595,
+      "step": 397
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 8.846466714586445,
+      "learning_rate": 9.453681710213777e-06,
+      "loss": 1.2658,
+      "step": 398
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 11.668267979154715,
+      "learning_rate": 9.477434679334918e-06,
+      "loss": 1.2533,
+      "step": 399
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 11.836584647533602,
+      "learning_rate": 9.501187648456057e-06,
+      "loss": 1.1438,
+      "step": 400
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 13.230093029765259,
+      "learning_rate": 9.524940617577197e-06,
+      "loss": 1.2362,
+      "step": 401
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 10.583808346836703,
+      "learning_rate": 9.548693586698338e-06,
+      "loss": 1.201,
+      "step": 402
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 12.881683772373117,
+      "learning_rate": 9.572446555819479e-06,
+      "loss": 1.2007,
+      "step": 403
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 10.893958864030148,
+      "learning_rate": 9.596199524940617e-06,
+      "loss": 1.1827,
+      "step": 404
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 12.783620335728795,
+      "learning_rate": 9.619952494061758e-06,
+      "loss": 1.2406,
+      "step": 405
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 13.037640681155969,
+      "learning_rate": 9.643705463182899e-06,
+      "loss": 1.2114,
+      "step": 406
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 12.482377886862523,
+      "learning_rate": 9.66745843230404e-06,
+      "loss": 1.2387,
+      "step": 407
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 9.746547023609418,
+      "learning_rate": 9.69121140142518e-06,
+      "loss": 1.2245,
+      "step": 408
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 12.88630422905531,
+      "learning_rate": 9.714964370546319e-06,
+      "loss": 1.1651,
+      "step": 409
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 10.485237409385615,
+      "learning_rate": 9.73871733966746e-06,
+      "loss": 1.2372,
+      "step": 410
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 11.365828307499992,
+      "learning_rate": 9.7624703087886e-06,
+      "loss": 1.2173,
+      "step": 411
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 11.717431439894106,
+      "learning_rate": 9.78622327790974e-06,
+      "loss": 1.1952,
+      "step": 412
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 13.648062839507999,
+      "learning_rate": 9.80997624703088e-06,
+      "loss": 1.281,
+      "step": 413
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 12.61268687486968,
+      "learning_rate": 9.83372921615202e-06,
+      "loss": 1.2267,
+      "step": 414
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 8.110563513244465,
+      "learning_rate": 9.857482185273159e-06,
+      "loss": 1.184,
+      "step": 415
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 9.22231639787149,
+      "learning_rate": 9.8812351543943e-06,
+      "loss": 1.2458,
+      "step": 416
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 8.896743151246348,
+      "learning_rate": 9.90498812351544e-06,
+      "loss": 1.2519,
+      "step": 417
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 10.397454759177506,
+      "learning_rate": 9.92874109263658e-06,
+      "loss": 1.206,
+      "step": 418
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 11.60806474657245,
+      "learning_rate": 9.95249406175772e-06,
+      "loss": 1.3512,
+      "step": 419
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 9.692407495799932,
+      "learning_rate": 9.97624703087886e-06,
+      "loss": 1.1953,
+      "step": 420
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 12.340094380608619,
+      "learning_rate": 1e-05,
+      "loss": 1.2162,
+      "step": 421
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 12.560068332023205,
+      "learning_rate": 9.99999986665696e-06,
+      "loss": 1.262,
+      "step": 422
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 11.239695950225409,
+      "learning_rate": 9.999999466627843e-06,
+      "loss": 1.2542,
+      "step": 423
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 12.067203838128746,
+      "learning_rate": 9.999998799912672e-06,
+      "loss": 1.2593,
+      "step": 424
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 11.566335291439099,
+      "learning_rate": 9.999997866511484e-06,
+      "loss": 1.2391,
+      "step": 425
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 11.249581856275908,
+      "learning_rate": 9.999996666424327e-06,
+      "loss": 1.2905,
+      "step": 426
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 10.690797464667694,
+      "learning_rate": 9.999995199651265e-06,
+      "loss": 1.2224,
+      "step": 427
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 10.080102926079906,
+      "learning_rate": 9.999993466192375e-06,
+      "loss": 1.1935,
+      "step": 428
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 11.182710292591764,
+      "learning_rate": 9.999991466047754e-06,
+      "loss": 1.1766,
+      "step": 429
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 6.934032335538793,
+      "learning_rate": 9.999989199217503e-06,
+      "loss": 1.2062,
+      "step": 430
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 10.438368689725275,
+      "learning_rate": 9.99998666570175e-06,
+      "loss": 1.2179,
+      "step": 431
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 12.66745533630242,
+      "learning_rate": 9.999983865500621e-06,
+      "loss": 1.2006,
+      "step": 432
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 11.323383527602953,
+      "learning_rate": 9.999980798614273e-06,
+      "loss": 1.2669,
+      "step": 433
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 12.345513483445227,
+      "learning_rate": 9.999977465042865e-06,
+      "loss": 1.2708,
+      "step": 434
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 12.640985263827483,
+      "learning_rate": 9.999973864786577e-06,
+      "loss": 1.2446,
+      "step": 435
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 11.125567957942506,
+      "learning_rate": 9.999969997845601e-06,
+      "loss": 1.1986,
+      "step": 436
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 10.39004461203683,
+      "learning_rate": 9.999965864220142e-06,
+      "loss": 1.1984,
+      "step": 437
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 8.029145915169035,
+      "learning_rate": 9.999961463910424e-06,
+      "loss": 1.2123,
+      "step": 438
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 14.21540260584871,
+      "learning_rate": 9.999956796916676e-06,
+      "loss": 1.2276,
+      "step": 439
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 10.704628437093861,
+      "learning_rate": 9.99995186323915e-06,
+      "loss": 1.2366,
+      "step": 440
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 13.096452193772913,
+      "learning_rate": 9.999946662878111e-06,
+      "loss": 1.2047,
+      "step": 441
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 12.794620405913037,
+      "learning_rate": 9.999941195833834e-06,
+      "loss": 1.2627,
+      "step": 442
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 13.740469096048157,
+      "learning_rate": 9.99993546210661e-06,
+      "loss": 1.1257,
+      "step": 443
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 8.990374737967382,
+      "learning_rate": 9.999929461696746e-06,
+      "loss": 1.1889,
+      "step": 444
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 9.044924962175136,
+      "learning_rate": 9.999923194604565e-06,
+      "loss": 1.1662,
+      "step": 445
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 12.431321241961342,
+      "learning_rate": 9.999916660830393e-06,
+      "loss": 1.1727,
+      "step": 446
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 9.833846888603903,
+      "learning_rate": 9.999909860374587e-06,
+      "loss": 1.2905,
+      "step": 447
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 10.966469441162129,
+      "learning_rate": 9.999902793237506e-06,
+      "loss": 1.286,
+      "step": 448
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 14.294478823116261,
+      "learning_rate": 9.999895459419526e-06,
+      "loss": 1.3041,
+      "step": 449
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 10.49999088031908,
+      "learning_rate": 9.99988785892104e-06,
+      "loss": 1.1985,
+      "step": 450
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 10.233325571458206,
+      "learning_rate": 9.999879991742455e-06,
+      "loss": 1.1881,
+      "step": 451
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 7.897636019475168,
+      "learning_rate": 9.999871857884188e-06,
+      "loss": 1.2384,
+      "step": 452
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 12.720308774855038,
+      "learning_rate": 9.99986345734667e-06,
+      "loss": 1.2229,
+      "step": 453
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 9.349986383256443,
+      "learning_rate": 9.999854790130354e-06,
+      "loss": 1.2485,
+      "step": 454
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 12.670476709212592,
+      "learning_rate": 9.999845856235701e-06,
+      "loss": 1.1991,
+      "step": 455
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 9.771203307060166,
+      "learning_rate": 9.999836655663189e-06,
+      "loss": 1.2834,
+      "step": 456
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 12.414883929567807,
+      "learning_rate": 9.999827188413305e-06,
+      "loss": 1.2109,
+      "step": 457
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 12.030551743085837,
+      "learning_rate": 9.999817454486556e-06,
+      "loss": 1.2084,
+      "step": 458
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 9.671766381786938,
+      "learning_rate": 9.999807453883461e-06,
+      "loss": 1.1999,
+      "step": 459
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 9.19138423772248,
+      "learning_rate": 9.999797186604554e-06,
+      "loss": 1.2488,
+      "step": 460
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 9.784278338555643,
+      "learning_rate": 9.999786652650382e-06,
+      "loss": 1.23,
+      "step": 461
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 10.683052006723328,
+      "learning_rate": 9.999775852021507e-06,
+      "loss": 1.1885,
+      "step": 462
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 9.251798341326873,
+      "learning_rate": 9.999764784718504e-06,
+      "loss": 1.2298,
+      "step": 463
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 9.725009928411191,
+      "learning_rate": 9.999753450741968e-06,
+      "loss": 1.2488,
+      "step": 464
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 8.63269075622982,
+      "learning_rate": 9.999741850092495e-06,
+      "loss": 1.2244,
+      "step": 465
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 10.582247215193442,
+      "learning_rate": 9.99972998277071e-06,
+      "loss": 1.2136,
+      "step": 466
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 9.746492034217015,
+      "learning_rate": 9.999717848777246e-06,
+      "loss": 1.2551,
+      "step": 467
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 8.66711527438808,
+      "learning_rate": 9.999705448112749e-06,
+      "loss": 1.1735,
+      "step": 468
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 9.81931106319056,
+      "learning_rate": 9.99969278077788e-06,
+      "loss": 1.208,
+      "step": 469
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 13.38026076139341,
+      "learning_rate": 9.99967984677331e-06,
+      "loss": 1.243,
+      "step": 470
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 13.318229242119266,
+      "learning_rate": 9.99966664609974e-06,
+      "loss": 1.186,
+      "step": 471
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 11.93356495154702,
+      "learning_rate": 9.999653178757864e-06,
+      "loss": 1.2466,
+      "step": 472
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 9.843254858663684,
+      "learning_rate": 9.999639444748406e-06,
+      "loss": 1.1422,
+      "step": 473
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 11.356925450464619,
+      "learning_rate": 9.999625444072096e-06,
+      "loss": 1.3257,
+      "step": 474
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 10.821884643691597,
+      "learning_rate": 9.999611176729682e-06,
+      "loss": 1.2651,
+      "step": 475
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 10.846952706200888,
+      "learning_rate": 9.999596642721923e-06,
+      "loss": 1.1359,
+      "step": 476
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 9.261835406980833,
+      "learning_rate": 9.999581842049598e-06,
+      "loss": 1.2314,
+      "step": 477
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 10.960528098010887,
+      "learning_rate": 9.999566774713491e-06,
+      "loss": 1.1945,
+      "step": 478
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 10.306474990200089,
+      "learning_rate": 9.99955144071441e-06,
+      "loss": 1.2179,
+      "step": 479
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 8.88300622050581,
+      "learning_rate": 9.999535840053171e-06,
+      "loss": 1.2267,
+      "step": 480
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 6.9322347150520285,
+      "learning_rate": 9.999519972730607e-06,
+      "loss": 1.2858,
+      "step": 481
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 12.079722823382646,
+      "learning_rate": 9.999503838747564e-06,
+      "loss": 1.2158,
+      "step": 482
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 7.697266448416722,
+      "learning_rate": 9.999487438104903e-06,
+      "loss": 1.1827,
+      "step": 483
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 12.05588982728182,
+      "learning_rate": 9.999470770803498e-06,
+      "loss": 1.274,
+      "step": 484
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 7.6218298989676665,
+      "learning_rate": 9.999453836844238e-06,
+      "loss": 1.2428,
+      "step": 485
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 9.184917104269173,
+      "learning_rate": 9.999436636228025e-06,
+      "loss": 1.2186,
+      "step": 486
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 9.028937770019645,
+      "learning_rate": 9.99941916895578e-06,
+      "loss": 1.2157,
+      "step": 487
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 12.063904469697915,
+      "learning_rate": 9.999401435028431e-06,
+      "loss": 1.1714,
+      "step": 488
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 10.416229292769001,
+      "learning_rate": 9.999383434446927e-06,
+      "loss": 1.1522,
+      "step": 489
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 8.47518364164538,
+      "learning_rate": 9.999365167212226e-06,
+      "loss": 1.1819,
+      "step": 490
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 11.187413174280465,
+      "learning_rate": 9.999346633325302e-06,
+      "loss": 1.1836,
+      "step": 491
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 12.658554556363475,
+      "learning_rate": 9.999327832787144e-06,
+      "loss": 1.2323,
+      "step": 492
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 9.886137707890898,
+      "learning_rate": 9.999308765598755e-06,
+      "loss": 1.1927,
+      "step": 493
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 8.70071512412514,
+      "learning_rate": 9.999289431761153e-06,
+      "loss": 1.2115,
+      "step": 494
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 10.637994504135056,
+      "learning_rate": 9.99926983127537e-06,
+      "loss": 1.2695,
+      "step": 495
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 7.743927628179929,
+      "learning_rate": 9.999249964142445e-06,
+      "loss": 1.1889,
+      "step": 496
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 14.048968061511802,
+      "learning_rate": 9.999229830363446e-06,
+      "loss": 1.2636,
+      "step": 497
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 10.54162064549555,
+      "learning_rate": 9.999209429939442e-06,
+      "loss": 1.1638,
+      "step": 498
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 13.12088313722267,
+      "learning_rate": 9.999188762871524e-06,
+      "loss": 1.2386,
+      "step": 499
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 12.71466565531587,
+      "learning_rate": 9.999167829160791e-06,
+      "loss": 1.1987,
+      "step": 500
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 9.610019652564965,
+      "learning_rate": 9.999146628808362e-06,
+      "loss": 1.175,
+      "step": 501
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 10.344666755399041,
+      "learning_rate": 9.999125161815369e-06,
+      "loss": 1.2404,
+      "step": 502
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 14.970838347032394,
+      "learning_rate": 9.999103428182953e-06,
+      "loss": 1.1806,
+      "step": 503
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 10.424702481767163,
+      "learning_rate": 9.999081427912275e-06,
+      "loss": 1.1838,
+      "step": 504
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 9.645063059380105,
+      "learning_rate": 9.999059161004509e-06,
+      "loss": 1.2418,
+      "step": 505
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 9.375009155908476,
+      "learning_rate": 9.999036627460844e-06,
+      "loss": 1.2006,
+      "step": 506
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 9.195530639125787,
+      "learning_rate": 9.999013827282478e-06,
+      "loss": 1.1784,
+      "step": 507
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 13.13218021968538,
+      "learning_rate": 9.998990760470628e-06,
+      "loss": 1.2012,
+      "step": 508
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 15.600143803915214,
+      "learning_rate": 9.998967427026527e-06,
+      "loss": 1.1471,
+      "step": 509
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 10.633992523934245,
+      "learning_rate": 9.99894382695142e-06,
+      "loss": 1.176,
+      "step": 510
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 9.88537686063166,
+      "learning_rate": 9.998919960246563e-06,
+      "loss": 1.2505,
+      "step": 511
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 14.253374310779085,
+      "learning_rate": 9.99889582691323e-06,
+      "loss": 1.297,
+      "step": 512
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 10.031116349683902,
+      "learning_rate": 9.998871426952706e-06,
+      "loss": 1.2743,
+      "step": 513
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 11.258025612135967,
+      "learning_rate": 9.998846760366297e-06,
+      "loss": 1.2517,
+      "step": 514
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 12.827395900433766,
+      "learning_rate": 9.998821827155316e-06,
+      "loss": 1.1098,
+      "step": 515
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 14.72128945147374,
+      "learning_rate": 9.998796627321092e-06,
+      "loss": 1.1813,
+      "step": 516
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 10.768508816998478,
+      "learning_rate": 9.998771160864971e-06,
+      "loss": 1.243,
+      "step": 517
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 10.439191663135716,
+      "learning_rate": 9.99874542778831e-06,
+      "loss": 1.1847,
+      "step": 518
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 8.192006611423949,
+      "learning_rate": 9.998719428092481e-06,
+      "loss": 1.2118,
+      "step": 519
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 11.47252293622274,
+      "learning_rate": 9.998693161778874e-06,
+      "loss": 1.2865,
+      "step": 520
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 9.980237092751466,
+      "learning_rate": 9.998666628848885e-06,
+      "loss": 1.2082,
+      "step": 521
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 9.686362862770345,
+      "learning_rate": 9.998639829303934e-06,
+      "loss": 1.1773,
+      "step": 522
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 10.790322417048934,
+      "learning_rate": 9.998612763145448e-06,
+      "loss": 1.2417,
+      "step": 523
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 14.477207781429353,
+      "learning_rate": 9.998585430374869e-06,
+      "loss": 1.2694,
+      "step": 524
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 12.076540332476485,
+      "learning_rate": 9.998557830993659e-06,
+      "loss": 1.2366,
+      "step": 525
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 10.34205277098085,
+      "learning_rate": 9.998529965003287e-06,
+      "loss": 1.2199,
+      "step": 526
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 11.996242909118816,
+      "learning_rate": 9.99850183240524e-06,
+      "loss": 1.2483,
+      "step": 527
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 8.579689235681705,
+      "learning_rate": 9.998473433201018e-06,
+      "loss": 1.2548,
+      "step": 528
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 11.14492966875259,
+      "learning_rate": 9.998444767392139e-06,
+      "loss": 1.2583,
+      "step": 529
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 15.897319743106433,
+      "learning_rate": 9.99841583498013e-06,
+      "loss": 1.2147,
+      "step": 530
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 11.809591668904266,
+      "learning_rate": 9.99838663596653e-06,
+      "loss": 1.1784,
+      "step": 531
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 8.883436933550614,
+      "learning_rate": 9.998357170352901e-06,
+      "loss": 1.2125,
+      "step": 532
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 10.323763719473009,
+      "learning_rate": 9.998327438140814e-06,
+      "loss": 1.1833,
+      "step": 533
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 8.092151339267021,
+      "learning_rate": 9.998297439331855e-06,
+      "loss": 1.1814,
+      "step": 534
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 9.757081788486584,
+      "learning_rate": 9.998267173927623e-06,
+      "loss": 1.2142,
+      "step": 535
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 8.18174992073368,
+      "learning_rate": 9.998236641929732e-06,
+      "loss": 1.2467,
+      "step": 536
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 8.748245206841284,
+      "learning_rate": 9.998205843339811e-06,
+      "loss": 1.1944,
+      "step": 537
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 9.165777271582032,
+      "learning_rate": 9.998174778159505e-06,
+      "loss": 1.2978,
+      "step": 538
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 14.685530095724042,
+      "learning_rate": 9.998143446390467e-06,
+      "loss": 1.2532,
+      "step": 539
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 15.049258518260823,
+      "learning_rate": 9.99811184803437e-06,
+      "loss": 1.2387,
+      "step": 540
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 9.893613630458242,
+      "learning_rate": 9.998079983092899e-06,
+      "loss": 1.2168,
+      "step": 541
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 7.096495334320435,
+      "learning_rate": 9.998047851567757e-06,
+      "loss": 1.2341,
+      "step": 542
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 12.783504312414417,
+      "learning_rate": 9.998015453460651e-06,
+      "loss": 1.1231,
+      "step": 543
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 10.811476538610647,
+      "learning_rate": 9.997982788773315e-06,
+      "loss": 1.1472,
+      "step": 544
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 13.94778684203721,
+      "learning_rate": 9.997949857507487e-06,
+      "loss": 1.1417,
+      "step": 545
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 12.68379366696013,
+      "learning_rate": 9.997916659664928e-06,
+      "loss": 1.2673,
+      "step": 546
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 9.992715590840627,
+      "learning_rate": 9.997883195247405e-06,
+      "loss": 1.1709,
+      "step": 547
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 9.132309691687782,
+      "learning_rate": 9.997849464256704e-06,
+      "loss": 1.1765,
+      "step": 548
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 10.121938903376991,
+      "learning_rate": 9.997815466694624e-06,
+      "loss": 1.2115,
+      "step": 549
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 7.1230597856207565,
+      "learning_rate": 9.99778120256298e-06,
+      "loss": 1.1928,
+      "step": 550
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 10.744954053971872,
+      "learning_rate": 9.997746671863595e-06,
+      "loss": 1.1884,
+      "step": 551
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 10.898291623150179,
+      "learning_rate": 9.997711874598317e-06,
+      "loss": 1.1834,
+      "step": 552
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 7.2381569948650695,
+      "learning_rate": 9.997676810768997e-06,
+      "loss": 1.1586,
+      "step": 553
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 9.238133565168845,
+      "learning_rate": 9.997641480377509e-06,
+      "loss": 1.2207,
+      "step": 554
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 8.864240432406815,
+      "learning_rate": 9.997605883425732e-06,
+      "loss": 1.1847,
+      "step": 555
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 13.932210964597877,
+      "learning_rate": 9.997570019915571e-06,
+      "loss": 1.1512,
+      "step": 556
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 10.320825335965706,
+      "learning_rate": 9.997533889848935e-06,
+      "loss": 1.1833,
+      "step": 557
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 11.999000344289522,
+      "learning_rate": 9.997497493227752e-06,
+      "loss": 1.179,
+      "step": 558
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 10.652670181660264,
+      "learning_rate": 9.997460830053964e-06,
+      "loss": 1.1987,
+      "step": 559
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 11.10617959594484,
+      "learning_rate": 9.997423900329526e-06,
+      "loss": 1.1645,
+      "step": 560
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 9.745010355859252,
+      "learning_rate": 9.997386704056407e-06,
+      "loss": 1.2193,
+      "step": 561
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 13.387707400654765,
+      "learning_rate": 9.997349241236592e-06,
+      "loss": 1.2016,
+      "step": 562
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 10.283859035468684,
+      "learning_rate": 9.997311511872076e-06,
+      "loss": 1.1692,
+      "step": 563
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 9.537417535659758,
+      "learning_rate": 9.997273515964878e-06,
+      "loss": 1.1341,
+      "step": 564
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 10.123965916847826,
+      "learning_rate": 9.997235253517018e-06,
+      "loss": 1.2224,
+      "step": 565
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 9.668149058510146,
+      "learning_rate": 9.99719672453054e-06,
+      "loss": 1.211,
+      "step": 566
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 10.849534685734481,
+      "learning_rate": 9.997157929007499e-06,
+      "loss": 1.1609,
+      "step": 567
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 9.134937669408764,
+      "learning_rate": 9.997118866949964e-06,
+      "loss": 1.1779,
+      "step": 568
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 15.478798080926,
+      "learning_rate": 9.997079538360017e-06,
+      "loss": 1.175,
+      "step": 569
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 8.986183791795971,
+      "learning_rate": 9.997039943239759e-06,
+      "loss": 1.1192,
+      "step": 570
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 10.462050371314469,
+      "learning_rate": 9.997000081591297e-06,
+      "loss": 1.1192,
+      "step": 571
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 9.096961659522766,
+      "learning_rate": 9.996959953416762e-06,
+      "loss": 1.216,
+      "step": 572
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 10.876945900232876,
+      "learning_rate": 9.996919558718292e-06,
+      "loss": 1.209,
+      "step": 573
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 14.309026752463426,
+      "learning_rate": 9.996878897498041e-06,
+      "loss": 1.1051,
+      "step": 574
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 10.397672916327592,
+      "learning_rate": 9.996837969758179e-06,
+      "loss": 1.2799,
+      "step": 575
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 13.54788812734169,
+      "learning_rate": 9.996796775500889e-06,
+      "loss": 1.1058,
+      "step": 576
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 10.537943003112165,
+      "learning_rate": 9.996755314728367e-06,
+      "loss": 1.206,
+      "step": 577
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 13.479383093527462,
+      "learning_rate": 9.996713587442825e-06,
+      "loss": 1.1019,
+      "step": 578
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 9.72080877915568,
+      "learning_rate": 9.996671593646488e-06,
+      "loss": 1.1512,
+      "step": 579
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 8.307070473158431,
+      "learning_rate": 9.996629333341598e-06,
+      "loss": 1.2113,
+      "step": 580
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 12.488668301356412,
+      "learning_rate": 9.996586806530407e-06,
+      "loss": 1.2099,
+      "step": 581
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 11.418252650714141,
+      "learning_rate": 9.996544013215183e-06,
+      "loss": 1.1892,
+      "step": 582
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 10.913263321550723,
+      "learning_rate": 9.996500953398212e-06,
+      "loss": 1.1631,
+      "step": 583
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 9.210144151178955,
+      "learning_rate": 9.996457627081785e-06,
+      "loss": 1.1944,
+      "step": 584
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 12.20362796899612,
+      "learning_rate": 9.996414034268217e-06,
+      "loss": 1.1355,
+      "step": 585
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 8.919988530995854,
+      "learning_rate": 9.996370174959832e-06,
+      "loss": 1.207,
+      "step": 586
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 11.808245815108055,
+      "learning_rate": 9.996326049158969e-06,
+      "loss": 1.1752,
+      "step": 587
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 7.073677585494579,
+      "learning_rate": 9.996281656867983e-06,
+      "loss": 1.2358,
+      "step": 588
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 10.657584970524626,
+      "learning_rate": 9.99623699808924e-06,
+      "loss": 1.2246,
+      "step": 589
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 8.923469550139831,
+      "learning_rate": 9.996192072825122e-06,
+      "loss": 1.1247,
+      "step": 590
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 10.26805911180553,
+      "learning_rate": 9.996146881078025e-06,
+      "loss": 1.1832,
+      "step": 591
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 9.088318878804197,
+      "learning_rate": 9.996101422850364e-06,
+      "loss": 1.2314,
+      "step": 592
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 9.942000782359365,
+      "learning_rate": 9.996055698144556e-06,
+      "loss": 1.1411,
+      "step": 593
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 11.423230853588064,
+      "learning_rate": 9.996009706963046e-06,
+      "loss": 1.1866,
+      "step": 594
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 9.297001619275642,
+      "learning_rate": 9.995963449308284e-06,
+      "loss": 1.2262,
+      "step": 595
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 7.134310908645247,
+      "learning_rate": 9.995916925182739e-06,
+      "loss": 1.2422,
+      "step": 596
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 9.927973467223898,
+      "learning_rate": 9.995870134588888e-06,
+      "loss": 1.2297,
+      "step": 597
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 10.49729509251155,
+      "learning_rate": 9.995823077529234e-06,
+      "loss": 1.1819,
+      "step": 598
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 10.378520757035595,
+      "learning_rate": 9.99577575400628e-06,
+      "loss": 1.2262,
+      "step": 599
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 9.592879800534263,
+      "learning_rate": 9.995728164022556e-06,
+      "loss": 1.2099,
+      "step": 600
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 7.768433293918745,
+      "learning_rate": 9.995680307580596e-06,
+      "loss": 1.1436,
+      "step": 601
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 10.813406774677345,
+      "learning_rate": 9.995632184682953e-06,
+      "loss": 1.1475,
+      "step": 602
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 11.512327239364824,
+      "learning_rate": 9.995583795332196e-06,
+      "loss": 1.2246,
+      "step": 603
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 11.650767399232441,
+      "learning_rate": 9.995535139530904e-06,
+      "loss": 1.1783,
+      "step": 604
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 12.400673040842356,
+      "learning_rate": 9.995486217281674e-06,
+      "loss": 1.1578,
+      "step": 605
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 7.195500728003077,
+      "learning_rate": 9.995437028587113e-06,
+      "loss": 1.1915,
+      "step": 606
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 10.184395343388324,
+      "learning_rate": 9.995387573449845e-06,
+      "loss": 1.1656,
+      "step": 607
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 7.546654648522888,
+      "learning_rate": 9.995337851872511e-06,
+      "loss": 1.1177,
+      "step": 608
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 11.484138559199241,
+      "learning_rate": 9.99528786385776e-06,
+      "loss": 1.2144,
+      "step": 609
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 8.74945045755422,
+      "learning_rate": 9.995237609408257e-06,
+      "loss": 1.173,
+      "step": 610
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 10.64880391369783,
+      "learning_rate": 9.995187088526686e-06,
+      "loss": 1.141,
+      "step": 611
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 10.391470662193274,
+      "learning_rate": 9.995136301215738e-06,
+      "loss": 1.0992,
+      "step": 612
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 12.200505643163382,
+      "learning_rate": 9.995085247478125e-06,
+      "loss": 1.1619,
+      "step": 613
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 10.553495744889252,
+      "learning_rate": 9.99503392731657e-06,
+      "loss": 1.2152,
+      "step": 614
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 12.447519792795141,
+      "learning_rate": 9.994982340733808e-06,
+      "loss": 1.2051,
+      "step": 615
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 14.603808172133196,
+      "learning_rate": 9.994930487732591e-06,
+      "loss": 1.1308,
+      "step": 616
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 10.737450536516459,
+      "learning_rate": 9.994878368315685e-06,
+      "loss": 1.3214,
+      "step": 617
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 11.99804939590362,
+      "learning_rate": 9.994825982485872e-06,
+      "loss": 1.1228,
+      "step": 618
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 9.949505786102648,
+      "learning_rate": 9.994773330245945e-06,
+      "loss": 1.1638,
+      "step": 619
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 12.796238485440728,
+      "learning_rate": 9.994720411598709e-06,
+      "loss": 1.1747,
+      "step": 620
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 9.487021077023234,
+      "learning_rate": 9.994667226546989e-06,
+      "loss": 1.1555,
+      "step": 621
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 9.779619408443107,
+      "learning_rate": 9.994613775093622e-06,
+      "loss": 1.2045,
+      "step": 622
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 9.510472823324871,
+      "learning_rate": 9.99456005724146e-06,
+      "loss": 1.1865,
+      "step": 623
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 10.28063434972837,
+      "learning_rate": 9.994506072993368e-06,
+      "loss": 1.2187,
+      "step": 624
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 10.44001485670533,
+      "learning_rate": 9.994451822352223e-06,
+      "loss": 1.1241,
+      "step": 625
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 9.614466424652951,
+      "learning_rate": 9.994397305320919e-06,
+      "loss": 1.2286,
+      "step": 626
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 9.624391640602136,
+      "learning_rate": 9.994342521902368e-06,
+      "loss": 1.1528,
+      "step": 627
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 8.280528861754396,
+      "learning_rate": 9.994287472099484e-06,
+      "loss": 1.2773,
+      "step": 628
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 13.279169360358251,
+      "learning_rate": 9.99423215591521e-06,
+      "loss": 1.2141,
+      "step": 629
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 10.52492890827474,
+      "learning_rate": 9.994176573352497e-06,
+      "loss": 1.2044,
+      "step": 630
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 11.181049405599115,
+      "learning_rate": 9.994120724414304e-06,
+      "loss": 1.154,
+      "step": 631
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 10.742835160324967,
+      "learning_rate": 9.994064609103613e-06,
+      "loss": 1.1641,
+      "step": 632
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 7.7822206199542885,
+      "learning_rate": 9.994008227423418e-06,
+      "loss": 1.1605,
+      "step": 633
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 9.39132822650865,
+      "learning_rate": 9.993951579376723e-06,
+      "loss": 1.1574,
+      "step": 634
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 11.939915664069888,
+      "learning_rate": 9.993894664966555e-06,
+      "loss": 1.1566,
+      "step": 635
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 9.845005204648166,
+      "learning_rate": 9.993837484195942e-06,
+      "loss": 1.094,
+      "step": 636
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 6.91910742848748,
+      "learning_rate": 9.993780037067942e-06,
+      "loss": 1.2838,
+      "step": 637
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 12.04963637387935,
+      "learning_rate": 9.993722323585613e-06,
+      "loss": 1.207,
+      "step": 638
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 11.939837018226589,
+      "learning_rate": 9.993664343752035e-06,
+      "loss": 1.1713,
+      "step": 639
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 8.804973565967911,
+      "learning_rate": 9.993606097570301e-06,
+      "loss": 1.1448,
+      "step": 640
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 10.458410894909422,
+      "learning_rate": 9.993547585043519e-06,
+      "loss": 1.1423,
+      "step": 641
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 12.899874142097017,
+      "learning_rate": 9.993488806174808e-06,
+      "loss": 1.1771,
+      "step": 642
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 9.203869418649408,
+      "learning_rate": 9.993429760967303e-06,
+      "loss": 1.1968,
+      "step": 643
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 14.032610066071259,
+      "learning_rate": 9.993370449424153e-06,
+      "loss": 1.1727,
+      "step": 644
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 8.90574074951128,
+      "learning_rate": 9.993310871548524e-06,
+      "loss": 1.1857,
+      "step": 645
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 14.99054875675555,
+      "learning_rate": 9.993251027343591e-06,
+      "loss": 1.1932,
+      "step": 646
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 10.904801218426211,
+      "learning_rate": 9.993190916812547e-06,
+      "loss": 1.095,
+      "step": 647
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 12.037214516176727,
+      "learning_rate": 9.993130539958599e-06,
+      "loss": 1.2196,
+      "step": 648
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 10.737651092029482,
+      "learning_rate": 9.993069896784967e-06,
+      "loss": 1.2211,
+      "step": 649
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 10.7225290080776,
+      "learning_rate": 9.993008987294884e-06,
+      "loss": 1.1426,
+      "step": 650
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 11.385124160869497,
+      "learning_rate": 9.992947811491598e-06,
+      "loss": 1.098,
+      "step": 651
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 9.684889018911187,
+      "learning_rate": 9.992886369378377e-06,
+      "loss": 1.0502,
+      "step": 652
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 9.508374751345716,
+      "learning_rate": 9.992824660958492e-06,
+      "loss": 1.172,
+      "step": 653
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 10.99107211716205,
+      "learning_rate": 9.99276268623524e-06,
+      "loss": 1.184,
+      "step": 654
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 12.222298714252494,
+      "learning_rate": 9.992700445211922e-06,
+      "loss": 1.314,
+      "step": 655
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 10.980324645262623,
+      "learning_rate": 9.992637937891858e-06,
+      "loss": 1.1327,
+      "step": 656
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 11.076841261016707,
+      "learning_rate": 9.992575164278386e-06,
+      "loss": 1.2181,
+      "step": 657
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 11.564544232409354,
+      "learning_rate": 9.992512124374851e-06,
+      "loss": 1.1913,
+      "step": 658
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 11.065661360301775,
+      "learning_rate": 9.992448818184617e-06,
+      "loss": 1.1732,
+      "step": 659
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 9.908563402954986,
+      "learning_rate": 9.992385245711058e-06,
+      "loss": 1.1305,
+      "step": 660
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 14.050453102204328,
+      "learning_rate": 9.992321406957567e-06,
+      "loss": 1.2426,
+      "step": 661
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 9.723464318496807,
+      "learning_rate": 9.992257301927549e-06,
+      "loss": 1.1629,
+      "step": 662
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 12.973096178456021,
+      "learning_rate": 9.992192930624422e-06,
+      "loss": 1.1289,
+      "step": 663
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 11.268105175082662,
+      "learning_rate": 9.99212829305162e-06,
+      "loss": 1.2266,
+      "step": 664
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 9.625955932280544,
+      "learning_rate": 9.992063389212589e-06,
+      "loss": 1.2018,
+      "step": 665
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 13.316635001045418,
+      "learning_rate": 9.991998219110794e-06,
+      "loss": 1.1331,
+      "step": 666
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.725615980243889,
+      "learning_rate": 9.991932782749709e-06,
+      "loss": 1.1646,
+      "step": 667
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 10.660949988570708,
+      "learning_rate": 9.991867080132825e-06,
+      "loss": 1.1692,
+      "step": 668
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 10.357800704297247,
+      "learning_rate": 9.991801111263647e-06,
+      "loss": 1.2388,
+      "step": 669
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 11.195015067878483,
+      "learning_rate": 9.991734876145689e-06,
+      "loss": 1.2017,
+      "step": 670
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 7.665725390466289,
+      "learning_rate": 9.991668374782491e-06,
+      "loss": 1.203,
+      "step": 671
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 7.685516493733012,
+      "learning_rate": 9.991601607177593e-06,
+      "loss": 1.1794,
+      "step": 672
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 9.650514764647543,
+      "learning_rate": 9.991534573334561e-06,
+      "loss": 1.1237,
+      "step": 673
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 9.114784874505075,
+      "learning_rate": 9.99146727325697e-06,
+      "loss": 1.1412,
+      "step": 674
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 9.896846960933422,
+      "learning_rate": 9.991399706948405e-06,
+      "loss": 1.1777,
+      "step": 675
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.991288979166422,
+      "learning_rate": 9.991331874412478e-06,
+      "loss": 1.091,
+      "step": 676
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 9.709791386054693,
+      "learning_rate": 9.991263775652799e-06,
+      "loss": 1.1662,
+      "step": 677
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.912361785281098,
+      "learning_rate": 9.991195410673004e-06,
+      "loss": 1.1282,
+      "step": 678
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.38670380850144,
+      "learning_rate": 9.991126779476741e-06,
+      "loss": 1.1074,
+      "step": 679
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 10.27803404182293,
+      "learning_rate": 9.991057882067665e-06,
+      "loss": 1.1193,
+      "step": 680
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.689564244187798,
+      "learning_rate": 9.990988718449457e-06,
+      "loss": 1.1903,
+      "step": 681
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.229842204990936,
+      "learning_rate": 9.990919288625802e-06,
+      "loss": 1.1801,
+      "step": 682
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 7.540583680778348,
+      "learning_rate": 9.990849592600406e-06,
+      "loss": 1.1862,
+      "step": 683
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 12.707978090432553,
+      "learning_rate": 9.990779630376983e-06,
+      "loss": 1.1539,
+      "step": 684
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 14.96106997165472,
+      "learning_rate": 9.990709401959267e-06,
+      "loss": 1.1936,
+      "step": 685
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 10.669871824541621,
+      "learning_rate": 9.990638907351003e-06,
+      "loss": 1.088,
+      "step": 686
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 12.500804354173297,
+      "learning_rate": 9.990568146555953e-06,
+      "loss": 1.1072,
+      "step": 687
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 7.221593977704549,
+      "learning_rate": 9.990497119577886e-06,
+      "loss": 1.1454,
+      "step": 688
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.308896694366732,
+      "learning_rate": 9.990425826420597e-06,
+      "loss": 1.1325,
+      "step": 689
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 9.642515955495039,
+      "learning_rate": 9.990354267087885e-06,
+      "loss": 1.2245,
+      "step": 690
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 6.6132220969135975,
+      "learning_rate": 9.990282441583565e-06,
+      "loss": 1.1096,
+      "step": 691
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 10.449193380449818,
+      "learning_rate": 9.990210349911472e-06,
+      "loss": 1.2307,
+      "step": 692
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 9.731469709975404,
+      "learning_rate": 9.990137992075448e-06,
+      "loss": 1.1548,
+      "step": 693
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 7.593475943667629,
+      "learning_rate": 9.990065368079355e-06,
+      "loss": 1.1605,
+      "step": 694
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 15.3196974025508,
+      "learning_rate": 9.989992477927064e-06,
+      "loss": 1.1631,
+      "step": 695
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 9.868849043534754,
+      "learning_rate": 9.989919321622465e-06,
+      "loss": 1.1666,
+      "step": 696
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.965328657251852,
+      "learning_rate": 9.989845899169458e-06,
+      "loss": 1.2465,
+      "step": 697
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.562747283293042,
+      "learning_rate": 9.989772210571963e-06,
+      "loss": 1.2337,
+      "step": 698
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 9.0462453304089,
+      "learning_rate": 9.989698255833903e-06,
+      "loss": 1.2148,
+      "step": 699
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 12.250736067166674,
+      "learning_rate": 9.989624034959229e-06,
+      "loss": 1.1536,
+      "step": 700
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 9.480678080908262,
+      "learning_rate": 9.989549547951897e-06,
+      "loss": 1.1688,
+      "step": 701
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 7.414656391497006,
+      "learning_rate": 9.98947479481588e-06,
+      "loss": 1.1692,
+      "step": 702
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 12.132195384868012,
+      "learning_rate": 9.98939977555517e-06,
+      "loss": 1.1328,
+      "step": 703
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 10.51327267019025,
+      "learning_rate": 9.98932449017376e-06,
+      "loss": 1.1986,
+      "step": 704
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 10.864620384804471,
+      "learning_rate": 9.98924893867567e-06,
+      "loss": 1.1855,
+      "step": 705
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 12.078490450129037,
+      "learning_rate": 9.98917312106493e-06,
+      "loss": 1.1552,
+      "step": 706
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 10.89251691098606,
+      "learning_rate": 9.989097037345583e-06,
+      "loss": 1.1594,
+      "step": 707
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.324998997019504,
+      "learning_rate": 9.989020687521686e-06,
+      "loss": 1.1296,
+      "step": 708
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.902301128542724,
+      "learning_rate": 9.988944071597315e-06,
+      "loss": 1.1751,
+      "step": 709
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 11.049041829681638,
+      "learning_rate": 9.988867189576553e-06,
+      "loss": 1.178,
+      "step": 710
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 9.461983703796381,
+      "learning_rate": 9.988790041463502e-06,
+      "loss": 1.2231,
+      "step": 711
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 10.17485265625425,
+      "learning_rate": 9.988712627262276e-06,
+      "loss": 1.1555,
+      "step": 712
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 10.156336175118557,
+      "learning_rate": 9.988634946977005e-06,
+      "loss": 1.1764,
+      "step": 713
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 9.771961684092881,
+      "learning_rate": 9.98855700061183e-06,
+      "loss": 1.1898,
+      "step": 714
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 12.32104402185865,
+      "learning_rate": 9.988478788170912e-06,
+      "loss": 1.0701,
+      "step": 715
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 12.6159306262863,
+      "learning_rate": 9.988400309658423e-06,
+      "loss": 1.1711,
+      "step": 716
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 12.481621950386716,
+      "learning_rate": 9.988321565078545e-06,
+      "loss": 1.2382,
+      "step": 717
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 15.162145644275473,
+      "learning_rate": 9.988242554435479e-06,
+      "loss": 1.1311,
+      "step": 718
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 10.567466030270415,
+      "learning_rate": 9.98816327773344e-06,
+      "loss": 1.2364,
+      "step": 719
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 9.704739723289759,
+      "learning_rate": 9.988083734976657e-06,
+      "loss": 1.1606,
+      "step": 720
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 9.435793870108572,
+      "learning_rate": 9.988003926169374e-06,
+      "loss": 1.1346,
+      "step": 721
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 10.145319704555805,
+      "learning_rate": 9.987923851315843e-06,
+      "loss": 1.1394,
+      "step": 722
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 7.9049656702851845,
+      "learning_rate": 9.987843510420338e-06,
+      "loss": 1.1881,
+      "step": 723
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 9.144894345325051,
+      "learning_rate": 9.987762903487145e-06,
+      "loss": 1.1386,
+      "step": 724
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 10.270829825487052,
+      "learning_rate": 9.98768203052056e-06,
+      "loss": 1.2803,
+      "step": 725
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.771089294625202,
+      "learning_rate": 9.987600891524902e-06,
+      "loss": 1.1552,
+      "step": 726
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 10.964201637858874,
+      "learning_rate": 9.987519486504494e-06,
+      "loss": 1.1343,
+      "step": 727
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 10.130270202490514,
+      "learning_rate": 9.98743781546368e-06,
+      "loss": 1.149,
+      "step": 728
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 12.324842404823224,
+      "learning_rate": 9.987355878406814e-06,
+      "loss": 1.086,
+      "step": 729
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 6.776930385684051,
+      "learning_rate": 9.987273675338269e-06,
+      "loss": 1.1478,
+      "step": 730
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 11.601411918082745,
+      "learning_rate": 9.987191206262428e-06,
+      "loss": 1.2157,
+      "step": 731
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 11.459702424353637,
+      "learning_rate": 9.987108471183691e-06,
+      "loss": 1.1646,
+      "step": 732
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.554459338283614,
+      "learning_rate": 9.987025470106468e-06,
+      "loss": 1.1273,
+      "step": 733
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 9.394372154221319,
+      "learning_rate": 9.98694220303519e-06,
+      "loss": 1.1106,
+      "step": 734
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 7.380248486178393,
+      "learning_rate": 9.986858669974295e-06,
+      "loss": 1.1518,
+      "step": 735
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.082462016599694,
+      "learning_rate": 9.98677487092824e-06,
+      "loss": 1.1374,
+      "step": 736
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 7.961844279681643,
+      "learning_rate": 9.986690805901493e-06,
+      "loss": 1.205,
+      "step": 737
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 12.73003522525729,
+      "learning_rate": 9.98660647489854e-06,
+      "loss": 1.1832,
+      "step": 738
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 11.386711369460137,
+      "learning_rate": 9.986521877923879e-06,
+      "loss": 1.1631,
+      "step": 739
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 9.34870161860221,
+      "learning_rate": 9.98643701498202e-06,
+      "loss": 1.1785,
+      "step": 740
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.276976135703304,
+      "learning_rate": 9.98635188607749e-06,
+      "loss": 1.1289,
+      "step": 741
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 10.412931896219577,
+      "learning_rate": 9.98626649121483e-06,
+      "loss": 1.1216,
+      "step": 742
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 10.035427898581107,
+      "learning_rate": 9.986180830398595e-06,
+      "loss": 1.0702,
+      "step": 743
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 10.59364500710674,
+      "learning_rate": 9.986094903633355e-06,
+      "loss": 1.1819,
+      "step": 744
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 9.537118560775655,
+      "learning_rate": 9.98600871092369e-06,
+      "loss": 1.2184,
+      "step": 745
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 11.207352367776009,
+      "learning_rate": 9.985922252274198e-06,
+      "loss": 1.1293,
+      "step": 746
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 12.329689815410417,
+      "learning_rate": 9.985835527689494e-06,
+      "loss": 1.1518,
+      "step": 747
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.85332182569387,
+      "learning_rate": 9.9857485371742e-06,
+      "loss": 1.2202,
+      "step": 748
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 9.383155702652582,
+      "learning_rate": 9.985661280732956e-06,
+      "loss": 1.2236,
+      "step": 749
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 10.148162105683651,
+      "learning_rate": 9.98557375837042e-06,
+      "loss": 1.0761,
+      "step": 750
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 9.255382135089567,
+      "learning_rate": 9.985485970091253e-06,
+      "loss": 1.2112,
+      "step": 751
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 10.178005320691621,
+      "learning_rate": 9.985397915900144e-06,
+      "loss": 1.144,
+      "step": 752
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 9.982631405261909,
+      "learning_rate": 9.985309595801788e-06,
+      "loss": 1.1694,
+      "step": 753
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.916621518269412,
+      "learning_rate": 9.985221009800894e-06,
+      "loss": 1.146,
+      "step": 754
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 10.163941504565434,
+      "learning_rate": 9.985132157902186e-06,
+      "loss": 1.145,
+      "step": 755
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 9.529991168980182,
+      "learning_rate": 9.985043040110408e-06,
+      "loss": 1.1571,
+      "step": 756
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.392645665557373,
+      "learning_rate": 9.984953656430307e-06,
+      "loss": 1.2357,
+      "step": 757
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.213995298212957,
+      "learning_rate": 9.984864006866655e-06,
+      "loss": 1.1935,
+      "step": 758
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.030707818763284,
+      "learning_rate": 9.984774091424234e-06,
+      "loss": 1.1562,
+      "step": 759
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.725870800691753,
+      "learning_rate": 9.984683910107836e-06,
+      "loss": 1.2525,
+      "step": 760
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 10.759673323452208,
+      "learning_rate": 9.984593462922274e-06,
+      "loss": 1.1174,
+      "step": 761
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 9.823817374509218,
+      "learning_rate": 9.98450274987237e-06,
+      "loss": 1.1764,
+      "step": 762
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 9.165188893075076,
+      "learning_rate": 9.984411770962967e-06,
+      "loss": 1.0274,
+      "step": 763
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 7.937169831389869,
+      "learning_rate": 9.984320526198912e-06,
+      "loss": 1.2211,
+      "step": 764
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 14.376914978001027,
+      "learning_rate": 9.984229015585074e-06,
+      "loss": 1.1667,
+      "step": 765
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.680373520124277,
+      "learning_rate": 9.984137239126335e-06,
+      "loss": 1.1644,
+      "step": 766
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 11.602960198360268,
+      "learning_rate": 9.98404519682759e-06,
+      "loss": 1.087,
+      "step": 767
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.905967976739671,
+      "learning_rate": 9.983952888693744e-06,
+      "loss": 1.1574,
+      "step": 768
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 12.438843541508476,
+      "learning_rate": 9.983860314729728e-06,
+      "loss": 1.1093,
+      "step": 769
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 9.642007734997385,
+      "learning_rate": 9.983767474940473e-06,
+      "loss": 1.159,
+      "step": 770
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.0478117660995,
+      "learning_rate": 9.983674369330934e-06,
+      "loss": 1.2436,
+      "step": 771
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.914670815042081,
+      "learning_rate": 9.983580997906075e-06,
+      "loss": 1.1004,
+      "step": 772
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 9.934646089653034,
+      "learning_rate": 9.983487360670879e-06,
+      "loss": 1.0736,
+      "step": 773
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 9.206062697900476,
+      "learning_rate": 9.983393457630337e-06,
+      "loss": 1.1148,
+      "step": 774
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 7.222605151617985,
+      "learning_rate": 9.983299288789462e-06,
+      "loss": 1.119,
+      "step": 775
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 9.324754640448496,
+      "learning_rate": 9.983204854153273e-06,
+      "loss": 1.1653,
+      "step": 776
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 10.676296210353804,
+      "learning_rate": 9.983110153726807e-06,
+      "loss": 1.1075,
+      "step": 777
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.650074538768019,
+      "learning_rate": 9.983015187515116e-06,
+      "loss": 1.2313,
+      "step": 778
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 11.21292751189824,
+      "learning_rate": 9.982919955523266e-06,
+      "loss": 1.1668,
+      "step": 779
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 7.939125987830315,
+      "learning_rate": 9.982824457756335e-06,
+      "loss": 1.1509,
+      "step": 780
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 7.719155687222313,
+      "learning_rate": 9.982728694219418e-06,
+      "loss": 1.1537,
+      "step": 781
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 9.982967699588944,
+      "learning_rate": 9.982632664917622e-06,
+      "loss": 1.1428,
+      "step": 782
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 10.043153445922687,
+      "learning_rate": 9.982536369856069e-06,
+      "loss": 1.2309,
+      "step": 783
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 9.576748174027196,
+      "learning_rate": 9.982439809039894e-06,
+      "loss": 1.0782,
+      "step": 784
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 11.413340519598375,
+      "learning_rate": 9.982342982474248e-06,
+      "loss": 1.1281,
+      "step": 785
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.073938640985448,
+      "learning_rate": 9.982245890164297e-06,
+      "loss": 1.1285,
+      "step": 786
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 9.015529262726885,
+      "learning_rate": 9.982148532115218e-06,
+      "loss": 1.2302,
+      "step": 787
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 9.773912295208172,
+      "learning_rate": 9.982050908332204e-06,
+      "loss": 1.1422,
+      "step": 788
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 9.553642727335077,
+      "learning_rate": 9.981953018820462e-06,
+      "loss": 1.1417,
+      "step": 789
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 10.116006205919206,
+      "learning_rate": 9.981854863585214e-06,
+      "loss": 1.1695,
+      "step": 790
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.195818551731474,
+      "learning_rate": 9.981756442631694e-06,
+      "loss": 1.1078,
+      "step": 791
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 9.549920035560003,
+      "learning_rate": 9.981657755965152e-06,
+      "loss": 1.121,
+      "step": 792
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 7.894072999892381,
+      "learning_rate": 9.981558803590855e-06,
+      "loss": 1.0891,
+      "step": 793
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 13.613688090602672,
+      "learning_rate": 9.981459585514073e-06,
+      "loss": 1.15,
+      "step": 794
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 9.64058708414418,
+      "learning_rate": 9.981360101740105e-06,
+      "loss": 1.2024,
+      "step": 795
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.43763513563086,
+      "learning_rate": 9.981260352274255e-06,
+      "loss": 1.0459,
+      "step": 796
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 9.544501997608947,
+      "learning_rate": 9.981160337121843e-06,
+      "loss": 1.2033,
+      "step": 797
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.08545698639881,
+      "learning_rate": 9.981060056288203e-06,
+      "loss": 1.1497,
+      "step": 798
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 12.674917307273349,
+      "learning_rate": 9.980959509778684e-06,
+      "loss": 1.1659,
+      "step": 799
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 11.373016275937408,
+      "learning_rate": 9.980858697598652e-06,
+      "loss": 1.1583,
+      "step": 800
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.246265548547193,
+      "learning_rate": 9.980757619753479e-06,
+      "loss": 1.1037,
+      "step": 801
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 9.507534813271349,
+      "learning_rate": 9.98065627624856e-06,
+      "loss": 1.1907,
+      "step": 802
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 10.591363853077237,
+      "learning_rate": 9.980554667089298e-06,
+      "loss": 1.0711,
+      "step": 803
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 11.21359476443993,
+      "learning_rate": 9.980452792281114e-06,
+      "loss": 1.143,
+      "step": 804
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.119461207636915,
+      "learning_rate": 9.980350651829441e-06,
+      "loss": 1.1583,
+      "step": 805
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 9.796668173288055,
+      "learning_rate": 9.980248245739728e-06,
+      "loss": 1.1795,
+      "step": 806
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 11.283693511733778,
+      "learning_rate": 9.980145574017434e-06,
+      "loss": 1.215,
+      "step": 807
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 7.8224139961403205,
+      "learning_rate": 9.98004263666804e-06,
+      "loss": 1.1274,
+      "step": 808
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 8.153166945979198,
+      "learning_rate": 9.979939433697033e-06,
+      "loss": 1.0804,
+      "step": 809
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 9.457241762608476,
+      "learning_rate": 9.979835965109919e-06,
+      "loss": 1.1677,
+      "step": 810
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 9.816496594867994,
+      "learning_rate": 9.979732230912213e-06,
+      "loss": 1.1433,
+      "step": 811
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 7.009364621563408,
+      "learning_rate": 9.979628231109453e-06,
+      "loss": 1.125,
+      "step": 812
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 9.39403678801367,
+      "learning_rate": 9.979523965707186e-06,
+      "loss": 1.1296,
+      "step": 813
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 10.304699725751561,
+      "learning_rate": 9.979419434710968e-06,
+      "loss": 1.1401,
+      "step": 814
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 11.193184048389552,
+      "learning_rate": 9.97931463812638e-06,
+      "loss": 1.1119,
+      "step": 815
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 8.556432122092557,
+      "learning_rate": 9.979209575959008e-06,
+      "loss": 1.221,
+      "step": 816
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 10.533330207342924,
+      "learning_rate": 9.979104248214457e-06,
+      "loss": 1.2047,
+      "step": 817
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 8.253706521375046,
+      "learning_rate": 9.978998654898345e-06,
+      "loss": 1.1448,
+      "step": 818
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 9.61965690651147,
+      "learning_rate": 9.978892796016304e-06,
+      "loss": 1.1255,
+      "step": 819
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 8.915591515820628,
+      "learning_rate": 9.97878667157398e-06,
+      "loss": 1.1353,
+      "step": 820
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 10.843960449365577,
+      "learning_rate": 9.978680281577033e-06,
+      "loss": 1.2048,
+      "step": 821
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 11.520501705707709,
+      "learning_rate": 9.97857362603114e-06,
+      "loss": 1.1218,
+      "step": 822
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 10.862242152862839,
+      "learning_rate": 9.978466704941986e-06,
+      "loss": 1.1,
+      "step": 823
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 10.667995616339669,
+      "learning_rate": 9.978359518315277e-06,
+      "loss": 1.183,
+      "step": 824
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 9.570416139557253,
+      "learning_rate": 9.978252066156727e-06,
+      "loss": 1.163,
+      "step": 825
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 11.184653752701314,
+      "learning_rate": 9.978144348472069e-06,
+      "loss": 1.1619,
+      "step": 826
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 14.16734516755293,
+      "learning_rate": 9.978036365267048e-06,
+      "loss": 1.2009,
+      "step": 827
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 9.080982622814082,
+      "learning_rate": 9.977928116547425e-06,
+      "loss": 1.2026,
+      "step": 828
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 8.950622811847744,
+      "learning_rate": 9.977819602318972e-06,
+      "loss": 1.2114,
+      "step": 829
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 14.091117757460996,
+      "learning_rate": 9.977710822587477e-06,
+      "loss": 1.1954,
+      "step": 830
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 12.529558009384216,
+      "learning_rate": 9.977601777358743e-06,
+      "loss": 1.1519,
+      "step": 831
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 11.769092423146349,
+      "learning_rate": 9.977492466638584e-06,
+      "loss": 1.1063,
+      "step": 832
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 6.531664297649536,
+      "learning_rate": 9.977382890432833e-06,
+      "loss": 1.1609,
+      "step": 833
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 7.4726128364663245,
+      "learning_rate": 9.977273048747334e-06,
+      "loss": 1.0883,
+      "step": 834
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 12.209355191403908,
+      "learning_rate": 9.977162941587943e-06,
+      "loss": 1.0669,
+      "step": 835
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 10.003550948036848,
+      "learning_rate": 9.977052568960535e-06,
+      "loss": 1.0737,
+      "step": 836
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 9.505461279286244,
+      "learning_rate": 9.976941930870998e-06,
+      "loss": 1.2226,
+      "step": 837
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 10.01390890292844,
+      "learning_rate": 9.976831027325231e-06,
+      "loss": 1.0473,
+      "step": 838
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 10.886902648745723,
+      "learning_rate": 9.976719858329149e-06,
+      "loss": 1.1789,
+      "step": 839
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 8.055514920572405,
+      "learning_rate": 9.976608423888684e-06,
+      "loss": 1.1234,
+      "step": 840
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 10.070362713030955,
+      "learning_rate": 9.97649672400978e-06,
+      "loss": 1.2005,
+      "step": 841
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 13.220323401865729,
+      "learning_rate": 9.976384758698388e-06,
+      "loss": 1.1112,
+      "step": 842
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 11.234743927310623,
+      "learning_rate": 9.97627252796049e-06,
+      "loss": 1.12,
+      "step": 843
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 9.606588981172049,
+      "learning_rate": 9.976160031802062e-06,
+      "loss": 1.072,
+      "step": 844
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 10.252036478832839,
+      "learning_rate": 9.976047270229112e-06,
+      "loss": 1.1096,
+      "step": 845
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 9.76883296792795,
+      "learning_rate": 9.97593424324765e-06,
+      "loss": 1.1135,
+      "step": 846
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 8.951855634683003,
+      "learning_rate": 9.975820950863708e-06,
+      "loss": 1.1541,
+      "step": 847
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 9.471771192994817,
+      "learning_rate": 9.975707393083328e-06,
+      "loss": 1.2348,
+      "step": 848
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 8.61341812168712,
+      "learning_rate": 9.975593569912563e-06,
+      "loss": 1.1073,
+      "step": 849
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 11.989729468025931,
+      "learning_rate": 9.975479481357486e-06,
+      "loss": 1.1816,
+      "step": 850
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 7.3792979640892,
+      "learning_rate": 9.975365127424184e-06,
+      "loss": 1.0775,
+      "step": 851
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 11.485223629679236,
+      "learning_rate": 9.975250508118757e-06,
+      "loss": 1.1095,
+      "step": 852
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 10.160555539782248,
+      "learning_rate": 9.975135623447314e-06,
+      "loss": 1.1913,
+      "step": 853
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 14.727594849069348,
+      "learning_rate": 9.975020473415986e-06,
+      "loss": 1.1802,
+      "step": 854
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 8.083656157679098,
+      "learning_rate": 9.974905058030914e-06,
+      "loss": 1.2098,
+      "step": 855
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 8.604370181190173,
+      "learning_rate": 9.974789377298254e-06,
+      "loss": 1.0494,
+      "step": 856
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 9.850763434816187,
+      "learning_rate": 9.974673431224176e-06,
+      "loss": 1.1513,
+      "step": 857
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 9.807475153606191,
+      "learning_rate": 9.974557219814864e-06,
+      "loss": 1.1034,
+      "step": 858
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 11.123749356064922,
+      "learning_rate": 9.974440743076517e-06,
+      "loss": 1.0466,
+      "step": 859
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 10.5226315539616,
+      "learning_rate": 9.974324001015348e-06,
+      "loss": 1.127,
+      "step": 860
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 10.639352432919736,
+      "learning_rate": 9.97420699363758e-06,
+      "loss": 1.2097,
+      "step": 861
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 11.01121271714511,
+      "learning_rate": 9.97408972094946e-06,
+      "loss": 1.1137,
+      "step": 862
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 9.648839726665642,
+      "learning_rate": 9.97397218295724e-06,
+      "loss": 1.2076,
+      "step": 863
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 10.657345337542527,
+      "learning_rate": 9.973854379667186e-06,
+      "loss": 1.1359,
+      "step": 864
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 8.96571537064073,
+      "learning_rate": 9.973736311085587e-06,
+      "loss": 1.0476,
+      "step": 865
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 10.210911245397138,
+      "learning_rate": 9.973617977218735e-06,
+      "loss": 1.1215,
+      "step": 866
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 6.925548953289173,
+      "learning_rate": 9.973499378072947e-06,
+      "loss": 1.1011,
+      "step": 867
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 9.481382104894,
+      "learning_rate": 9.973380513654544e-06,
+      "loss": 1.1901,
+      "step": 868
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 10.464948860244109,
+      "learning_rate": 9.973261383969867e-06,
+      "loss": 1.1328,
+      "step": 869
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 10.97925281400497,
+      "learning_rate": 9.973141989025275e-06,
+      "loss": 1.1522,
+      "step": 870
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 10.896865099315301,
+      "learning_rate": 9.973022328827129e-06,
+      "loss": 1.1105,
+      "step": 871
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 10.355042497513455,
+      "learning_rate": 9.972902403381814e-06,
+      "loss": 1.1862,
+      "step": 872
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 10.695364982871196,
+      "learning_rate": 9.972782212695729e-06,
+      "loss": 1.1073,
+      "step": 873
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 7.200405553355677,
+      "learning_rate": 9.972661756775282e-06,
+      "loss": 1.0801,
+      "step": 874
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 12.980257912717619,
+      "learning_rate": 9.972541035626897e-06,
+      "loss": 1.1383,
+      "step": 875
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 10.24424671219532,
+      "learning_rate": 9.972420049257015e-06,
+      "loss": 1.171,
+      "step": 876
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 13.188554814454145,
+      "learning_rate": 9.97229879767209e-06,
+      "loss": 1.2133,
+      "step": 877
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.326744611391558,
+      "learning_rate": 9.972177280878585e-06,
+      "loss": 1.1388,
+      "step": 878
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.70827223733619,
+      "learning_rate": 9.972055498882986e-06,
+      "loss": 1.1044,
+      "step": 879
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 9.376756231465961,
+      "learning_rate": 9.971933451691787e-06,
+      "loss": 1.1546,
+      "step": 880
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.096124503442642,
+      "learning_rate": 9.971811139311495e-06,
+      "loss": 1.1635,
+      "step": 881
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.54988525896353,
+      "learning_rate": 9.971688561748637e-06,
+      "loss": 1.1655,
+      "step": 882
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 11.306158057732189,
+      "learning_rate": 9.97156571900975e-06,
+      "loss": 1.1449,
+      "step": 883
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 10.117517441947749,
+      "learning_rate": 9.971442611101387e-06,
+      "loss": 1.1956,
+      "step": 884
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.945616759690367,
+      "learning_rate": 9.971319238030111e-06,
+      "loss": 1.1064,
+      "step": 885
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.303806058376594,
+      "learning_rate": 9.971195599802505e-06,
+      "loss": 1.1853,
+      "step": 886
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 12.633885199353609,
+      "learning_rate": 9.971071696425166e-06,
+      "loss": 1.2208,
+      "step": 887
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 7.86321458740007,
+      "learning_rate": 9.970947527904697e-06,
+      "loss": 1.2484,
+      "step": 888
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 9.92471525157383,
+      "learning_rate": 9.970823094247724e-06,
+      "loss": 1.0074,
+      "step": 889
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.840538170560782,
+      "learning_rate": 9.970698395460884e-06,
+      "loss": 1.0756,
+      "step": 890
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 9.124040773471815,
+      "learning_rate": 9.970573431550827e-06,
+      "loss": 1.1932,
+      "step": 891
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 9.045581997952114,
+      "learning_rate": 9.970448202524219e-06,
+      "loss": 1.0988,
+      "step": 892
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 10.735210507737222,
+      "learning_rate": 9.97032270838774e-06,
+      "loss": 1.0917,
+      "step": 893
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 7.458090952922053,
+      "learning_rate": 9.970196949148082e-06,
+      "loss": 1.2159,
+      "step": 894
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.142170180749396,
+      "learning_rate": 9.970070924811954e-06,
+      "loss": 1.1332,
+      "step": 895
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 9.138525642897706,
+      "learning_rate": 9.969944635386078e-06,
+      "loss": 1.1728,
+      "step": 896
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.920793158796572,
+      "learning_rate": 9.969818080877188e-06,
+      "loss": 1.1195,
+      "step": 897
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.530683034401656,
+      "learning_rate": 9.969691261292033e-06,
+      "loss": 1.1432,
+      "step": 898
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 10.754608315005663,
+      "learning_rate": 9.969564176637381e-06,
+      "loss": 1.1675,
+      "step": 899
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.867112891081831,
+      "learning_rate": 9.96943682692001e-06,
+      "loss": 1.1426,
+      "step": 900
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 9.114823763427992,
+      "learning_rate": 9.969309212146709e-06,
+      "loss": 1.1407,
+      "step": 901
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 7.803703536689614,
+      "learning_rate": 9.969181332324289e-06,
+      "loss": 1.2019,
+      "step": 902
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 10.267811423013272,
+      "learning_rate": 9.969053187459568e-06,
+      "loss": 1.1879,
+      "step": 903
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 12.181832072864417,
+      "learning_rate": 9.96892477755938e-06,
+      "loss": 1.1837,
+      "step": 904
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 9.747110384427423,
+      "learning_rate": 9.968796102630577e-06,
+      "loss": 1.1695,
+      "step": 905
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.767005081571696,
+      "learning_rate": 9.96866716268002e-06,
+      "loss": 1.1179,
+      "step": 906
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 9.565370549237734,
+      "learning_rate": 9.968537957714587e-06,
+      "loss": 1.1571,
+      "step": 907
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 7.454569148662221,
+      "learning_rate": 9.96840848774117e-06,
+      "loss": 1.1308,
+      "step": 908
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 10.594655488351421,
+      "learning_rate": 9.968278752766673e-06,
+      "loss": 1.0854,
+      "step": 909
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 10.362707335369436,
+      "learning_rate": 9.968148752798016e-06,
+      "loss": 1.1674,
+      "step": 910
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 9.533006266880571,
+      "learning_rate": 9.968018487842135e-06,
+      "loss": 1.0844,
+      "step": 911
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 7.677965253880486,
+      "learning_rate": 9.967887957905975e-06,
+      "loss": 1.1518,
+      "step": 912
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.864221511095703,
+      "learning_rate": 9.9677571629965e-06,
+      "loss": 1.1374,
+      "step": 913
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.346355477076848,
+      "learning_rate": 9.967626103120687e-06,
+      "loss": 1.1976,
+      "step": 914
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 11.26340349342993,
+      "learning_rate": 9.967494778285525e-06,
+      "loss": 1.0675,
+      "step": 915
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 10.927259319021614,
+      "learning_rate": 9.967363188498017e-06,
+      "loss": 1.1179,
+      "step": 916
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 13.137587862953993,
+      "learning_rate": 9.967231333765184e-06,
+      "loss": 1.1723,
+      "step": 917
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 7.499568435706075,
+      "learning_rate": 9.967099214094058e-06,
+      "loss": 1.1275,
+      "step": 918
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 10.855615430886866,
+      "learning_rate": 9.966966829491686e-06,
+      "loss": 1.1501,
+      "step": 919
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 10.145255467621872,
+      "learning_rate": 9.966834179965128e-06,
+      "loss": 1.0439,
+      "step": 920
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.97137408379902,
+      "learning_rate": 9.966701265521463e-06,
+      "loss": 1.1042,
+      "step": 921
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 10.973059465859606,
+      "learning_rate": 9.966568086167775e-06,
+      "loss": 1.149,
+      "step": 922
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 7.992653169759609,
+      "learning_rate": 9.966434641911171e-06,
+      "loss": 1.0854,
+      "step": 923
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 9.489845703350293,
+      "learning_rate": 9.966300932758766e-06,
+      "loss": 1.1142,
+      "step": 924
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.621333712192047,
+      "learning_rate": 9.966166958717694e-06,
+      "loss": 1.1472,
+      "step": 925
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.215572145645885,
+      "learning_rate": 9.9660327197951e-06,
+      "loss": 1.1584,
+      "step": 926
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 11.13459428125909,
+      "learning_rate": 9.965898215998143e-06,
+      "loss": 1.1199,
+      "step": 927
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 7.666033123309216,
+      "learning_rate": 9.965763447333997e-06,
+      "loss": 1.1667,
+      "step": 928
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 9.501488150059513,
+      "learning_rate": 9.965628413809854e-06,
+      "loss": 1.2249,
+      "step": 929
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.10437324367321,
+      "learning_rate": 9.965493115432912e-06,
+      "loss": 1.187,
+      "step": 930
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 9.458389094299497,
+      "learning_rate": 9.965357552210387e-06,
+      "loss": 1.2363,
+      "step": 931
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 9.900917954766951,
+      "learning_rate": 9.965221724149514e-06,
+      "loss": 1.0691,
+      "step": 932
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.152518119995538,
+      "learning_rate": 9.965085631257532e-06,
+      "loss": 1.1968,
+      "step": 933
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 7.552103169091629,
+      "learning_rate": 9.964949273541707e-06,
+      "loss": 1.1941,
+      "step": 934
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.352197586077919,
+      "learning_rate": 9.964812651009304e-06,
+      "loss": 1.1125,
+      "step": 935
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 11.660715470732297,
+      "learning_rate": 9.964675763667615e-06,
+      "loss": 1.1278,
+      "step": 936
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 11.8455521705924,
+      "learning_rate": 9.96453861152394e-06,
+      "loss": 1.0677,
+      "step": 937
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 11.283927343486008,
+      "learning_rate": 9.964401194585596e-06,
+      "loss": 1.1101,
+      "step": 938
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 10.261289840329528,
+      "learning_rate": 9.964263512859909e-06,
+      "loss": 1.1552,
+      "step": 939
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 10.848794979403246,
+      "learning_rate": 9.964125566354223e-06,
+      "loss": 1.1091,
+      "step": 940
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 7.728662287645392,
+      "learning_rate": 9.963987355075899e-06,
+      "loss": 1.1835,
+      "step": 941
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 9.008367482635803,
+      "learning_rate": 9.963848879032308e-06,
+      "loss": 1.1244,
+      "step": 942
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 10.302556055619586,
+      "learning_rate": 9.963710138230834e-06,
+      "loss": 1.1875,
+      "step": 943
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 10.347610570043583,
+      "learning_rate": 9.963571132678875e-06,
+      "loss": 1.0992,
+      "step": 944
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 9.01534862625795,
+      "learning_rate": 9.96343186238385e-06,
+      "loss": 1.1952,
+      "step": 945
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.79872692075884,
+      "learning_rate": 9.963292327353187e-06,
+      "loss": 1.1613,
+      "step": 946
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.029820761969845,
+      "learning_rate": 9.963152527594324e-06,
+      "loss": 1.103,
+      "step": 947
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 11.915189580185112,
+      "learning_rate": 9.963012463114722e-06,
+      "loss": 1.1213,
+      "step": 948
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 7.8152354522206355,
+      "learning_rate": 9.962872133921848e-06,
+      "loss": 1.1555,
+      "step": 949
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 7.805067393121889,
+      "learning_rate": 9.962731540023191e-06,
+      "loss": 1.0564,
+      "step": 950
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.489854584881686,
+      "learning_rate": 9.962590681426245e-06,
+      "loss": 1.1669,
+      "step": 951
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 7.237768679368146,
+      "learning_rate": 9.962449558138527e-06,
+      "loss": 1.2134,
+      "step": 952
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 10.473886848374987,
+      "learning_rate": 9.962308170167564e-06,
+      "loss": 1.0853,
+      "step": 953
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 7.5619044444690555,
+      "learning_rate": 9.962166517520893e-06,
+      "loss": 1.1445,
+      "step": 954
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 6.426832081514213,
+      "learning_rate": 9.962024600206074e-06,
+      "loss": 1.1326,
+      "step": 955
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 7.1692178329826435,
+      "learning_rate": 9.961882418230674e-06,
+      "loss": 1.1224,
+      "step": 956
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.359241162174161,
+      "learning_rate": 9.961739971602278e-06,
+      "loss": 1.1126,
+      "step": 957
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.257485534393336,
+      "learning_rate": 9.961597260328481e-06,
+      "loss": 1.1176,
+      "step": 958
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 11.918379293556043,
+      "learning_rate": 9.961454284416901e-06,
+      "loss": 1.0985,
+      "step": 959
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.560881040999721,
+      "learning_rate": 9.961311043875155e-06,
+      "loss": 1.0494,
+      "step": 960
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 10.413686757904502,
+      "learning_rate": 9.961167538710891e-06,
+      "loss": 1.1934,
+      "step": 961
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 11.036469976268151,
+      "learning_rate": 9.961023768931759e-06,
+      "loss": 1.0554,
+      "step": 962
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 10.110806004345832,
+      "learning_rate": 9.960879734545428e-06,
+      "loss": 1.1496,
+      "step": 963
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.615545027029308,
+      "learning_rate": 9.960735435559581e-06,
+      "loss": 1.1258,
+      "step": 964
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 6.900403406886219,
+      "learning_rate": 9.960590871981915e-06,
+      "loss": 1.1327,
+      "step": 965
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 8.603547899758588,
+      "learning_rate": 9.960446043820138e-06,
+      "loss": 1.1064,
+      "step": 966
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.844422146176743,
+      "learning_rate": 9.960300951081978e-06,
+      "loss": 1.092,
+      "step": 967
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.532758640817526,
+      "learning_rate": 9.960155593775172e-06,
+      "loss": 1.1509,
+      "step": 968
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 10.308950563834777,
+      "learning_rate": 9.960009971907471e-06,
+      "loss": 1.2016,
+      "step": 969
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 12.30793982199318,
+      "learning_rate": 9.959864085486649e-06,
+      "loss": 1.1193,
+      "step": 970
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.526124547758911,
+      "learning_rate": 9.959717934520479e-06,
+      "loss": 1.0818,
+      "step": 971
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.777446757600824,
+      "learning_rate": 9.959571519016761e-06,
+      "loss": 1.0783,
+      "step": 972
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.364232031438574,
+      "learning_rate": 9.959424838983302e-06,
+      "loss": 1.1183,
+      "step": 973
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.812925991532216,
+      "learning_rate": 9.959277894427929e-06,
+      "loss": 1.1034,
+      "step": 974
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 8.680502608531338,
+      "learning_rate": 9.959130685358476e-06,
+      "loss": 1.0616,
+      "step": 975
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 12.9231256525455,
+      "learning_rate": 9.958983211782795e-06,
+      "loss": 1.1652,
+      "step": 976
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 10.998920404078122,
+      "learning_rate": 9.958835473708755e-06,
+      "loss": 1.191,
+      "step": 977
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.04576930562172,
+      "learning_rate": 9.958687471144233e-06,
+      "loss": 1.1689,
+      "step": 978
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 7.522234986834103,
+      "learning_rate": 9.958539204097123e-06,
+      "loss": 1.1197,
+      "step": 979
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 11.543996011567051,
+      "learning_rate": 9.958390672575336e-06,
+      "loss": 1.1295,
+      "step": 980
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 8.875835752652991,
+      "learning_rate": 9.958241876586792e-06,
+      "loss": 1.133,
+      "step": 981
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.751579509572803,
+      "learning_rate": 9.958092816139426e-06,
+      "loss": 1.2074,
+      "step": 982
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 6.657174251205237,
+      "learning_rate": 9.95794349124119e-06,
+      "loss": 1.1976,
+      "step": 983
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.671446552750915,
+      "learning_rate": 9.95779390190005e-06,
+      "loss": 1.1023,
+      "step": 984
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 10.85329032988452,
+      "learning_rate": 9.957644048123985e-06,
+      "loss": 1.1701,
+      "step": 985
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.497951291092575,
+      "learning_rate": 9.957493929920984e-06,
+      "loss": 1.1706,
+      "step": 986
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 11.450803311535141,
+      "learning_rate": 9.957343547299056e-06,
+      "loss": 1.0752,
+      "step": 987
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 11.193781844016353,
+      "learning_rate": 9.957192900266222e-06,
+      "loss": 1.0851,
+      "step": 988
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 8.952853327132763,
+      "learning_rate": 9.957041988830518e-06,
+      "loss": 1.0671,
+      "step": 989
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 12.453579759543315,
+      "learning_rate": 9.956890812999992e-06,
+      "loss": 1.1163,
+      "step": 990
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.627331849030737,
+      "learning_rate": 9.956739372782708e-06,
+      "loss": 1.0881,
+      "step": 991
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 7.834929508345354,
+      "learning_rate": 9.956587668186743e-06,
+      "loss": 1.1066,
+      "step": 992
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 6.268084022733908,
+      "learning_rate": 9.956435699220188e-06,
+      "loss": 1.077,
+      "step": 993
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.06748825627173,
+      "learning_rate": 9.956283465891148e-06,
+      "loss": 1.1122,
+      "step": 994
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 12.159454170823379,
+      "learning_rate": 9.956130968207746e-06,
+      "loss": 1.1609,
+      "step": 995
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 10.431617236193752,
+      "learning_rate": 9.955978206178112e-06,
+      "loss": 1.136,
+      "step": 996
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 8.411542432059553,
+      "learning_rate": 9.955825179810396e-06,
+      "loss": 1.1985,
+      "step": 997
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 10.497170448302485,
+      "learning_rate": 9.95567188911276e-06,
+      "loss": 1.1242,
+      "step": 998
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 10.968543669576675,
+      "learning_rate": 9.95551833409338e-06,
+      "loss": 1.1216,
+      "step": 999
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.67351934046599,
+      "learning_rate": 9.955364514760444e-06,
+      "loss": 1.1233,
+      "step": 1000
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.62171688581224,
+      "learning_rate": 9.95521043112216e-06,
+      "loss": 1.0857,
+      "step": 1001
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.191714979662338,
+      "learning_rate": 9.955056083186745e-06,
+      "loss": 1.1502,
+      "step": 1002
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 7.771592830594204,
+      "learning_rate": 9.954901470962429e-06,
+      "loss": 1.1005,
+      "step": 1003
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.694325971648707,
+      "learning_rate": 9.954746594457462e-06,
+      "loss": 1.065,
+      "step": 1004
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 8.309163760161134,
+      "learning_rate": 9.954591453680103e-06,
+      "loss": 1.159,
+      "step": 1005
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.515853344012365,
+      "learning_rate": 9.954436048638627e-06,
+      "loss": 1.0296,
+      "step": 1006
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 6.393161274580403,
+      "learning_rate": 9.954280379341323e-06,
+      "loss": 1.07,
+      "step": 1007
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 10.149425107521994,
+      "learning_rate": 9.954124445796494e-06,
+      "loss": 1.0787,
+      "step": 1008
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 11.551595746389426,
+      "learning_rate": 9.953968248012456e-06,
+      "loss": 1.1611,
+      "step": 1009
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 8.995184946375717,
+      "learning_rate": 9.953811785997543e-06,
+      "loss": 1.1598,
+      "step": 1010
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.516468337412416,
+      "learning_rate": 9.953655059760097e-06,
+      "loss": 1.1094,
+      "step": 1011
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 10.740718465465765,
+      "learning_rate": 9.953498069308478e-06,
+      "loss": 1.1462,
+      "step": 1012
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.944972131104844,
+      "learning_rate": 9.95334081465106e-06,
+      "loss": 1.1702,
+      "step": 1013
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 8.066990053049928,
+      "learning_rate": 9.953183295796233e-06,
+      "loss": 1.1742,
+      "step": 1014
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 10.058088734481043,
+      "learning_rate": 9.953025512752395e-06,
+      "loss": 1.2111,
+      "step": 1015
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 8.635609555436492,
+      "learning_rate": 9.952867465527962e-06,
+      "loss": 1.1395,
+      "step": 1016
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 9.296115008175377,
+      "learning_rate": 9.952709154131365e-06,
+      "loss": 1.1598,
+      "step": 1017
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 8.10048011648295,
+      "learning_rate": 9.95255057857105e-06,
+      "loss": 1.1349,
+      "step": 1018
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 10.407072769288062,
+      "learning_rate": 9.952391738855471e-06,
+      "loss": 1.1126,
+      "step": 1019
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 9.536915625668392,
+      "learning_rate": 9.952232634993103e-06,
+      "loss": 1.1001,
+      "step": 1020
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 10.90312177488641,
+      "learning_rate": 9.952073266992429e-06,
+      "loss": 1.1112,
+      "step": 1021
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 11.893821013829845,
+      "learning_rate": 9.951913634861952e-06,
+      "loss": 1.1671,
+      "step": 1022
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 10.264940691397525,
+      "learning_rate": 9.951753738610184e-06,
+      "loss": 1.0815,
+      "step": 1023
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 9.20662758118657,
+      "learning_rate": 9.951593578245658e-06,
+      "loss": 1.0839,
+      "step": 1024
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 10.577203275270978,
+      "learning_rate": 9.95143315377691e-06,
+      "loss": 1.1408,
+      "step": 1025
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 7.446380060523219,
+      "learning_rate": 9.951272465212503e-06,
+      "loss": 1.0677,
+      "step": 1026
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 7.185308443060191,
+      "learning_rate": 9.951111512561002e-06,
+      "loss": 1.1022,
+      "step": 1027
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 9.909581099746429,
+      "learning_rate": 9.950950295830995e-06,
+      "loss": 1.1629,
+      "step": 1028
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 8.745393504996242,
+      "learning_rate": 9.950788815031082e-06,
+      "loss": 1.041,
+      "step": 1029
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 7.696961475643495,
+      "learning_rate": 9.950627070169873e-06,
+      "loss": 1.1313,
+      "step": 1030
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 14.557949973367416,
+      "learning_rate": 9.950465061255996e-06,
+      "loss": 1.0906,
+      "step": 1031
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 11.426804684195517,
+      "learning_rate": 9.950302788298093e-06,
+      "loss": 1.0705,
+      "step": 1032
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 9.426467169887061,
+      "learning_rate": 9.950140251304819e-06,
+      "loss": 1.0922,
+      "step": 1033
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 9.206624980133588,
+      "learning_rate": 9.94997745028484e-06,
+      "loss": 1.1494,
+      "step": 1034
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 11.965720592307374,
+      "learning_rate": 9.949814385246845e-06,
+      "loss": 1.0876,
+      "step": 1035
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 9.528958012867855,
+      "learning_rate": 9.949651056199528e-06,
+      "loss": 1.1713,
+      "step": 1036
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 9.349397935952068,
+      "learning_rate": 9.949487463151601e-06,
+      "loss": 1.0775,
+      "step": 1037
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 9.940013723616998,
+      "learning_rate": 9.949323606111789e-06,
+      "loss": 1.0953,
+      "step": 1038
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 12.548857354856471,
+      "learning_rate": 9.949159485088834e-06,
+      "loss": 1.042,
+      "step": 1039
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 12.72638869028312,
+      "learning_rate": 9.948995100091487e-06,
+      "loss": 1.1246,
+      "step": 1040
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 12.313508253117641,
+      "learning_rate": 9.948830451128517e-06,
+      "loss": 1.1863,
+      "step": 1041
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 7.688740077719735,
+      "learning_rate": 9.948665538208707e-06,
+      "loss": 1.1393,
+      "step": 1042
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 10.38689563746036,
+      "learning_rate": 9.94850036134085e-06,
+      "loss": 1.1221,
+      "step": 1043
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 7.995378804138972,
+      "learning_rate": 9.94833492053376e-06,
+      "loss": 1.0289,
+      "step": 1044
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 10.950933083863653,
+      "learning_rate": 9.948169215796257e-06,
+      "loss": 1.0979,
+      "step": 1045
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 8.512096768856386,
+      "learning_rate": 9.948003247137184e-06,
+      "loss": 1.1901,
+      "step": 1046
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 7.947445134240318,
+      "learning_rate": 9.947837014565388e-06,
+      "loss": 1.0359,
+      "step": 1047
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 9.893044664801637,
+      "learning_rate": 9.947670518089738e-06,
+      "loss": 1.1306,
+      "step": 1048
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 9.260029087642648,
+      "learning_rate": 9.947503757719117e-06,
+      "loss": 1.1777,
+      "step": 1049
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 8.655252489384248,
+      "learning_rate": 9.947336733462415e-06,
+      "loss": 1.0583,
+      "step": 1050
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 8.674704773483013,
+      "learning_rate": 9.947169445328543e-06,
+      "loss": 1.1394,
+      "step": 1051
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 11.070072313127595,
+      "learning_rate": 9.947001893326424e-06,
+      "loss": 1.1594,
+      "step": 1052
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 8.659032538333252,
+      "learning_rate": 9.946834077464992e-06,
+      "loss": 1.0388,
+      "step": 1053
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 6.473011612246851,
+      "learning_rate": 9.946665997753204e-06,
+      "loss": 1.1003,
+      "step": 1054
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 12.31312141889838,
+      "learning_rate": 9.946497654200017e-06,
+      "loss": 1.1288,
+      "step": 1055
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 9.417538438917381,
+      "learning_rate": 9.946329046814416e-06,
+      "loss": 1.1607,
+      "step": 1056
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 8.676177876835448,
+      "learning_rate": 9.94616017560539e-06,
+      "loss": 1.1137,
+      "step": 1057
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 8.75666693252564,
+      "learning_rate": 9.94599104058195e-06,
+      "loss": 1.1512,
+      "step": 1058
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 11.401141963419965,
+      "learning_rate": 9.945821641753115e-06,
+      "loss": 1.0677,
+      "step": 1059
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 8.431111169608647,
+      "learning_rate": 9.94565197912792e-06,
+      "loss": 1.0652,
+      "step": 1060
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 10.06305006599899,
+      "learning_rate": 9.945482052715415e-06,
+      "loss": 1.1006,
+      "step": 1061
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 8.069799365468269,
+      "learning_rate": 9.945311862524661e-06,
+      "loss": 1.1405,
+      "step": 1062
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 13.617956335300693,
+      "learning_rate": 9.945141408564739e-06,
+      "loss": 1.1066,
+      "step": 1063
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 9.447719674971562,
+      "learning_rate": 9.944970690844739e-06,
+      "loss": 1.1103,
+      "step": 1064
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 10.078434418156876,
+      "learning_rate": 9.944799709373767e-06,
+      "loss": 1.108,
+      "step": 1065
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 7.851295021513174,
+      "learning_rate": 9.944628464160944e-06,
+      "loss": 1.0722,
+      "step": 1066
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 6.815117759681772,
+      "learning_rate": 9.9444569552154e-06,
+      "loss": 1.074,
+      "step": 1067
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 10.611821689030869,
+      "learning_rate": 9.944285182546284e-06,
+      "loss": 1.1419,
+      "step": 1068
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 7.907887658806863,
+      "learning_rate": 9.944113146162761e-06,
+      "loss": 1.117,
+      "step": 1069
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 11.773439981834114,
+      "learning_rate": 9.943940846074005e-06,
+      "loss": 1.1309,
+      "step": 1070
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 10.420008788164198,
+      "learning_rate": 9.943768282289205e-06,
+      "loss": 1.1226,
+      "step": 1071
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 8.023590198058459,
+      "learning_rate": 9.943595454817567e-06,
+      "loss": 1.1152,
+      "step": 1072
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 9.160352130710256,
+      "learning_rate": 9.943422363668308e-06,
+      "loss": 1.1476,
+      "step": 1073
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 9.75833777748016,
+      "learning_rate": 9.943249008850658e-06,
+      "loss": 1.0749,
+      "step": 1074
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 9.885724031718935,
+      "learning_rate": 9.943075390373866e-06,
+      "loss": 1.127,
+      "step": 1075
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 11.121791430389655,
+      "learning_rate": 9.942901508247194e-06,
+      "loss": 1.0932,
+      "step": 1076
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 8.64314113348335,
+      "learning_rate": 9.942727362479912e-06,
+      "loss": 1.1164,
+      "step": 1077
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 11.919438447770984,
+      "learning_rate": 9.942552953081312e-06,
+      "loss": 1.1667,
+      "step": 1078
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 9.011653688660052,
+      "learning_rate": 9.942378280060695e-06,
+      "loss": 1.0707,
+      "step": 1079
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 9.882718788620776,
+      "learning_rate": 9.942203343427377e-06,
+      "loss": 1.1267,
+      "step": 1080
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 6.600312258030099,
+      "learning_rate": 9.94202814319069e-06,
+      "loss": 1.1286,
+      "step": 1081
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 6.78118283025409,
+      "learning_rate": 9.941852679359978e-06,
+      "loss": 1.0236,
+      "step": 1082
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 8.759577552731114,
+      "learning_rate": 9.9416769519446e-06,
+      "loss": 1.0587,
+      "step": 1083
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 9.849748614567302,
+      "learning_rate": 9.941500960953928e-06,
+      "loss": 1.1399,
+      "step": 1084
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 5.674829393330436,
+      "learning_rate": 9.941324706397351e-06,
+      "loss": 1.0995,
+      "step": 1085
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 8.357367692342663,
+      "learning_rate": 9.941148188284267e-06,
+      "loss": 1.1151,
+      "step": 1086
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 6.382603520934284,
+      "learning_rate": 9.940971406624091e-06,
+      "loss": 1.1149,
+      "step": 1087
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 11.420256640477774,
+      "learning_rate": 9.940794361426257e-06,
+      "loss": 1.145,
+      "step": 1088
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 10.467049956387411,
+      "learning_rate": 9.940617052700202e-06,
+      "loss": 1.1055,
+      "step": 1089
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 10.691078863938994,
+      "learning_rate": 9.940439480455386e-06,
+      "loss": 1.071,
+      "step": 1090
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 9.556685863386972,
+      "learning_rate": 9.940261644701281e-06,
+      "loss": 1.1616,
+      "step": 1091
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 9.059802682792562,
+      "learning_rate": 9.94008354544737e-06,
+      "loss": 1.1,
+      "step": 1092
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 10.952581477431941,
+      "learning_rate": 9.939905182703155e-06,
+      "loss": 1.0409,
+      "step": 1093
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 13.166701878199547,
+      "learning_rate": 9.939726556478146e-06,
+      "loss": 1.0698,
+      "step": 1094
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 7.938073302418943,
+      "learning_rate": 9.939547666781873e-06,
+      "loss": 1.054,
+      "step": 1095
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 8.549749864068065,
+      "learning_rate": 9.939368513623877e-06,
+      "loss": 1.1493,
+      "step": 1096
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 8.167717988847198,
+      "learning_rate": 9.939189097013714e-06,
+      "loss": 1.0565,
+      "step": 1097
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 10.53712802988943,
+      "learning_rate": 9.939009416960952e-06,
+      "loss": 1.1387,
+      "step": 1098
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 8.927596736838835,
+      "learning_rate": 9.938829473475176e-06,
+      "loss": 1.1199,
+      "step": 1099
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 9.003714268212727,
+      "learning_rate": 9.938649266565982e-06,
+      "loss": 1.0704,
+      "step": 1100
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 10.218335249864973,
+      "learning_rate": 9.938468796242985e-06,
+      "loss": 1.1574,
+      "step": 1101
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 10.842996309720688,
+      "learning_rate": 9.938288062515809e-06,
+      "loss": 1.0518,
+      "step": 1102
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 8.67024709558197,
+      "learning_rate": 9.938107065394091e-06,
+      "loss": 1.1095,
+      "step": 1103
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 9.697326940692474,
+      "learning_rate": 9.937925804887488e-06,
+      "loss": 1.0719,
+      "step": 1104
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 10.300336309760205,
+      "learning_rate": 9.937744281005667e-06,
+      "loss": 1.111,
+      "step": 1105
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 6.557570709887889,
+      "learning_rate": 9.937562493758313e-06,
+      "loss": 1.2012,
+      "step": 1106
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 9.951228824659351,
+      "learning_rate": 9.937380443155118e-06,
+      "loss": 1.0739,
+      "step": 1107
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 8.662720029631094,
+      "learning_rate": 9.937198129205792e-06,
+      "loss": 1.0685,
+      "step": 1108
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 9.991730433417482,
+      "learning_rate": 9.937015551920063e-06,
+      "loss": 1.1269,
+      "step": 1109
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 10.177878576653105,
+      "learning_rate": 9.936832711307664e-06,
+      "loss": 1.0773,
+      "step": 1110
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 8.91736270260804,
+      "learning_rate": 9.936649607378353e-06,
+      "loss": 1.1264,
+      "step": 1111
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 10.09236315398289,
+      "learning_rate": 9.936466240141892e-06,
+      "loss": 1.0796,
+      "step": 1112
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 7.794799566108366,
+      "learning_rate": 9.936282609608061e-06,
+      "loss": 1.1944,
+      "step": 1113
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 10.315400137508453,
+      "learning_rate": 9.936098715786657e-06,
+      "loss": 1.0627,
+      "step": 1114
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 9.271133639938963,
+      "learning_rate": 9.935914558687488e-06,
+      "loss": 1.0971,
+      "step": 1115
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 7.437439186578602,
+      "learning_rate": 9.935730138320374e-06,
+      "loss": 1.1219,
+      "step": 1116
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 7.865854171628148,
+      "learning_rate": 9.935545454695155e-06,
+      "loss": 1.1293,
+      "step": 1117
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 8.123605493797166,
+      "learning_rate": 9.935360507821678e-06,
+      "loss": 1.1751,
+      "step": 1118
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 10.988142411407201,
+      "learning_rate": 9.93517529770981e-06,
+      "loss": 1.0739,
+      "step": 1119
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 8.186644806773327,
+      "learning_rate": 9.93498982436943e-06,
+      "loss": 1.0993,
+      "step": 1120
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 9.499250576963705,
+      "learning_rate": 9.934804087810426e-06,
+      "loss": 1.0616,
+      "step": 1121
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 8.72852136927071,
+      "learning_rate": 9.934618088042711e-06,
+      "loss": 1.1333,
+      "step": 1122
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 9.483162543574448,
+      "learning_rate": 9.934431825076202e-06,
+      "loss": 1.1682,
+      "step": 1123
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 9.69188544595919,
+      "learning_rate": 9.934245298920834e-06,
+      "loss": 1.0569,
+      "step": 1124
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 7.497668571800592,
+      "learning_rate": 9.934058509586556e-06,
+      "loss": 1.1028,
+      "step": 1125
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 10.653980238047474,
+      "learning_rate": 9.933871457083333e-06,
+      "loss": 1.0679,
+      "step": 1126
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 9.573524013587194,
+      "learning_rate": 9.933684141421138e-06,
+      "loss": 1.1592,
+      "step": 1127
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 9.707171391717003,
+      "learning_rate": 9.933496562609966e-06,
+      "loss": 1.1138,
+      "step": 1128
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 9.655870136642156,
+      "learning_rate": 9.933308720659818e-06,
+      "loss": 1.17,
+      "step": 1129
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 10.872126517473504,
+      "learning_rate": 9.933120615580716e-06,
+      "loss": 1.0705,
+      "step": 1130
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 7.728682675785882,
+      "learning_rate": 9.932932247382691e-06,
+      "loss": 1.0817,
+      "step": 1131
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 6.951098393096133,
+      "learning_rate": 9.93274361607579e-06,
+      "loss": 1.1232,
+      "step": 1132
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 8.222889446680403,
+      "learning_rate": 9.932554721670077e-06,
+      "loss": 1.0974,
+      "step": 1133
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 10.11737357391576,
+      "learning_rate": 9.932365564175623e-06,
+      "loss": 1.0797,
+      "step": 1134
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 8.869618646875386,
+      "learning_rate": 9.93217614360252e-06,
+      "loss": 1.1822,
+      "step": 1135
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 9.263035916003135,
+      "learning_rate": 9.931986459960872e-06,
+      "loss": 1.1447,
+      "step": 1136
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 9.5353912329103,
+      "learning_rate": 9.931796513260791e-06,
+      "loss": 1.0784,
+      "step": 1137
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 7.9642078222144725,
+      "learning_rate": 9.931606303512415e-06,
+      "loss": 1.0645,
+      "step": 1138
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 10.233196552750902,
+      "learning_rate": 9.931415830725885e-06,
+      "loss": 1.1487,
+      "step": 1139
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 11.085232815257786,
+      "learning_rate": 9.93122509491136e-06,
+      "loss": 1.0783,
+      "step": 1140
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 8.693242284783665,
+      "learning_rate": 9.931034096079015e-06,
+      "loss": 1.0789,
+      "step": 1141
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 9.793143531779897,
+      "learning_rate": 9.930842834239038e-06,
+      "loss": 1.1301,
+      "step": 1142
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 10.278036427620934,
+      "learning_rate": 9.930651309401629e-06,
+      "loss": 1.1133,
+      "step": 1143
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 8.9576755189414,
+      "learning_rate": 9.930459521577002e-06,
+      "loss": 1.1163,
+      "step": 1144
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 9.144329631603688,
+      "learning_rate": 9.930267470775391e-06,
+      "loss": 1.1468,
+      "step": 1145
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 8.902591650602561,
+      "learning_rate": 9.930075157007035e-06,
+      "loss": 1.0788,
+      "step": 1146
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 9.240485680292965,
+      "learning_rate": 9.929882580282193e-06,
+      "loss": 1.228,
+      "step": 1147
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 10.941869971113794,
+      "learning_rate": 9.929689740611137e-06,
+      "loss": 1.1506,
+      "step": 1148
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 9.573514008516675,
+      "learning_rate": 9.929496638004152e-06,
+      "loss": 1.0872,
+      "step": 1149
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 8.4855447818549,
+      "learning_rate": 9.929303272471537e-06,
+      "loss": 1.0575,
+      "step": 1150
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 10.6603887656913,
+      "learning_rate": 9.92910964402361e-06,
+      "loss": 1.1742,
+      "step": 1151
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 9.608141883779801,
+      "learning_rate": 9.92891575267069e-06,
+      "loss": 1.1318,
+      "step": 1152
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 9.338460910075543,
+      "learning_rate": 9.928721598423125e-06,
+      "loss": 1.1059,
+      "step": 1153
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 9.518208973627814,
+      "learning_rate": 9.92852718129127e-06,
+      "loss": 1.1226,
+      "step": 1154
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 8.766911294985174,
+      "learning_rate": 9.928332501285493e-06,
+      "loss": 1.2089,
+      "step": 1155
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 7.83005399680488,
+      "learning_rate": 9.92813755841618e-06,
+      "loss": 1.0808,
+      "step": 1156
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 11.944218537224852,
+      "learning_rate": 9.927942352693725e-06,
+      "loss": 1.0966,
+      "step": 1157
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 11.172286932537125,
+      "learning_rate": 9.927746884128542e-06,
+      "loss": 1.0596,
+      "step": 1158
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 10.587650790488437,
+      "learning_rate": 9.92755115273106e-06,
+      "loss": 1.1561,
+      "step": 1159
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.238050478205782,
+      "learning_rate": 9.92735515851171e-06,
+      "loss": 1.1141,
+      "step": 1160
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 10.408677155862119,
+      "learning_rate": 9.927158901480956e-06,
+      "loss": 1.128,
+      "step": 1161
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 9.820218492219816,
+      "learning_rate": 9.926962381649259e-06,
+      "loss": 1.1546,
+      "step": 1162
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 8.074341415996674,
+      "learning_rate": 9.926765599027103e-06,
+      "loss": 1.0512,
+      "step": 1163
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.895566309483361,
+      "learning_rate": 9.926568553624983e-06,
+      "loss": 1.0672,
+      "step": 1164
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 10.065156401547997,
+      "learning_rate": 9.92637124545341e-06,
+      "loss": 1.0686,
+      "step": 1165
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 9.558295037215455,
+      "learning_rate": 9.926173674522908e-06,
+      "loss": 1.0281,
+      "step": 1166
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 9.651671913815035,
+      "learning_rate": 9.925975840844014e-06,
+      "loss": 1.2218,
+      "step": 1167
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.892806167092602,
+      "learning_rate": 9.92577774442728e-06,
+      "loss": 1.0851,
+      "step": 1168
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 10.215999276479408,
+      "learning_rate": 9.925579385283273e-06,
+      "loss": 1.119,
+      "step": 1169
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 9.69593539415043,
+      "learning_rate": 9.925380763422573e-06,
+      "loss": 1.0814,
+      "step": 1170
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.583154813745731,
+      "learning_rate": 9.92518187885577e-06,
+      "loss": 1.0652,
+      "step": 1171
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.519637478025454,
+      "learning_rate": 9.924982731593476e-06,
+      "loss": 1.0955,
+      "step": 1172
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 6.731614530536422,
+      "learning_rate": 9.924783321646312e-06,
+      "loss": 1.078,
+      "step": 1173
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 9.080855313710074,
+      "learning_rate": 9.924583649024915e-06,
+      "loss": 1.0988,
+      "step": 1174
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 13.16584445550648,
+      "learning_rate": 9.924383713739932e-06,
+      "loss": 1.1529,
+      "step": 1175
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.5331242589649205,
+      "learning_rate": 9.92418351580203e-06,
+      "loss": 1.1433,
+      "step": 1176
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 10.353407842560696,
+      "learning_rate": 9.923983055221885e-06,
+      "loss": 1.117,
+      "step": 1177
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 8.125858486653119,
+      "learning_rate": 9.92378233201019e-06,
+      "loss": 1.1231,
+      "step": 1178
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.52763915963895,
+      "learning_rate": 9.923581346177652e-06,
+      "loss": 1.0823,
+      "step": 1179
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 8.525951412163911,
+      "learning_rate": 9.923380097734989e-06,
+      "loss": 1.0286,
+      "step": 1180
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 10.699040605232776,
+      "learning_rate": 9.923178586692936e-06,
+      "loss": 1.1493,
+      "step": 1181
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.236946126704272,
+      "learning_rate": 9.922976813062241e-06,
+      "loss": 1.1349,
+      "step": 1182
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.684409155105941,
+      "learning_rate": 9.922774776853665e-06,
+      "loss": 1.1229,
+      "step": 1183
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 8.790681697480581,
+      "learning_rate": 9.922572478077986e-06,
+      "loss": 1.0646,
+      "step": 1184
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.518241619922023,
+      "learning_rate": 9.922369916745994e-06,
+      "loss": 1.1041,
+      "step": 1185
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.575594600638225,
+      "learning_rate": 9.922167092868492e-06,
+      "loss": 1.1119,
+      "step": 1186
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 8.5877403999594,
+      "learning_rate": 9.921964006456296e-06,
+      "loss": 1.1935,
+      "step": 1187
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 8.253723853349976,
+      "learning_rate": 9.921760657520241e-06,
+      "loss": 1.1668,
+      "step": 1188
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 8.566468412846003,
+      "learning_rate": 9.921557046071175e-06,
+      "loss": 1.1491,
+      "step": 1189
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 9.130611797854765,
+      "learning_rate": 9.921353172119952e-06,
+      "loss": 1.0816,
+      "step": 1190
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 8.084449827030424,
+      "learning_rate": 9.921149035677451e-06,
+      "loss": 1.1398,
+      "step": 1191
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 9.890855314715285,
+      "learning_rate": 9.920944636754559e-06,
+      "loss": 1.2006,
+      "step": 1192
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 9.768590935089069,
+      "learning_rate": 9.920739975362177e-06,
+      "loss": 1.0975,
+      "step": 1193
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.669707183699388,
+      "learning_rate": 9.920535051511221e-06,
+      "loss": 1.1324,
+      "step": 1194
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 10.286244827901188,
+      "learning_rate": 9.920329865212624e-06,
+      "loss": 1.0176,
+      "step": 1195
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 11.246548274506127,
+      "learning_rate": 9.920124416477326e-06,
+      "loss": 1.0698,
+      "step": 1196
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 8.996620273858186,
+      "learning_rate": 9.919918705316287e-06,
+      "loss": 1.0399,
+      "step": 1197
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 8.40393458668356,
+      "learning_rate": 9.919712731740478e-06,
+      "loss": 1.1761,
+      "step": 1198
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 6.785591174302111,
+      "learning_rate": 9.919506495760888e-06,
+      "loss": 1.1448,
+      "step": 1199
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.359066233221308,
+      "learning_rate": 9.919299997388514e-06,
+      "loss": 1.1235,
+      "step": 1200
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.6582287571889625,
+      "learning_rate": 9.919093236634372e-06,
+      "loss": 1.0926,
+      "step": 1201
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.178647300279617,
+      "learning_rate": 9.918886213509488e-06,
+      "loss": 1.13,
+      "step": 1202
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 11.675906846859982,
+      "learning_rate": 9.918678928024905e-06,
+      "loss": 1.175,
+      "step": 1203
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 9.22450656389854,
+      "learning_rate": 9.918471380191681e-06,
+      "loss": 1.1221,
+      "step": 1204
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.0711040883469565,
+      "learning_rate": 9.918263570020883e-06,
+      "loss": 1.0912,
+      "step": 1205
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 12.069192757275381,
+      "learning_rate": 9.918055497523595e-06,
+      "loss": 1.0995,
+      "step": 1206
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.7400688085591725,
+      "learning_rate": 9.917847162710918e-06,
+      "loss": 1.1604,
+      "step": 1207
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 8.32493729047497,
+      "learning_rate": 9.917638565593964e-06,
+      "loss": 1.0895,
+      "step": 1208
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.536412295234804,
+      "learning_rate": 9.917429706183854e-06,
+      "loss": 1.1496,
+      "step": 1209
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 9.38755777145255,
+      "learning_rate": 9.917220584491731e-06,
+      "loss": 1.0578,
+      "step": 1210
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 8.26379243253755,
+      "learning_rate": 9.917011200528752e-06,
+      "loss": 1.1287,
+      "step": 1211
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 8.555884969231833,
+      "learning_rate": 9.91680155430608e-06,
+      "loss": 1.1236,
+      "step": 1212
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 6.961048863146423,
+      "learning_rate": 9.9165916458349e-06,
+      "loss": 1.0863,
+      "step": 1213
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 8.304862077737468,
+      "learning_rate": 9.916381475126406e-06,
+      "loss": 1.09,
+      "step": 1214
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 9.809664883390326,
+      "learning_rate": 9.916171042191811e-06,
+      "loss": 1.0941,
+      "step": 1215
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.04950623952086,
+      "learning_rate": 9.915960347042335e-06,
+      "loss": 1.1485,
+      "step": 1216
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.72391735997582,
+      "learning_rate": 9.91574938968922e-06,
+      "loss": 1.1312,
+      "step": 1217
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 8.347047873944346,
+      "learning_rate": 9.915538170143712e-06,
+      "loss": 1.2004,
+      "step": 1218
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 11.812629064978228,
+      "learning_rate": 9.915326688417084e-06,
+      "loss": 1.1227,
+      "step": 1219
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 9.43911249701816,
+      "learning_rate": 9.915114944520611e-06,
+      "loss": 1.1117,
+      "step": 1220
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 11.452816306017228,
+      "learning_rate": 9.914902938465588e-06,
+      "loss": 1.1051,
+      "step": 1221
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 8.230758551528167,
+      "learning_rate": 9.914690670263323e-06,
+      "loss": 1.1377,
+      "step": 1222
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 8.203147308547761,
+      "learning_rate": 9.914478139925138e-06,
+      "loss": 1.12,
+      "step": 1223
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 9.990511713042482,
+      "learning_rate": 9.914265347462368e-06,
+      "loss": 1.1435,
+      "step": 1224
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 9.248510645098353,
+      "learning_rate": 9.914052292886364e-06,
+      "loss": 1.0619,
+      "step": 1225
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 9.965518302975386,
+      "learning_rate": 9.91383897620849e-06,
+      "loss": 1.0998,
+      "step": 1226
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.703746207188083,
+      "learning_rate": 9.913625397440122e-06,
+      "loss": 1.0897,
+      "step": 1227
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 9.636956329083505,
+      "learning_rate": 9.913411556592652e-06,
+      "loss": 1.1521,
+      "step": 1228
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 10.51732661648066,
+      "learning_rate": 9.913197453677486e-06,
+      "loss": 1.0762,
+      "step": 1229
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.075248181113443,
+      "learning_rate": 9.912983088706044e-06,
+      "loss": 1.0881,
+      "step": 1230
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 7.438294892766044,
+      "learning_rate": 9.912768461689758e-06,
+      "loss": 1.1117,
+      "step": 1231
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.628883657988728,
+      "learning_rate": 9.912553572640079e-06,
+      "loss": 1.064,
+      "step": 1232
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 9.573158726952954,
+      "learning_rate": 9.912338421568466e-06,
+      "loss": 1.1014,
+      "step": 1233
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 10.629330293030952,
+      "learning_rate": 9.912123008486395e-06,
+      "loss": 1.0725,
+      "step": 1234
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.486324542451438,
+      "learning_rate": 9.911907333405356e-06,
+      "loss": 1.1802,
+      "step": 1235
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 12.620971919016942,
+      "learning_rate": 9.911691396336852e-06,
+      "loss": 1.0468,
+      "step": 1236
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 7.425829957762033,
+      "learning_rate": 9.911475197292401e-06,
+      "loss": 1.1955,
+      "step": 1237
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 9.827278035955102,
+      "learning_rate": 9.911258736283535e-06,
+      "loss": 1.1575,
+      "step": 1238
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 9.861313462453076,
+      "learning_rate": 9.911042013321797e-06,
+      "loss": 1.1061,
+      "step": 1239
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.697962211405294,
+      "learning_rate": 9.910825028418748e-06,
+      "loss": 1.1242,
+      "step": 1240
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.1413576436278,
+      "learning_rate": 9.910607781585963e-06,
+      "loss": 1.0679,
+      "step": 1241
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.165909466990637,
+      "learning_rate": 9.910390272835027e-06,
+      "loss": 1.2027,
+      "step": 1242
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 10.680108059384047,
+      "learning_rate": 9.910172502177542e-06,
+      "loss": 1.141,
+      "step": 1243
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 9.718302515493681,
+      "learning_rate": 9.909954469625123e-06,
+      "loss": 1.1832,
+      "step": 1244
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 7.842229027326358,
+      "learning_rate": 9.9097361751894e-06,
+      "loss": 1.0983,
+      "step": 1245
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 7.761326612572715,
+      "learning_rate": 9.909517618882014e-06,
+      "loss": 1.139,
+      "step": 1246
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.20760804832606,
+      "learning_rate": 9.909298800714626e-06,
+      "loss": 1.07,
+      "step": 1247
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 6.593891046959905,
+      "learning_rate": 9.909079720698904e-06,
+      "loss": 1.0514,
+      "step": 1248
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 9.287500887152273,
+      "learning_rate": 9.908860378846534e-06,
+      "loss": 1.1369,
+      "step": 1249
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.067956666439782,
+      "learning_rate": 9.908640775169217e-06,
+      "loss": 1.0764,
+      "step": 1250
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.294877246340747,
+      "learning_rate": 9.908420909678662e-06,
+      "loss": 1.1995,
+      "step": 1251
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 9.915441076243,
+      "learning_rate": 9.9082007823866e-06,
+      "loss": 1.1005,
+      "step": 1252
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.380382784302304,
+      "learning_rate": 9.90798039330477e-06,
+      "loss": 1.0837,
+      "step": 1253
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 10.84356526957393,
+      "learning_rate": 9.907759742444927e-06,
+      "loss": 1.0587,
+      "step": 1254
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 9.01242902812565,
+      "learning_rate": 9.907538829818841e-06,
+      "loss": 1.076,
+      "step": 1255
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.159171487994158,
+      "learning_rate": 9.907317655438293e-06,
+      "loss": 1.0428,
+      "step": 1256
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.072501029773175,
+      "learning_rate": 9.907096219315081e-06,
+      "loss": 1.061,
+      "step": 1257
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 11.653591822519802,
+      "learning_rate": 9.906874521461017e-06,
+      "loss": 1.1111,
+      "step": 1258
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 6.844068742653087,
+      "learning_rate": 9.906652561887923e-06,
+      "loss": 1.2678,
+      "step": 1259
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 7.844348979795736,
+      "learning_rate": 9.90643034060764e-06,
+      "loss": 1.0906,
+      "step": 1260
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 10.380892340628163,
+      "learning_rate": 9.906207857632019e-06,
+      "loss": 1.0058,
+      "step": 1261
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 11.434156889877508,
+      "learning_rate": 9.905985112972926e-06,
+      "loss": 1.0552,
+      "step": 1262
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 10.926649244127125,
+      "learning_rate": 9.905762106642245e-06,
+      "loss": 1.1399,
+      "step": 1263
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 9.01025421955472,
+      "learning_rate": 9.905538838651869e-06,
+      "loss": 1.133,
+      "step": 1264
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.159331091402619,
+      "learning_rate": 9.905315309013705e-06,
+      "loss": 1.0826,
+      "step": 1265
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.67613982482397,
+      "learning_rate": 9.905091517739676e-06,
+      "loss": 1.0953,
+      "step": 1266
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 11.315598981002115,
+      "learning_rate": 9.904867464841719e-06,
+      "loss": 1.0805,
+      "step": 1267
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.086374968046101,
+      "learning_rate": 9.904643150331785e-06,
+      "loss": 0.9861,
+      "step": 1268
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 6.4192787182320945,
+      "learning_rate": 9.904418574221838e-06,
+      "loss": 1.059,
+      "step": 1269
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 6.4659681736207535,
+      "learning_rate": 9.904193736523855e-06,
+      "loss": 1.1393,
+      "step": 1270
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 10.437940279433848,
+      "learning_rate": 9.903968637249828e-06,
+      "loss": 1.1286,
+      "step": 1271
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 6.896976775083747,
+      "learning_rate": 9.903743276411766e-06,
+      "loss": 1.1434,
+      "step": 1272
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.758648751204992,
+      "learning_rate": 9.903517654021687e-06,
+      "loss": 1.109,
+      "step": 1273
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.969642259662876,
+      "learning_rate": 9.903291770091625e-06,
+      "loss": 1.2409,
+      "step": 1274
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.465172380726782,
+      "learning_rate": 9.903065624633628e-06,
+      "loss": 1.1474,
+      "step": 1275
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 12.062271387051156,
+      "learning_rate": 9.902839217659759e-06,
+      "loss": 1.0562,
+      "step": 1276
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 9.902833500167436,
+      "learning_rate": 9.902612549182092e-06,
+      "loss": 1.1698,
+      "step": 1277
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 9.263464193331686,
+      "learning_rate": 9.90238561921272e-06,
+      "loss": 1.0362,
+      "step": 1278
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 9.315837162745957,
+      "learning_rate": 9.902158427763744e-06,
+      "loss": 1.1416,
+      "step": 1279
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 10.71238017113065,
+      "learning_rate": 9.901930974847283e-06,
+      "loss": 0.9973,
+      "step": 1280
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.44921393349967,
+      "learning_rate": 9.901703260475468e-06,
+      "loss": 1.1469,
+      "step": 1281
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 10.740833628009195,
+      "learning_rate": 9.901475284660445e-06,
+      "loss": 1.1033,
+      "step": 1282
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 10.951077672446223,
+      "learning_rate": 9.901247047414373e-06,
+      "loss": 1.0932,
+      "step": 1283
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.480630589384056,
+      "learning_rate": 9.901018548749427e-06,
+      "loss": 1.2583,
+      "step": 1284
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.856738290227007,
+      "learning_rate": 9.900789788677793e-06,
+      "loss": 1.1405,
+      "step": 1285
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 11.253696724197782,
+      "learning_rate": 9.900560767211676e-06,
+      "loss": 1.1756,
+      "step": 1286
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 7.499434653875398,
+      "learning_rate": 9.900331484363286e-06,
+      "loss": 1.0936,
+      "step": 1287
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.525339482839252,
+      "learning_rate": 9.900101940144855e-06,
+      "loss": 1.0577,
+      "step": 1288
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.200719069058923,
+      "learning_rate": 9.899872134568626e-06,
+      "loss": 1.0701,
+      "step": 1289
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 9.14579737181837,
+      "learning_rate": 9.899642067646856e-06,
+      "loss": 1.1076,
+      "step": 1290
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 9.017072400096708,
+      "learning_rate": 9.899411739391816e-06,
+      "loss": 1.0659,
+      "step": 1291
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 11.16967339409397,
+      "learning_rate": 9.899181149815793e-06,
+      "loss": 1.2106,
+      "step": 1292
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 10.273965044497501,
+      "learning_rate": 9.898950298931083e-06,
+      "loss": 1.2038,
+      "step": 1293
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.277925942660675,
+      "learning_rate": 9.898719186750002e-06,
+      "loss": 1.1687,
+      "step": 1294
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 7.865667785145235,
+      "learning_rate": 9.898487813284874e-06,
+      "loss": 1.0929,
+      "step": 1295
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.50286794084111,
+      "learning_rate": 9.89825617854804e-06,
+      "loss": 1.0849,
+      "step": 1296
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.270164715713308,
+      "learning_rate": 9.898024282551858e-06,
+      "loss": 1.0303,
+      "step": 1297
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 8.93759744799604,
+      "learning_rate": 9.897792125308694e-06,
+      "loss": 1.1398,
+      "step": 1298
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 9.532873545646835,
+      "learning_rate": 9.89755970683093e-06,
+      "loss": 1.0645,
+      "step": 1299
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 8.480074940408537,
+      "learning_rate": 9.897327027130965e-06,
+      "loss": 1.0805,
+      "step": 1300
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 9.859359721787444,
+      "learning_rate": 9.897094086221209e-06,
+      "loss": 1.1334,
+      "step": 1301
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 6.841955029426265,
+      "learning_rate": 9.896860884114084e-06,
+      "loss": 1.0896,
+      "step": 1302
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 11.435473053090842,
+      "learning_rate": 9.89662742082203e-06,
+      "loss": 1.1435,
+      "step": 1303
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 12.767234370223889,
+      "learning_rate": 9.896393696357499e-06,
+      "loss": 1.0713,
+      "step": 1304
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 11.035326165425028,
+      "learning_rate": 9.896159710732957e-06,
+      "loss": 1.101,
+      "step": 1305
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 10.264801118175946,
+      "learning_rate": 9.895925463960885e-06,
+      "loss": 1.1054,
+      "step": 1306
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 8.116622030780347,
+      "learning_rate": 9.895690956053779e-06,
+      "loss": 1.0939,
+      "step": 1307
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 8.24355238602407,
+      "learning_rate": 9.895456187024141e-06,
+      "loss": 1.1687,
+      "step": 1308
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 9.559593624183648,
+      "learning_rate": 9.895221156884499e-06,
+      "loss": 1.1538,
+      "step": 1309
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 7.365882969915963,
+      "learning_rate": 9.894985865647386e-06,
+      "loss": 1.1303,
+      "step": 1310
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 10.014790341512489,
+      "learning_rate": 9.894750313325351e-06,
+      "loss": 1.0736,
+      "step": 1311
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 9.430309384085529,
+      "learning_rate": 9.894514499930961e-06,
+      "loss": 1.0795,
+      "step": 1312
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 10.819475546112459,
+      "learning_rate": 9.89427842547679e-06,
+      "loss": 1.0988,
+      "step": 1313
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 9.513441966638668,
+      "learning_rate": 9.894042089975431e-06,
+      "loss": 1.12,
+      "step": 1314
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 7.068936322002252,
+      "learning_rate": 9.89380549343949e-06,
+      "loss": 1.151,
+      "step": 1315
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 6.7810555204655465,
+      "learning_rate": 9.893568635881588e-06,
+      "loss": 1.0979,
+      "step": 1316
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 10.064111214920217,
+      "learning_rate": 9.893331517314353e-06,
+      "loss": 1.0529,
+      "step": 1317
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 10.321075518645644,
+      "learning_rate": 9.893094137750438e-06,
+      "loss": 1.0711,
+      "step": 1318
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 9.962156324663372,
+      "learning_rate": 9.8928564972025e-06,
+      "loss": 1.0656,
+      "step": 1319
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 10.438790547780533,
+      "learning_rate": 9.892618595683217e-06,
+      "loss": 1.1003,
+      "step": 1320
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 8.695514709170899,
+      "learning_rate": 9.892380433205276e-06,
+      "loss": 1.0509,
+      "step": 1321
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 9.689025490389616,
+      "learning_rate": 9.89214200978138e-06,
+      "loss": 1.0986,
+      "step": 1322
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 9.38180385607492,
+      "learning_rate": 9.891903325424249e-06,
+      "loss": 1.1075,
+      "step": 1323
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 7.72115859414938,
+      "learning_rate": 9.891664380146608e-06,
+      "loss": 1.0971,
+      "step": 1324
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 7.840130257795688,
+      "learning_rate": 9.89142517396121e-06,
+      "loss": 1.0321,
+      "step": 1325
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 10.728197571973592,
+      "learning_rate": 9.891185706880804e-06,
+      "loss": 1.1497,
+      "step": 1326
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 8.328871103127625,
+      "learning_rate": 9.890945978918168e-06,
+      "loss": 1.1247,
+      "step": 1327
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 8.750762107149452,
+      "learning_rate": 9.890705990086089e-06,
+      "loss": 1.0653,
+      "step": 1328
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 8.9942081425713,
+      "learning_rate": 9.890465740397363e-06,
+      "loss": 1.1477,
+      "step": 1329
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 8.232939202067826,
+      "learning_rate": 9.89022522986481e-06,
+      "loss": 1.0685,
+      "step": 1330
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 7.533918795940488,
+      "learning_rate": 9.889984458501255e-06,
+      "loss": 1.0603,
+      "step": 1331
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 8.201546305740987,
+      "learning_rate": 9.88974342631954e-06,
+      "loss": 1.0863,
+      "step": 1332
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 7.511749692286603,
+      "learning_rate": 9.88950213333252e-06,
+      "loss": 1.0764,
+      "step": 1333
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 9.59757284590374,
+      "learning_rate": 9.889260579553069e-06,
+      "loss": 0.9983,
+      "step": 1334
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 19.972604180183513,
+      "learning_rate": 9.889018764994065e-06,
+      "loss": 1.1393,
+      "step": 1335
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 7.5059053588344415,
+      "learning_rate": 9.88877668966841e-06,
+      "loss": 1.1273,
+      "step": 1336
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 9.982908718030197,
+      "learning_rate": 9.888534353589015e-06,
+      "loss": 1.118,
+      "step": 1337
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 8.361827620053255,
+      "learning_rate": 9.888291756768804e-06,
+      "loss": 1.1457,
+      "step": 1338
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 8.520196367729666,
+      "learning_rate": 9.888048899220718e-06,
+      "loss": 1.102,
+      "step": 1339
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 9.586038667117107,
+      "learning_rate": 9.887805780957709e-06,
+      "loss": 1.0603,
+      "step": 1340
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 9.170998248910923,
+      "learning_rate": 9.887562401992746e-06,
+      "loss": 1.141,
+      "step": 1341
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 8.89246504003959,
+      "learning_rate": 9.887318762338808e-06,
+      "loss": 1.1113,
+      "step": 1342
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 9.185728280762534,
+      "learning_rate": 9.887074862008892e-06,
+      "loss": 1.1337,
+      "step": 1343
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 9.07507270994606,
+      "learning_rate": 9.886830701016006e-06,
+      "loss": 1.1202,
+      "step": 1344
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 9.705450214592341,
+      "learning_rate": 9.886586279373172e-06,
+      "loss": 1.0804,
+      "step": 1345
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 9.696656622652153,
+      "learning_rate": 9.886341597093429e-06,
+      "loss": 1.0968,
+      "step": 1346
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 8.273207135928763,
+      "learning_rate": 9.886096654189824e-06,
+      "loss": 1.1753,
+      "step": 1347
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 7.802545936914654,
+      "learning_rate": 9.885851450675427e-06,
+      "loss": 1.1269,
+      "step": 1348
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 10.423294199464626,
+      "learning_rate": 9.885605986563314e-06,
+      "loss": 1.1511,
+      "step": 1349
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 8.814768805515317,
+      "learning_rate": 9.885360261866574e-06,
+      "loss": 1.1122,
+      "step": 1350
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 8.416621103456706,
+      "learning_rate": 9.885114276598317e-06,
+      "loss": 1.141,
+      "step": 1351
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 6.976788817751798,
+      "learning_rate": 9.884868030771663e-06,
+      "loss": 1.0599,
+      "step": 1352
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 8.902508801871718,
+      "learning_rate": 9.884621524399745e-06,
+      "loss": 1.0968,
+      "step": 1353
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 10.155543638221795,
+      "learning_rate": 9.884374757495712e-06,
+      "loss": 1.0431,
+      "step": 1354
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 9.795014198647554,
+      "learning_rate": 9.884127730072723e-06,
+      "loss": 1.0825,
+      "step": 1355
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 10.67955666040836,
+      "learning_rate": 9.883880442143959e-06,
+      "loss": 1.1106,
+      "step": 1356
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 9.308415637536383,
+      "learning_rate": 9.883632893722605e-06,
+      "loss": 1.1022,
+      "step": 1357
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 8.579716655173304,
+      "learning_rate": 9.883385084821866e-06,
+      "loss": 1.1185,
+      "step": 1358
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 9.768877611976377,
+      "learning_rate": 9.88313701545496e-06,
+      "loss": 1.1153,
+      "step": 1359
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 7.744783574418334,
+      "learning_rate": 9.882888685635117e-06,
+      "loss": 1.0854,
+      "step": 1360
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 8.01572243005487,
+      "learning_rate": 9.882640095375585e-06,
+      "loss": 1.0518,
+      "step": 1361
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 6.912020116468136,
+      "learning_rate": 9.88239124468962e-06,
+      "loss": 1.08,
+      "step": 1362
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 7.241810152910932,
+      "learning_rate": 9.882142133590496e-06,
+      "loss": 1.1081,
+      "step": 1363
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 7.58706209725831,
+      "learning_rate": 9.8818927620915e-06,
+      "loss": 1.1188,
+      "step": 1364
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 10.362955083218433,
+      "learning_rate": 9.881643130205933e-06,
+      "loss": 1.1757,
+      "step": 1365
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 8.743373599888153,
+      "learning_rate": 9.88139323794711e-06,
+      "loss": 1.129,
+      "step": 1366
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 9.955918131094565,
+      "learning_rate": 9.88114308532836e-06,
+      "loss": 1.0588,
+      "step": 1367
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.58569870766606,
+      "learning_rate": 9.880892672363022e-06,
+      "loss": 1.0912,
+      "step": 1368
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 9.468104326715835,
+      "learning_rate": 9.880641999064457e-06,
+      "loss": 1.0647,
+      "step": 1369
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 12.198770369589026,
+      "learning_rate": 9.880391065446033e-06,
+      "loss": 1.0651,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.566845040702932,
+      "learning_rate": 9.880139871521134e-06,
+      "loss": 1.1265,
+      "step": 1371
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 6.95649110533655,
+      "learning_rate": 9.879888417303159e-06,
+      "loss": 1.1139,
+      "step": 1372
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 10.027103041345741,
+      "learning_rate": 9.879636702805518e-06,
+      "loss": 1.0377,
+      "step": 1373
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 10.50417057558522,
+      "learning_rate": 9.879384728041637e-06,
+      "loss": 1.1045,
+      "step": 1374
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 7.040809141810538,
+      "learning_rate": 9.879132493024959e-06,
+      "loss": 1.1177,
+      "step": 1375
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.964958519054225,
+      "learning_rate": 9.878879997768934e-06,
+      "loss": 1.1537,
+      "step": 1376
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 10.200010580488462,
+      "learning_rate": 9.878627242287028e-06,
+      "loss": 1.1377,
+      "step": 1377
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 9.695945678269311,
+      "learning_rate": 9.878374226592727e-06,
+      "loss": 1.1428,
+      "step": 1378
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 10.022598145372845,
+      "learning_rate": 9.878120950699523e-06,
+      "loss": 1.0773,
+      "step": 1379
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 10.650280484086563,
+      "learning_rate": 9.877867414620926e-06,
+      "loss": 1.0972,
+      "step": 1380
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 9.051597577385865,
+      "learning_rate": 9.87761361837046e-06,
+      "loss": 1.0471,
+      "step": 1381
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.167422996336063,
+      "learning_rate": 9.877359561961658e-06,
+      "loss": 1.1116,
+      "step": 1382
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 6.765715415133771,
+      "learning_rate": 9.877105245408075e-06,
+      "loss": 1.1593,
+      "step": 1383
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 9.072133668980818,
+      "learning_rate": 9.876850668723274e-06,
+      "loss": 1.1307,
+      "step": 1384
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 9.47516979325466,
+      "learning_rate": 9.876595831920832e-06,
+      "loss": 1.1886,
+      "step": 1385
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.053446983830877,
+      "learning_rate": 9.876340735014343e-06,
+      "loss": 1.0929,
+      "step": 1386
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 7.240724235939722,
+      "learning_rate": 9.876085378017412e-06,
+      "loss": 1.1488,
+      "step": 1387
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.617182214861822,
+      "learning_rate": 9.87582976094366e-06,
+      "loss": 1.0928,
+      "step": 1388
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.65585738563645,
+      "learning_rate": 9.875573883806722e-06,
+      "loss": 1.0324,
+      "step": 1389
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 5.979232683824391,
+      "learning_rate": 9.875317746620242e-06,
+      "loss": 1.1194,
+      "step": 1390
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 9.01370460429015,
+      "learning_rate": 9.875061349397885e-06,
+      "loss": 1.1987,
+      "step": 1391
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.899050684367616,
+      "learning_rate": 9.874804692153325e-06,
+      "loss": 1.1005,
+      "step": 1392
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 10.950134739384639,
+      "learning_rate": 9.874547774900252e-06,
+      "loss": 1.0841,
+      "step": 1393
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 6.559779582223225,
+      "learning_rate": 9.874290597652369e-06,
+      "loss": 1.1257,
+      "step": 1394
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 6.77641511077059,
+      "learning_rate": 9.874033160423393e-06,
+      "loss": 1.0877,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 12.931009318270513,
+      "learning_rate": 9.873775463227057e-06,
+      "loss": 1.009,
+      "step": 1396
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 10.87591248069772,
+      "learning_rate": 9.873517506077101e-06,
+      "loss": 1.0177,
+      "step": 1397
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 10.46525080392852,
+      "learning_rate": 9.87325928898729e-06,
+      "loss": 1.0663,
+      "step": 1398
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 11.073891299726238,
+      "learning_rate": 9.87300081197139e-06,
+      "loss": 1.0657,
+      "step": 1399
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 9.216005279949847,
+      "learning_rate": 9.872742075043193e-06,
+      "loss": 1.1202,
+      "step": 1400
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 10.387404369782772,
+      "learning_rate": 9.872483078216497e-06,
+      "loss": 1.1959,
+      "step": 1401
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 9.373582409738207,
+      "learning_rate": 9.872223821505115e-06,
+      "loss": 1.0215,
+      "step": 1402
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 10.988771378530345,
+      "learning_rate": 9.871964304922879e-06,
+      "loss": 1.1189,
+      "step": 1403
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 9.482792617936525,
+      "learning_rate": 9.871704528483626e-06,
+      "loss": 1.1239,
+      "step": 1404
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 10.413065535267915,
+      "learning_rate": 9.871444492201214e-06,
+      "loss": 1.0754,
+      "step": 1405
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 9.388253965147298,
+      "learning_rate": 9.871184196089512e-06,
+      "loss": 1.11,
+      "step": 1406
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 11.584458917367463,
+      "learning_rate": 9.870923640162406e-06,
+      "loss": 1.0991,
+      "step": 1407
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 7.467998050620955,
+      "learning_rate": 9.87066282443379e-06,
+      "loss": 1.0484,
+      "step": 1408
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.697864863424849,
+      "learning_rate": 9.870401748917576e-06,
+      "loss": 1.0646,
+      "step": 1409
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 6.836579458543193,
+      "learning_rate": 9.870140413627691e-06,
+      "loss": 1.0615,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.448351864598353,
+      "learning_rate": 9.869878818578071e-06,
+      "loss": 0.982,
+      "step": 1411
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.56206881838164,
+      "learning_rate": 9.869616963782671e-06,
+      "loss": 1.1174,
+      "step": 1412
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.861434615270486,
+      "learning_rate": 9.869354849255457e-06,
+      "loss": 1.0344,
+      "step": 1413
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 7.967263402508492,
+      "learning_rate": 9.869092475010408e-06,
+      "loss": 1.0853,
+      "step": 1414
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 10.02726555724784,
+      "learning_rate": 9.868829841061521e-06,
+      "loss": 1.0432,
+      "step": 1415
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 7.714555498589986,
+      "learning_rate": 9.868566947422802e-06,
+      "loss": 1.1299,
+      "step": 1416
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 10.239163586247084,
+      "learning_rate": 9.868303794108275e-06,
+      "loss": 1.1438,
+      "step": 1417
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 5.974134307678351,
+      "learning_rate": 9.868040381131974e-06,
+      "loss": 1.1659,
+      "step": 1418
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.72805081157597,
+      "learning_rate": 9.867776708507947e-06,
+      "loss": 1.0658,
+      "step": 1419
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.964091881058126,
+      "learning_rate": 9.867512776250262e-06,
+      "loss": 1.0715,
+      "step": 1420
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.71848750350186,
+      "learning_rate": 9.867248584372996e-06,
+      "loss": 1.043,
+      "step": 1421
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.283708048503595,
+      "learning_rate": 9.866984132890237e-06,
+      "loss": 1.049,
+      "step": 1422
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.901649407429076,
+      "learning_rate": 9.86671942181609e-06,
+      "loss": 1.0934,
+      "step": 1423
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.282357381819908,
+      "learning_rate": 9.866454451164678e-06,
+      "loss": 1.1485,
+      "step": 1424
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 9.651571123653712,
+      "learning_rate": 9.86618922095013e-06,
+      "loss": 1.1597,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 10.687219478435237,
+      "learning_rate": 9.865923731186597e-06,
+      "loss": 1.0876,
+      "step": 1426
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 9.513757550411386,
+      "learning_rate": 9.865657981888235e-06,
+      "loss": 1.0889,
+      "step": 1427
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 7.419682221735929,
+      "learning_rate": 9.865391973069218e-06,
+      "loss": 1.089,
+      "step": 1428
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 9.954680154805862,
+      "learning_rate": 9.865125704743738e-06,
+      "loss": 1.1366,
+      "step": 1429
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 9.555181854549875,
+      "learning_rate": 9.864859176925995e-06,
+      "loss": 1.1406,
+      "step": 1430
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 11.768605645967979,
+      "learning_rate": 9.864592389630202e-06,
+      "loss": 1.0539,
+      "step": 1431
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.813615792012184,
+      "learning_rate": 9.864325342870595e-06,
+      "loss": 1.0346,
+      "step": 1432
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 10.26446025546781,
+      "learning_rate": 9.864058036661413e-06,
+      "loss": 1.037,
+      "step": 1433
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.865044845882863,
+      "learning_rate": 9.863790471016915e-06,
+      "loss": 1.0805,
+      "step": 1434
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.95779466039497,
+      "learning_rate": 9.863522645951371e-06,
+      "loss": 1.0315,
+      "step": 1435
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 9.87851344773488,
+      "learning_rate": 9.863254561479067e-06,
+      "loss": 1.1042,
+      "step": 1436
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.673977521849585,
+      "learning_rate": 9.862986217614303e-06,
+      "loss": 1.0762,
+      "step": 1437
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 6.694070244988544,
+      "learning_rate": 9.862717614371389e-06,
+      "loss": 1.0963,
+      "step": 1438
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 9.161463352059004,
+      "learning_rate": 9.862448751764653e-06,
+      "loss": 1.0851,
+      "step": 1439
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.546620915740538,
+      "learning_rate": 9.862179629808435e-06,
+      "loss": 1.123,
+      "step": 1440
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 7.58895550852893,
+      "learning_rate": 9.86191024851709e-06,
+      "loss": 1.0659,
+      "step": 1441
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 9.818848569233412,
+      "learning_rate": 9.861640607904985e-06,
+      "loss": 1.1392,
+      "step": 1442
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 15.166110983250643,
+      "learning_rate": 9.861370707986504e-06,
+      "loss": 1.0938,
+      "step": 1443
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.733361015627016,
+      "learning_rate": 9.861100548776039e-06,
+      "loss": 1.0822,
+      "step": 1444
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 10.617731477350572,
+      "learning_rate": 9.860830130288003e-06,
+      "loss": 1.0299,
+      "step": 1445
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 9.765859711365614,
+      "learning_rate": 9.860559452536818e-06,
+      "loss": 1.0691,
+      "step": 1446
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 9.984788237947805,
+      "learning_rate": 9.860288515536922e-06,
+      "loss": 1.0838,
+      "step": 1447
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 6.680953631036643,
+      "learning_rate": 9.860017319302763e-06,
+      "loss": 1.0987,
+      "step": 1448
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.624937777675681,
+      "learning_rate": 9.85974586384881e-06,
+      "loss": 1.0967,
+      "step": 1449
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 7.9068271195212425,
+      "learning_rate": 9.859474149189541e-06,
+      "loss": 1.0471,
+      "step": 1450
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 10.999614408688677,
+      "learning_rate": 9.859202175339445e-06,
+      "loss": 1.0858,
+      "step": 1451
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 5.858066363637516,
+      "learning_rate": 9.858929942313031e-06,
+      "loss": 1.0111,
+      "step": 1452
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.56948561757752,
+      "learning_rate": 9.85865745012482e-06,
+      "loss": 1.0681,
+      "step": 1453
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.446222672981389,
+      "learning_rate": 9.858384698789342e-06,
+      "loss": 1.0557,
+      "step": 1454
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 9.794388634468547,
+      "learning_rate": 9.85811168832115e-06,
+      "loss": 1.0433,
+      "step": 1455
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 7.374232394507551,
+      "learning_rate": 9.857838418734803e-06,
+      "loss": 1.0115,
+      "step": 1456
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.77133033576864,
+      "learning_rate": 9.857564890044877e-06,
+      "loss": 1.1276,
+      "step": 1457
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 11.030142620075264,
+      "learning_rate": 9.85729110226596e-06,
+      "loss": 1.0558,
+      "step": 1458
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 5.164197756778955,
+      "learning_rate": 9.857017055412657e-06,
+      "loss": 1.1137,
+      "step": 1459
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 6.864424288357119,
+      "learning_rate": 9.856742749499582e-06,
+      "loss": 1.0307,
+      "step": 1460
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 9.428486169742907,
+      "learning_rate": 9.856468184541368e-06,
+      "loss": 1.0241,
+      "step": 1461
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 9.539421784205132,
+      "learning_rate": 9.856193360552659e-06,
+      "loss": 1.102,
+      "step": 1462
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.321690362257936,
+      "learning_rate": 9.855918277548115e-06,
+      "loss": 1.1333,
+      "step": 1463
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 10.486840941854293,
+      "learning_rate": 9.855642935542407e-06,
+      "loss": 1.0713,
+      "step": 1464
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 7.44679292693836,
+      "learning_rate": 9.855367334550218e-06,
+      "loss": 1.1227,
+      "step": 1465
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 6.948249878329554,
+      "learning_rate": 9.855091474586252e-06,
+      "loss": 1.0902,
+      "step": 1466
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 9.586184858859344,
+      "learning_rate": 9.854815355665222e-06,
+      "loss": 1.1066,
+      "step": 1467
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 7.087819863659885,
+      "learning_rate": 9.854538977801852e-06,
+      "loss": 1.0887,
+      "step": 1468
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 7.163875129146382,
+      "learning_rate": 9.854262341010888e-06,
+      "loss": 1.106,
+      "step": 1469
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.01108417930635,
+      "learning_rate": 9.85398544530708e-06,
+      "loss": 1.1036,
+      "step": 1470
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 7.405662478870584,
+      "learning_rate": 9.853708290705201e-06,
+      "loss": 1.1306,
+      "step": 1471
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 9.233263769578565,
+      "learning_rate": 9.85343087722003e-06,
+      "loss": 1.0334,
+      "step": 1472
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 7.869496718983072,
+      "learning_rate": 9.85315320486637e-06,
+      "loss": 1.0917,
+      "step": 1473
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.415044466558191,
+      "learning_rate": 9.852875273659024e-06,
+      "loss": 1.092,
+      "step": 1474
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.633701173355556,
+      "learning_rate": 9.85259708361282e-06,
+      "loss": 1.0994,
+      "step": 1475
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 10.250066838594206,
+      "learning_rate": 9.852318634742594e-06,
+      "loss": 1.1278,
+      "step": 1476
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 9.54851876471368,
+      "learning_rate": 9.8520399270632e-06,
+      "loss": 1.129,
+      "step": 1477
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.834047583641535,
+      "learning_rate": 9.851760960589501e-06,
+      "loss": 1.1333,
+      "step": 1478
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.416602894255465,
+      "learning_rate": 9.851481735336376e-06,
+      "loss": 1.0731,
+      "step": 1479
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 6.61060572268527,
+      "learning_rate": 9.851202251318721e-06,
+      "loss": 1.0938,
+      "step": 1480
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.443421559905326,
+      "learning_rate": 9.850922508551442e-06,
+      "loss": 1.0633,
+      "step": 1481
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 6.784214589177235,
+      "learning_rate": 9.850642507049459e-06,
+      "loss": 1.0879,
+      "step": 1482
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 9.055445032354138,
+      "learning_rate": 9.850362246827706e-06,
+      "loss": 1.0945,
+      "step": 1483
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 7.988344608123538,
+      "learning_rate": 9.850081727901131e-06,
+      "loss": 1.0759,
+      "step": 1484
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.900823429471703,
+      "learning_rate": 9.849800950284697e-06,
+      "loss": 1.1399,
+      "step": 1485
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 9.323295369756504,
+      "learning_rate": 9.84951991399338e-06,
+      "loss": 1.0743,
+      "step": 1486
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 11.34556520671954,
+      "learning_rate": 9.849238619042169e-06,
+      "loss": 1.0632,
+      "step": 1487
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.29305599255504,
+      "learning_rate": 9.84895706544607e-06,
+      "loss": 1.053,
+      "step": 1488
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 10.454921025823877,
+      "learning_rate": 9.848675253220097e-06,
+      "loss": 1.0733,
+      "step": 1489
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 12.585074672999438,
+      "learning_rate": 9.848393182379282e-06,
+      "loss": 1.0421,
+      "step": 1490
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 9.44260093547447,
+      "learning_rate": 9.84811085293867e-06,
+      "loss": 1.145,
+      "step": 1491
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 9.237388442276615,
+      "learning_rate": 9.847828264913321e-06,
+      "loss": 1.0305,
+      "step": 1492
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.559930430836443,
+      "learning_rate": 9.847545418318307e-06,
+      "loss": 1.0829,
+      "step": 1493
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 10.843664449192106,
+      "learning_rate": 9.847262313168713e-06,
+      "loss": 1.0906,
+      "step": 1494
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 11.194585832760692,
+      "learning_rate": 9.846978949479638e-06,
+      "loss": 1.0934,
+      "step": 1495
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 6.268319199415883,
+      "learning_rate": 9.846695327266199e-06,
+      "loss": 1.098,
+      "step": 1496
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.131083406308386,
+      "learning_rate": 9.846411446543523e-06,
+      "loss": 1.1513,
+      "step": 1497
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 9.629576101298808,
+      "learning_rate": 9.846127307326748e-06,
+      "loss": 1.0774,
+      "step": 1498
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 7.0072233427094135,
+      "learning_rate": 9.845842909631033e-06,
+      "loss": 1.08,
+      "step": 1499
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 7.735334842968965,
+      "learning_rate": 9.845558253471546e-06,
+      "loss": 1.1015,
+      "step": 1500
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 7.4280270999379026,
+      "learning_rate": 9.84527333886347e-06,
+      "loss": 1.0477,
+      "step": 1501
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.323133301067443,
+      "learning_rate": 9.844988165822e-06,
+      "loss": 1.0814,
+      "step": 1502
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.532769124870752,
+      "learning_rate": 9.844702734362349e-06,
+      "loss": 1.0785,
+      "step": 1503
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 7.920206817507332,
+      "learning_rate": 9.844417044499736e-06,
+      "loss": 0.9927,
+      "step": 1504
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.393500168968927,
+      "learning_rate": 9.844131096249405e-06,
+      "loss": 1.1268,
+      "step": 1505
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.311633691144975,
+      "learning_rate": 9.843844889626605e-06,
+      "loss": 1.0652,
+      "step": 1506
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 7.233319750008654,
+      "learning_rate": 9.8435584246466e-06,
+      "loss": 1.116,
+      "step": 1507
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 7.05085055970696,
+      "learning_rate": 9.843271701324673e-06,
+      "loss": 1.1495,
+      "step": 1508
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.749435765630864,
+      "learning_rate": 9.842984719676112e-06,
+      "loss": 1.034,
+      "step": 1509
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 7.097379065529494,
+      "learning_rate": 9.84269747971623e-06,
+      "loss": 1.1151,
+      "step": 1510
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 11.241627367675074,
+      "learning_rate": 9.842409981460341e-06,
+      "loss": 1.1641,
+      "step": 1511
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.488572268186662,
+      "learning_rate": 9.842122224923785e-06,
+      "loss": 1.2093,
+      "step": 1512
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.755298415903754,
+      "learning_rate": 9.841834210121906e-06,
+      "loss": 1.143,
+      "step": 1513
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.184961171061964,
+      "learning_rate": 9.841545937070068e-06,
+      "loss": 1.0588,
+      "step": 1514
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.850440598091556,
+      "learning_rate": 9.841257405783647e-06,
+      "loss": 1.1202,
+      "step": 1515
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 10.16090037522501,
+      "learning_rate": 9.84096861627803e-06,
+      "loss": 1.0733,
+      "step": 1516
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 9.932120286220833,
+      "learning_rate": 9.840679568568623e-06,
+      "loss": 1.1006,
+      "step": 1517
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 7.836638537774612,
+      "learning_rate": 9.840390262670842e-06,
+      "loss": 1.0799,
+      "step": 1518
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.711883129431738,
+      "learning_rate": 9.84010069860012e-06,
+      "loss": 1.1294,
+      "step": 1519
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.508879481301141,
+      "learning_rate": 9.839810876371896e-06,
+      "loss": 1.0341,
+      "step": 1520
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.08655125385915,
+      "learning_rate": 9.839520796001632e-06,
+      "loss": 1.0994,
+      "step": 1521
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.638026447233552,
+      "learning_rate": 9.8392304575048e-06,
+      "loss": 1.1682,
+      "step": 1522
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 12.467858399236109,
+      "learning_rate": 9.838939860896886e-06,
+      "loss": 1.0547,
+      "step": 1523
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 5.432475448312895,
+      "learning_rate": 9.838649006193388e-06,
+      "loss": 1.116,
+      "step": 1524
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 9.214074946506262,
+      "learning_rate": 9.838357893409822e-06,
+      "loss": 1.0573,
+      "step": 1525
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.06366902067041,
+      "learning_rate": 9.838066522561712e-06,
+      "loss": 1.1113,
+      "step": 1526
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.768078897910172,
+      "learning_rate": 9.837774893664603e-06,
+      "loss": 1.1314,
+      "step": 1527
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 10.724982864541449,
+      "learning_rate": 9.837483006734044e-06,
+      "loss": 1.1424,
+      "step": 1528
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.187815919765015,
+      "learning_rate": 9.837190861785609e-06,
+      "loss": 1.036,
+      "step": 1529
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 7.243552497338592,
+      "learning_rate": 9.836898458834876e-06,
+      "loss": 1.1213,
+      "step": 1530
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.704663906574341,
+      "learning_rate": 9.836605797897446e-06,
+      "loss": 1.0847,
+      "step": 1531
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.938019073715143,
+      "learning_rate": 9.836312878988922e-06,
+      "loss": 1.1253,
+      "step": 1532
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 7.313175478828008,
+      "learning_rate": 9.836019702124933e-06,
+      "loss": 1.1541,
+      "step": 1533
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 9.685886819167742,
+      "learning_rate": 9.835726267321112e-06,
+      "loss": 1.0064,
+      "step": 1534
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 10.892271072864812,
+      "learning_rate": 9.835432574593114e-06,
+      "loss": 1.0605,
+      "step": 1535
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 7.308619958539015,
+      "learning_rate": 9.835138623956603e-06,
+      "loss": 1.0337,
+      "step": 1536
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 9.63534920570695,
+      "learning_rate": 9.834844415427255e-06,
+      "loss": 1.1168,
+      "step": 1537
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 6.226011128884159,
+      "learning_rate": 9.834549949020764e-06,
+      "loss": 1.0256,
+      "step": 1538
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 13.200496422997857,
+      "learning_rate": 9.834255224752835e-06,
+      "loss": 1.0158,
+      "step": 1539
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.551878861039919,
+      "learning_rate": 9.833960242639191e-06,
+      "loss": 1.0831,
+      "step": 1540
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 7.4806616771647185,
+      "learning_rate": 9.83366500269556e-06,
+      "loss": 1.0935,
+      "step": 1541
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 9.814909969836437,
+      "learning_rate": 9.833369504937695e-06,
+      "loss": 1.1826,
+      "step": 1542
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.074897597647725,
+      "learning_rate": 9.833073749381353e-06,
+      "loss": 1.1071,
+      "step": 1543
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 7.176730983579259,
+      "learning_rate": 9.83277773604231e-06,
+      "loss": 1.0566,
+      "step": 1544
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 9.041017500286317,
+      "learning_rate": 9.832481464936357e-06,
+      "loss": 1.151,
+      "step": 1545
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 7.782114871764142,
+      "learning_rate": 9.832184936079292e-06,
+      "loss": 1.0755,
+      "step": 1546
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 10.620475452705076,
+      "learning_rate": 9.831888149486933e-06,
+      "loss": 1.1864,
+      "step": 1547
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.014240293311902,
+      "learning_rate": 9.831591105175108e-06,
+      "loss": 1.031,
+      "step": 1548
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.806540350348149,
+      "learning_rate": 9.831293803159666e-06,
+      "loss": 1.1094,
+      "step": 1549
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 7.569971221723261,
+      "learning_rate": 9.830996243456458e-06,
+      "loss": 1.0595,
+      "step": 1550
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 7.971189425104546,
+      "learning_rate": 9.830698426081357e-06,
+      "loss": 1.0802,
+      "step": 1551
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 7.901704090735758,
+      "learning_rate": 9.83040035105025e-06,
+      "loss": 1.0539,
+      "step": 1552
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 7.380091903799149,
+      "learning_rate": 9.830102018379032e-06,
+      "loss": 1.1243,
+      "step": 1553
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 9.613611082477194,
+      "learning_rate": 9.82980342808362e-06,
+      "loss": 1.0753,
+      "step": 1554
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 9.582581063522511,
+      "learning_rate": 9.829504580179934e-06,
+      "loss": 1.0997,
+      "step": 1555
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 9.574451230802186,
+      "learning_rate": 9.829205474683919e-06,
+      "loss": 1.0546,
+      "step": 1556
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 7.738009824432397,
+      "learning_rate": 9.828906111611525e-06,
+      "loss": 1.013,
+      "step": 1557
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.937820702052193,
+      "learning_rate": 9.82860649097872e-06,
+      "loss": 1.1103,
+      "step": 1558
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 11.1432839381848,
+      "learning_rate": 9.828306612801487e-06,
+      "loss": 1.0946,
+      "step": 1559
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.624966241204776,
+      "learning_rate": 9.828006477095817e-06,
+      "loss": 1.089,
+      "step": 1560
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 6.848103005024943,
+      "learning_rate": 9.827706083877721e-06,
+      "loss": 1.1316,
+      "step": 1561
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.54596191586356,
+      "learning_rate": 9.82740543316322e-06,
+      "loss": 1.0725,
+      "step": 1562
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.908870719091421,
+      "learning_rate": 9.827104524968351e-06,
+      "loss": 1.0924,
+      "step": 1563
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 10.907475562258602,
+      "learning_rate": 9.826803359309164e-06,
+      "loss": 1.0948,
+      "step": 1564
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 7.805583853154702,
+      "learning_rate": 9.82650193620172e-06,
+      "loss": 1.0717,
+      "step": 1565
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.528279844331047,
+      "learning_rate": 9.826200255662097e-06,
+      "loss": 1.1155,
+      "step": 1566
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.135205993333235,
+      "learning_rate": 9.825898317706388e-06,
+      "loss": 1.1164,
+      "step": 1567
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 6.661600182094031,
+      "learning_rate": 9.825596122350694e-06,
+      "loss": 1.1707,
+      "step": 1568
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 7.5375793744326565,
+      "learning_rate": 9.825293669611136e-06,
+      "loss": 1.1337,
+      "step": 1569
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.178947436094651,
+      "learning_rate": 9.824990959503845e-06,
+      "loss": 1.0498,
+      "step": 1570
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 6.834186605976648,
+      "learning_rate": 9.824687992044966e-06,
+      "loss": 1.0748,
+      "step": 1571
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 12.754740059224938,
+      "learning_rate": 9.824384767250658e-06,
+      "loss": 1.0505,
+      "step": 1572
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 9.147670105210594,
+      "learning_rate": 9.824081285137097e-06,
+      "loss": 1.1374,
+      "step": 1573
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.442466742289767,
+      "learning_rate": 9.823777545720468e-06,
+      "loss": 1.0199,
+      "step": 1574
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.879524437346365,
+      "learning_rate": 9.823473549016973e-06,
+      "loss": 1.0511,
+      "step": 1575
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 7.587811855848716,
+      "learning_rate": 9.823169295042822e-06,
+      "loss": 1.1822,
+      "step": 1576
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.358163513212219,
+      "learning_rate": 9.822864783814247e-06,
+      "loss": 1.0479,
+      "step": 1577
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 10.22169452667116,
+      "learning_rate": 9.82256001534749e-06,
+      "loss": 1.0952,
+      "step": 1578
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 7.937468560430927,
+      "learning_rate": 9.822254989658806e-06,
+      "loss": 1.0875,
+      "step": 1579
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.96762471953662,
+      "learning_rate": 9.821949706764463e-06,
+      "loss": 1.0568,
+      "step": 1580
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.04611006089608,
+      "learning_rate": 9.821644166680746e-06,
+      "loss": 1.0856,
+      "step": 1581
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.848519162822345,
+      "learning_rate": 9.821338369423948e-06,
+      "loss": 1.0621,
+      "step": 1582
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 7.3166716934837694,
+      "learning_rate": 9.821032315010381e-06,
+      "loss": 1.1022,
+      "step": 1583
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 9.91570333720168,
+      "learning_rate": 9.820726003456372e-06,
+      "loss": 1.1833,
+      "step": 1584
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 9.306416155966636,
+      "learning_rate": 9.820419434778256e-06,
+      "loss": 1.0692,
+      "step": 1585
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.0133899952175,
+      "learning_rate": 9.820112608992385e-06,
+      "loss": 1.0631,
+      "step": 1586
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.499997901038338,
+      "learning_rate": 9.819805526115126e-06,
+      "loss": 1.0577,
+      "step": 1587
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.656419337230755,
+      "learning_rate": 9.819498186162852e-06,
+      "loss": 1.0609,
+      "step": 1588
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.01394080247554,
+      "learning_rate": 9.819190589151964e-06,
+      "loss": 1.1082,
+      "step": 1589
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 12.28628569377676,
+      "learning_rate": 9.818882735098862e-06,
+      "loss": 1.1336,
+      "step": 1590
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 7.823558338485334,
+      "learning_rate": 9.81857462401997e-06,
+      "loss": 1.052,
+      "step": 1591
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 9.573556497909772,
+      "learning_rate": 9.818266255931719e-06,
+      "loss": 1.0525,
+      "step": 1592
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 9.830941960497366,
+      "learning_rate": 9.817957630850558e-06,
+      "loss": 1.105,
+      "step": 1593
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.111883702596293,
+      "learning_rate": 9.817648748792947e-06,
+      "loss": 1.1046,
+      "step": 1594
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.944220896053789,
+      "learning_rate": 9.817339609775363e-06,
+      "loss": 1.0198,
+      "step": 1595
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 9.927742512801343,
+      "learning_rate": 9.817030213814292e-06,
+      "loss": 1.041,
+      "step": 1596
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 11.027854766441179,
+      "learning_rate": 9.816720560926238e-06,
+      "loss": 1.0954,
+      "step": 1597
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.490789507896908,
+      "learning_rate": 9.816410651127719e-06,
+      "loss": 1.0374,
+      "step": 1598
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 9.261344677188527,
+      "learning_rate": 9.81610048443526e-06,
+      "loss": 1.1667,
+      "step": 1599
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 9.376197235791727,
+      "learning_rate": 9.815790060865407e-06,
+      "loss": 1.1428,
+      "step": 1600
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 9.528592779933375,
+      "learning_rate": 9.815479380434718e-06,
+      "loss": 1.0331,
+      "step": 1601
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 6.643468266388616,
+      "learning_rate": 9.815168443159762e-06,
+      "loss": 1.1403,
+      "step": 1602
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 7.094909632092029,
+      "learning_rate": 9.814857249057125e-06,
+      "loss": 1.0515,
+      "step": 1603
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 6.982168225461286,
+      "learning_rate": 9.814545798143404e-06,
+      "loss": 1.1695,
+      "step": 1604
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 11.76938034925118,
+      "learning_rate": 9.814234090435212e-06,
+      "loss": 1.0846,
+      "step": 1605
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 6.424145441436249,
+      "learning_rate": 9.813922125949174e-06,
+      "loss": 1.0574,
+      "step": 1606
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.515941710219085,
+      "learning_rate": 9.81360990470193e-06,
+      "loss": 1.0078,
+      "step": 1607
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 9.03686065667271,
+      "learning_rate": 9.813297426710132e-06,
+      "loss": 1.0351,
+      "step": 1608
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 10.047439459283305,
+      "learning_rate": 9.812984691990446e-06,
+      "loss": 1.0952,
+      "step": 1609
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 10.338070247194104,
+      "learning_rate": 9.812671700559555e-06,
+      "loss": 1.062,
+      "step": 1610
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 9.96268744406751,
+      "learning_rate": 9.812358452434151e-06,
+      "loss": 1.1035,
+      "step": 1611
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 7.723803228920889,
+      "learning_rate": 9.812044947630943e-06,
+      "loss": 1.0513,
+      "step": 1612
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 11.047373072008673,
+      "learning_rate": 9.81173118616665e-06,
+      "loss": 1.0678,
+      "step": 1613
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 9.685787524988394,
+      "learning_rate": 9.811417168058012e-06,
+      "loss": 1.1212,
+      "step": 1614
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 7.090998577630162,
+      "learning_rate": 9.811102893321774e-06,
+      "loss": 1.0555,
+      "step": 1615
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 10.767997127515814,
+      "learning_rate": 9.810788361974698e-06,
+      "loss": 1.1217,
+      "step": 1616
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 7.743724238306513,
+      "learning_rate": 9.810473574033564e-06,
+      "loss": 1.0886,
+      "step": 1617
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 7.510422526528798,
+      "learning_rate": 9.810158529515157e-06,
+      "loss": 1.1114,
+      "step": 1618
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 11.15980641443579,
+      "learning_rate": 9.809843228436285e-06,
+      "loss": 1.0572,
+      "step": 1619
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 7.764667031975411,
+      "learning_rate": 9.809527670813764e-06,
+      "loss": 1.0276,
+      "step": 1620
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 9.328483509108512,
+      "learning_rate": 9.809211856664423e-06,
+      "loss": 1.0749,
+      "step": 1621
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 6.852304532854427,
+      "learning_rate": 9.808895786005108e-06,
+      "loss": 1.1312,
+      "step": 1622
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 9.49539354773631,
+      "learning_rate": 9.808579458852678e-06,
+      "loss": 1.0865,
+      "step": 1623
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 10.21806757944238,
+      "learning_rate": 9.808262875224005e-06,
+      "loss": 1.0523,
+      "step": 1624
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 6.98365399300213,
+      "learning_rate": 9.807946035135974e-06,
+      "loss": 1.027,
+      "step": 1625
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 7.403083400216751,
+      "learning_rate": 9.807628938605483e-06,
+      "loss": 1.104,
+      "step": 1626
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 7.946439755494261,
+      "learning_rate": 9.807311585649449e-06,
+      "loss": 1.1221,
+      "step": 1627
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.067669850786745,
+      "learning_rate": 9.806993976284793e-06,
+      "loss": 1.0613,
+      "step": 1628
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.005677143553918,
+      "learning_rate": 9.806676110528462e-06,
+      "loss": 0.9983,
+      "step": 1629
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.209945980329286,
+      "learning_rate": 9.806357988397405e-06,
+      "loss": 1.0352,
+      "step": 1630
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 9.708653578586269,
+      "learning_rate": 9.80603960990859e-06,
+      "loss": 1.0541,
+      "step": 1631
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 9.684482198264883,
+      "learning_rate": 9.805720975079002e-06,
+      "loss": 1.1028,
+      "step": 1632
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 9.84011738032967,
+      "learning_rate": 9.805402083925635e-06,
+      "loss": 1.1098,
+      "step": 1633
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.110324616031447,
+      "learning_rate": 9.805082936465496e-06,
+      "loss": 1.0815,
+      "step": 1634
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 9.904311818999208,
+      "learning_rate": 9.804763532715607e-06,
+      "loss": 1.1552,
+      "step": 1635
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 7.58153362774017,
+      "learning_rate": 9.804443872693006e-06,
+      "loss": 1.025,
+      "step": 1636
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 9.02644864674269,
+      "learning_rate": 9.804123956414741e-06,
+      "loss": 1.0719,
+      "step": 1637
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 7.172254202662708,
+      "learning_rate": 9.803803783897879e-06,
+      "loss": 1.0855,
+      "step": 1638
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 7.004607912046384,
+      "learning_rate": 9.803483355159494e-06,
+      "loss": 1.0802,
+      "step": 1639
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 10.330204092551867,
+      "learning_rate": 9.803162670216675e-06,
+      "loss": 1.0594,
+      "step": 1640
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 11.407359866067642,
+      "learning_rate": 9.80284172908653e-06,
+      "loss": 1.1185,
+      "step": 1641
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 10.29178705592653,
+      "learning_rate": 9.802520531786178e-06,
+      "loss": 1.09,
+      "step": 1642
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.635197617084744,
+      "learning_rate": 9.802199078332746e-06,
+      "loss": 1.1861,
+      "step": 1643
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 10.299576629720653,
+      "learning_rate": 9.801877368743385e-06,
+      "loss": 1.067,
+      "step": 1644
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.47579895705731,
+      "learning_rate": 9.801555403035249e-06,
+      "loss": 1.0314,
+      "step": 1645
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 6.711985571839852,
+      "learning_rate": 9.801233181225513e-06,
+      "loss": 1.0834,
+      "step": 1646
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 6.803697072754232,
+      "learning_rate": 9.800910703331365e-06,
+      "loss": 1.0945,
+      "step": 1647
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 8.3434837306761,
+      "learning_rate": 9.800587969370002e-06,
+      "loss": 1.047,
+      "step": 1648
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 10.25965315804392,
+      "learning_rate": 9.80026497935864e-06,
+      "loss": 1.0402,
+      "step": 1649
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.926144448192063,
+      "learning_rate": 9.799941733314506e-06,
+      "loss": 1.0721,
+      "step": 1650
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 8.123339400823626,
+      "learning_rate": 9.799618231254839e-06,
+      "loss": 1.0595,
+      "step": 1651
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 9.811201466554396,
+      "learning_rate": 9.799294473196897e-06,
+      "loss": 1.0946,
+      "step": 1652
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 9.557587187528373,
+      "learning_rate": 9.798970459157946e-06,
+      "loss": 1.1029,
+      "step": 1653
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 6.197346735091606,
+      "learning_rate": 9.79864618915527e-06,
+      "loss": 1.0946,
+      "step": 1654
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 9.785904697250869,
+      "learning_rate": 9.798321663206162e-06,
+      "loss": 1.0287,
+      "step": 1655
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.844065804771389,
+      "learning_rate": 9.797996881327933e-06,
+      "loss": 1.1277,
+      "step": 1656
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 6.884237655042217,
+      "learning_rate": 9.797671843537906e-06,
+      "loss": 1.0964,
+      "step": 1657
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 9.40678506043681,
+      "learning_rate": 9.797346549853417e-06,
+      "loss": 1.121,
+      "step": 1658
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 9.007087423718746,
+      "learning_rate": 9.797021000291817e-06,
+      "loss": 1.0775,
+      "step": 1659
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 8.818917581207508,
+      "learning_rate": 9.796695194870469e-06,
+      "loss": 1.0395,
+      "step": 1660
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 8.111875374507687,
+      "learning_rate": 9.79636913360675e-06,
+      "loss": 1.0984,
+      "step": 1661
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 9.724769740848393,
+      "learning_rate": 9.796042816518054e-06,
+      "loss": 1.034,
+      "step": 1662
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 9.266588220805053,
+      "learning_rate": 9.795716243621784e-06,
+      "loss": 1.0787,
+      "step": 1663
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 8.178646305867934,
+      "learning_rate": 9.795389414935357e-06,
+      "loss": 1.1935,
+      "step": 1664
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 8.564437285251499,
+      "learning_rate": 9.795062330476208e-06,
+      "loss": 1.0402,
+      "step": 1665
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 10.610490050473054,
+      "learning_rate": 9.794734990261781e-06,
+      "loss": 1.0324,
+      "step": 1666
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 10.296883004464274,
+      "learning_rate": 9.794407394309536e-06,
+      "loss": 1.0956,
+      "step": 1667
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 9.170857250134642,
+      "learning_rate": 9.794079542636946e-06,
+      "loss": 1.098,
+      "step": 1668
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 6.252131904611354,
+      "learning_rate": 9.793751435261495e-06,
+      "loss": 1.0548,
+      "step": 1669
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 9.168061407180586,
+      "learning_rate": 9.79342307220069e-06,
+      "loss": 1.0298,
+      "step": 1670
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.843728570787432,
+      "learning_rate": 9.79309445347204e-06,
+      "loss": 1.0011,
+      "step": 1671
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 10.386560887658348,
+      "learning_rate": 9.792765579093071e-06,
+      "loss": 1.0158,
+      "step": 1672
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.681220866642975,
+      "learning_rate": 9.792436449081329e-06,
+      "loss": 1.14,
+      "step": 1673
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 8.309069455889867,
+      "learning_rate": 9.792107063454365e-06,
+      "loss": 1.0915,
+      "step": 1674
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.363313857455048,
+      "learning_rate": 9.791777422229751e-06,
+      "loss": 1.0643,
+      "step": 1675
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.870082871642953,
+      "learning_rate": 9.791447525425067e-06,
+      "loss": 1.1061,
+      "step": 1676
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 11.083346511199146,
+      "learning_rate": 9.79111737305791e-06,
+      "loss": 1.0403,
+      "step": 1677
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 9.08584765969905,
+      "learning_rate": 9.790786965145886e-06,
+      "loss": 1.0798,
+      "step": 1678
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 6.401070231635871,
+      "learning_rate": 9.79045630170662e-06,
+      "loss": 1.0495,
+      "step": 1679
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.670930996507948,
+      "learning_rate": 9.790125382757753e-06,
+      "loss": 1.0501,
+      "step": 1680
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 9.481011149067845,
+      "learning_rate": 9.78979420831693e-06,
+      "loss": 1.0638,
+      "step": 1681
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.393940962303388,
+      "learning_rate": 9.789462778401814e-06,
+      "loss": 1.1312,
+      "step": 1682
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.5196957904211885,
+      "learning_rate": 9.789131093030087e-06,
+      "loss": 1.0099,
+      "step": 1683
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 8.982195775030853,
+      "learning_rate": 9.788799152219438e-06,
+      "loss": 1.0774,
+      "step": 1684
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.517774909001971,
+      "learning_rate": 9.788466955987573e-06,
+      "loss": 1.1444,
+      "step": 1685
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.399509165729016,
+      "learning_rate": 9.788134504352207e-06,
+      "loss": 1.0738,
+      "step": 1686
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 8.549095520109697,
+      "learning_rate": 9.787801797331076e-06,
+      "loss": 1.0638,
+      "step": 1687
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.140622276812272,
+      "learning_rate": 9.787468834941923e-06,
+      "loss": 1.0931,
+      "step": 1688
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 10.349377945765296,
+      "learning_rate": 9.78713561720251e-06,
+      "loss": 1.1046,
+      "step": 1689
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.3536666633918255,
+      "learning_rate": 9.786802144130608e-06,
+      "loss": 1.1124,
+      "step": 1690
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 10.720031545599259,
+      "learning_rate": 9.786468415744002e-06,
+      "loss": 1.09,
+      "step": 1691
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 8.155952208138379,
+      "learning_rate": 9.786134432060494e-06,
+      "loss": 1.0625,
+      "step": 1692
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 9.306656356766455,
+      "learning_rate": 9.785800193097898e-06,
+      "loss": 1.0259,
+      "step": 1693
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.968669740429601,
+      "learning_rate": 9.785465698874043e-06,
+      "loss": 1.1545,
+      "step": 1694
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 8.967599217954227,
+      "learning_rate": 9.785130949406766e-06,
+      "loss": 1.1163,
+      "step": 1695
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.999833644093303,
+      "learning_rate": 9.784795944713925e-06,
+      "loss": 1.1144,
+      "step": 1696
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 9.932170710915925,
+      "learning_rate": 9.784460684813386e-06,
+      "loss": 1.1791,
+      "step": 1697
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 9.941357312639408,
+      "learning_rate": 9.784125169723033e-06,
+      "loss": 1.123,
+      "step": 1698
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 9.302419205102504,
+      "learning_rate": 9.783789399460758e-06,
+      "loss": 0.9731,
+      "step": 1699
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 5.7014446776356555,
+      "learning_rate": 9.783453374044474e-06,
+      "loss": 1.1268,
+      "step": 1700
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 6.031924762719288,
+      "learning_rate": 9.783117093492101e-06,
+      "loss": 1.1649,
+      "step": 1701
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 9.766412669614574,
+      "learning_rate": 9.782780557821576e-06,
+      "loss": 1.0983,
+      "step": 1702
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 9.141879336713824,
+      "learning_rate": 9.782443767050849e-06,
+      "loss": 1.1667,
+      "step": 1703
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.720896453916393,
+      "learning_rate": 9.782106721197884e-06,
+      "loss": 1.1178,
+      "step": 1704
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 11.874878911924263,
+      "learning_rate": 9.781769420280657e-06,
+      "loss": 1.0857,
+      "step": 1705
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 11.344709924264496,
+      "learning_rate": 9.78143186431716e-06,
+      "loss": 1.0262,
+      "step": 1706
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.839103282635465,
+      "learning_rate": 9.781094053325397e-06,
+      "loss": 1.0731,
+      "step": 1707
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.772314908260823,
+      "learning_rate": 9.780755987323385e-06,
+      "loss": 0.9949,
+      "step": 1708
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 8.321789953828215,
+      "learning_rate": 9.780417666329154e-06,
+      "loss": 1.053,
+      "step": 1709
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 5.489780325147955,
+      "learning_rate": 9.780079090360754e-06,
+      "loss": 1.0431,
+      "step": 1710
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 9.928678638301925,
+      "learning_rate": 9.779740259436239e-06,
+      "loss": 1.1582,
+      "step": 1711
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 9.000365514381803,
+      "learning_rate": 9.779401173573684e-06,
+      "loss": 1.16,
+      "step": 1712
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 8.950532270770035,
+      "learning_rate": 9.779061832791172e-06,
+      "loss": 1.0797,
+      "step": 1713
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 10.011539586020742,
+      "learning_rate": 9.778722237106808e-06,
+      "loss": 1.2013,
+      "step": 1714
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 8.309154234056198,
+      "learning_rate": 9.778382386538698e-06,
+      "loss": 1.0376,
+      "step": 1715
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 8.1569732595347,
+      "learning_rate": 9.778042281104973e-06,
+      "loss": 1.081,
+      "step": 1716
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 6.670512953543746,
+      "learning_rate": 9.777701920823775e-06,
+      "loss": 1.0865,
+      "step": 1717
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 10.081181690646513,
+      "learning_rate": 9.777361305713253e-06,
+      "loss": 1.0788,
+      "step": 1718
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.364381017086526,
+      "learning_rate": 9.777020435791578e-06,
+      "loss": 1.0318,
+      "step": 1719
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 7.493853697652318,
+      "learning_rate": 9.77667931107693e-06,
+      "loss": 1.0371,
+      "step": 1720
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 7.996357024237208,
+      "learning_rate": 9.776337931587502e-06,
+      "loss": 1.1033,
+      "step": 1721
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.902127597088118,
+      "learning_rate": 9.775996297341504e-06,
+      "loss": 1.1026,
+      "step": 1722
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 9.123404979300268,
+      "learning_rate": 9.775654408357158e-06,
+      "loss": 1.1558,
+      "step": 1723
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.433242990166676,
+      "learning_rate": 9.775312264652699e-06,
+      "loss": 1.1478,
+      "step": 1724
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.12659635648995,
+      "learning_rate": 9.774969866246376e-06,
+      "loss": 1.0819,
+      "step": 1725
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 7.546283577905585,
+      "learning_rate": 9.774627213156453e-06,
+      "loss": 1.1358,
+      "step": 1726
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.526162004346322,
+      "learning_rate": 9.774284305401202e-06,
+      "loss": 1.0899,
+      "step": 1727
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.644441381868665,
+      "learning_rate": 9.773941142998918e-06,
+      "loss": 1.0446,
+      "step": 1728
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 9.510925787802002,
+      "learning_rate": 9.773597725967901e-06,
+      "loss": 1.1189,
+      "step": 1729
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.846951348778848,
+      "learning_rate": 9.773254054326468e-06,
+      "loss": 1.0915,
+      "step": 1730
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.07216400834873,
+      "learning_rate": 9.772910128092949e-06,
+      "loss": 1.1418,
+      "step": 1731
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.26323479084306,
+      "learning_rate": 9.772565947285691e-06,
+      "loss": 1.1266,
+      "step": 1732
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 9.207554298725062,
+      "learning_rate": 9.77222151192305e-06,
+      "loss": 1.0838,
+      "step": 1733
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 9.808571058573314,
+      "learning_rate": 9.771876822023398e-06,
+      "loss": 1.1846,
+      "step": 1734
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 7.603432185158972,
+      "learning_rate": 9.771531877605117e-06,
+      "loss": 1.068,
+      "step": 1735
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 7.694817475271414,
+      "learning_rate": 9.771186678686608e-06,
+      "loss": 1.0825,
+      "step": 1736
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 6.147409576138206,
+      "learning_rate": 9.770841225286283e-06,
+      "loss": 1.072,
+      "step": 1737
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 7.959314578460055,
+      "learning_rate": 9.770495517422565e-06,
+      "loss": 1.0303,
+      "step": 1738
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 7.01686779539492,
+      "learning_rate": 9.770149555113896e-06,
+      "loss": 1.0608,
+      "step": 1739
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 9.868568250052846,
+      "learning_rate": 9.769803338378728e-06,
+      "loss": 1.0335,
+      "step": 1740
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 10.913204891289388,
+      "learning_rate": 9.769456867235525e-06,
+      "loss": 1.066,
+      "step": 1741
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 9.316284972258437,
+      "learning_rate": 9.76911014170277e-06,
+      "loss": 1.1776,
+      "step": 1742
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 11.15848156602723,
+      "learning_rate": 9.768763161798955e-06,
+      "loss": 1.0653,
+      "step": 1743
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 10.579646942749394,
+      "learning_rate": 9.768415927542588e-06,
+      "loss": 1.0589,
+      "step": 1744
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 6.877115458526287,
+      "learning_rate": 9.768068438952185e-06,
+      "loss": 1.0908,
+      "step": 1745
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.29982594420452,
+      "learning_rate": 9.767720696046286e-06,
+      "loss": 1.1318,
+      "step": 1746
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.439699732973127,
+      "learning_rate": 9.767372698843436e-06,
+      "loss": 1.175,
+      "step": 1747
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.487926220709221,
+      "learning_rate": 9.767024447362195e-06,
+      "loss": 1.0851,
+      "step": 1748
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 6.7050805121716754,
+      "learning_rate": 9.76667594162114e-06,
+      "loss": 1.0839,
+      "step": 1749
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 7.16543917824283,
+      "learning_rate": 9.766327181638859e-06,
+      "loss": 1.1751,
+      "step": 1750
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 9.777850288464554,
+      "learning_rate": 9.765978167433952e-06,
+      "loss": 1.1287,
+      "step": 1751
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 10.576118738024663,
+      "learning_rate": 9.765628899025038e-06,
+      "loss": 1.0656,
+      "step": 1752
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 10.209269528509912,
+      "learning_rate": 9.765279376430741e-06,
+      "loss": 1.052,
+      "step": 1753
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 11.282401417244268,
+      "learning_rate": 9.764929599669707e-06,
+      "loss": 1.0978,
+      "step": 1754
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 7.146330291381881,
+      "learning_rate": 9.764579568760593e-06,
+      "loss": 1.0698,
+      "step": 1755
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 9.045959225275798,
+      "learning_rate": 9.764229283722066e-06,
+      "loss": 1.1512,
+      "step": 1756
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.254335490136603,
+      "learning_rate": 9.76387874457281e-06,
+      "loss": 1.063,
+      "step": 1757
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.116788901978294,
+      "learning_rate": 9.763527951331524e-06,
+      "loss": 1.0277,
+      "step": 1758
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 7.2529491396949135,
+      "learning_rate": 9.763176904016914e-06,
+      "loss": 1.0452,
+      "step": 1759
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 7.528675944301933,
+      "learning_rate": 9.762825602647707e-06,
+      "loss": 1.0346,
+      "step": 1760
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 6.489928316596163,
+      "learning_rate": 9.76247404724264e-06,
+      "loss": 1.0238,
+      "step": 1761
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.516181776782767,
+      "learning_rate": 9.762122237820464e-06,
+      "loss": 0.961,
+      "step": 1762
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 9.795938645608004,
+      "learning_rate": 9.761770174399943e-06,
+      "loss": 1.0695,
+      "step": 1763
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 7.263183470181056,
+      "learning_rate": 9.761417856999854e-06,
+      "loss": 1.1098,
+      "step": 1764
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 7.940573060454096,
+      "learning_rate": 9.761065285638993e-06,
+      "loss": 1.0401,
+      "step": 1765
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 12.235491299527732,
+      "learning_rate": 9.76071246033616e-06,
+      "loss": 1.0647,
+      "step": 1766
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.519662153269966,
+      "learning_rate": 9.760359381110176e-06,
+      "loss": 1.0955,
+      "step": 1767
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 9.63163640300171,
+      "learning_rate": 9.760006047979874e-06,
+      "loss": 1.078,
+      "step": 1768
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 9.66788634030905,
+      "learning_rate": 9.7596524609641e-06,
+      "loss": 1.0041,
+      "step": 1769
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.638314840597198,
+      "learning_rate": 9.75929862008171e-06,
+      "loss": 1.0634,
+      "step": 1770
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.99498100039058,
+      "learning_rate": 9.75894452535158e-06,
+      "loss": 1.1388,
+      "step": 1771
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 9.969914703302713,
+      "learning_rate": 9.758590176792596e-06,
+      "loss": 1.0126,
+      "step": 1772
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.92860447171762,
+      "learning_rate": 9.758235574423658e-06,
+      "loss": 1.1124,
+      "step": 1773
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 6.811180501435217,
+      "learning_rate": 9.757880718263677e-06,
+      "loss": 0.9612,
+      "step": 1774
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.519272225900284,
+      "learning_rate": 9.757525608331584e-06,
+      "loss": 0.9947,
+      "step": 1775
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 7.0231956068180486,
+      "learning_rate": 9.757170244646319e-06,
+      "loss": 1.0262,
+      "step": 1776
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 9.278132264608853,
+      "learning_rate": 9.756814627226831e-06,
+      "loss": 1.0675,
+      "step": 1777
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 10.320913186329504,
+      "learning_rate": 9.756458756092096e-06,
+      "loss": 1.0175,
+      "step": 1778
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 7.464416586908363,
+      "learning_rate": 9.756102631261086e-06,
+      "loss": 1.1214,
+      "step": 1779
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 7.149542174782649,
+      "learning_rate": 9.755746252752803e-06,
+      "loss": 1.1413,
+      "step": 1780
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.426632308311332,
+      "learning_rate": 9.755389620586253e-06,
+      "loss": 0.9974,
+      "step": 1781
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 6.703557451631696,
+      "learning_rate": 9.755032734780456e-06,
+      "loss": 0.9993,
+      "step": 1782
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 6.761337906415755,
+      "learning_rate": 9.75467559535445e-06,
+      "loss": 1.0758,
+      "step": 1783
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.384642422092437,
+      "learning_rate": 9.75431820232728e-06,
+      "loss": 1.0413,
+      "step": 1784
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 10.912784328415839,
+      "learning_rate": 9.753960555718014e-06,
+      "loss": 1.0801,
+      "step": 1785
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.962911055537024,
+      "learning_rate": 9.753602655545723e-06,
+      "loss": 1.0489,
+      "step": 1786
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 9.506772182143795,
+      "learning_rate": 9.753244501829498e-06,
+      "loss": 1.0835,
+      "step": 1787
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.982658853675254,
+      "learning_rate": 9.752886094588443e-06,
+      "loss": 1.0832,
+      "step": 1788
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 7.1343037157806215,
+      "learning_rate": 9.752527433841672e-06,
+      "loss": 1.0391,
+      "step": 1789
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.211824208158808,
+      "learning_rate": 9.752168519608318e-06,
+      "loss": 1.0497,
+      "step": 1790
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.579476508631428,
+      "learning_rate": 9.751809351907522e-06,
+      "loss": 1.0411,
+      "step": 1791
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 9.788443706976496,
+      "learning_rate": 9.751449930758441e-06,
+      "loss": 1.0927,
+      "step": 1792
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.432454650949245,
+      "learning_rate": 9.75109025618025e-06,
+      "loss": 1.0411,
+      "step": 1793
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.886251448335285,
+      "learning_rate": 9.750730328192126e-06,
+      "loss": 1.0463,
+      "step": 1794
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 7.761246163868942,
+      "learning_rate": 9.75037014681327e-06,
+      "loss": 1.0679,
+      "step": 1795
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 7.914977887836181,
+      "learning_rate": 9.750009712062895e-06,
+      "loss": 1.069,
+      "step": 1796
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 9.145302515053837,
+      "learning_rate": 9.749649023960222e-06,
+      "loss": 1.0021,
+      "step": 1797
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.549403272560635,
+      "learning_rate": 9.749288082524491e-06,
+      "loss": 1.0701,
+      "step": 1798
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 7.243456933720911,
+      "learning_rate": 9.748926887774954e-06,
+      "loss": 1.1828,
+      "step": 1799
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 10.227495723650708,
+      "learning_rate": 9.748565439730877e-06,
+      "loss": 1.0725,
+      "step": 1800
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 7.364865610094956,
+      "learning_rate": 9.748203738411535e-06,
+      "loss": 0.9933,
+      "step": 1801
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.421668191688257,
+      "learning_rate": 9.747841783836223e-06,
+      "loss": 0.9853,
+      "step": 1802
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.507694953166984,
+      "learning_rate": 9.747479576024246e-06,
+      "loss": 1.1551,
+      "step": 1803
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 7.673453335987755,
+      "learning_rate": 9.747117114994924e-06,
+      "loss": 1.0287,
+      "step": 1804
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 5.805097316591804,
+      "learning_rate": 9.746754400767588e-06,
+      "loss": 1.0724,
+      "step": 1805
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 7.919899584676218,
+      "learning_rate": 9.746391433361584e-06,
+      "loss": 1.1226,
+      "step": 1806
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 6.86059453611173,
+      "learning_rate": 9.746028212796275e-06,
+      "loss": 1.0718,
+      "step": 1807
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 9.203691981363555,
+      "learning_rate": 9.74566473909103e-06,
+      "loss": 1.1136,
+      "step": 1808
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 7.8953335448639175,
+      "learning_rate": 9.745301012265238e-06,
+      "loss": 1.0835,
+      "step": 1809
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 9.837507425979283,
+      "learning_rate": 9.744937032338297e-06,
+      "loss": 1.0792,
+      "step": 1810
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.393896161742747,
+      "learning_rate": 9.744572799329624e-06,
+      "loss": 1.0271,
+      "step": 1811
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 6.755894388984939,
+      "learning_rate": 9.744208313258645e-06,
+      "loss": 1.0432,
+      "step": 1812
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.330973243541656,
+      "learning_rate": 9.743843574144797e-06,
+      "loss": 1.0737,
+      "step": 1813
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 7.41294057904192,
+      "learning_rate": 9.743478582007542e-06,
+      "loss": 1.1045,
+      "step": 1814
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 6.721051821625842,
+      "learning_rate": 9.74311333686634e-06,
+      "loss": 1.1015,
+      "step": 1815
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 7.609255774156206,
+      "learning_rate": 9.742747838740675e-06,
+      "loss": 1.0429,
+      "step": 1816
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 11.211992939788578,
+      "learning_rate": 9.742382087650044e-06,
+      "loss": 1.0187,
+      "step": 1817
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 9.329061670404236,
+      "learning_rate": 9.74201608361395e-06,
+      "loss": 1.1313,
+      "step": 1818
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 10.255599915644918,
+      "learning_rate": 9.74164982665192e-06,
+      "loss": 1.0727,
+      "step": 1819
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 9.435170349320124,
+      "learning_rate": 9.741283316783486e-06,
+      "loss": 1.0228,
+      "step": 1820
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.407216913811713,
+      "learning_rate": 9.740916554028197e-06,
+      "loss": 1.0855,
+      "step": 1821
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.140628186195123,
+      "learning_rate": 9.740549538405615e-06,
+      "loss": 1.0541,
+      "step": 1822
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.16841377995441,
+      "learning_rate": 9.740182269935317e-06,
+      "loss": 1.0807,
+      "step": 1823
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.618772593916017,
+      "learning_rate": 9.739814748636892e-06,
+      "loss": 1.1384,
+      "step": 1824
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 9.987821987681542,
+      "learning_rate": 9.73944697452994e-06,
+      "loss": 1.0617,
+      "step": 1825
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.20434548015875,
+      "learning_rate": 9.73907894763408e-06,
+      "loss": 1.1144,
+      "step": 1826
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 6.030832859820695,
+      "learning_rate": 9.73871066796894e-06,
+      "loss": 1.0984,
+      "step": 1827
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 7.96105574320662,
+      "learning_rate": 9.738342135554162e-06,
+      "loss": 1.0385,
+      "step": 1828
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 6.619424481180078,
+      "learning_rate": 9.737973350409404e-06,
+      "loss": 1.0282,
+      "step": 1829
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.311315858549467,
+      "learning_rate": 9.737604312554337e-06,
+      "loss": 1.0631,
+      "step": 1830
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 9.963523658366004,
+      "learning_rate": 9.737235022008641e-06,
+      "loss": 1.0548,
+      "step": 1831
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.244425997999086,
+      "learning_rate": 9.736865478792017e-06,
+      "loss": 1.1714,
+      "step": 1832
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 9.134395622967661,
+      "learning_rate": 9.736495682924175e-06,
+      "loss": 1.0389,
+      "step": 1833
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 9.739559889402505,
+      "learning_rate": 9.736125634424833e-06,
+      "loss": 0.9475,
+      "step": 1834
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 6.614965278770593,
+      "learning_rate": 9.735755333313737e-06,
+      "loss": 1.076,
+      "step": 1835
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 10.781495180772634,
+      "learning_rate": 9.73538477961063e-06,
+      "loss": 1.12,
+      "step": 1836
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 10.220018604939568,
+      "learning_rate": 9.735013973335282e-06,
+      "loss": 1.0714,
+      "step": 1837
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.756391081733959,
+      "learning_rate": 9.734642914507468e-06,
+      "loss": 1.0882,
+      "step": 1838
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.069207073782605,
+      "learning_rate": 9.734271603146981e-06,
+      "loss": 1.0675,
+      "step": 1839
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 4.886388257047724,
+      "learning_rate": 9.733900039273625e-06,
+      "loss": 1.1118,
+      "step": 1840
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.618332017458787,
+      "learning_rate": 9.733528222907216e-06,
+      "loss": 1.0,
+      "step": 1841
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 7.155150621697735,
+      "learning_rate": 9.733156154067587e-06,
+      "loss": 1.054,
+      "step": 1842
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 5.866289543265329,
+      "learning_rate": 9.732783832774585e-06,
+      "loss": 1.0106,
+      "step": 1843
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 11.310103228650215,
+      "learning_rate": 9.732411259048066e-06,
+      "loss": 1.0869,
+      "step": 1844
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.9297110478907,
+      "learning_rate": 9.732038432907906e-06,
+      "loss": 1.0162,
+      "step": 1845
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 7.227864563142094,
+      "learning_rate": 9.731665354373985e-06,
+      "loss": 1.1025,
+      "step": 1846
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.949750463695482,
+      "learning_rate": 9.731292023466205e-06,
+      "loss": 1.051,
+      "step": 1847
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.080940771430853,
+      "learning_rate": 9.730918440204479e-06,
+      "loss": 1.0773,
+      "step": 1848
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 6.229070465532633,
+      "learning_rate": 9.730544604608731e-06,
+      "loss": 0.9769,
+      "step": 1849
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 6.892307553727597,
+      "learning_rate": 9.730170516698902e-06,
+      "loss": 1.0922,
+      "step": 1850
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 7.895046113254825,
+      "learning_rate": 9.729796176494944e-06,
+      "loss": 1.1551,
+      "step": 1851
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 10.068088695766155,
+      "learning_rate": 9.729421584016825e-06,
+      "loss": 1.129,
+      "step": 1852
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 10.654981527327827,
+      "learning_rate": 9.72904673928452e-06,
+      "loss": 1.0064,
+      "step": 1853
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.052165146505489,
+      "learning_rate": 9.728671642318028e-06,
+      "loss": 1.1163,
+      "step": 1854
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.848460932414124,
+      "learning_rate": 9.728296293137351e-06,
+      "loss": 1.1215,
+      "step": 1855
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.854484096147697,
+      "learning_rate": 9.727920691762513e-06,
+      "loss": 1.0496,
+      "step": 1856
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 9.025798475071197,
+      "learning_rate": 9.727544838213544e-06,
+      "loss": 0.9596,
+      "step": 1857
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.48743655650903,
+      "learning_rate": 9.727168732510493e-06,
+      "loss": 1.0322,
+      "step": 1858
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 8.156308915274696,
+      "learning_rate": 9.72679237467342e-06,
+      "loss": 1.0494,
+      "step": 1859
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 5.622078775939051,
+      "learning_rate": 9.726415764722399e-06,
+      "loss": 0.9953,
+      "step": 1860
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 8.432298896928181,
+      "learning_rate": 9.726038902677517e-06,
+      "loss": 1.0834,
+      "step": 1861
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.462861485205671,
+      "learning_rate": 9.725661788558874e-06,
+      "loss": 1.0902,
+      "step": 1862
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.239535943463746,
+      "learning_rate": 9.725284422386587e-06,
+      "loss": 1.1075,
+      "step": 1863
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 9.482082604939995,
+      "learning_rate": 9.72490680418078e-06,
+      "loss": 1.0168,
+      "step": 1864
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 8.108430953301179,
+      "learning_rate": 9.724528933961597e-06,
+      "loss": 1.0141,
+      "step": 1865
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 9.775426481552271,
+      "learning_rate": 9.724150811749191e-06,
+      "loss": 1.091,
+      "step": 1866
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 8.52219424150638,
+      "learning_rate": 9.723772437563732e-06,
+      "loss": 1.0317,
+      "step": 1867
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.463553291274778,
+      "learning_rate": 9.723393811425398e-06,
+      "loss": 1.1223,
+      "step": 1868
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 9.392381422946055,
+      "learning_rate": 9.723014933354386e-06,
+      "loss": 1.0791,
+      "step": 1869
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 9.500197847108847,
+      "learning_rate": 9.722635803370906e-06,
+      "loss": 0.9946,
+      "step": 1870
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 8.851872959367773,
+      "learning_rate": 9.722256421495175e-06,
+      "loss": 1.0129,
+      "step": 1871
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 8.418343491846773,
+      "learning_rate": 9.721876787747433e-06,
+      "loss": 1.1381,
+      "step": 1872
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 8.381673169933446,
+      "learning_rate": 9.721496902147925e-06,
+      "loss": 1.1505,
+      "step": 1873
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 6.628307152703445,
+      "learning_rate": 9.721116764716916e-06,
+      "loss": 1.1079,
+      "step": 1874
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 11.377591668247469,
+      "learning_rate": 9.72073637547468e-06,
+      "loss": 1.0543,
+      "step": 1875
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 9.10218560617536,
+      "learning_rate": 9.720355734441506e-06,
+      "loss": 1.0143,
+      "step": 1876
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 6.849374990119779,
+      "learning_rate": 9.719974841637695e-06,
+      "loss": 1.1045,
+      "step": 1877
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 6.507919719800491,
+      "learning_rate": 9.719593697083565e-06,
+      "loss": 1.1363,
+      "step": 1878
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 8.062434652555405,
+      "learning_rate": 9.719212300799444e-06,
+      "loss": 1.0845,
+      "step": 1879
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 6.392767229274878,
+      "learning_rate": 9.718830652805675e-06,
+      "loss": 1.1489,
+      "step": 1880
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.488867639720518,
+      "learning_rate": 9.718448753122614e-06,
+      "loss": 1.083,
+      "step": 1881
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 8.88533014341793,
+      "learning_rate": 9.71806660177063e-06,
+      "loss": 1.1576,
+      "step": 1882
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 6.499255731824836,
+      "learning_rate": 9.717684198770108e-06,
+      "loss": 1.1644,
+      "step": 1883
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 9.437916864210983,
+      "learning_rate": 9.717301544141442e-06,
+      "loss": 1.0853,
+      "step": 1884
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 9.589503659495966,
+      "learning_rate": 9.716918637905042e-06,
+      "loss": 1.0795,
+      "step": 1885
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.518326758864115,
+      "learning_rate": 9.71653548008133e-06,
+      "loss": 1.0703,
+      "step": 1886
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.903819253227301,
+      "learning_rate": 9.716152070690745e-06,
+      "loss": 1.0315,
+      "step": 1887
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.889853187871952,
+      "learning_rate": 9.715768409753737e-06,
+      "loss": 1.0047,
+      "step": 1888
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.554918690104995,
+      "learning_rate": 9.715384497290768e-06,
+      "loss": 1.0283,
+      "step": 1889
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.898628534393702,
+      "learning_rate": 9.715000333322316e-06,
+      "loss": 1.0764,
+      "step": 1890
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 8.876173644647116,
+      "learning_rate": 9.714615917868869e-06,
+      "loss": 1.1084,
+      "step": 1891
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 8.304564685718887,
+      "learning_rate": 9.714231250950931e-06,
+      "loss": 1.1481,
+      "step": 1892
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 11.756058324886576,
+      "learning_rate": 9.713846332589022e-06,
+      "loss": 1.1254,
+      "step": 1893
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.141880809495982,
+      "learning_rate": 9.71346116280367e-06,
+      "loss": 1.0619,
+      "step": 1894
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 10.907442301259326,
+      "learning_rate": 9.713075741615421e-06,
+      "loss": 1.0852,
+      "step": 1895
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.697943915482012,
+      "learning_rate": 9.71269006904483e-06,
+      "loss": 1.063,
+      "step": 1896
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.326586559910199,
+      "learning_rate": 9.71230414511247e-06,
+      "loss": 1.0544,
+      "step": 1897
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.979935292218374,
+      "learning_rate": 9.711917969838922e-06,
+      "loss": 1.0853,
+      "step": 1898
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 9.385008196302069,
+      "learning_rate": 9.711531543244786e-06,
+      "loss": 1.0515,
+      "step": 1899
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 9.12280660064469,
+      "learning_rate": 9.711144865350671e-06,
+      "loss": 0.996,
+      "step": 1900
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 8.234705931365804,
+      "learning_rate": 9.710757936177203e-06,
+      "loss": 1.1321,
+      "step": 1901
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.526430988143436,
+      "learning_rate": 9.71037075574502e-06,
+      "loss": 1.1503,
+      "step": 1902
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 9.668841608073315,
+      "learning_rate": 9.70998332407477e-06,
+      "loss": 1.1641,
+      "step": 1903
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 10.278033672524346,
+      "learning_rate": 9.709595641187121e-06,
+      "loss": 1.092,
+      "step": 1904
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 10.178663619914232,
+      "learning_rate": 9.709207707102753e-06,
+      "loss": 1.0149,
+      "step": 1905
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 8.21325926441692,
+      "learning_rate": 9.70881952184235e-06,
+      "loss": 1.016,
+      "step": 1906
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.367424099508718,
+      "learning_rate": 9.708431085426622e-06,
+      "loss": 1.0559,
+      "step": 1907
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 8.297806538444771,
+      "learning_rate": 9.708042397876286e-06,
+      "loss": 1.0749,
+      "step": 1908
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 8.175664297011862,
+      "learning_rate": 9.707653459212073e-06,
+      "loss": 1.0106,
+      "step": 1909
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 9.04277740405712,
+      "learning_rate": 9.707264269454729e-06,
+      "loss": 1.0698,
+      "step": 1910
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 10.232065484982723,
+      "learning_rate": 9.706874828625011e-06,
+      "loss": 1.1113,
+      "step": 1911
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 9.182858981134565,
+      "learning_rate": 9.706485136743691e-06,
+      "loss": 1.0582,
+      "step": 1912
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 6.334563358735637,
+      "learning_rate": 9.706095193831557e-06,
+      "loss": 1.1221,
+      "step": 1913
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 9.460861843204416,
+      "learning_rate": 9.705704999909403e-06,
+      "loss": 1.0366,
+      "step": 1914
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 6.004507302207999,
+      "learning_rate": 9.705314554998042e-06,
+      "loss": 1.0981,
+      "step": 1915
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.909840563811242,
+      "learning_rate": 9.7049238591183e-06,
+      "loss": 1.0786,
+      "step": 1916
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 10.394196439516701,
+      "learning_rate": 9.704532912291016e-06,
+      "loss": 1.1335,
+      "step": 1917
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 9.109149150536595,
+      "learning_rate": 9.704141714537041e-06,
+      "loss": 1.0488,
+      "step": 1918
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 6.9093968900417115,
+      "learning_rate": 9.703750265877244e-06,
+      "loss": 1.1108,
+      "step": 1919
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 9.605444460052428,
+      "learning_rate": 9.703358566332498e-06,
+      "loss": 0.9668,
+      "step": 1920
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 8.890714490875933,
+      "learning_rate": 9.702966615923698e-06,
+      "loss": 1.1225,
+      "step": 1921
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 6.564548003798345,
+      "learning_rate": 9.70257441467175e-06,
+      "loss": 1.0618,
+      "step": 1922
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 9.11621360990754,
+      "learning_rate": 9.702181962597572e-06,
+      "loss": 1.0361,
+      "step": 1923
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 9.952465458045952,
+      "learning_rate": 9.701789259722098e-06,
+      "loss": 1.0539,
+      "step": 1924
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 6.914655335155224,
+      "learning_rate": 9.701396306066273e-06,
+      "loss": 1.0649,
+      "step": 1925
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 8.818776099905453,
+      "learning_rate": 9.701003101651052e-06,
+      "loss": 1.0973,
+      "step": 1926
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 9.605035141237373,
+      "learning_rate": 9.700609646497414e-06,
+      "loss": 1.0641,
+      "step": 1927
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.968298099787627,
+      "learning_rate": 9.70021594062634e-06,
+      "loss": 1.1275,
+      "step": 1928
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 8.655499060480553,
+      "learning_rate": 9.699821984058833e-06,
+      "loss": 1.1017,
+      "step": 1929
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 7.793830687212693,
+      "learning_rate": 9.699427776815903e-06,
+      "loss": 1.1554,
+      "step": 1930
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 8.43731689377238,
+      "learning_rate": 9.699033318918574e-06,
+      "loss": 1.1048,
+      "step": 1931
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 10.745793894727655,
+      "learning_rate": 9.698638610387888e-06,
+      "loss": 1.0343,
+      "step": 1932
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 8.864012103238606,
+      "learning_rate": 9.698243651244899e-06,
+      "loss": 1.0887,
+      "step": 1933
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 7.925651851071471,
+      "learning_rate": 9.69784844151067e-06,
+      "loss": 1.1138,
+      "step": 1934
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 7.436222764882423,
+      "learning_rate": 9.697452981206282e-06,
+      "loss": 1.0915,
+      "step": 1935
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 10.255176533557728,
+      "learning_rate": 9.697057270352828e-06,
+      "loss": 1.0762,
+      "step": 1936
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 9.95934195494274,
+      "learning_rate": 9.696661308971413e-06,
+      "loss": 1.119,
+      "step": 1937
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 5.333448339842371,
+      "learning_rate": 9.696265097083157e-06,
+      "loss": 1.0254,
+      "step": 1938
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 6.973684189210821,
+      "learning_rate": 9.695868634709193e-06,
+      "loss": 1.0807,
+      "step": 1939
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 8.569364264046076,
+      "learning_rate": 9.695471921870668e-06,
+      "loss": 1.0422,
+      "step": 1940
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 5.5173089212915745,
+      "learning_rate": 9.69507495858874e-06,
+      "loss": 1.0677,
+      "step": 1941
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 8.908812769294203,
+      "learning_rate": 9.694677744884581e-06,
+      "loss": 1.0471,
+      "step": 1942
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 10.58771406915501,
+      "learning_rate": 9.69428028077938e-06,
+      "loss": 1.107,
+      "step": 1943
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 10.095143219764077,
+      "learning_rate": 9.693882566294336e-06,
+      "loss": 1.0207,
+      "step": 1944
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 7.446873545314485,
+      "learning_rate": 9.69348460145066e-06,
+      "loss": 1.0169,
+      "step": 1945
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 8.504205703760805,
+      "learning_rate": 9.693086386269581e-06,
+      "loss": 1.0735,
+      "step": 1946
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 7.019384825889969,
+      "learning_rate": 9.692687920772336e-06,
+      "loss": 1.0183,
+      "step": 1947
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 7.841859042620759,
+      "learning_rate": 9.692289204980181e-06,
+      "loss": 1.0762,
+      "step": 1948
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 9.670391557313724,
+      "learning_rate": 9.691890238914381e-06,
+      "loss": 1.0355,
+      "step": 1949
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 9.496257753166704,
+      "learning_rate": 9.691491022596216e-06,
+      "loss": 1.0594,
+      "step": 1950
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 8.009340365368363,
+      "learning_rate": 9.691091556046977e-06,
+      "loss": 1.0794,
+      "step": 1951
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 8.570180244246314,
+      "learning_rate": 9.690691839287973e-06,
+      "loss": 1.0846,
+      "step": 1952
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 7.711292607419504,
+      "learning_rate": 9.690291872340523e-06,
+      "loss": 1.0797,
+      "step": 1953
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 9.461981522493806,
+      "learning_rate": 9.68989165522596e-06,
+      "loss": 1.0962,
+      "step": 1954
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 7.750322904853485,
+      "learning_rate": 9.689491187965632e-06,
+      "loss": 1.0269,
+      "step": 1955
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 7.55979457708167,
+      "learning_rate": 9.689090470580895e-06,
+      "loss": 1.0213,
+      "step": 1956
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 8.95748598167721,
+      "learning_rate": 9.688689503093124e-06,
+      "loss": 1.0906,
+      "step": 1957
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 10.377825380698361,
+      "learning_rate": 9.688288285523708e-06,
+      "loss": 1.0287,
+      "step": 1958
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 9.164031478918458,
+      "learning_rate": 9.687886817894044e-06,
+      "loss": 0.9869,
+      "step": 1959
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 6.988307210973038,
+      "learning_rate": 9.687485100225547e-06,
+      "loss": 1.0573,
+      "step": 1960
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 9.574325051217542,
+      "learning_rate": 9.68708313253964e-06,
+      "loss": 1.0621,
+      "step": 1961
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 7.1797711752886695,
+      "learning_rate": 9.686680914857767e-06,
+      "loss": 1.1433,
+      "step": 1962
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 6.430075566336591,
+      "learning_rate": 9.68627844720138e-06,
+      "loss": 1.0324,
+      "step": 1963
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 6.935496710938556,
+      "learning_rate": 9.685875729591944e-06,
+      "loss": 1.0284,
+      "step": 1964
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 10.923155050155284,
+      "learning_rate": 9.68547276205094e-06,
+      "loss": 1.0069,
+      "step": 1965
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 8.946584749951636,
+      "learning_rate": 9.68506954459986e-06,
+      "loss": 1.1099,
+      "step": 1966
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 9.469867020341594,
+      "learning_rate": 9.684666077260214e-06,
+      "loss": 1.0818,
+      "step": 1967
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 6.940078169611366,
+      "learning_rate": 9.684262360053519e-06,
+      "loss": 1.094,
+      "step": 1968
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 9.82314090690651,
+      "learning_rate": 9.683858393001306e-06,
+      "loss": 1.0511,
+      "step": 1969
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 9.908564687034062,
+      "learning_rate": 9.683454176125125e-06,
+      "loss": 1.0557,
+      "step": 1970
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 7.108994151636048,
+      "learning_rate": 9.683049709446536e-06,
+      "loss": 1.1052,
+      "step": 1971
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 9.040138442676797,
+      "learning_rate": 9.682644992987108e-06,
+      "loss": 1.0489,
+      "step": 1972
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 8.840101644934654,
+      "learning_rate": 9.682240026768433e-06,
+      "loss": 1.0882,
+      "step": 1973
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 7.215798398188661,
+      "learning_rate": 9.681834810812106e-06,
+      "loss": 1.114,
+      "step": 1974
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 9.204998760782473,
+      "learning_rate": 9.681429345139744e-06,
+      "loss": 1.0475,
+      "step": 1975
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 7.951909835714886,
+      "learning_rate": 9.68102362977297e-06,
+      "loss": 1.0109,
+      "step": 1976
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 8.446200572528738,
+      "learning_rate": 9.680617664733425e-06,
+      "loss": 1.0681,
+      "step": 1977
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 9.286277746729002,
+      "learning_rate": 9.680211450042765e-06,
+      "loss": 1.0709,
+      "step": 1978
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 11.881476663144168,
+      "learning_rate": 9.679804985722652e-06,
+      "loss": 1.1068,
+      "step": 1979
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 9.588560422884665,
+      "learning_rate": 9.679398271794767e-06,
+      "loss": 1.0381,
+      "step": 1980
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 6.561645557161283,
+      "learning_rate": 9.678991308280804e-06,
+      "loss": 1.0933,
+      "step": 1981
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 9.559324184271865,
+      "learning_rate": 9.678584095202468e-06,
+      "loss": 1.068,
+      "step": 1982
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 7.391284282111403,
+      "learning_rate": 9.67817663258148e-06,
+      "loss": 1.0626,
+      "step": 1983
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 7.718548212632007,
+      "learning_rate": 9.677768920439573e-06,
+      "loss": 1.0769,
+      "step": 1984
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 7.745830350154188,
+      "learning_rate": 9.677360958798491e-06,
+      "loss": 1.0773,
+      "step": 1985
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 6.6466810459529615,
+      "learning_rate": 9.676952747679997e-06,
+      "loss": 1.0683,
+      "step": 1986
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 5.551652540913206,
+      "learning_rate": 9.67654428710586e-06,
+      "loss": 0.9796,
+      "step": 1987
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 6.182013366483654,
+      "learning_rate": 9.676135577097866e-06,
+      "loss": 1.1179,
+      "step": 1988
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 8.629381331460294,
+      "learning_rate": 9.67572661767782e-06,
+      "loss": 1.0413,
+      "step": 1989
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 9.321479462244655,
+      "learning_rate": 9.67531740886753e-06,
+      "loss": 1.1194,
+      "step": 1990
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 9.751278446917919,
+      "learning_rate": 9.674907950688823e-06,
+      "loss": 1.111,
+      "step": 1991
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 9.268191401016734,
+      "learning_rate": 9.67449824316354e-06,
+      "loss": 1.0145,
+      "step": 1992
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 7.6302964904771935,
+      "learning_rate": 9.674088286313531e-06,
+      "loss": 1.0842,
+      "step": 1993
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 7.389867823683224,
+      "learning_rate": 9.673678080160664e-06,
+      "loss": 1.0749,
+      "step": 1994
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 9.982905501751503,
+      "learning_rate": 9.673267624726817e-06,
+      "loss": 1.0903,
+      "step": 1995
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 7.882608096726191,
+      "learning_rate": 9.672856920033882e-06,
+      "loss": 1.0657,
+      "step": 1996
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 9.650866068531851,
+      "learning_rate": 9.672445966103767e-06,
+      "loss": 1.0721,
+      "step": 1997
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 10.777164927474502,
+      "learning_rate": 9.67203476295839e-06,
+      "loss": 1.014,
+      "step": 1998
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.9972056742669455,
+      "learning_rate": 9.671623310619682e-06,
+      "loss": 1.1539,
+      "step": 1999
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 9.118542198092975,
+      "learning_rate": 9.671211609109592e-06,
+      "loss": 1.0719,
+      "step": 2000
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 10.736502886000924,
+      "learning_rate": 9.670799658450077e-06,
+      "loss": 1.0689,
+      "step": 2001
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.707490492779027,
+      "learning_rate": 9.670387458663109e-06,
+      "loss": 1.0526,
+      "step": 2002
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 9.006781311761605,
+      "learning_rate": 9.669975009770673e-06,
+      "loss": 1.0055,
+      "step": 2003
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.609997077138359,
+      "learning_rate": 9.66956231179477e-06,
+      "loss": 1.0528,
+      "step": 2004
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 10.438313047544682,
+      "learning_rate": 9.669149364757411e-06,
+      "loss": 1.1282,
+      "step": 2005
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.900004773618261,
+      "learning_rate": 9.66873616868062e-06,
+      "loss": 1.0684,
+      "step": 2006
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.30981015812915,
+      "learning_rate": 9.66832272358644e-06,
+      "loss": 1.0897,
+      "step": 2007
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 9.777453652125754,
+      "learning_rate": 9.66790902949692e-06,
+      "loss": 1.0981,
+      "step": 2008
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 6.9377906223933765,
+      "learning_rate": 9.667495086434125e-06,
+      "loss": 1.0837,
+      "step": 2009
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.822254767480109,
+      "learning_rate": 9.667080894420134e-06,
+      "loss": 1.0625,
+      "step": 2010
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.549796515906943,
+      "learning_rate": 9.666666453477037e-06,
+      "loss": 1.0595,
+      "step": 2011
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.870511141219238,
+      "learning_rate": 9.666251763626943e-06,
+      "loss": 1.0195,
+      "step": 2012
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.337312328870603,
+      "learning_rate": 9.66583682489197e-06,
+      "loss": 1.0555,
+      "step": 2013
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 9.769687584743561,
+      "learning_rate": 9.665421637294247e-06,
+      "loss": 1.0549,
+      "step": 2014
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.693252486924296,
+      "learning_rate": 9.66500620085592e-06,
+      "loss": 1.061,
+      "step": 2015
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.528981269065984,
+      "learning_rate": 9.664590515599146e-06,
+      "loss": 1.0396,
+      "step": 2016
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 9.356777736088075,
+      "learning_rate": 9.6641745815461e-06,
+      "loss": 1.0085,
+      "step": 2017
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.507130253278719,
+      "learning_rate": 9.663758398718965e-06,
+      "loss": 1.0808,
+      "step": 2018
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 10.202647600256007,
+      "learning_rate": 9.663341967139937e-06,
+      "loss": 1.0561,
+      "step": 2019
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.86514772149138,
+      "learning_rate": 9.662925286831228e-06,
+      "loss": 1.0418,
+      "step": 2020
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.507323993078089,
+      "learning_rate": 9.662508357815065e-06,
+      "loss": 1.0336,
+      "step": 2021
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.038451970344083,
+      "learning_rate": 9.662091180113685e-06,
+      "loss": 1.0393,
+      "step": 2022
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 9.209446708798785,
+      "learning_rate": 9.661673753749338e-06,
+      "loss": 1.0798,
+      "step": 2023
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.442139221873873,
+      "learning_rate": 9.661256078744289e-06,
+      "loss": 0.9914,
+      "step": 2024
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.509963649303283,
+      "learning_rate": 9.660838155120816e-06,
+      "loss": 1.0219,
+      "step": 2025
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 6.849853422176337,
+      "learning_rate": 9.660419982901208e-06,
+      "loss": 1.0191,
+      "step": 2026
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.689745686433694,
+      "learning_rate": 9.66000156210777e-06,
+      "loss": 1.0768,
+      "step": 2027
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 9.175916230576386,
+      "learning_rate": 9.659582892762822e-06,
+      "loss": 1.1737,
+      "step": 2028
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 6.696654503643452,
+      "learning_rate": 9.65916397488869e-06,
+      "loss": 1.0059,
+      "step": 2029
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.054464396831368,
+      "learning_rate": 9.658744808507724e-06,
+      "loss": 1.0509,
+      "step": 2030
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.921764542616475,
+      "learning_rate": 9.658325393642274e-06,
+      "loss": 0.9336,
+      "step": 2031
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.657813181967308,
+      "learning_rate": 9.657905730314716e-06,
+      "loss": 1.0242,
+      "step": 2032
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.151539850328694,
+      "learning_rate": 9.65748581854743e-06,
+      "loss": 1.1031,
+      "step": 2033
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.267207744339643,
+      "learning_rate": 9.657065658362816e-06,
+      "loss": 1.0854,
+      "step": 2034
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 10.478557323232193,
+      "learning_rate": 9.656645249783284e-06,
+      "loss": 1.0387,
+      "step": 2035
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 10.707462310904496,
+      "learning_rate": 9.656224592831253e-06,
+      "loss": 1.091,
+      "step": 2036
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.740519658353039,
+      "learning_rate": 9.655803687529165e-06,
+      "loss": 1.1111,
+      "step": 2037
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 9.051659968281324,
+      "learning_rate": 9.65538253389947e-06,
+      "loss": 1.1221,
+      "step": 2038
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 9.181274510935006,
+      "learning_rate": 9.654961131964624e-06,
+      "loss": 1.1088,
+      "step": 2039
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.628733124859114,
+      "learning_rate": 9.654539481747111e-06,
+      "loss": 1.0943,
+      "step": 2040
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.191720464779557,
+      "learning_rate": 9.65411758326942e-06,
+      "loss": 1.0393,
+      "step": 2041
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.192243453442892,
+      "learning_rate": 9.653695436554049e-06,
+      "loss": 1.1421,
+      "step": 2042
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 10.16462661572445,
+      "learning_rate": 9.653273041623518e-06,
+      "loss": 1.1281,
+      "step": 2043
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 6.425671241901625,
+      "learning_rate": 9.652850398500355e-06,
+      "loss": 0.9938,
+      "step": 2044
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 11.24769995418706,
+      "learning_rate": 9.652427507207105e-06,
+      "loss": 1.0953,
+      "step": 2045
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.710047740459721,
+      "learning_rate": 9.65200436776632e-06,
+      "loss": 1.0289,
+      "step": 2046
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.677828377750195,
+      "learning_rate": 9.651580980200572e-06,
+      "loss": 1.0814,
+      "step": 2047
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.879340970652558,
+      "learning_rate": 9.651157344532441e-06,
+      "loss": 1.0364,
+      "step": 2048
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 10.250442028083263,
+      "learning_rate": 9.650733460784525e-06,
+      "loss": 0.9997,
+      "step": 2049
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 10.873559505029274,
+      "learning_rate": 9.65030932897943e-06,
+      "loss": 1.0781,
+      "step": 2050
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 5.753204169588961,
+      "learning_rate": 9.64988494913978e-06,
+      "loss": 1.0331,
+      "step": 2051
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.085744397145628,
+      "learning_rate": 9.649460321288211e-06,
+      "loss": 1.1077,
+      "step": 2052
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.024671619218055,
+      "learning_rate": 9.64903544544737e-06,
+      "loss": 1.0943,
+      "step": 2053
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.041025112802977,
+      "learning_rate": 9.64861032163992e-06,
+      "loss": 1.115,
+      "step": 2054
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.731517698203999,
+      "learning_rate": 9.648184949888532e-06,
+      "loss": 1.167,
+      "step": 2055
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 9.087377110456426,
+      "learning_rate": 9.6477593302159e-06,
+      "loss": 1.0472,
+      "step": 2056
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.4762204868527045,
+      "learning_rate": 9.647333462644722e-06,
+      "loss": 1.0689,
+      "step": 2057
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 11.146138983902588,
+      "learning_rate": 9.646907347197712e-06,
+      "loss": 1.1483,
+      "step": 2058
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 6.520164528057731,
+      "learning_rate": 9.6464809838976e-06,
+      "loss": 1.0394,
+      "step": 2059
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.528407048070159,
+      "learning_rate": 9.646054372767125e-06,
+      "loss": 1.0744,
+      "step": 2060
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 12.376177416473231,
+      "learning_rate": 9.645627513829041e-06,
+      "loss": 1.0835,
+      "step": 2061
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.83831556619916,
+      "learning_rate": 9.645200407106119e-06,
+      "loss": 1.0801,
+      "step": 2062
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.84043173380835,
+      "learning_rate": 9.644773052621137e-06,
+      "loss": 1.0267,
+      "step": 2063
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.14001636630391,
+      "learning_rate": 9.644345450396888e-06,
+      "loss": 1.0016,
+      "step": 2064
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 11.385091197157804,
+      "learning_rate": 9.64391760045618e-06,
+      "loss": 0.9918,
+      "step": 2065
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.510116541362503,
+      "learning_rate": 9.643489502821834e-06,
+      "loss": 0.993,
+      "step": 2066
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.332159824101713,
+      "learning_rate": 9.643061157516684e-06,
+      "loss": 1.1229,
+      "step": 2067
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.858878957788132,
+      "learning_rate": 9.642632564563576e-06,
+      "loss": 1.0401,
+      "step": 2068
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 9.546796542375134,
+      "learning_rate": 9.642203723985368e-06,
+      "loss": 1.024,
+      "step": 2069
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 9.229237839283428,
+      "learning_rate": 9.641774635804936e-06,
+      "loss": 0.9946,
+      "step": 2070
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 9.122236688099429,
+      "learning_rate": 9.641345300045165e-06,
+      "loss": 1.0191,
+      "step": 2071
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 6.538443342242851,
+      "learning_rate": 9.640915716728958e-06,
+      "loss": 1.0675,
+      "step": 2072
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.380795915655806,
+      "learning_rate": 9.64048588587922e-06,
+      "loss": 1.0995,
+      "step": 2073
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.624246779940584,
+      "learning_rate": 9.640055807518885e-06,
+      "loss": 1.1111,
+      "step": 2074
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.956358725634775,
+      "learning_rate": 9.63962548167089e-06,
+      "loss": 0.9772,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 10.181828578560243,
+      "learning_rate": 9.639194908358183e-06,
+      "loss": 0.9949,
+      "step": 2076
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.285022434749509,
+      "learning_rate": 9.638764087603734e-06,
+      "loss": 1.0435,
+      "step": 2077
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.947137501023203,
+      "learning_rate": 9.638333019430522e-06,
+      "loss": 1.0724,
+      "step": 2078
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.174106091109159,
+      "learning_rate": 9.637901703861535e-06,
+      "loss": 1.0217,
+      "step": 2079
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 9.31594110608283,
+      "learning_rate": 9.637470140919782e-06,
+      "loss": 1.0469,
+      "step": 2080
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.87873651540629,
+      "learning_rate": 9.637038330628281e-06,
+      "loss": 1.052,
+      "step": 2081
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 9.94173739140344,
+      "learning_rate": 9.636606273010062e-06,
+      "loss": 1.0484,
+      "step": 2082
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.005803500496363,
+      "learning_rate": 9.63617396808817e-06,
+      "loss": 1.0489,
+      "step": 2083
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.191442950085042,
+      "learning_rate": 9.635741415885664e-06,
+      "loss": 1.055,
+      "step": 2084
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.163599700417802,
+      "learning_rate": 9.635308616425616e-06,
+      "loss": 1.0325,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 11.302264697588845,
+      "learning_rate": 9.634875569731108e-06,
+      "loss": 1.068,
+      "step": 2086
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.390912545166904,
+      "learning_rate": 9.634442275825238e-06,
+      "loss": 1.0067,
+      "step": 2087
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.779625317174991,
+      "learning_rate": 9.634008734731117e-06,
+      "loss": 1.0529,
+      "step": 2088
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 10.957708470522926,
+      "learning_rate": 9.633574946471868e-06,
+      "loss": 1.0424,
+      "step": 2089
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 12.551785843842627,
+      "learning_rate": 9.633140911070629e-06,
+      "loss": 1.0281,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.586079315017114,
+      "learning_rate": 9.632706628550553e-06,
+      "loss": 1.0729,
+      "step": 2091
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.669264096683216,
+      "learning_rate": 9.632272098934798e-06,
+      "loss": 1.0445,
+      "step": 2092
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 10.032114214635005,
+      "learning_rate": 9.631837322246544e-06,
+      "loss": 1.0767,
+      "step": 2093
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.957156283562162,
+      "learning_rate": 9.63140229850898e-06,
+      "loss": 1.0368,
+      "step": 2094
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 6.4373006846877905,
+      "learning_rate": 9.630967027745307e-06,
+      "loss": 1.0407,
+      "step": 2095
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.348525033862229,
+      "learning_rate": 9.630531509978746e-06,
+      "loss": 1.0619,
+      "step": 2096
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 9.535497840188345,
+      "learning_rate": 9.630095745232524e-06,
+      "loss": 1.2093,
+      "step": 2097
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 9.446171339505872,
+      "learning_rate": 9.629659733529882e-06,
+      "loss": 1.1234,
+      "step": 2098
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.006351021616579,
+      "learning_rate": 9.629223474894076e-06,
+      "loss": 1.095,
+      "step": 2099
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.5565027738052395,
+      "learning_rate": 9.628786969348374e-06,
+      "loss": 1.1924,
+      "step": 2100
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 9.775133582080006,
+      "learning_rate": 9.628350216916062e-06,
+      "loss": 1.0817,
+      "step": 2101
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 5.6273000616664,
+      "learning_rate": 9.62791321762043e-06,
+      "loss": 1.0649,
+      "step": 2102
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.377795555207009,
+      "learning_rate": 9.627475971484792e-06,
+      "loss": 1.1539,
+      "step": 2103
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 6.982305249669038,
+      "learning_rate": 9.627038478532465e-06,
+      "loss": 1.0958,
+      "step": 2104
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.21417556403247,
+      "learning_rate": 9.626600738786784e-06,
+      "loss": 1.1271,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 6.031548733252828,
+      "learning_rate": 9.626162752271098e-06,
+      "loss": 1.0752,
+      "step": 2106
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 10.1997763187363,
+      "learning_rate": 9.625724519008768e-06,
+      "loss": 1.0653,
+      "step": 2107
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 6.91770518625655,
+      "learning_rate": 9.625286039023168e-06,
+      "loss": 1.0433,
+      "step": 2108
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.817351811187885,
+      "learning_rate": 9.624847312337687e-06,
+      "loss": 1.0374,
+      "step": 2109
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.795799719908066,
+      "learning_rate": 9.624408338975722e-06,
+      "loss": 1.0352,
+      "step": 2110
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.640170396423619,
+      "learning_rate": 9.623969118960688e-06,
+      "loss": 0.9525,
+      "step": 2111
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.441484424796507,
+      "learning_rate": 9.623529652316012e-06,
+      "loss": 1.0374,
+      "step": 2112
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.904538065785363,
+      "learning_rate": 9.623089939065135e-06,
+      "loss": 0.9936,
+      "step": 2113
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 9.628452262183064,
+      "learning_rate": 9.62264997923151e-06,
+      "loss": 1.0511,
+      "step": 2114
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.424939995202886,
+      "learning_rate": 9.6222097728386e-06,
+      "loss": 0.9575,
+      "step": 2115
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.723630012113662,
+      "learning_rate": 9.621769319909885e-06,
+      "loss": 1.0855,
+      "step": 2116
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 9.106656354607685,
+      "learning_rate": 9.621328620468865e-06,
+      "loss": 0.9901,
+      "step": 2117
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.172786308182292,
+      "learning_rate": 9.620887674539035e-06,
+      "loss": 1.0429,
+      "step": 2118
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.329246084334181,
+      "learning_rate": 9.620446482143921e-06,
+      "loss": 1.0266,
+      "step": 2119
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.5011926211350595,
+      "learning_rate": 9.620005043307053e-06,
+      "loss": 1.0279,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 6.664362844871323,
+      "learning_rate": 9.619563358051976e-06,
+      "loss": 1.0281,
+      "step": 2121
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 9.167707184262524,
+      "learning_rate": 9.619121426402247e-06,
+      "loss": 1.091,
+      "step": 2122
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.77783320634712,
+      "learning_rate": 9.61867924838144e-06,
+      "loss": 1.0549,
+      "step": 2123
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.503674253730134,
+      "learning_rate": 9.618236824013137e-06,
+      "loss": 1.0453,
+      "step": 2124
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 11.637002846233978,
+      "learning_rate": 9.617794153320936e-06,
+      "loss": 1.1237,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.776751776941537,
+      "learning_rate": 9.61735123632845e-06,
+      "loss": 1.0635,
+      "step": 2126
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 6.886769423929575,
+      "learning_rate": 9.6169080730593e-06,
+      "loss": 1.1515,
+      "step": 2127
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.480397191558232,
+      "learning_rate": 9.616464663537127e-06,
+      "loss": 1.115,
+      "step": 2128
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.835094914215936,
+      "learning_rate": 9.616021007785576e-06,
+      "loss": 1.0779,
+      "step": 2129
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.849190687958196,
+      "learning_rate": 9.615577105828315e-06,
+      "loss": 1.0512,
+      "step": 2130
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.963320761682995,
+      "learning_rate": 9.615132957689018e-06,
+      "loss": 1.0031,
+      "step": 2131
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 5.755355713981442,
+      "learning_rate": 9.614688563391376e-06,
+      "loss": 1.0631,
+      "step": 2132
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.189805673385065,
+      "learning_rate": 9.61424392295909e-06,
+      "loss": 1.0375,
+      "step": 2133
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 11.2438013521433,
+      "learning_rate": 9.613799036415878e-06,
+      "loss": 1.0282,
+      "step": 2134
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.062128104918983,
+      "learning_rate": 9.61335390378547e-06,
+      "loss": 1.0592,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 10.707684014550134,
+      "learning_rate": 9.612908525091602e-06,
+      "loss": 0.9918,
+      "step": 2136
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.405732243621534,
+      "learning_rate": 9.612462900358034e-06,
+      "loss": 0.9936,
+      "step": 2137
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 5.740884433600016,
+      "learning_rate": 9.612017029608537e-06,
+      "loss": 0.9868,
+      "step": 2138
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.961321018355006,
+      "learning_rate": 9.611570912866888e-06,
+      "loss": 1.1276,
+      "step": 2139
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.463066800148788,
+      "learning_rate": 9.61112455015688e-06,
+      "loss": 1.04,
+      "step": 2140
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.924382052934371,
+      "learning_rate": 9.610677941502327e-06,
+      "loss": 1.0227,
+      "step": 2141
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 9.111775934277325,
+      "learning_rate": 9.610231086927047e-06,
+      "loss": 1.0711,
+      "step": 2142
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 9.098440813939852,
+      "learning_rate": 9.609783986454871e-06,
+      "loss": 1.0038,
+      "step": 2143
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.584994481319092,
+      "learning_rate": 9.60933664010965e-06,
+      "loss": 1.0422,
+      "step": 2144
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.69487637747444,
+      "learning_rate": 9.608889047915241e-06,
+      "loss": 1.0226,
+      "step": 2145
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 10.330553447351994,
+      "learning_rate": 9.60844120989552e-06,
+      "loss": 1.1695,
+      "step": 2146
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.712084038442681,
+      "learning_rate": 9.607993126074374e-06,
+      "loss": 1.0353,
+      "step": 2147
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 9.581255662576392,
+      "learning_rate": 9.6075447964757e-06,
+      "loss": 1.0957,
+      "step": 2148
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 9.521459246284385,
+      "learning_rate": 9.607096221123411e-06,
+      "loss": 1.0536,
+      "step": 2149
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 9.007546695781919,
+      "learning_rate": 9.606647400041436e-06,
+      "loss": 1.0372,
+      "step": 2150
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.498443004832637,
+      "learning_rate": 9.60619833325371e-06,
+      "loss": 1.0779,
+      "step": 2151
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.774681283723685,
+      "learning_rate": 9.605749020784185e-06,
+      "loss": 1.0677,
+      "step": 2152
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.387361793669214,
+      "learning_rate": 9.605299462656828e-06,
+      "loss": 0.9984,
+      "step": 2153
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.296882833706519,
+      "learning_rate": 9.604849658895617e-06,
+      "loss": 1.1405,
+      "step": 2154
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.768169185640491,
+      "learning_rate": 9.604399609524543e-06,
+      "loss": 0.9635,
+      "step": 2155
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.103132950160686,
+      "learning_rate": 9.603949314567612e-06,
+      "loss": 1.0373,
+      "step": 2156
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.529350627552371,
+      "learning_rate": 9.603498774048836e-06,
+      "loss": 0.9827,
+      "step": 2157
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 6.205579421754026,
+      "learning_rate": 9.603047987992252e-06,
+      "loss": 1.0807,
+      "step": 2158
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.714925196374659,
+      "learning_rate": 9.6025969564219e-06,
+      "loss": 1.0006,
+      "step": 2159
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.170376183941092,
+      "learning_rate": 9.602145679361839e-06,
+      "loss": 1.0197,
+      "step": 2160
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.184316640446129,
+      "learning_rate": 9.601694156836136e-06,
+      "loss": 1.135,
+      "step": 2161
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.148667606574356,
+      "learning_rate": 9.601242388868876e-06,
+      "loss": 0.9765,
+      "step": 2162
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.430890881971933,
+      "learning_rate": 9.600790375484155e-06,
+      "loss": 0.9014,
+      "step": 2163
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 6.864953718054729,
+      "learning_rate": 9.60033811670608e-06,
+      "loss": 1.0135,
+      "step": 2164
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.65236328038838,
+      "learning_rate": 9.599885612558778e-06,
+      "loss": 1.0221,
+      "step": 2165
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.193155248642213,
+      "learning_rate": 9.59943286306638e-06,
+      "loss": 1.045,
+      "step": 2166
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 9.867874432367643,
+      "learning_rate": 9.598979868253035e-06,
+      "loss": 1.0802,
+      "step": 2167
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 6.17785981581886,
+      "learning_rate": 9.598526628142906e-06,
+      "loss": 1.0264,
+      "step": 2168
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 6.7330059374966496,
+      "learning_rate": 9.598073142760166e-06,
+      "loss": 0.9929,
+      "step": 2169
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 10.26933621454625,
+      "learning_rate": 9.597619412129005e-06,
+      "loss": 1.0717,
+      "step": 2170
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.3337880263143935,
+      "learning_rate": 9.59716543627362e-06,
+      "loss": 1.0728,
+      "step": 2171
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 10.341336281315773,
+      "learning_rate": 9.596711215218228e-06,
+      "loss": 1.1363,
+      "step": 2172
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 6.446547051441781,
+      "learning_rate": 9.596256748987055e-06,
+      "loss": 1.0872,
+      "step": 2173
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.927897544255718,
+      "learning_rate": 9.59580203760434e-06,
+      "loss": 1.1313,
+      "step": 2174
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.309231821670476,
+      "learning_rate": 9.595347081094338e-06,
+      "loss": 1.0416,
+      "step": 2175
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.370369801571239,
+      "learning_rate": 9.594891879481313e-06,
+      "loss": 0.9035,
+      "step": 2176
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 9.994597045607804,
+      "learning_rate": 9.594436432789544e-06,
+      "loss": 1.0535,
+      "step": 2177
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.301456920618664,
+      "learning_rate": 9.593980741043327e-06,
+      "loss": 1.075,
+      "step": 2178
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 9.85854573662901,
+      "learning_rate": 9.593524804266963e-06,
+      "loss": 1.0652,
+      "step": 2179
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 6.713901065636905,
+      "learning_rate": 9.593068622484774e-06,
+      "loss": 1.0024,
+      "step": 2180
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 6.9621895284232584,
+      "learning_rate": 9.592612195721087e-06,
+      "loss": 0.9944,
+      "step": 2181
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.05721003845873,
+      "learning_rate": 9.592155524000251e-06,
+      "loss": 1.1368,
+      "step": 2182
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 10.588226265688835,
+      "learning_rate": 9.59169860734662e-06,
+      "loss": 1.0853,
+      "step": 2183
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.03822943737401,
+      "learning_rate": 9.591241445784568e-06,
+      "loss": 1.0475,
+      "step": 2184
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.999269742760204,
+      "learning_rate": 9.590784039338476e-06,
+      "loss": 1.0919,
+      "step": 2185
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 11.25605853583919,
+      "learning_rate": 9.590326388032742e-06,
+      "loss": 1.0911,
+      "step": 2186
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.428339048914516,
+      "learning_rate": 9.589868491891775e-06,
+      "loss": 1.052,
+      "step": 2187
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 10.292416267886098,
+      "learning_rate": 9.589410350940001e-06,
+      "loss": 1.0667,
+      "step": 2188
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.032845290660584,
+      "learning_rate": 9.588951965201851e-06,
+      "loss": 1.0817,
+      "step": 2189
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.028546502806828,
+      "learning_rate": 9.588493334701778e-06,
+      "loss": 1.082,
+      "step": 2190
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 9.107260494806159,
+      "learning_rate": 9.588034459464244e-06,
+      "loss": 1.0617,
+      "step": 2191
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 6.744237548749001,
+      "learning_rate": 9.58757533951372e-06,
+      "loss": 1.0576,
+      "step": 2192
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.450626760928272,
+      "learning_rate": 9.5871159748747e-06,
+      "loss": 1.0749,
+      "step": 2193
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.871680748493663,
+      "learning_rate": 9.586656365571679e-06,
+      "loss": 1.1564,
+      "step": 2194
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.582547182287465,
+      "learning_rate": 9.586196511629176e-06,
+      "loss": 0.9883,
+      "step": 2195
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.409678869350286,
+      "learning_rate": 9.585736413071718e-06,
+      "loss": 1.1043,
+      "step": 2196
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 6.909348384091736,
+      "learning_rate": 9.585276069923843e-06,
+      "loss": 1.1768,
+      "step": 2197
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 10.542928342334992,
+      "learning_rate": 9.584815482210105e-06,
+      "loss": 1.0731,
+      "step": 2198
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 10.905056353968735,
+      "learning_rate": 9.584354649955072e-06,
+      "loss": 1.0796,
+      "step": 2199
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.485371975286139,
+      "learning_rate": 9.583893573183323e-06,
+      "loss": 1.0649,
+      "step": 2200
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.606021880061077,
+      "learning_rate": 9.583432251919448e-06,
+      "loss": 1.13,
+      "step": 2201
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 10.660306351375874,
+      "learning_rate": 9.582970686188058e-06,
+      "loss": 1.0261,
+      "step": 2202
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 9.272955101928384,
+      "learning_rate": 9.582508876013767e-06,
+      "loss": 1.0808,
+      "step": 2203
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.618661410188803,
+      "learning_rate": 9.582046821421207e-06,
+      "loss": 1.0174,
+      "step": 2204
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.195076192885104,
+      "learning_rate": 9.581584522435025e-06,
+      "loss": 1.0566,
+      "step": 2205
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 10.3535333187097,
+      "learning_rate": 9.581121979079878e-06,
+      "loss": 1.0124,
+      "step": 2206
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 11.244489366383936,
+      "learning_rate": 9.580659191380434e-06,
+      "loss": 1.0319,
+      "step": 2207
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 10.278593803389317,
+      "learning_rate": 9.58019615936138e-06,
+      "loss": 1.082,
+      "step": 2208
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.610926213116721,
+      "learning_rate": 9.579732883047413e-06,
+      "loss": 1.0681,
+      "step": 2209
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 9.081550065491143,
+      "learning_rate": 9.57926936246324e-06,
+      "loss": 1.0428,
+      "step": 2210
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 9.829110693826664,
+      "learning_rate": 9.578805597633587e-06,
+      "loss": 1.0726,
+      "step": 2211
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 8.5441931748098,
+      "learning_rate": 9.578341588583188e-06,
+      "loss": 1.0674,
+      "step": 2212
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 9.661461758975802,
+      "learning_rate": 9.577877335336794e-06,
+      "loss": 1.1188,
+      "step": 2213
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.289461623463051,
+      "learning_rate": 9.577412837919164e-06,
+      "loss": 0.9961,
+      "step": 2214
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 6.840558995002687,
+      "learning_rate": 9.576948096355076e-06,
+      "loss": 0.9798,
+      "step": 2215
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 5.512098052691788,
+      "learning_rate": 9.576483110669314e-06,
+      "loss": 1.105,
+      "step": 2216
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 8.80092768892146,
+      "learning_rate": 9.576017880886685e-06,
+      "loss": 1.1777,
+      "step": 2217
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 9.887231003715133,
+      "learning_rate": 9.575552407031997e-06,
+      "loss": 0.9976,
+      "step": 2218
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.726788417390652,
+      "learning_rate": 9.575086689130081e-06,
+      "loss": 1.1018,
+      "step": 2219
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 8.900642350990903,
+      "learning_rate": 9.574620727205776e-06,
+      "loss": 1.0607,
+      "step": 2220
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.8155188108462506,
+      "learning_rate": 9.574154521283933e-06,
+      "loss": 1.0258,
+      "step": 2221
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 9.32382295223785,
+      "learning_rate": 9.573688071389423e-06,
+      "loss": 1.0473,
+      "step": 2222
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 9.554531188610872,
+      "learning_rate": 9.573221377547119e-06,
+      "loss": 1.0478,
+      "step": 2223
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 10.119855749420037,
+      "learning_rate": 9.572754439781919e-06,
+      "loss": 1.066,
+      "step": 2224
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.5733940236493575,
+      "learning_rate": 9.572287258118724e-06,
+      "loss": 1.0748,
+      "step": 2225
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 9.177719399190766,
+      "learning_rate": 9.571819832582456e-06,
+      "loss": 1.0667,
+      "step": 2226
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 10.604441385915276,
+      "learning_rate": 9.571352163198043e-06,
+      "loss": 1.0242,
+      "step": 2227
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 6.200887943379558,
+      "learning_rate": 9.570884249990429e-06,
+      "loss": 1.1587,
+      "step": 2228
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 8.119719126685471,
+      "learning_rate": 9.570416092984571e-06,
+      "loss": 1.069,
+      "step": 2229
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 8.73204441813622,
+      "learning_rate": 9.569947692205443e-06,
+      "loss": 1.147,
+      "step": 2230
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 8.21185015899105,
+      "learning_rate": 9.569479047678022e-06,
+      "loss": 1.0298,
+      "step": 2231
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.654605272192691,
+      "learning_rate": 9.569010159427311e-06,
+      "loss": 1.1006,
+      "step": 2232
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 8.291978052349284,
+      "learning_rate": 9.568541027478317e-06,
+      "loss": 1.0507,
+      "step": 2233
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.637628597585577,
+      "learning_rate": 9.56807165185606e-06,
+      "loss": 1.0872,
+      "step": 2234
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.816016710386535,
+      "learning_rate": 9.567602032585576e-06,
+      "loss": 0.9278,
+      "step": 2235
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 8.886602128778007,
+      "learning_rate": 9.567132169691914e-06,
+      "loss": 1.0387,
+      "step": 2236
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 5.662187781015992,
+      "learning_rate": 9.566662063200134e-06,
+      "loss": 1.041,
+      "step": 2237
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 9.621655776193288,
+      "learning_rate": 9.566191713135311e-06,
+      "loss": 1.0336,
+      "step": 2238
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 8.98086217760965,
+      "learning_rate": 9.565721119522534e-06,
+      "loss": 1.0546,
+      "step": 2239
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.78814294043072,
+      "learning_rate": 9.565250282386898e-06,
+      "loss": 1.1032,
+      "step": 2240
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 6.529368134513618,
+      "learning_rate": 9.564779201753522e-06,
+      "loss": 1.1028,
+      "step": 2241
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 6.588588992888949,
+      "learning_rate": 9.564307877647527e-06,
+      "loss": 1.0456,
+      "step": 2242
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 5.858269505153884,
+      "learning_rate": 9.563836310094058e-06,
+      "loss": 1.0605,
+      "step": 2243
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 9.223387426360457,
+      "learning_rate": 9.563364499118261e-06,
+      "loss": 1.0396,
+      "step": 2244
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.237556874170526,
+      "learning_rate": 9.562892444745306e-06,
+      "loss": 1.0344,
+      "step": 2245
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 9.3220184611485,
+      "learning_rate": 9.562420147000366e-06,
+      "loss": 1.0919,
+      "step": 2246
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 9.719756663397863,
+      "learning_rate": 9.561947605908638e-06,
+      "loss": 0.9036,
+      "step": 2247
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 9.978135926050424,
+      "learning_rate": 9.561474821495321e-06,
+      "loss": 1.0421,
+      "step": 2248
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 5.686660346983927,
+      "learning_rate": 9.561001793785635e-06,
+      "loss": 1.1319,
+      "step": 2249
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.31703470378724,
+      "learning_rate": 9.560528522804808e-06,
+      "loss": 1.1045,
+      "step": 2250
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 8.512432451978666,
+      "learning_rate": 9.560055008578086e-06,
+      "loss": 1.0613,
+      "step": 2251
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.736150387701011,
+      "learning_rate": 9.55958125113072e-06,
+      "loss": 1.0207,
+      "step": 2252
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 9.31164783837015,
+      "learning_rate": 9.559107250487984e-06,
+      "loss": 1.0445,
+      "step": 2253
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 11.159774877277732,
+      "learning_rate": 9.558633006675157e-06,
+      "loss": 1.0836,
+      "step": 2254
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 8.661179634332042,
+      "learning_rate": 9.558158519717534e-06,
+      "loss": 1.0225,
+      "step": 2255
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 9.560269909366928,
+      "learning_rate": 9.557683789640424e-06,
+      "loss": 1.0615,
+      "step": 2256
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 6.82849869373667,
+      "learning_rate": 9.557208816469145e-06,
+      "loss": 0.9946,
+      "step": 2257
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 9.077856004651343,
+      "learning_rate": 9.556733600229035e-06,
+      "loss": 1.0633,
+      "step": 2258
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.520106782709531,
+      "learning_rate": 9.556258140945438e-06,
+      "loss": 0.9994,
+      "step": 2259
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.0731679434150445,
+      "learning_rate": 9.555782438643715e-06,
+      "loss": 1.0771,
+      "step": 2260
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 8.608043260186026,
+      "learning_rate": 9.555306493349237e-06,
+      "loss": 1.1611,
+      "step": 2261
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 6.524486224082052,
+      "learning_rate": 9.554830305087389e-06,
+      "loss": 1.1139,
+      "step": 2262
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 8.790823590059075,
+      "learning_rate": 9.554353873883572e-06,
+      "loss": 1.065,
+      "step": 2263
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 9.786062507031392,
+      "learning_rate": 9.553877199763197e-06,
+      "loss": 1.1594,
+      "step": 2264
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 6.57226582737238,
+      "learning_rate": 9.553400282751687e-06,
+      "loss": 1.0143,
+      "step": 2265
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 8.388427701345156,
+      "learning_rate": 9.552923122874483e-06,
+      "loss": 0.9551,
+      "step": 2266
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 5.202619792323546,
+      "learning_rate": 9.55244572015703e-06,
+      "loss": 1.1025,
+      "step": 2267
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.358500407937211,
+      "learning_rate": 9.551968074624795e-06,
+      "loss": 1.1497,
+      "step": 2268
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.856433022820206,
+      "learning_rate": 9.551490186303254e-06,
+      "loss": 1.015,
+      "step": 2269
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.62706741604239,
+      "learning_rate": 9.551012055217895e-06,
+      "loss": 1.1163,
+      "step": 2270
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 5.964379740356077,
+      "learning_rate": 9.550533681394221e-06,
+      "loss": 1.0559,
+      "step": 2271
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 9.463397305458004,
+      "learning_rate": 9.550055064857747e-06,
+      "loss": 1.1322,
+      "step": 2272
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.972258605619659,
+      "learning_rate": 9.549576205634e-06,
+      "loss": 0.9735,
+      "step": 2273
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 9.404973360948476,
+      "learning_rate": 9.549097103748524e-06,
+      "loss": 1.0514,
+      "step": 2274
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 10.604280609000943,
+      "learning_rate": 9.548617759226868e-06,
+      "loss": 1.0337,
+      "step": 2275
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 6.453770373713218,
+      "learning_rate": 9.548138172094606e-06,
+      "loss": 1.1013,
+      "step": 2276
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 10.546161222234982,
+      "learning_rate": 9.54765834237731e-06,
+      "loss": 0.9947,
+      "step": 2277
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 5.394817121969008,
+      "learning_rate": 9.547178270100578e-06,
+      "loss": 1.0485,
+      "step": 2278
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 9.342027170831969,
+      "learning_rate": 9.546697955290014e-06,
+      "loss": 1.127,
+      "step": 2279
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 8.587062784094343,
+      "learning_rate": 9.546217397971237e-06,
+      "loss": 1.0825,
+      "step": 2280
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.294277742740312,
+      "learning_rate": 9.54573659816988e-06,
+      "loss": 1.1219,
+      "step": 2281
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 8.352519838959001,
+      "learning_rate": 9.545255555911584e-06,
+      "loss": 1.0571,
+      "step": 2282
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 6.794009776292457,
+      "learning_rate": 9.544774271222007e-06,
+      "loss": 1.082,
+      "step": 2283
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 9.282681610365373,
+      "learning_rate": 9.544292744126824e-06,
+      "loss": 1.0834,
+      "step": 2284
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 8.708921670728063,
+      "learning_rate": 9.543810974651714e-06,
+      "loss": 1.1551,
+      "step": 2285
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 8.236036646783106,
+      "learning_rate": 9.543328962822374e-06,
+      "loss": 1.03,
+      "step": 2286
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 9.463917674658287,
+      "learning_rate": 9.542846708664515e-06,
+      "loss": 1.0775,
+      "step": 2287
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 8.044679366758128,
+      "learning_rate": 9.542364212203856e-06,
+      "loss": 1.0154,
+      "step": 2288
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 6.377565210663258,
+      "learning_rate": 9.541881473466134e-06,
+      "loss": 1.0884,
+      "step": 2289
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.994055764668307,
+      "learning_rate": 9.541398492477096e-06,
+      "loss": 1.0718,
+      "step": 2290
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 9.619059072338807,
+      "learning_rate": 9.540915269262505e-06,
+      "loss": 1.0168,
+      "step": 2291
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 9.175999541726444,
+      "learning_rate": 9.540431803848134e-06,
+      "loss": 0.9272,
+      "step": 2292
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 9.513710466879582,
+      "learning_rate": 9.539948096259766e-06,
+      "loss": 1.0576,
+      "step": 2293
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 11.620364096308787,
+      "learning_rate": 9.539464146523206e-06,
+      "loss": 0.9839,
+      "step": 2294
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 8.960261814655196,
+      "learning_rate": 9.538979954664262e-06,
+      "loss": 1.0961,
+      "step": 2295
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 9.51984158602657,
+      "learning_rate": 9.538495520708764e-06,
+      "loss": 1.0438,
+      "step": 2296
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 10.059921198183563,
+      "learning_rate": 9.538010844682546e-06,
+      "loss": 1.1296,
+      "step": 2297
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.747270314539842,
+      "learning_rate": 9.537525926611462e-06,
+      "loss": 1.073,
+      "step": 2298
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 11.16469421457769,
+      "learning_rate": 9.537040766521374e-06,
+      "loss": 1.1133,
+      "step": 2299
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.0379419551978355,
+      "learning_rate": 9.536555364438163e-06,
+      "loss": 1.011,
+      "step": 2300
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 5.7839037624944805,
+      "learning_rate": 9.536069720387715e-06,
+      "loss": 1.0062,
+      "step": 2301
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.66845468992758,
+      "learning_rate": 9.535583834395935e-06,
+      "loss": 1.113,
+      "step": 2302
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 5.846160576732793,
+      "learning_rate": 9.53509770648874e-06,
+      "loss": 1.0449,
+      "step": 2303
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.894154841550897,
+      "learning_rate": 9.534611336692053e-06,
+      "loss": 1.06,
+      "step": 2304
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 9.63990774827937,
+      "learning_rate": 9.53412472503182e-06,
+      "loss": 1.0481,
+      "step": 2305
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 6.249698144749873,
+      "learning_rate": 9.533637871533996e-06,
+      "loss": 1.0387,
+      "step": 2306
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 9.251706574487882,
+      "learning_rate": 9.533150776224547e-06,
+      "loss": 0.9857,
+      "step": 2307
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.101333757065721,
+      "learning_rate": 9.532663439129454e-06,
+      "loss": 1.0918,
+      "step": 2308
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.0298603906872845,
+      "learning_rate": 9.53217586027471e-06,
+      "loss": 1.0025,
+      "step": 2309
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 8.917064980447492,
+      "learning_rate": 9.531688039686322e-06,
+      "loss": 1.0104,
+      "step": 2310
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.230019144247643,
+      "learning_rate": 9.531199977390308e-06,
+      "loss": 1.0801,
+      "step": 2311
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 9.330957147142353,
+      "learning_rate": 9.530711673412698e-06,
+      "loss": 1.0372,
+      "step": 2312
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.5932823355359105,
+      "learning_rate": 9.53022312777954e-06,
+      "loss": 1.0245,
+      "step": 2313
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 8.96197691379599,
+      "learning_rate": 9.52973434051689e-06,
+      "loss": 1.0243,
+      "step": 2314
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 9.287345759641198,
+      "learning_rate": 9.52924531165082e-06,
+      "loss": 1.041,
+      "step": 2315
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 8.89646689190675,
+      "learning_rate": 9.52875604120741e-06,
+      "loss": 0.9999,
+      "step": 2316
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.772474182997783,
+      "learning_rate": 9.52826652921276e-06,
+      "loss": 1.0616,
+      "step": 2317
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 8.089539822185815,
+      "learning_rate": 9.527776775692978e-06,
+      "loss": 1.0701,
+      "step": 2318
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 6.427073728263865,
+      "learning_rate": 9.527286780674185e-06,
+      "loss": 1.1045,
+      "step": 2319
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 8.445833154218667,
+      "learning_rate": 9.526796544182518e-06,
+      "loss": 0.9873,
+      "step": 2320
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 8.39099058626755,
+      "learning_rate": 9.526306066244121e-06,
+      "loss": 1.085,
+      "step": 2321
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 9.931320351385004,
+      "learning_rate": 9.525815346885161e-06,
+      "loss": 1.0387,
+      "step": 2322
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 10.592138623697581,
+      "learning_rate": 9.525324386131806e-06,
+      "loss": 1.076,
+      "step": 2323
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 8.464592456904,
+      "learning_rate": 9.524833184010246e-06,
+      "loss": 1.0698,
+      "step": 2324
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 9.374503160135049,
+      "learning_rate": 9.524341740546679e-06,
+      "loss": 1.0167,
+      "step": 2325
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.9347030040943185,
+      "learning_rate": 9.523850055767316e-06,
+      "loss": 1.0888,
+      "step": 2326
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.855425173207875,
+      "learning_rate": 9.523358129698384e-06,
+      "loss": 1.0435,
+      "step": 2327
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.095238482475478,
+      "learning_rate": 9.52286596236612e-06,
+      "loss": 1.0215,
+      "step": 2328
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.792251592098202,
+      "learning_rate": 9.522373553796775e-06,
+      "loss": 1.093,
+      "step": 2329
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 6.808661193125151,
+      "learning_rate": 9.52188090401661e-06,
+      "loss": 1.0497,
+      "step": 2330
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 10.526627875695116,
+      "learning_rate": 9.52138801305191e-06,
+      "loss": 1.0872,
+      "step": 2331
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.73875385294479,
+      "learning_rate": 9.520894880928955e-06,
+      "loss": 1.0111,
+      "step": 2332
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.520483659700958,
+      "learning_rate": 9.520401507674053e-06,
+      "loss": 0.9538,
+      "step": 2333
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 10.414661424108491,
+      "learning_rate": 9.519907893313516e-06,
+      "loss": 1.07,
+      "step": 2334
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 8.033899302083071,
+      "learning_rate": 9.519414037873673e-06,
+      "loss": 0.9903,
+      "step": 2335
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 6.61754811442501,
+      "learning_rate": 9.518919941380864e-06,
+      "loss": 1.0397,
+      "step": 2336
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 11.20370887419738,
+      "learning_rate": 9.518425603861445e-06,
+      "loss": 1.068,
+      "step": 2337
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 6.805510386853588,
+      "learning_rate": 9.517931025341783e-06,
+      "loss": 1.0644,
+      "step": 2338
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 6.903496855110971,
+      "learning_rate": 9.517436205848254e-06,
+      "loss": 1.0864,
+      "step": 2339
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 8.13946519884566,
+      "learning_rate": 9.516941145407252e-06,
+      "loss": 1.0723,
+      "step": 2340
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 8.922178130164253,
+      "learning_rate": 9.516445844045183e-06,
+      "loss": 1.0437,
+      "step": 2341
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 9.719039406846255,
+      "learning_rate": 9.515950301788464e-06,
+      "loss": 1.0587,
+      "step": 2342
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 11.142463465825443,
+      "learning_rate": 9.515454518663527e-06,
+      "loss": 0.972,
+      "step": 2343
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 6.039098618745283,
+      "learning_rate": 9.514958494696814e-06,
+      "loss": 1.1298,
+      "step": 2344
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.677304396674528,
+      "learning_rate": 9.514462229914782e-06,
+      "loss": 0.9998,
+      "step": 2345
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 8.054185665354234,
+      "learning_rate": 9.513965724343901e-06,
+      "loss": 1.0901,
+      "step": 2346
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 8.6351290396527,
+      "learning_rate": 9.513468978010654e-06,
+      "loss": 1.0439,
+      "step": 2347
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 6.822293152572387,
+      "learning_rate": 9.512971990941534e-06,
+      "loss": 1.018,
+      "step": 2348
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 9.624104822565638,
+      "learning_rate": 9.512474763163051e-06,
+      "loss": 1.0731,
+      "step": 2349
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 6.7942685400457625,
+      "learning_rate": 9.511977294701724e-06,
+      "loss": 1.0474,
+      "step": 2350
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.660062404009196,
+      "learning_rate": 9.511479585584086e-06,
+      "loss": 1.0943,
+      "step": 2351
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 8.338619458658906,
+      "learning_rate": 9.510981635836687e-06,
+      "loss": 1.0126,
+      "step": 2352
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 9.373283611940568,
+      "learning_rate": 9.510483445486082e-06,
+      "loss": 0.9965,
+      "step": 2353
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.807759457592766,
+      "learning_rate": 9.509985014558847e-06,
+      "loss": 1.054,
+      "step": 2354
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.379519998469399,
+      "learning_rate": 9.509486343081564e-06,
+      "loss": 1.1422,
+      "step": 2355
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 8.990444853273551,
+      "learning_rate": 9.508987431080831e-06,
+      "loss": 1.0983,
+      "step": 2356
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 5.872748675421672,
+      "learning_rate": 9.508488278583259e-06,
+      "loss": 1.0365,
+      "step": 2357
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 9.920397777187508,
+      "learning_rate": 9.507988885615474e-06,
+      "loss": 0.9998,
+      "step": 2358
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 9.24634814392762,
+      "learning_rate": 9.507489252204107e-06,
+      "loss": 1.0651,
+      "step": 2359
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 6.693820818853802,
+      "learning_rate": 9.506989378375812e-06,
+      "loss": 0.9933,
+      "step": 2360
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 11.724269103046808,
+      "learning_rate": 9.506489264157248e-06,
+      "loss": 1.0836,
+      "step": 2361
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.354555634989588,
+      "learning_rate": 9.505988909575091e-06,
+      "loss": 1.1081,
+      "step": 2362
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.6788169583916295,
+      "learning_rate": 9.505488314656027e-06,
+      "loss": 0.9916,
+      "step": 2363
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.918439153232494,
+      "learning_rate": 9.50498747942676e-06,
+      "loss": 1.0401,
+      "step": 2364
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 6.322413116584273,
+      "learning_rate": 9.504486403913997e-06,
+      "loss": 1.0187,
+      "step": 2365
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.481555862871005,
+      "learning_rate": 9.50398508814447e-06,
+      "loss": 1.0171,
+      "step": 2366
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.048147355570595,
+      "learning_rate": 9.503483532144915e-06,
+      "loss": 1.0777,
+      "step": 2367
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 6.866215673952513,
+      "learning_rate": 9.502981735942083e-06,
+      "loss": 1.1099,
+      "step": 2368
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 6.398183564909931,
+      "learning_rate": 9.50247969956274e-06,
+      "loss": 1.1091,
+      "step": 2369
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 9.484515526470778,
+      "learning_rate": 9.501977423033662e-06,
+      "loss": 1.0441,
+      "step": 2370
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 6.018276502228533,
+      "learning_rate": 9.50147490638164e-06,
+      "loss": 1.0163,
+      "step": 2371
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.922006641620337,
+      "learning_rate": 9.500972149633477e-06,
+      "loss": 1.0663,
+      "step": 2372
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 5.71102425154715,
+      "learning_rate": 9.500469152815987e-06,
+      "loss": 1.0406,
+      "step": 2373
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.05057184167281,
+      "learning_rate": 9.499965915956001e-06,
+      "loss": 1.0441,
+      "step": 2374
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 6.2212272536366475,
+      "learning_rate": 9.499462439080357e-06,
+      "loss": 1.1035,
+      "step": 2375
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 6.89624119491981,
+      "learning_rate": 9.498958722215914e-06,
+      "loss": 1.0936,
+      "step": 2376
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.420143337043775,
+      "learning_rate": 9.498454765389532e-06,
+      "loss": 1.1103,
+      "step": 2377
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.669957677864721,
+      "learning_rate": 9.497950568628096e-06,
+      "loss": 1.0816,
+      "step": 2378
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 11.006800189536415,
+      "learning_rate": 9.497446131958496e-06,
+      "loss": 1.0793,
+      "step": 2379
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 9.661661607502703,
+      "learning_rate": 9.49694145540764e-06,
+      "loss": 0.9453,
+      "step": 2380
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 6.414608907125888,
+      "learning_rate": 9.496436539002443e-06,
+      "loss": 1.0526,
+      "step": 2381
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 8.377515363514085,
+      "learning_rate": 9.495931382769837e-06,
+      "loss": 1.0669,
+      "step": 2382
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.336443238975458,
+      "learning_rate": 9.495425986736763e-06,
+      "loss": 1.1261,
+      "step": 2383
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 8.045596424899399,
+      "learning_rate": 9.494920350930184e-06,
+      "loss": 1.0215,
+      "step": 2384
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 9.522629733295513,
+      "learning_rate": 9.494414475377062e-06,
+      "loss": 1.0468,
+      "step": 2385
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.9188540027075245,
+      "learning_rate": 9.493908360104385e-06,
+      "loss": 1.0509,
+      "step": 2386
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 9.404878793501766,
+      "learning_rate": 9.493402005139142e-06,
+      "loss": 1.0052,
+      "step": 2387
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 6.673105670737775,
+      "learning_rate": 9.492895410508346e-06,
+      "loss": 1.0343,
+      "step": 2388
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 9.225332033809787,
+      "learning_rate": 9.492388576239014e-06,
+      "loss": 1.0131,
+      "step": 2389
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 9.380528977347023,
+      "learning_rate": 9.49188150235818e-06,
+      "loss": 1.0276,
+      "step": 2390
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 9.989068673674808,
+      "learning_rate": 9.491374188892888e-06,
+      "loss": 1.163,
+      "step": 2391
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 8.597816575291175,
+      "learning_rate": 9.490866635870201e-06,
+      "loss": 1.1249,
+      "step": 2392
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.779076415477766,
+      "learning_rate": 9.490358843317188e-06,
+      "loss": 1.0054,
+      "step": 2393
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.686187654286731,
+      "learning_rate": 9.489850811260932e-06,
+      "loss": 1.0066,
+      "step": 2394
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 10.074048566364178,
+      "learning_rate": 9.489342539728532e-06,
+      "loss": 1.046,
+      "step": 2395
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.785725872455818,
+      "learning_rate": 9.488834028747098e-06,
+      "loss": 1.0891,
+      "step": 2396
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.814441978044528,
+      "learning_rate": 9.488325278343751e-06,
+      "loss": 1.1304,
+      "step": 2397
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 8.321559911265076,
+      "learning_rate": 9.487816288545625e-06,
+      "loss": 1.0619,
+      "step": 2398
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 8.504672944878786,
+      "learning_rate": 9.487307059379873e-06,
+      "loss": 1.0817,
+      "step": 2399
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 10.451993195662764,
+      "learning_rate": 9.486797590873652e-06,
+      "loss": 1.0366,
+      "step": 2400
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 6.949679637256395,
+      "learning_rate": 9.486287883054137e-06,
+      "loss": 1.0945,
+      "step": 2401
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.742373638955562,
+      "learning_rate": 9.485777935948512e-06,
+      "loss": 1.03,
+      "step": 2402
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.156546394551169,
+      "learning_rate": 9.48526774958398e-06,
+      "loss": 1.0602,
+      "step": 2403
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 6.398609599210751,
+      "learning_rate": 9.484757323987751e-06,
+      "loss": 1.0456,
+      "step": 2404
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 9.03440708035918,
+      "learning_rate": 9.484246659187049e-06,
+      "loss": 1.1027,
+      "step": 2405
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 9.187894088196714,
+      "learning_rate": 9.483735755209114e-06,
+      "loss": 1.0396,
+      "step": 2406
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.529071967506626,
+      "learning_rate": 9.483224612081192e-06,
+      "loss": 1.0901,
+      "step": 2407
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.873177033988654,
+      "learning_rate": 9.482713229830549e-06,
+      "loss": 0.9977,
+      "step": 2408
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.03224109212905,
+      "learning_rate": 9.482201608484461e-06,
+      "loss": 1.006,
+      "step": 2409
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 8.042856081487642,
+      "learning_rate": 9.481689748070215e-06,
+      "loss": 1.0123,
+      "step": 2410
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 8.223533187995377,
+      "learning_rate": 9.481177648615112e-06,
+      "loss": 1.0197,
+      "step": 2411
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 6.8790642366515335,
+      "learning_rate": 9.480665310146467e-06,
+      "loss": 0.9814,
+      "step": 2412
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 8.058145337083317,
+      "learning_rate": 9.480152732691607e-06,
+      "loss": 1.0451,
+      "step": 2413
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 5.499421392283852,
+      "learning_rate": 9.47963991627787e-06,
+      "loss": 1.064,
+      "step": 2414
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.086009865846564,
+      "learning_rate": 9.479126860932609e-06,
+      "loss": 1.0821,
+      "step": 2415
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 5.143393481015037,
+      "learning_rate": 9.47861356668319e-06,
+      "loss": 1.0235,
+      "step": 2416
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 6.633070916493558,
+      "learning_rate": 9.478100033556989e-06,
+      "loss": 1.0914,
+      "step": 2417
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.115455630194129,
+      "learning_rate": 9.477586261581398e-06,
+      "loss": 1.0532,
+      "step": 2418
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 6.613047874299119,
+      "learning_rate": 9.477072250783817e-06,
+      "loss": 1.0694,
+      "step": 2419
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 6.949816294938999,
+      "learning_rate": 9.476558001191665e-06,
+      "loss": 1.1271,
+      "step": 2420
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 8.09186640081472,
+      "learning_rate": 9.47604351283237e-06,
+      "loss": 1.0007,
+      "step": 2421
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 9.74635486451282,
+      "learning_rate": 9.475528785733374e-06,
+      "loss": 1.0964,
+      "step": 2422
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 8.041739093256759,
+      "learning_rate": 9.47501381992213e-06,
+      "loss": 1.1025,
+      "step": 2423
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 10.538558700912471,
+      "learning_rate": 9.474498615426106e-06,
+      "loss": 1.0766,
+      "step": 2424
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 12.392812698944896,
+      "learning_rate": 9.473983172272777e-06,
+      "loss": 1.1115,
+      "step": 2425
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 8.565101739578719,
+      "learning_rate": 9.473467490489642e-06,
+      "loss": 1.1541,
+      "step": 2426
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 8.079685991892555,
+      "learning_rate": 9.472951570104204e-06,
+      "loss": 1.0776,
+      "step": 2427
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 8.915904365113814,
+      "learning_rate": 9.472435411143979e-06,
+      "loss": 1.1253,
+      "step": 2428
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 9.539561092057802,
+      "learning_rate": 9.471919013636497e-06,
+      "loss": 1.0079,
+      "step": 2429
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 9.62787495216152,
+      "learning_rate": 9.471402377609305e-06,
+      "loss": 1.0995,
+      "step": 2430
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 6.453353786839894,
+      "learning_rate": 9.470885503089952e-06,
+      "loss": 1.0197,
+      "step": 2431
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.212221710901317,
+      "learning_rate": 9.470368390106015e-06,
+      "loss": 1.0287,
+      "step": 2432
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.720081705881015,
+      "learning_rate": 9.46985103868507e-06,
+      "loss": 1.0499,
+      "step": 2433
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.566512852740881,
+      "learning_rate": 9.469333448854713e-06,
+      "loss": 1.0203,
+      "step": 2434
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.637772570459396,
+      "learning_rate": 9.46881562064255e-06,
+      "loss": 1.0052,
+      "step": 2435
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.366945673547932,
+      "learning_rate": 9.4682975540762e-06,
+      "loss": 1.1659,
+      "step": 2436
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 8.177826875760706,
+      "learning_rate": 9.467779249183297e-06,
+      "loss": 1.0227,
+      "step": 2437
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 10.526193809719338,
+      "learning_rate": 9.467260705991486e-06,
+      "loss": 1.009,
+      "step": 2438
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.0511443022445555,
+      "learning_rate": 9.466741924528423e-06,
+      "loss": 1.0252,
+      "step": 2439
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.2415227693373385,
+      "learning_rate": 9.466222904821778e-06,
+      "loss": 0.9892,
+      "step": 2440
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 8.239709417558212,
+      "learning_rate": 9.465703646899234e-06,
+      "loss": 1.0839,
+      "step": 2441
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 8.09119938687991,
+      "learning_rate": 9.465184150788492e-06,
+      "loss": 1.0334,
+      "step": 2442
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 6.677555410361969,
+      "learning_rate": 9.464664416517253e-06,
+      "loss": 1.0167,
+      "step": 2443
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.074163532494952,
+      "learning_rate": 9.464144444113242e-06,
+      "loss": 1.0465,
+      "step": 2444
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 9.015617945982513,
+      "learning_rate": 9.463624233604192e-06,
+      "loss": 1.141,
+      "step": 2445
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.639530829125731,
+      "learning_rate": 9.463103785017851e-06,
+      "loss": 0.9866,
+      "step": 2446
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.527621631622783,
+      "learning_rate": 9.462583098381977e-06,
+      "loss": 1.0376,
+      "step": 2447
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.977327629276228,
+      "learning_rate": 9.462062173724343e-06,
+      "loss": 1.0843,
+      "step": 2448
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 10.239581367197182,
+      "learning_rate": 9.46154101107273e-06,
+      "loss": 1.0974,
+      "step": 2449
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.346269795039807,
+      "learning_rate": 9.461019610454941e-06,
+      "loss": 1.0953,
+      "step": 2450
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 9.177324357654253,
+      "learning_rate": 9.460497971898782e-06,
+      "loss": 0.9782,
+      "step": 2451
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 9.915068396078551,
+      "learning_rate": 9.459976095432077e-06,
+      "loss": 1.0177,
+      "step": 2452
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.648212309697665,
+      "learning_rate": 9.459453981082663e-06,
+      "loss": 1.0169,
+      "step": 2453
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.775477642420972,
+      "learning_rate": 9.458931628878385e-06,
+      "loss": 1.0146,
+      "step": 2454
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.806126785093311,
+      "learning_rate": 9.458409038847107e-06,
+      "loss": 1.0833,
+      "step": 2455
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 10.135807817135493,
+      "learning_rate": 9.4578862110167e-06,
+      "loss": 1.061,
+      "step": 2456
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 8.302868782645444,
+      "learning_rate": 9.457363145415051e-06,
+      "loss": 1.0645,
+      "step": 2457
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 9.339857461918808,
+      "learning_rate": 9.456839842070058e-06,
+      "loss": 1.108,
+      "step": 2458
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 8.018222161478024,
+      "learning_rate": 9.456316301009636e-06,
+      "loss": 1.1134,
+      "step": 2459
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.281300469874181,
+      "learning_rate": 9.455792522261706e-06,
+      "loss": 1.0923,
+      "step": 2460
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 9.098055094720687,
+      "learning_rate": 9.455268505854205e-06,
+      "loss": 1.0483,
+      "step": 2461
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 6.566455074789505,
+      "learning_rate": 9.454744251815082e-06,
+      "loss": 1.0159,
+      "step": 2462
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 6.99857876201228,
+      "learning_rate": 9.454219760172302e-06,
+      "loss": 1.0835,
+      "step": 2463
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 8.838612522634183,
+      "learning_rate": 9.453695030953838e-06,
+      "loss": 1.0279,
+      "step": 2464
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 8.465303694335514,
+      "learning_rate": 9.453170064187678e-06,
+      "loss": 1.0366,
+      "step": 2465
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 8.623605435492822,
+      "learning_rate": 9.452644859901823e-06,
+      "loss": 1.1029,
+      "step": 2466
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 6.142531145330086,
+      "learning_rate": 9.452119418124283e-06,
+      "loss": 1.0968,
+      "step": 2467
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 10.184243908125472,
+      "learning_rate": 9.45159373888309e-06,
+      "loss": 1.0071,
+      "step": 2468
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 9.234384292035452,
+      "learning_rate": 9.451067822206274e-06,
+      "loss": 1.0656,
+      "step": 2469
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 10.770584462314238,
+      "learning_rate": 9.45054166812189e-06,
+      "loss": 1.0355,
+      "step": 2470
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.484222947963238,
+      "learning_rate": 9.450015276658005e-06,
+      "loss": 1.0528,
+      "step": 2471
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.727774619403611,
+      "learning_rate": 9.44948864784269e-06,
+      "loss": 1.1309,
+      "step": 2472
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.511964700216296,
+      "learning_rate": 9.448961781704036e-06,
+      "loss": 1.044,
+      "step": 2473
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 12.064780270716877,
+      "learning_rate": 9.448434678270144e-06,
+      "loss": 1.105,
+      "step": 2474
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.156400483569124,
+      "learning_rate": 9.447907337569127e-06,
+      "loss": 0.9907,
+      "step": 2475
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 9.924247987137152,
+      "learning_rate": 9.447379759629116e-06,
+      "loss": 0.9941,
+      "step": 2476
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 8.614950749567894,
+      "learning_rate": 9.446851944478247e-06,
+      "loss": 0.9747,
+      "step": 2477
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 8.435359883213497,
+      "learning_rate": 9.446323892144673e-06,
+      "loss": 1.0493,
+      "step": 2478
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 8.731996670601635,
+      "learning_rate": 9.44579560265656e-06,
+      "loss": 1.012,
+      "step": 2479
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 6.4383477297085365,
+      "learning_rate": 9.445267076042084e-06,
+      "loss": 1.0275,
+      "step": 2480
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 9.144554465935112,
+      "learning_rate": 9.444738312329435e-06,
+      "loss": 1.0631,
+      "step": 2481
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 8.348295253417007,
+      "learning_rate": 9.444209311546815e-06,
+      "loss": 1.0627,
+      "step": 2482
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 6.851707155793982,
+      "learning_rate": 9.443680073722444e-06,
+      "loss": 1.0119,
+      "step": 2483
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 8.998191858999405,
+      "learning_rate": 9.443150598884545e-06,
+      "loss": 1.007,
+      "step": 2484
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 9.624690970186206,
+      "learning_rate": 9.442620887061363e-06,
+      "loss": 1.0859,
+      "step": 2485
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.63162912876737,
+      "learning_rate": 9.442090938281145e-06,
+      "loss": 1.0811,
+      "step": 2486
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.962230442179276,
+      "learning_rate": 9.441560752572163e-06,
+      "loss": 1.0497,
+      "step": 2487
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 8.677259230481678,
+      "learning_rate": 9.441030329962695e-06,
+      "loss": 1.088,
+      "step": 2488
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 9.054082077266392,
+      "learning_rate": 9.44049967048103e-06,
+      "loss": 1.0201,
+      "step": 2489
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 10.017028079184156,
+      "learning_rate": 9.439968774155473e-06,
+      "loss": 1.0068,
+      "step": 2490
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 5.974344420740714,
+      "learning_rate": 9.439437641014339e-06,
+      "loss": 1.0897,
+      "step": 2491
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 9.702913516162456,
+      "learning_rate": 9.43890627108596e-06,
+      "loss": 0.9975,
+      "step": 2492
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 10.698409285354671,
+      "learning_rate": 9.438374664398674e-06,
+      "loss": 1.1101,
+      "step": 2493
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 7.06134665503265,
+      "learning_rate": 9.43784282098084e-06,
+      "loss": 1.0692,
+      "step": 2494
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.63216346360713,
+      "learning_rate": 9.437310740860822e-06,
+      "loss": 1.1397,
+      "step": 2495
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.133018377162138,
+      "learning_rate": 9.436778424067002e-06,
+      "loss": 1.0523,
+      "step": 2496
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 7.448556009901485,
+      "learning_rate": 9.436245870627768e-06,
+      "loss": 1.0529,
+      "step": 2497
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.555993096236199,
+      "learning_rate": 9.43571308057153e-06,
+      "loss": 1.0555,
+      "step": 2498
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 7.649601458722023,
+      "learning_rate": 9.435180053926701e-06,
+      "loss": 1.0419,
+      "step": 2499
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 11.097485011539195,
+      "learning_rate": 9.434646790721714e-06,
+      "loss": 1.0553,
+      "step": 2500
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 6.048821849697724,
+      "learning_rate": 9.434113290985012e-06,
+      "loss": 1.0811,
+      "step": 2501
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 7.659417908406662,
+      "learning_rate": 9.43357955474505e-06,
+      "loss": 1.0101,
+      "step": 2502
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.801661366345956,
+      "learning_rate": 9.433045582030293e-06,
+      "loss": 1.1019,
+      "step": 2503
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 9.988179680785397,
+      "learning_rate": 9.432511372869226e-06,
+      "loss": 1.0251,
+      "step": 2504
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 7.045544380434475,
+      "learning_rate": 9.43197692729034e-06,
+      "loss": 1.0257,
+      "step": 2505
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 10.262572575048786,
+      "learning_rate": 9.431442245322142e-06,
+      "loss": 1.0539,
+      "step": 2506
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.110024954551305,
+      "learning_rate": 9.430907326993148e-06,
+      "loss": 1.0793,
+      "step": 2507
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 9.71691101625989,
+      "learning_rate": 9.430372172331891e-06,
+      "loss": 0.9399,
+      "step": 2508
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 7.678275273708303,
+      "learning_rate": 9.429836781366914e-06,
+      "loss": 0.9765,
+      "step": 2509
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 10.187970835011242,
+      "learning_rate": 9.429301154126775e-06,
+      "loss": 1.0479,
+      "step": 2510
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.543567812316605,
+      "learning_rate": 9.428765290640041e-06,
+      "loss": 1.0363,
+      "step": 2511
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 9.39669214792208,
+      "learning_rate": 9.428229190935294e-06,
+      "loss": 1.1377,
+      "step": 2512
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.77934056159523,
+      "learning_rate": 9.427692855041128e-06,
+      "loss": 0.9605,
+      "step": 2513
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 9.77864933075892,
+      "learning_rate": 9.42715628298615e-06,
+      "loss": 1.0528,
+      "step": 2514
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.153033040844017,
+      "learning_rate": 9.426619474798978e-06,
+      "loss": 1.0049,
+      "step": 2515
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 7.483504585020186,
+      "learning_rate": 9.426082430508246e-06,
+      "loss": 1.1305,
+      "step": 2516
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 12.031149434862524,
+      "learning_rate": 9.425545150142597e-06,
+      "loss": 1.062,
+      "step": 2517
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 9.467421716911893,
+      "learning_rate": 9.425007633730687e-06,
+      "loss": 1.0487,
+      "step": 2518
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.026374818928732,
+      "learning_rate": 9.424469881301188e-06,
+      "loss": 1.0746,
+      "step": 2519
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 10.614651797423177,
+      "learning_rate": 9.423931892882781e-06,
+      "loss": 0.9765,
+      "step": 2520
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 10.960901260852426,
+      "learning_rate": 9.42339366850416e-06,
+      "loss": 1.0854,
+      "step": 2521
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.254194586664097,
+      "learning_rate": 9.422855208194035e-06,
+      "loss": 1.0836,
+      "step": 2522
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 11.13279542831735,
+      "learning_rate": 9.422316511981121e-06,
+      "loss": 1.0146,
+      "step": 2523
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 7.868245650153167,
+      "learning_rate": 9.421777579894159e-06,
+      "loss": 1.0935,
+      "step": 2524
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 7.281314212913462,
+      "learning_rate": 9.421238411961885e-06,
+      "loss": 1.0908,
+      "step": 2525
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 7.101855792679677,
+      "learning_rate": 9.420699008213062e-06,
+      "loss": 1.0009,
+      "step": 2526
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.184063253279605,
+      "learning_rate": 9.42015936867646e-06,
+      "loss": 1.1136,
+      "step": 2527
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 9.425850628871896,
+      "learning_rate": 9.419619493380858e-06,
+      "loss": 1.027,
+      "step": 2528
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.659747715317447,
+      "learning_rate": 9.419079382355057e-06,
+      "loss": 1.0203,
+      "step": 2529
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 7.795125976165045,
+      "learning_rate": 9.418539035627862e-06,
+      "loss": 1.0285,
+      "step": 2530
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 5.977305178870498,
+      "learning_rate": 9.417998453228092e-06,
+      "loss": 1.085,
+      "step": 2531
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.507523236635494,
+      "learning_rate": 9.417457635184584e-06,
+      "loss": 1.0478,
+      "step": 2532
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 6.356306967533413,
+      "learning_rate": 9.416916581526182e-06,
+      "loss": 1.0565,
+      "step": 2533
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 6.223651197854272,
+      "learning_rate": 9.416375292281743e-06,
+      "loss": 0.9875,
+      "step": 2534
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.095612184093708,
+      "learning_rate": 9.41583376748014e-06,
+      "loss": 0.987,
+      "step": 2535
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.272492179879439,
+      "learning_rate": 9.415292007150253e-06,
+      "loss": 1.0252,
+      "step": 2536
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 6.371754195978913,
+      "learning_rate": 9.414750011320983e-06,
+      "loss": 1.1351,
+      "step": 2537
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 7.224527981530823,
+      "learning_rate": 9.414207780021236e-06,
+      "loss": 0.976,
+      "step": 2538
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 5.203692256810612,
+      "learning_rate": 9.413665313279932e-06,
+      "loss": 1.0527,
+      "step": 2539
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.700044886210229,
+      "learning_rate": 9.413122611126006e-06,
+      "loss": 1.0455,
+      "step": 2540
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.06623146196666,
+      "learning_rate": 9.412579673588404e-06,
+      "loss": 1.04,
+      "step": 2541
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.607717464147578,
+      "learning_rate": 9.412036500696084e-06,
+      "loss": 1.0353,
+      "step": 2542
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.590455187257483,
+      "learning_rate": 9.41149309247802e-06,
+      "loss": 1.0671,
+      "step": 2543
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.907922458328105,
+      "learning_rate": 9.41094944896319e-06,
+      "loss": 1.0445,
+      "step": 2544
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 9.4690791431151,
+      "learning_rate": 9.410405570180599e-06,
+      "loss": 1.0044,
+      "step": 2545
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.684189275266245,
+      "learning_rate": 9.409861456159248e-06,
+      "loss": 1.0719,
+      "step": 2546
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 7.075233971177194,
+      "learning_rate": 9.409317106928163e-06,
+      "loss": 1.0416,
+      "step": 2547
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 7.004240956273251,
+      "learning_rate": 9.408772522516377e-06,
+      "loss": 1.0755,
+      "step": 2548
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.599571374091859,
+      "learning_rate": 9.408227702952937e-06,
+      "loss": 1.0747,
+      "step": 2549
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 7.200744191763406,
+      "learning_rate": 9.4076826482669e-06,
+      "loss": 1.1049,
+      "step": 2550
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 10.44384682516799,
+      "learning_rate": 9.407137358487342e-06,
+      "loss": 1.0793,
+      "step": 2551
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 7.743579479433129,
+      "learning_rate": 9.406591833643343e-06,
+      "loss": 0.9575,
+      "step": 2552
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 7.715271941250298,
+      "learning_rate": 9.406046073764002e-06,
+      "loss": 1.0081,
+      "step": 2553
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 6.831813523023332,
+      "learning_rate": 9.405500078878427e-06,
+      "loss": 1.0129,
+      "step": 2554
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 7.258464860962174,
+      "learning_rate": 9.404953849015742e-06,
+      "loss": 1.0381,
+      "step": 2555
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 10.020802393168392,
+      "learning_rate": 9.404407384205078e-06,
+      "loss": 1.031,
+      "step": 2556
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 7.8428612660071675,
+      "learning_rate": 9.403860684475585e-06,
+      "loss": 1.0577,
+      "step": 2557
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.757834132391757,
+      "learning_rate": 9.403313749856422e-06,
+      "loss": 1.1244,
+      "step": 2558
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.65609621198654,
+      "learning_rate": 9.402766580376758e-06,
+      "loss": 1.0565,
+      "step": 2559
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.017632528065592,
+      "learning_rate": 9.402219176065783e-06,
+      "loss": 1.1206,
+      "step": 2560
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.167007917302419,
+      "learning_rate": 9.401671536952689e-06,
+      "loss": 1.0276,
+      "step": 2561
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 7.950317364867685,
+      "learning_rate": 9.401123663066687e-06,
+      "loss": 1.0157,
+      "step": 2562
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 5.840640413504008,
+      "learning_rate": 9.400575554436999e-06,
+      "loss": 1.0758,
+      "step": 2563
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.45796428607723,
+      "learning_rate": 9.400027211092862e-06,
+      "loss": 1.0125,
+      "step": 2564
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.241029605871926,
+      "learning_rate": 9.39947863306352e-06,
+      "loss": 1.0306,
+      "step": 2565
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.751724697839267,
+      "learning_rate": 9.398929820378233e-06,
+      "loss": 1.0149,
+      "step": 2566
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.299546396336286,
+      "learning_rate": 9.398380773066276e-06,
+      "loss": 1.0271,
+      "step": 2567
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.450235047364623,
+      "learning_rate": 9.39783149115693e-06,
+      "loss": 1.0598,
+      "step": 2568
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 6.284108988490921,
+      "learning_rate": 9.397281974679493e-06,
+      "loss": 0.9837,
+      "step": 2569
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.491419533309703,
+      "learning_rate": 9.396732223663277e-06,
+      "loss": 0.9112,
+      "step": 2570
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 10.671208275378904,
+      "learning_rate": 9.396182238137601e-06,
+      "loss": 1.1049,
+      "step": 2571
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 7.784191864814369,
+      "learning_rate": 9.395632018131802e-06,
+      "loss": 1.0479,
+      "step": 2572
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 7.390062837551184,
+      "learning_rate": 9.395081563675226e-06,
+      "loss": 1.062,
+      "step": 2573
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 5.1546819478744865,
+      "learning_rate": 9.394530874797234e-06,
+      "loss": 1.0439,
+      "step": 2574
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 6.988010553869544,
+      "learning_rate": 9.393979951527198e-06,
+      "loss": 1.1079,
+      "step": 2575
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 9.219692939282428,
+      "learning_rate": 9.393428793894502e-06,
+      "loss": 0.987,
+      "step": 2576
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 11.520947475264075,
+      "learning_rate": 9.392877401928541e-06,
+      "loss": 1.0807,
+      "step": 2577
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 9.882180825190957,
+      "learning_rate": 9.392325775658729e-06,
+      "loss": 0.9936,
+      "step": 2578
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 10.091580626472375,
+      "learning_rate": 9.391773915114486e-06,
+      "loss": 1.0334,
+      "step": 2579
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 9.145686680229197,
+      "learning_rate": 9.391221820325246e-06,
+      "loss": 0.998,
+      "step": 2580
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.174608410508952,
+      "learning_rate": 9.390669491320458e-06,
+      "loss": 1.1172,
+      "step": 2581
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 7.589580284139084,
+      "learning_rate": 9.390116928129583e-06,
+      "loss": 1.0599,
+      "step": 2582
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 9.75705756900486,
+      "learning_rate": 9.389564130782088e-06,
+      "loss": 1.0388,
+      "step": 2583
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 9.176210467947431,
+      "learning_rate": 9.389011099307462e-06,
+      "loss": 1.0397,
+      "step": 2584
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 9.491011154954291,
+      "learning_rate": 9.3884578337352e-06,
+      "loss": 1.1156,
+      "step": 2585
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 7.189735101499042,
+      "learning_rate": 9.387904334094814e-06,
+      "loss": 1.0602,
+      "step": 2586
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 7.25414298273057,
+      "learning_rate": 9.387350600415823e-06,
+      "loss": 1.0046,
+      "step": 2587
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 10.181995740776687,
+      "learning_rate": 9.386796632727764e-06,
+      "loss": 1.0858,
+      "step": 2588
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 9.270913791176909,
+      "learning_rate": 9.386242431060183e-06,
+      "loss": 1.092,
+      "step": 2589
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 9.177518257087067,
+      "learning_rate": 9.385687995442639e-06,
+      "loss": 1.1323,
+      "step": 2590
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 6.626247627652941,
+      "learning_rate": 9.385133325904708e-06,
+      "loss": 1.0556,
+      "step": 2591
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 6.703122460028861,
+      "learning_rate": 9.384578422475968e-06,
+      "loss": 1.107,
+      "step": 2592
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 6.95179470072993,
+      "learning_rate": 9.384023285186022e-06,
+      "loss": 1.0438,
+      "step": 2593
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 7.229848893489577,
+      "learning_rate": 9.383467914064474e-06,
+      "loss": 1.0672,
+      "step": 2594
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 10.235632484667935,
+      "learning_rate": 9.382912309140953e-06,
+      "loss": 1.096,
+      "step": 2595
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.423212448059722,
+      "learning_rate": 9.382356470445087e-06,
+      "loss": 1.0779,
+      "step": 2596
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.016984053305894,
+      "learning_rate": 9.381800398006525e-06,
+      "loss": 1.1621,
+      "step": 2597
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.022411640679517,
+      "learning_rate": 9.381244091854927e-06,
+      "loss": 1.0133,
+      "step": 2598
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.448558583809747,
+      "learning_rate": 9.380687552019965e-06,
+      "loss": 1.0409,
+      "step": 2599
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 9.875988237630251,
+      "learning_rate": 9.380130778531322e-06,
+      "loss": 1.0715,
+      "step": 2600
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 6.4422897480158525,
+      "learning_rate": 9.379573771418696e-06,
+      "loss": 1.0584,
+      "step": 2601
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 10.980274003062494,
+      "learning_rate": 9.379016530711795e-06,
+      "loss": 1.0171,
+      "step": 2602
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.31251138361806,
+      "learning_rate": 9.378459056440343e-06,
+      "loss": 1.0802,
+      "step": 2603
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.405050897139754,
+      "learning_rate": 9.37790134863407e-06,
+      "loss": 1.0657,
+      "step": 2604
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.516340320801168,
+      "learning_rate": 9.377343407322727e-06,
+      "loss": 1.0936,
+      "step": 2605
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 9.529126096383056,
+      "learning_rate": 9.37678523253607e-06,
+      "loss": 1.0175,
+      "step": 2606
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 6.151785792649855,
+      "learning_rate": 9.376226824303871e-06,
+      "loss": 1.0705,
+      "step": 2607
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.0528711234107,
+      "learning_rate": 9.375668182655918e-06,
+      "loss": 0.9918,
+      "step": 2608
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 7.898780070996434,
+      "learning_rate": 9.375109307622e-06,
+      "loss": 1.0254,
+      "step": 2609
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 6.955551300906097,
+      "learning_rate": 9.37455019923193e-06,
+      "loss": 0.9361,
+      "step": 2610
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.688774014391436,
+      "learning_rate": 9.37399085751553e-06,
+      "loss": 0.9896,
+      "step": 2611
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 10.762194415837465,
+      "learning_rate": 9.373431282502632e-06,
+      "loss": 1.1037,
+      "step": 2612
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 12.00808156167861,
+      "learning_rate": 9.372871474223085e-06,
+      "loss": 1.0948,
+      "step": 2613
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 7.822708030465508,
+      "learning_rate": 9.372311432706744e-06,
+      "loss": 1.0169,
+      "step": 2614
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 6.921085046395318,
+      "learning_rate": 9.371751157983483e-06,
+      "loss": 1.0196,
+      "step": 2615
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 5.8229821340849615,
+      "learning_rate": 9.371190650083181e-06,
+      "loss": 1.0661,
+      "step": 2616
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.35698220789519,
+      "learning_rate": 9.37062990903574e-06,
+      "loss": 1.0744,
+      "step": 2617
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 7.036624399976806,
+      "learning_rate": 9.370068934871065e-06,
+      "loss": 1.0173,
+      "step": 2618
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 6.590892849522806,
+      "learning_rate": 9.369507727619076e-06,
+      "loss": 0.9942,
+      "step": 2619
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.878335060678594,
+      "learning_rate": 9.368946287309707e-06,
+      "loss": 1.0015,
+      "step": 2620
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 6.858001561496145,
+      "learning_rate": 9.368384613972905e-06,
+      "loss": 1.0965,
+      "step": 2621
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 10.437505613341756,
+      "learning_rate": 9.367822707638628e-06,
+      "loss": 0.9819,
+      "step": 2622
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 7.072730169677802,
+      "learning_rate": 9.367260568336844e-06,
+      "loss": 1.0335,
+      "step": 2623
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.138922468736643,
+      "learning_rate": 9.366698196097539e-06,
+      "loss": 1.0515,
+      "step": 2624
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 9.611416777955016,
+      "learning_rate": 9.366135590950705e-06,
+      "loss": 1.0606,
+      "step": 2625
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 7.851804792997554,
+      "learning_rate": 9.365572752926354e-06,
+      "loss": 1.0188,
+      "step": 2626
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 9.223843878310904,
+      "learning_rate": 9.365009682054503e-06,
+      "loss": 1.0615,
+      "step": 2627
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 6.3561108817011345,
+      "learning_rate": 9.364446378365186e-06,
+      "loss": 1.0848,
+      "step": 2628
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 7.424792035640964,
+      "learning_rate": 9.363882841888448e-06,
+      "loss": 1.0236,
+      "step": 2629
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 6.483112490924482,
+      "learning_rate": 9.363319072654345e-06,
+      "loss": 0.9857,
+      "step": 2630
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.11934834714659,
+      "learning_rate": 9.36275507069295e-06,
+      "loss": 1.0228,
+      "step": 2631
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.844939018394689,
+      "learning_rate": 9.362190836034342e-06,
+      "loss": 1.1705,
+      "step": 2632
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 6.153057212455661,
+      "learning_rate": 9.361626368708617e-06,
+      "loss": 1.0618,
+      "step": 2633
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.279628619880609,
+      "learning_rate": 9.361061668745882e-06,
+      "loss": 1.0319,
+      "step": 2634
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.609227969556367,
+      "learning_rate": 9.360496736176257e-06,
+      "loss": 1.0566,
+      "step": 2635
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.305446353836873,
+      "learning_rate": 9.359931571029875e-06,
+      "loss": 1.0938,
+      "step": 2636
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.083568011091226,
+      "learning_rate": 9.359366173336877e-06,
+      "loss": 1.0284,
+      "step": 2637
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 5.929204585698418,
+      "learning_rate": 9.358800543127422e-06,
+      "loss": 1.0432,
+      "step": 2638
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 6.883042855868488,
+      "learning_rate": 9.35823468043168e-06,
+      "loss": 1.0092,
+      "step": 2639
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 9.438765199702821,
+      "learning_rate": 9.35766858527983e-06,
+      "loss": 1.0763,
+      "step": 2640
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.258133524990033,
+      "learning_rate": 9.357102257702067e-06,
+      "loss": 0.9377,
+      "step": 2641
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 10.585900313102677,
+      "learning_rate": 9.3565356977286e-06,
+      "loss": 1.0408,
+      "step": 2642
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.875693167845807,
+      "learning_rate": 9.355968905389644e-06,
+      "loss": 0.9993,
+      "step": 2643
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.1588817060263,
+      "learning_rate": 9.35540188071543e-06,
+      "loss": 1.0893,
+      "step": 2644
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 9.205424116289764,
+      "learning_rate": 9.354834623736204e-06,
+      "loss": 1.0426,
+      "step": 2645
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.288800707919322,
+      "learning_rate": 9.354267134482223e-06,
+      "loss": 0.9709,
+      "step": 2646
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.7921942832234485,
+      "learning_rate": 9.35369941298375e-06,
+      "loss": 1.037,
+      "step": 2647
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 11.539839024254395,
+      "learning_rate": 9.353131459271073e-06,
+      "loss": 1.031,
+      "step": 2648
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.353844908146815,
+      "learning_rate": 9.352563273374476e-06,
+      "loss": 1.0281,
+      "step": 2649
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.732052979234183,
+      "learning_rate": 9.351994855324273e-06,
+      "loss": 1.1181,
+      "step": 2650
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 5.992742666369961,
+      "learning_rate": 9.351426205150778e-06,
+      "loss": 1.0894,
+      "step": 2651
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.700596744852043,
+      "learning_rate": 9.350857322884319e-06,
+      "loss": 1.1209,
+      "step": 2652
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.147381869743972,
+      "learning_rate": 9.350288208555244e-06,
+      "loss": 1.0191,
+      "step": 2653
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.134059368041308,
+      "learning_rate": 9.349718862193904e-06,
+      "loss": 1.0384,
+      "step": 2654
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 5.965324608566804,
+      "learning_rate": 9.349149283830667e-06,
+      "loss": 1.0584,
+      "step": 2655
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 9.803583639599518,
+      "learning_rate": 9.348579473495913e-06,
+      "loss": 1.0312,
+      "step": 2656
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.46388410280263,
+      "learning_rate": 9.348009431220036e-06,
+      "loss": 1.0011,
+      "step": 2657
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 9.482227670397663,
+      "learning_rate": 9.347439157033435e-06,
+      "loss": 1.0127,
+      "step": 2658
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.796996923751869,
+      "learning_rate": 9.346868650966534e-06,
+      "loss": 1.1332,
+      "step": 2659
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 6.820829132367153,
+      "learning_rate": 9.346297913049757e-06,
+      "loss": 1.0246,
+      "step": 2660
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 9.639156476950982,
+      "learning_rate": 9.34572694331355e-06,
+      "loss": 1.0479,
+      "step": 2661
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.45789210720963,
+      "learning_rate": 9.345155741788362e-06,
+      "loss": 1.1183,
+      "step": 2662
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.245697225637441,
+      "learning_rate": 9.344584308504661e-06,
+      "loss": 1.0321,
+      "step": 2663
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.424286392015613,
+      "learning_rate": 9.344012643492927e-06,
+      "loss": 1.0447,
+      "step": 2664
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.036216884045906,
+      "learning_rate": 9.343440746783649e-06,
+      "loss": 1.1221,
+      "step": 2665
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 11.430384657436983,
+      "learning_rate": 9.342868618407334e-06,
+      "loss": 0.985,
+      "step": 2666
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.728246211682265,
+      "learning_rate": 9.342296258394495e-06,
+      "loss": 1.0394,
+      "step": 2667
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.197835806116226,
+      "learning_rate": 9.341723666775658e-06,
+      "loss": 1.0551,
+      "step": 2668
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 5.501666951341626,
+      "learning_rate": 9.341150843581369e-06,
+      "loss": 1.0042,
+      "step": 2669
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 10.882295168829616,
+      "learning_rate": 9.340577788842176e-06,
+      "loss": 1.0314,
+      "step": 2670
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.447296696492751,
+      "learning_rate": 9.340004502588646e-06,
+      "loss": 1.0204,
+      "step": 2671
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.349298020854189,
+      "learning_rate": 9.339430984851357e-06,
+      "loss": 1.0232,
+      "step": 2672
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 6.853859219575004,
+      "learning_rate": 9.338857235660897e-06,
+      "loss": 0.9898,
+      "step": 2673
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 6.213096856589902,
+      "learning_rate": 9.338283255047868e-06,
+      "loss": 1.0522,
+      "step": 2674
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 10.340316553119347,
+      "learning_rate": 9.337709043042889e-06,
+      "loss": 1.0881,
+      "step": 2675
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.338771736777787,
+      "learning_rate": 9.337134599676583e-06,
+      "loss": 0.9886,
+      "step": 2676
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 6.692695900060564,
+      "learning_rate": 9.33655992497959e-06,
+      "loss": 1.0471,
+      "step": 2677
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 9.162222000162028,
+      "learning_rate": 9.33598501898256e-06,
+      "loss": 1.0748,
+      "step": 2678
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.940620438434383,
+      "learning_rate": 9.335409881716158e-06,
+      "loss": 1.0728,
+      "step": 2679
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 6.095147298693245,
+      "learning_rate": 9.334834513211063e-06,
+      "loss": 1.0049,
+      "step": 2680
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.467137824062192,
+      "learning_rate": 9.33425891349796e-06,
+      "loss": 1.0812,
+      "step": 2681
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.617670242207183,
+      "learning_rate": 9.333683082607553e-06,
+      "loss": 1.0445,
+      "step": 2682
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.656599551038722,
+      "learning_rate": 9.333107020570549e-06,
+      "loss": 1.057,
+      "step": 2683
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 9.277994277476385,
+      "learning_rate": 9.332530727417681e-06,
+      "loss": 0.9984,
+      "step": 2684
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.454361227786968,
+      "learning_rate": 9.331954203179683e-06,
+      "loss": 0.9965,
+      "step": 2685
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.275188212027179,
+      "learning_rate": 9.331377447887306e-06,
+      "loss": 1.0464,
+      "step": 2686
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.645207681645768,
+      "learning_rate": 9.330800461571311e-06,
+      "loss": 1.1509,
+      "step": 2687
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.538637150506669,
+      "learning_rate": 9.330223244262474e-06,
+      "loss": 1.0805,
+      "step": 2688
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 10.42225934146801,
+      "learning_rate": 9.329645795991585e-06,
+      "loss": 1.0368,
+      "step": 2689
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 6.870676261352656,
+      "learning_rate": 9.32906811678944e-06,
+      "loss": 0.9998,
+      "step": 2690
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.782548072241013,
+      "learning_rate": 9.32849020668685e-06,
+      "loss": 1.0019,
+      "step": 2691
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.411406296105442,
+      "learning_rate": 9.327912065714644e-06,
+      "loss": 0.9646,
+      "step": 2692
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 9.15855643658938,
+      "learning_rate": 9.327333693903653e-06,
+      "loss": 1.0108,
+      "step": 2693
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.732788300204467,
+      "learning_rate": 9.32675509128473e-06,
+      "loss": 1.0668,
+      "step": 2694
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.599775489169585,
+      "learning_rate": 9.32617625788873e-06,
+      "loss": 1.0064,
+      "step": 2695
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.667656240456941,
+      "learning_rate": 9.325597193746535e-06,
+      "loss": 1.0728,
+      "step": 2696
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.70157159998695,
+      "learning_rate": 9.325017898889024e-06,
+      "loss": 1.0764,
+      "step": 2697
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.422617395972281,
+      "learning_rate": 9.324438373347096e-06,
+      "loss": 1.0773,
+      "step": 2698
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.054835715602778,
+      "learning_rate": 9.323858617151664e-06,
+      "loss": 1.0257,
+      "step": 2699
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 8.603265649711066,
+      "learning_rate": 9.323278630333648e-06,
+      "loss": 1.0195,
+      "step": 2700
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 8.616770239612341,
+      "learning_rate": 9.322698412923984e-06,
+      "loss": 0.9621,
+      "step": 2701
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 9.415847297252416,
+      "learning_rate": 9.322117964953618e-06,
+      "loss": 1.0154,
+      "step": 2702
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 6.5586945100171254,
+      "learning_rate": 9.321537286453512e-06,
+      "loss": 1.0229,
+      "step": 2703
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 8.0175570947549,
+      "learning_rate": 9.320956377454635e-06,
+      "loss": 1.0818,
+      "step": 2704
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 10.33666815484962,
+      "learning_rate": 9.320375237987974e-06,
+      "loss": 1.0436,
+      "step": 2705
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.199101984750588,
+      "learning_rate": 9.31979386808452e-06,
+      "loss": 1.0377,
+      "step": 2706
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 8.84065980256212,
+      "learning_rate": 9.31921226777529e-06,
+      "loss": 1.0447,
+      "step": 2707
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 9.4497108877033,
+      "learning_rate": 9.318630437091298e-06,
+      "loss": 1.0421,
+      "step": 2708
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 9.277653578340512,
+      "learning_rate": 9.31804837606358e-06,
+      "loss": 1.0508,
+      "step": 2709
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 9.451744830056034,
+      "learning_rate": 9.31746608472318e-06,
+      "loss": 1.0278,
+      "step": 2710
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 6.5351930368019255,
+      "learning_rate": 9.316883563101158e-06,
+      "loss": 1.0615,
+      "step": 2711
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.796309288620775,
+      "learning_rate": 9.316300811228583e-06,
+      "loss": 1.0427,
+      "step": 2712
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.524488788499173,
+      "learning_rate": 9.315717829136535e-06,
+      "loss": 1.0667,
+      "step": 2713
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 5.930575353813115,
+      "learning_rate": 9.315134616856112e-06,
+      "loss": 0.9939,
+      "step": 2714
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.417243879690828,
+      "learning_rate": 9.31455117441842e-06,
+      "loss": 1.0196,
+      "step": 2715
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 10.96835692760635,
+      "learning_rate": 9.313967501854579e-06,
+      "loss": 1.1205,
+      "step": 2716
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 11.324570472559287,
+      "learning_rate": 9.313383599195718e-06,
+      "loss": 1.0097,
+      "step": 2717
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 6.369657126585696,
+      "learning_rate": 9.312799466472984e-06,
+      "loss": 1.0392,
+      "step": 2718
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.65027133171125,
+      "learning_rate": 9.312215103717529e-06,
+      "loss": 0.9781,
+      "step": 2719
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 8.519150046982682,
+      "learning_rate": 9.311630510960526e-06,
+      "loss": 0.9727,
+      "step": 2720
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.960905666471633,
+      "learning_rate": 9.311045688233151e-06,
+      "loss": 1.0939,
+      "step": 2721
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.693701481598684,
+      "learning_rate": 9.310460635566599e-06,
+      "loss": 1.0055,
+      "step": 2722
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 8.006148881078182,
+      "learning_rate": 9.309875352992075e-06,
+      "loss": 1.0384,
+      "step": 2723
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 9.380547439894237,
+      "learning_rate": 9.309289840540796e-06,
+      "loss": 1.0126,
+      "step": 2724
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 8.800761865737305,
+      "learning_rate": 9.308704098243994e-06,
+      "loss": 0.988,
+      "step": 2725
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 6.251824754652018,
+      "learning_rate": 9.308118126132907e-06,
+      "loss": 1.1176,
+      "step": 2726
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 6.110476840310048,
+      "learning_rate": 9.307531924238791e-06,
+      "loss": 1.0258,
+      "step": 2727
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.621437557592616,
+      "learning_rate": 9.306945492592912e-06,
+      "loss": 1.0216,
+      "step": 2728
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 9.257273001904911,
+      "learning_rate": 9.30635883122655e-06,
+      "loss": 1.0122,
+      "step": 2729
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 8.19024827226243,
+      "learning_rate": 9.305771940170993e-06,
+      "loss": 1.1339,
+      "step": 2730
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 9.092123237121418,
+      "learning_rate": 9.305184819457547e-06,
+      "loss": 0.9977,
+      "step": 2731
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 6.000024979775382,
+      "learning_rate": 9.304597469117527e-06,
+      "loss": 1.005,
+      "step": 2732
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 8.402929547686009,
+      "learning_rate": 9.30400988918226e-06,
+      "loss": 1.0512,
+      "step": 2733
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.843430327572067,
+      "learning_rate": 9.303422079683085e-06,
+      "loss": 0.9965,
+      "step": 2734
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.8013065888361615,
+      "learning_rate": 9.302834040651354e-06,
+      "loss": 1.0206,
+      "step": 2735
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 6.983807826991791,
+      "learning_rate": 9.302245772118435e-06,
+      "loss": 1.0618,
+      "step": 2736
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 6.81254667518136,
+      "learning_rate": 9.301657274115698e-06,
+      "loss": 1.0735,
+      "step": 2737
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 8.180825112508378,
+      "learning_rate": 9.30106854667454e-06,
+      "loss": 1.0587,
+      "step": 2738
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 5.949581153752731,
+      "learning_rate": 9.300479589826354e-06,
+      "loss": 1.0749,
+      "step": 2739
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 8.484184778912436,
+      "learning_rate": 9.29989040360256e-06,
+      "loss": 1.0455,
+      "step": 2740
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 9.110855406187143,
+      "learning_rate": 9.299300988034579e-06,
+      "loss": 1.0112,
+      "step": 2741
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 6.750203841099222,
+      "learning_rate": 9.298711343153851e-06,
+      "loss": 1.1019,
+      "step": 2742
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 6.042924629850923,
+      "learning_rate": 9.298121468991825e-06,
+      "loss": 0.9942,
+      "step": 2743
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.30501371520001,
+      "learning_rate": 9.297531365579966e-06,
+      "loss": 1.052,
+      "step": 2744
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 8.46430106313334,
+      "learning_rate": 9.296941032949744e-06,
+      "loss": 1.0463,
+      "step": 2745
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 8.131077296984015,
+      "learning_rate": 9.296350471132649e-06,
+      "loss": 1.0546,
+      "step": 2746
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.379597561162943,
+      "learning_rate": 9.295759680160177e-06,
+      "loss": 1.0153,
+      "step": 2747
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.015918748455668,
+      "learning_rate": 9.295168660063842e-06,
+      "loss": 1.0394,
+      "step": 2748
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 6.64783173718036,
+      "learning_rate": 9.294577410875167e-06,
+      "loss": 0.9859,
+      "step": 2749
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 5.516562282192306,
+      "learning_rate": 9.293985932625686e-06,
+      "loss": 1.0192,
+      "step": 2750
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 6.658216713087952,
+      "learning_rate": 9.29339422534695e-06,
+      "loss": 1.0123,
+      "step": 2751
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.970249549447439,
+      "learning_rate": 9.292802289070514e-06,
+      "loss": 1.0068,
+      "step": 2752
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 5.40919639766164,
+      "learning_rate": 9.292210123827955e-06,
+      "loss": 0.9615,
+      "step": 2753
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 6.65387896484262,
+      "learning_rate": 9.291617729650855e-06,
+      "loss": 1.0291,
+      "step": 2754
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 8.627033717248565,
+      "learning_rate": 9.29102510657081e-06,
+      "loss": 0.9925,
+      "step": 2755
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.669536515219448,
+      "learning_rate": 9.29043225461943e-06,
+      "loss": 0.9887,
+      "step": 2756
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.5676930812598915,
+      "learning_rate": 9.289839173828337e-06,
+      "loss": 1.1248,
+      "step": 2757
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.986893150252165,
+      "learning_rate": 9.289245864229163e-06,
+      "loss": 1.0449,
+      "step": 2758
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.082026458046305,
+      "learning_rate": 9.288652325853553e-06,
+      "loss": 1.0474,
+      "step": 2759
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 9.434131825254072,
+      "learning_rate": 9.288058558733168e-06,
+      "loss": 0.9815,
+      "step": 2760
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 9.968540837755866,
+      "learning_rate": 9.287464562899672e-06,
+      "loss": 1.0239,
+      "step": 2761
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 8.348364542443624,
+      "learning_rate": 9.286870338384752e-06,
+      "loss": 1.1085,
+      "step": 2762
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 8.397681857764644,
+      "learning_rate": 9.286275885220102e-06,
+      "loss": 1.0638,
+      "step": 2763
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 9.243392057504973,
+      "learning_rate": 9.285681203437427e-06,
+      "loss": 1.071,
+      "step": 2764
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 12.004678525226444,
+      "learning_rate": 9.285086293068443e-06,
+      "loss": 0.9949,
+      "step": 2765
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.639174633974003,
+      "learning_rate": 9.284491154144886e-06,
+      "loss": 1.0091,
+      "step": 2766
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.498657830020085,
+      "learning_rate": 9.283895786698496e-06,
+      "loss": 1.077,
+      "step": 2767
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 8.856435933245098,
+      "learning_rate": 9.283300190761032e-06,
+      "loss": 1.0722,
+      "step": 2768
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.779204401532822,
+      "learning_rate": 9.282704366364255e-06,
+      "loss": 1.084,
+      "step": 2769
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 7.9091656769150225,
+      "learning_rate": 9.282108313539948e-06,
+      "loss": 1.0298,
+      "step": 2770
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 6.966644664287864,
+      "learning_rate": 9.281512032319904e-06,
+      "loss": 1.0891,
+      "step": 2771
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 8.920446104946322,
+      "learning_rate": 9.280915522735926e-06,
+      "loss": 1.1087,
+      "step": 2772
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 7.101825827971282,
+      "learning_rate": 9.28031878481983e-06,
+      "loss": 1.0628,
+      "step": 2773
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 8.435886521028284,
+      "learning_rate": 9.279721818603444e-06,
+      "loss": 1.0627,
+      "step": 2774
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 8.734884902052622,
+      "learning_rate": 9.27912462411861e-06,
+      "loss": 1.0283,
+      "step": 2775
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 7.488965742296708,
+      "learning_rate": 9.278527201397179e-06,
+      "loss": 1.0316,
+      "step": 2776
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 7.992596244349267,
+      "learning_rate": 9.277929550471016e-06,
+      "loss": 1.022,
+      "step": 2777
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 9.758589899023832,
+      "learning_rate": 9.277331671371999e-06,
+      "loss": 1.0537,
+      "step": 2778
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 9.189198904316486,
+      "learning_rate": 9.276733564132014e-06,
+      "loss": 1.0313,
+      "step": 2779
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 8.274594374518744,
+      "learning_rate": 9.276135228782969e-06,
+      "loss": 1.0654,
+      "step": 2780
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 7.181560431535962,
+      "learning_rate": 9.275536665356772e-06,
+      "loss": 1.0956,
+      "step": 2781
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 8.393431581882348,
+      "learning_rate": 9.27493787388535e-06,
+      "loss": 1.0667,
+      "step": 2782
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 9.528231708925277,
+      "learning_rate": 9.27433885440064e-06,
+      "loss": 1.0023,
+      "step": 2783
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 6.87886673536211,
+      "learning_rate": 9.273739606934595e-06,
+      "loss": 1.0288,
+      "step": 2784
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 6.4617389630297986,
+      "learning_rate": 9.273140131519175e-06,
+      "loss": 1.0523,
+      "step": 2785
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 8.434342043765897,
+      "learning_rate": 9.272540428186355e-06,
+      "loss": 1.0841,
+      "step": 2786
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 5.258841312756764,
+      "learning_rate": 9.271940496968121e-06,
+      "loss": 1.216,
+      "step": 2787
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 6.908019960599113,
+      "learning_rate": 9.271340337896472e-06,
+      "loss": 1.0006,
+      "step": 2788
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 5.499825256386682,
+      "learning_rate": 9.270739951003418e-06,
+      "loss": 1.0558,
+      "step": 2789
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 8.077279181064196,
+      "learning_rate": 9.270139336320984e-06,
+      "loss": 1.0181,
+      "step": 2790
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 6.155609876146255,
+      "learning_rate": 9.269538493881204e-06,
+      "loss": 1.1177,
+      "step": 2791
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 9.11034926344333,
+      "learning_rate": 9.268937423716124e-06,
+      "loss": 1.0588,
+      "step": 2792
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 7.80151802880451,
+      "learning_rate": 9.268336125857804e-06,
+      "loss": 1.0594,
+      "step": 2793
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 8.733423993232318,
+      "learning_rate": 9.267734600338317e-06,
+      "loss": 1.0304,
+      "step": 2794
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 7.463482653993631,
+      "learning_rate": 9.267132847189746e-06,
+      "loss": 0.9776,
+      "step": 2795
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 6.1792146429433075,
+      "learning_rate": 9.266530866444185e-06,
+      "loss": 1.1199,
+      "step": 2796
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 8.622548448229491,
+      "learning_rate": 9.265928658133745e-06,
+      "loss": 1.0385,
+      "step": 2797
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 13.23362948649079,
+      "learning_rate": 9.265326222290545e-06,
+      "loss": 1.0374,
+      "step": 2798
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 7.213313270016978,
+      "learning_rate": 9.264723558946717e-06,
+      "loss": 1.065,
+      "step": 2799
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 14.437781232843012,
+      "learning_rate": 9.264120668134405e-06,
+      "loss": 1.0706,
+      "step": 2800
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 8.758071407444385,
+      "learning_rate": 9.263517549885765e-06,
+      "loss": 1.0998,
+      "step": 2801
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 6.236138010250233,
+      "learning_rate": 9.262914204232966e-06,
+      "loss": 1.0175,
+      "step": 2802
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 9.018313121419641,
+      "learning_rate": 9.26231063120819e-06,
+      "loss": 0.9605,
+      "step": 2803
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 6.449199904886371,
+      "learning_rate": 9.26170683084363e-06,
+      "loss": 0.9998,
+      "step": 2804
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 6.649426464605187,
+      "learning_rate": 9.26110280317149e-06,
+      "loss": 1.0288,
+      "step": 2805
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 7.440321389242025,
+      "learning_rate": 9.260498548223986e-06,
+      "loss": 1.0421,
+      "step": 2806
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 8.355422741719591,
+      "learning_rate": 9.25989406603335e-06,
+      "loss": 0.9945,
+      "step": 2807
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 9.146186445138524,
+      "learning_rate": 9.25928935663182e-06,
+      "loss": 0.9659,
+      "step": 2808
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 8.531603789290127,
+      "learning_rate": 9.258684420051653e-06,
+      "loss": 0.9843,
+      "step": 2809
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 8.830709454116334,
+      "learning_rate": 9.258079256325114e-06,
+      "loss": 0.9701,
+      "step": 2810
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 8.085823811957624,
+      "learning_rate": 9.25747386548448e-06,
+      "loss": 0.984,
+      "step": 2811
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 8.506725413437657,
+      "learning_rate": 9.256868247562039e-06,
+      "loss": 1.0648,
+      "step": 2812
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 7.2695083433320224,
+      "learning_rate": 9.256262402590095e-06,
+      "loss": 1.056,
+      "step": 2813
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 6.437340027355859,
+      "learning_rate": 9.255656330600962e-06,
+      "loss": 1.0057,
+      "step": 2814
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 8.032602172862468,
+      "learning_rate": 9.255050031626967e-06,
+      "loss": 1.1247,
+      "step": 2815
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 8.419664268142473,
+      "learning_rate": 9.254443505700445e-06,
+      "loss": 1.001,
+      "step": 2816
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 10.824374730214494,
+      "learning_rate": 9.253836752853752e-06,
+      "loss": 1.1395,
+      "step": 2817
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 8.552865627823985,
+      "learning_rate": 9.253229773119245e-06,
+      "loss": 0.9904,
+      "step": 2818
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 5.852196976075313,
+      "learning_rate": 9.252622566529302e-06,
+      "loss": 1.0219,
+      "step": 2819
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 8.440014664885412,
+      "learning_rate": 9.252015133116308e-06,
+      "loss": 1.0135,
+      "step": 2820
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 8.445808709549643,
+      "learning_rate": 9.251407472912664e-06,
+      "loss": 1.0346,
+      "step": 2821
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 8.10610618548008,
+      "learning_rate": 9.250799585950776e-06,
+      "loss": 1.0285,
+      "step": 2822
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 5.6589584744485695,
+      "learning_rate": 9.250191472263073e-06,
+      "loss": 1.0813,
+      "step": 2823
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 7.831500254448914,
+      "learning_rate": 9.249583131881987e-06,
+      "loss": 1.081,
+      "step": 2824
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 5.961536180918424,
+      "learning_rate": 9.248974564839965e-06,
+      "loss": 1.0197,
+      "step": 2825
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 7.209338438072945,
+      "learning_rate": 9.248365771169465e-06,
+      "loss": 1.0163,
+      "step": 2826
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 5.925631819045517,
+      "learning_rate": 9.247756750902962e-06,
+      "loss": 1.0459,
+      "step": 2827
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 9.075598756437047,
+      "learning_rate": 9.247147504072937e-06,
+      "loss": 1.0445,
+      "step": 2828
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 9.334286047092416,
+      "learning_rate": 9.246538030711888e-06,
+      "loss": 1.0604,
+      "step": 2829
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 8.188267512999658,
+      "learning_rate": 9.245928330852318e-06,
+      "loss": 1.0113,
+      "step": 2830
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 6.74442373977378,
+      "learning_rate": 9.245318404526753e-06,
+      "loss": 1.1032,
+      "step": 2831
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 8.285214314415786,
+      "learning_rate": 9.244708251767718e-06,
+      "loss": 1.0261,
+      "step": 2832
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 5.726298333195831,
+      "learning_rate": 9.24409787260776e-06,
+      "loss": 1.0715,
+      "step": 2833
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 6.369504755176582,
+      "learning_rate": 9.24348726707944e-06,
+      "loss": 0.9775,
+      "step": 2834
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 7.166347332295521,
+      "learning_rate": 9.242876435215315e-06,
+      "loss": 1.0037,
+      "step": 2835
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 8.85713139716033,
+      "learning_rate": 9.242265377047974e-06,
+      "loss": 1.0693,
+      "step": 2836
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 10.191862043117323,
+      "learning_rate": 9.241654092610004e-06,
+      "loss": 0.9704,
+      "step": 2837
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 4.936433769073417,
+      "learning_rate": 9.241042581934013e-06,
+      "loss": 1.0814,
+      "step": 2838
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 7.772036909761508,
+      "learning_rate": 9.240430845052616e-06,
+      "loss": 0.9693,
+      "step": 2839
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 6.815923974507775,
+      "learning_rate": 9.23981888199844e-06,
+      "loss": 1.0133,
+      "step": 2840
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.315992870595341,
+      "learning_rate": 9.239206692804129e-06,
+      "loss": 1.0539,
+      "step": 2841
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 6.056268942605209,
+      "learning_rate": 9.23859427750233e-06,
+      "loss": 1.0586,
+      "step": 2842
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 9.083605678786812,
+      "learning_rate": 9.237981636125711e-06,
+      "loss": 1.0595,
+      "step": 2843
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.613122472419661,
+      "learning_rate": 9.23736876870695e-06,
+      "loss": 1.0774,
+      "step": 2844
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 8.599615945847779,
+      "learning_rate": 9.23675567527873e-06,
+      "loss": 1.0516,
+      "step": 2845
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 6.870083921317387,
+      "learning_rate": 9.236142355873759e-06,
+      "loss": 1.0567,
+      "step": 2846
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 9.503128105772054,
+      "learning_rate": 9.235528810524745e-06,
+      "loss": 1.1182,
+      "step": 2847
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.045693535924987,
+      "learning_rate": 9.234915039264414e-06,
+      "loss": 0.9914,
+      "step": 2848
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 8.015569262136212,
+      "learning_rate": 9.234301042125501e-06,
+      "loss": 1.0477,
+      "step": 2849
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 8.986054388650151,
+      "learning_rate": 9.233686819140758e-06,
+      "loss": 1.0211,
+      "step": 2850
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 6.42043790950698,
+      "learning_rate": 9.233072370342945e-06,
+      "loss": 0.9666,
+      "step": 2851
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 6.311181330652802,
+      "learning_rate": 9.232457695764834e-06,
+      "loss": 1.0239,
+      "step": 2852
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 8.081388076520355,
+      "learning_rate": 9.23184279543921e-06,
+      "loss": 0.9876,
+      "step": 2853
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 8.245403905564945,
+      "learning_rate": 9.231227669398872e-06,
+      "loss": 0.9977,
+      "step": 2854
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.001462470986749,
+      "learning_rate": 9.230612317676628e-06,
+      "loss": 0.9693,
+      "step": 2855
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 8.674609390659539,
+      "learning_rate": 9.229996740305299e-06,
+      "loss": 1.0086,
+      "step": 2856
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.490316904752884,
+      "learning_rate": 9.229380937317715e-06,
+      "loss": 0.9865,
+      "step": 2857
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.388494822331136,
+      "learning_rate": 9.228764908746728e-06,
+      "loss": 1.0531,
+      "step": 2858
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 6.446609774112938,
+      "learning_rate": 9.22814865462519e-06,
+      "loss": 0.9944,
+      "step": 2859
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 6.200068172085794,
+      "learning_rate": 9.227532174985974e-06,
+      "loss": 1.1363,
+      "step": 2860
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.873769313317069,
+      "learning_rate": 9.226915469861957e-06,
+      "loss": 1.0064,
+      "step": 2861
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 8.653639201133208,
+      "learning_rate": 9.226298539286035e-06,
+      "loss": 1.0622,
+      "step": 2862
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.177037365808149,
+      "learning_rate": 9.225681383291113e-06,
+      "loss": 1.0405,
+      "step": 2863
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.74860085191124,
+      "learning_rate": 9.225064001910109e-06,
+      "loss": 1.0064,
+      "step": 2864
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 5.92905761656786,
+      "learning_rate": 9.22444639517595e-06,
+      "loss": 0.9712,
+      "step": 2865
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 8.018480115325563,
+      "learning_rate": 9.22382856312158e-06,
+      "loss": 1.0111,
+      "step": 2866
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.778598178666075,
+      "learning_rate": 9.223210505779952e-06,
+      "loss": 1.1295,
+      "step": 2867
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.975748588516808,
+      "learning_rate": 9.22259222318403e-06,
+      "loss": 1.0079,
+      "step": 2868
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.116828757999752,
+      "learning_rate": 9.221973715366794e-06,
+      "loss": 1.0689,
+      "step": 2869
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 8.306057913008644,
+      "learning_rate": 9.22135498236123e-06,
+      "loss": 1.0563,
+      "step": 2870
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 9.199829364653358,
+      "learning_rate": 9.220736024200343e-06,
+      "loss": 1.0223,
+      "step": 2871
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.753628772734618,
+      "learning_rate": 9.220116840917145e-06,
+      "loss": 1.0058,
+      "step": 2872
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 9.943845023138007,
+      "learning_rate": 9.219497432544661e-06,
+      "loss": 0.9987,
+      "step": 2873
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.8942125147617235,
+      "learning_rate": 9.218877799115929e-06,
+      "loss": 1.0874,
+      "step": 2874
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.370823259932932,
+      "learning_rate": 9.218257940664e-06,
+      "loss": 1.0098,
+      "step": 2875
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 8.43122933055547,
+      "learning_rate": 9.217637857221931e-06,
+      "loss": 1.0563,
+      "step": 2876
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 8.65000730943961,
+      "learning_rate": 9.217017548822799e-06,
+      "loss": 1.0804,
+      "step": 2877
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 8.86872372142779,
+      "learning_rate": 9.216397015499692e-06,
+      "loss": 1.0581,
+      "step": 2878
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.3040568124180885,
+      "learning_rate": 9.215776257285702e-06,
+      "loss": 1.0184,
+      "step": 2879
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 11.258435279603283,
+      "learning_rate": 9.215155274213943e-06,
+      "loss": 1.0813,
+      "step": 2880
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 5.967786673813,
+      "learning_rate": 9.214534066317532e-06,
+      "loss": 1.0409,
+      "step": 2881
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.542877189865114,
+      "learning_rate": 9.213912633629608e-06,
+      "loss": 1.039,
+      "step": 2882
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.480938189459273,
+      "learning_rate": 9.213290976183311e-06,
+      "loss": 1.0347,
+      "step": 2883
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.3158361754997285,
+      "learning_rate": 9.212669094011803e-06,
+      "loss": 1.1408,
+      "step": 2884
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.956487498056371,
+      "learning_rate": 9.212046987148251e-06,
+      "loss": 1.068,
+      "step": 2885
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 6.725836470810443,
+      "learning_rate": 9.211424655625838e-06,
+      "loss": 1.0591,
+      "step": 2886
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 9.172499626713497,
+      "learning_rate": 9.210802099477755e-06,
+      "loss": 1.0925,
+      "step": 2887
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 8.518386709076623,
+      "learning_rate": 9.210179318737208e-06,
+      "loss": 1.0689,
+      "step": 2888
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 9.61195188497229,
+      "learning_rate": 9.209556313437417e-06,
+      "loss": 1.0442,
+      "step": 2889
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.945992335398295,
+      "learning_rate": 9.208933083611611e-06,
+      "loss": 1.1272,
+      "step": 2890
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 8.006296346508393,
+      "learning_rate": 9.208309629293029e-06,
+      "loss": 1.085,
+      "step": 2891
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 8.16119179219909,
+      "learning_rate": 9.207685950514924e-06,
+      "loss": 1.0493,
+      "step": 2892
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.525395727011995,
+      "learning_rate": 9.207062047310562e-06,
+      "loss": 1.0508,
+      "step": 2893
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 9.948766011268221,
+      "learning_rate": 9.206437919713223e-06,
+      "loss": 1.0538,
+      "step": 2894
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 9.533545226418534,
+      "learning_rate": 9.205813567756193e-06,
+      "loss": 1.0535,
+      "step": 2895
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 9.317863204594195,
+      "learning_rate": 9.205188991472773e-06,
+      "loss": 1.0287,
+      "step": 2896
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.617495764186184,
+      "learning_rate": 9.20456419089628e-06,
+      "loss": 0.9844,
+      "step": 2897
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 11.093806655691477,
+      "learning_rate": 9.203939166060034e-06,
+      "loss": 1.063,
+      "step": 2898
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.652149682232128,
+      "learning_rate": 9.203313916997377e-06,
+      "loss": 1.1069,
+      "step": 2899
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.713872749368201,
+      "learning_rate": 9.202688443741655e-06,
+      "loss": 1.0068,
+      "step": 2900
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.703714370902976,
+      "learning_rate": 9.202062746326228e-06,
+      "loss": 1.0873,
+      "step": 2901
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.422901079867813,
+      "learning_rate": 9.201436824784471e-06,
+      "loss": 1.0587,
+      "step": 2902
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 6.872442458477529,
+      "learning_rate": 9.20081067914977e-06,
+      "loss": 1.0492,
+      "step": 2903
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 9.38856091099215,
+      "learning_rate": 9.20018430945552e-06,
+      "loss": 0.9592,
+      "step": 2904
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 8.872797908377011,
+      "learning_rate": 9.19955771573513e-06,
+      "loss": 1.1087,
+      "step": 2905
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 9.910689381510252,
+      "learning_rate": 9.19893089802202e-06,
+      "loss": 1.1011,
+      "step": 2906
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.627190587612383,
+      "learning_rate": 9.198303856349627e-06,
+      "loss": 0.9986,
+      "step": 2907
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.1868359393321954,
+      "learning_rate": 9.197676590751389e-06,
+      "loss": 1.0069,
+      "step": 2908
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.911683903178032,
+      "learning_rate": 9.197049101260767e-06,
+      "loss": 1.0033,
+      "step": 2909
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 9.467924067604594,
+      "learning_rate": 9.196421387911228e-06,
+      "loss": 1.0456,
+      "step": 2910
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 8.857602814242583,
+      "learning_rate": 9.195793450736255e-06,
+      "loss": 1.0134,
+      "step": 2911
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 8.550528977209728,
+      "learning_rate": 9.195165289769337e-06,
+      "loss": 0.9728,
+      "step": 2912
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 9.82511896740366,
+      "learning_rate": 9.194536905043977e-06,
+      "loss": 1.0111,
+      "step": 2913
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 9.286155165349438,
+      "learning_rate": 9.1939082965937e-06,
+      "loss": 1.0955,
+      "step": 2914
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 8.05003606248581,
+      "learning_rate": 9.193279464452022e-06,
+      "loss": 0.9983,
+      "step": 2915
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.452274196185942,
+      "learning_rate": 9.192650408652493e-06,
+      "loss": 1.0189,
+      "step": 2916
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.3461138441327645,
+      "learning_rate": 9.192021129228661e-06,
+      "loss": 1.0523,
+      "step": 2917
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 9.481442331082443,
+      "learning_rate": 9.19139162621409e-06,
+      "loss": 1.025,
+      "step": 2918
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.651733298373431,
+      "learning_rate": 9.190761899642357e-06,
+      "loss": 1.0192,
+      "step": 2919
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 6.4605602321881515,
+      "learning_rate": 9.190131949547051e-06,
+      "loss": 0.9869,
+      "step": 2920
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.1206585510517515,
+      "learning_rate": 9.189501775961769e-06,
+      "loss": 1.1026,
+      "step": 2921
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 6.091077679278348,
+      "learning_rate": 9.188871378920123e-06,
+      "loss": 1.0034,
+      "step": 2922
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 8.703898514478372,
+      "learning_rate": 9.188240758455738e-06,
+      "loss": 0.9774,
+      "step": 2923
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 6.899837366932251,
+      "learning_rate": 9.187609914602247e-06,
+      "loss": 1.0183,
+      "step": 2924
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 8.517008439736335,
+      "learning_rate": 9.186978847393302e-06,
+      "loss": 1.0208,
+      "step": 2925
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.448055142267179,
+      "learning_rate": 9.186347556862559e-06,
+      "loss": 1.0885,
+      "step": 2926
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 8.458257274643904,
+      "learning_rate": 9.18571604304369e-06,
+      "loss": 1.059,
+      "step": 2927
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 8.463879595366494,
+      "learning_rate": 9.185084305970378e-06,
+      "loss": 0.9642,
+      "step": 2928
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.371897146292639,
+      "learning_rate": 9.18445234567632e-06,
+      "loss": 1.0095,
+      "step": 2929
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 8.221927730013286,
+      "learning_rate": 9.18382016219522e-06,
+      "loss": 1.0771,
+      "step": 2930
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 9.229317726877731,
+      "learning_rate": 9.183187755560799e-06,
+      "loss": 1.0597,
+      "step": 2931
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.713678263054453,
+      "learning_rate": 9.182555125806784e-06,
+      "loss": 1.0078,
+      "step": 2932
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.371887298900009,
+      "learning_rate": 9.181922272966926e-06,
+      "loss": 1.0233,
+      "step": 2933
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 6.8136527362944905,
+      "learning_rate": 9.18128919707497e-06,
+      "loss": 0.9937,
+      "step": 2934
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 10.551868639349184,
+      "learning_rate": 9.18065589816469e-06,
+      "loss": 1.0328,
+      "step": 2935
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 8.449064206804502,
+      "learning_rate": 9.180022376269859e-06,
+      "loss": 1.0704,
+      "step": 2936
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 8.271016848019752,
+      "learning_rate": 9.179388631424271e-06,
+      "loss": 0.9946,
+      "step": 2937
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 6.647196055967439,
+      "learning_rate": 9.178754663661727e-06,
+      "loss": 1.0731,
+      "step": 2938
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 9.600754178875452,
+      "learning_rate": 9.17812047301604e-06,
+      "loss": 1.0705,
+      "step": 2939
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 10.105219216514765,
+      "learning_rate": 9.177486059521037e-06,
+      "loss": 1.0316,
+      "step": 2940
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.6052585659338705,
+      "learning_rate": 9.176851423210555e-06,
+      "loss": 1.0605,
+      "step": 2941
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 9.929464401534828,
+      "learning_rate": 9.176216564118445e-06,
+      "loss": 1.0194,
+      "step": 2942
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 6.754149354027648,
+      "learning_rate": 9.17558148227857e-06,
+      "loss": 1.0095,
+      "step": 2943
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 6.392704900810029,
+      "learning_rate": 9.174946177724799e-06,
+      "loss": 0.9854,
+      "step": 2944
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 9.504411588825485,
+      "learning_rate": 9.17431065049102e-06,
+      "loss": 1.0452,
+      "step": 2945
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 11.124356825472745,
+      "learning_rate": 9.17367490061113e-06,
+      "loss": 0.9632,
+      "step": 2946
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.029115858182446,
+      "learning_rate": 9.173038928119041e-06,
+      "loss": 0.9214,
+      "step": 2947
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 10.839382699824995,
+      "learning_rate": 9.17240273304867e-06,
+      "loss": 1.0635,
+      "step": 2948
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.126362382126733,
+      "learning_rate": 9.171766315433951e-06,
+      "loss": 1.0222,
+      "step": 2949
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 6.960475659930672,
+      "learning_rate": 9.171129675308829e-06,
+      "loss": 1.0338,
+      "step": 2950
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 8.83549196335587,
+      "learning_rate": 9.170492812707258e-06,
+      "loss": 1.1062,
+      "step": 2951
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.831000416045291,
+      "learning_rate": 9.169855727663213e-06,
+      "loss": 0.9892,
+      "step": 2952
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 8.542058691489792,
+      "learning_rate": 9.169218420210668e-06,
+      "loss": 1.0224,
+      "step": 2953
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.696636591238837,
+      "learning_rate": 9.168580890383618e-06,
+      "loss": 1.0006,
+      "step": 2954
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 6.835969513290239,
+      "learning_rate": 9.167943138216068e-06,
+      "loss": 1.0549,
+      "step": 2955
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.388519685789637,
+      "learning_rate": 9.167305163742031e-06,
+      "loss": 1.0538,
+      "step": 2956
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 6.963389492237515,
+      "learning_rate": 9.166666966995537e-06,
+      "loss": 1.1161,
+      "step": 2957
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 8.427586807263571,
+      "learning_rate": 9.166028548010623e-06,
+      "loss": 0.9594,
+      "step": 2958
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.024182862743915,
+      "learning_rate": 9.165389906821346e-06,
+      "loss": 1.0372,
+      "step": 2959
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.155547502307013,
+      "learning_rate": 9.164751043461764e-06,
+      "loss": 1.0238,
+      "step": 2960
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 10.139416471692254,
+      "learning_rate": 9.164111957965953e-06,
+      "loss": 1.0277,
+      "step": 2961
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 6.873740196066008,
+      "learning_rate": 9.163472650368002e-06,
+      "loss": 1.1005,
+      "step": 2962
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.978832310390871,
+      "learning_rate": 9.16283312070201e-06,
+      "loss": 0.9873,
+      "step": 2963
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 8.157496942498433,
+      "learning_rate": 9.162193369002086e-06,
+      "loss": 1.0662,
+      "step": 2964
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 9.905026501833856,
+      "learning_rate": 9.161553395302352e-06,
+      "loss": 1.066,
+      "step": 2965
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 8.02577277159656,
+      "learning_rate": 9.160913199636944e-06,
+      "loss": 1.0186,
+      "step": 2966
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 9.239447593428258,
+      "learning_rate": 9.160272782040009e-06,
+      "loss": 0.9868,
+      "step": 2967
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 8.589968662422496,
+      "learning_rate": 9.159632142545703e-06,
+      "loss": 1.0622,
+      "step": 2968
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 9.373621824014586,
+      "learning_rate": 9.158991281188198e-06,
+      "loss": 1.0119,
+      "step": 2969
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 6.921180059721789,
+      "learning_rate": 9.158350198001675e-06,
+      "loss": 0.9781,
+      "step": 2970
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 6.116764279069625,
+      "learning_rate": 9.157708893020325e-06,
+      "loss": 1.0393,
+      "step": 2971
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 8.752125547607312,
+      "learning_rate": 9.157067366278357e-06,
+      "loss": 1.0301,
+      "step": 2972
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 8.501071405189826,
+      "learning_rate": 9.156425617809988e-06,
+      "loss": 0.966,
+      "step": 2973
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.534789990038464,
+      "learning_rate": 9.155783647649445e-06,
+      "loss": 0.9371,
+      "step": 2974
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 6.085620124191985,
+      "learning_rate": 9.155141455830969e-06,
+      "loss": 0.9594,
+      "step": 2975
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 10.08118555851295,
+      "learning_rate": 9.154499042388816e-06,
+      "loss": 1.0846,
+      "step": 2976
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.374919724126967,
+      "learning_rate": 9.153856407357247e-06,
+      "loss": 1.0254,
+      "step": 2977
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 9.220959928705073,
+      "learning_rate": 9.15321355077054e-06,
+      "loss": 0.9899,
+      "step": 2978
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 8.361120710084249,
+      "learning_rate": 9.152570472662981e-06,
+      "loss": 1.0165,
+      "step": 2979
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 8.568824454323476,
+      "learning_rate": 9.151927173068874e-06,
+      "loss": 1.0805,
+      "step": 2980
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 8.223014720921713,
+      "learning_rate": 9.151283652022527e-06,
+      "loss": 0.9903,
+      "step": 2981
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 9.026039951571942,
+      "learning_rate": 9.150639909558265e-06,
+      "loss": 0.9702,
+      "step": 2982
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.451109103568678,
+      "learning_rate": 9.149995945710424e-06,
+      "loss": 0.9772,
+      "step": 2983
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 9.45581295757948,
+      "learning_rate": 9.149351760513351e-06,
+      "loss": 1.1214,
+      "step": 2984
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 8.668024135075441,
+      "learning_rate": 9.148707354001405e-06,
+      "loss": 1.0007,
+      "step": 2985
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.302265454030366,
+      "learning_rate": 9.148062726208956e-06,
+      "loss": 1.0445,
+      "step": 2986
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 6.444386760352945,
+      "learning_rate": 9.14741787717039e-06,
+      "loss": 1.0135,
+      "step": 2987
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.604009164944802,
+      "learning_rate": 9.146772806920096e-06,
+      "loss": 1.0612,
+      "step": 2988
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.570838397526774,
+      "learning_rate": 9.146127515492484e-06,
+      "loss": 1.0227,
+      "step": 2989
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 6.642841732413437,
+      "learning_rate": 9.145482002921972e-06,
+      "loss": 1.1483,
+      "step": 2990
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 8.560853339911723,
+      "learning_rate": 9.144836269242988e-06,
+      "loss": 1.0045,
+      "step": 2991
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 8.829318765799316,
+      "learning_rate": 9.144190314489975e-06,
+      "loss": 1.1568,
+      "step": 2992
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.22309113584755,
+      "learning_rate": 9.143544138697386e-06,
+      "loss": 1.0053,
+      "step": 2993
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 8.451578872975093,
+      "learning_rate": 9.142897741899686e-06,
+      "loss": 1.0766,
+      "step": 2994
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 8.544292784561378,
+      "learning_rate": 9.142251124131353e-06,
+      "loss": 1.1363,
+      "step": 2995
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 8.3061657742343,
+      "learning_rate": 9.141604285426874e-06,
+      "loss": 1.0146,
+      "step": 2996
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.173771182232024,
+      "learning_rate": 9.14095722582075e-06,
+      "loss": 1.0698,
+      "step": 2997
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 6.416369459287501,
+      "learning_rate": 9.140309945347496e-06,
+      "loss": 0.9982,
+      "step": 2998
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 10.514567601657337,
+      "learning_rate": 9.139662444041633e-06,
+      "loss": 1.0383,
+      "step": 2999
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.7238542393905885,
+      "learning_rate": 9.139014721937699e-06,
+      "loss": 1.1364,
+      "step": 3000
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 6.857135589069568,
+      "learning_rate": 9.13836677907024e-06,
+      "loss": 1.0351,
+      "step": 3001
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.608204588599762,
+      "learning_rate": 9.137718615473816e-06,
+      "loss": 1.1135,
+      "step": 3002
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 8.184637146785159,
+      "learning_rate": 9.137070231183e-06,
+      "loss": 1.0909,
+      "step": 3003
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.8517061938444455,
+      "learning_rate": 9.136421626232373e-06,
+      "loss": 1.0433,
+      "step": 3004
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.491170217572197,
+      "learning_rate": 9.135772800656528e-06,
+      "loss": 1.0415,
+      "step": 3005
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.645207949911455,
+      "learning_rate": 9.135123754490076e-06,
+      "loss": 0.98,
+      "step": 3006
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 8.253140089884061,
+      "learning_rate": 9.134474487767634e-06,
+      "loss": 1.0148,
+      "step": 3007
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 9.504066060860225,
+      "learning_rate": 9.133825000523828e-06,
+      "loss": 1.0359,
+      "step": 3008
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 6.128567262447935,
+      "learning_rate": 9.133175292793305e-06,
+      "loss": 0.9542,
+      "step": 3009
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.524503408287729,
+      "learning_rate": 9.132525364610715e-06,
+      "loss": 0.9412,
+      "step": 3010
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 11.075150234962168,
+      "learning_rate": 9.131875216010728e-06,
+      "loss": 1.0585,
+      "step": 3011
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 6.820184691338084,
+      "learning_rate": 9.131224847028018e-06,
+      "loss": 1.1093,
+      "step": 3012
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 6.19842791967396,
+      "learning_rate": 9.130574257697271e-06,
+      "loss": 1.032,
+      "step": 3013
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 6.449576961331512,
+      "learning_rate": 9.129923448053194e-06,
+      "loss": 1.0018,
+      "step": 3014
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 10.24274726399833,
+      "learning_rate": 9.129272418130492e-06,
+      "loss": 0.9973,
+      "step": 3015
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 8.47202236427221,
+      "learning_rate": 9.128621167963898e-06,
+      "loss": 1.0387,
+      "step": 3016
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 8.566767935342028,
+      "learning_rate": 9.12796969758814e-06,
+      "loss": 1.0631,
+      "step": 3017
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 6.356274373860288,
+      "learning_rate": 9.127318007037969e-06,
+      "loss": 0.9929,
+      "step": 3018
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.540655755500782,
+      "learning_rate": 9.126666096348143e-06,
+      "loss": 1.0535,
+      "step": 3019
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 8.120938187870049,
+      "learning_rate": 9.126013965553435e-06,
+      "loss": 1.0964,
+      "step": 3020
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 9.669823887158218,
+      "learning_rate": 9.125361614688627e-06,
+      "loss": 1.1017,
+      "step": 3021
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 11.960797099573837,
+      "learning_rate": 9.124709043788514e-06,
+      "loss": 1.1287,
+      "step": 3022
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.480358552704659,
+      "learning_rate": 9.124056252887901e-06,
+      "loss": 1.0018,
+      "step": 3023
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 9.435505113852757,
+      "learning_rate": 9.123403242021607e-06,
+      "loss": 1.0366,
+      "step": 3024
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 9.441260587202043,
+      "learning_rate": 9.122750011224462e-06,
+      "loss": 1.0582,
+      "step": 3025
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 6.549265603254031,
+      "learning_rate": 9.122096560531306e-06,
+      "loss": 1.0668,
+      "step": 3026
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 8.09149872881497,
+      "learning_rate": 9.121442889976995e-06,
+      "loss": 1.0597,
+      "step": 3027
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 10.830566411656635,
+      "learning_rate": 9.120788999596391e-06,
+      "loss": 1.0549,
+      "step": 3028
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 5.185149097800064,
+      "learning_rate": 9.120134889424374e-06,
+      "loss": 1.0021,
+      "step": 3029
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 6.501859579751927,
+      "learning_rate": 9.11948055949583e-06,
+      "loss": 1.0034,
+      "step": 3030
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 6.997693936359913,
+      "learning_rate": 9.118826009845658e-06,
+      "loss": 1.0141,
+      "step": 3031
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 8.369951151673494,
+      "learning_rate": 9.118171240508772e-06,
+      "loss": 0.977,
+      "step": 3032
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 9.824179331289685,
+      "learning_rate": 9.117516251520095e-06,
+      "loss": 1.0114,
+      "step": 3033
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 9.999147984166049,
+      "learning_rate": 9.116861042914562e-06,
+      "loss": 0.9594,
+      "step": 3034
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 6.79984474112979,
+      "learning_rate": 9.116205614727122e-06,
+      "loss": 1.0333,
+      "step": 3035
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.372864003416208,
+      "learning_rate": 9.115549966992732e-06,
+      "loss": 1.0045,
+      "step": 3036
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 12.051423260938684,
+      "learning_rate": 9.114894099746361e-06,
+      "loss": 0.9581,
+      "step": 3037
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 9.17327516953504,
+      "learning_rate": 9.114238013022993e-06,
+      "loss": 1.0682,
+      "step": 3038
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.229246108889428,
+      "learning_rate": 9.113581706857622e-06,
+      "loss": 1.0614,
+      "step": 3039
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 6.061393091466746,
+      "learning_rate": 9.112925181285253e-06,
+      "loss": 0.93,
+      "step": 3040
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 6.938785776176641,
+      "learning_rate": 9.112268436340901e-06,
+      "loss": 1.0777,
+      "step": 3041
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 6.759076143264093,
+      "learning_rate": 9.1116114720596e-06,
+      "loss": 0.9848,
+      "step": 3042
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 9.244211598659216,
+      "learning_rate": 9.110954288476386e-06,
+      "loss": 0.9787,
+      "step": 3043
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 9.438743107297407,
+      "learning_rate": 9.110296885626315e-06,
+      "loss": 0.9795,
+      "step": 3044
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.971258788860944,
+      "learning_rate": 9.109639263544447e-06,
+      "loss": 1.0164,
+      "step": 3045
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 11.95951080024645,
+      "learning_rate": 9.108981422265862e-06,
+      "loss": 1.0159,
+      "step": 3046
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.038391549210722,
+      "learning_rate": 9.108323361825645e-06,
+      "loss": 0.9958,
+      "step": 3047
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 6.0391843606498785,
+      "learning_rate": 9.107665082258893e-06,
+      "loss": 1.0042,
+      "step": 3048
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.244005767874363,
+      "learning_rate": 9.107006583600723e-06,
+      "loss": 1.0305,
+      "step": 3049
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 6.517041005529781,
+      "learning_rate": 9.106347865886252e-06,
+      "loss": 0.9809,
+      "step": 3050
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 10.454077416460695,
+      "learning_rate": 9.105688929150615e-06,
+      "loss": 1.0448,
+      "step": 3051
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.480704560933145,
+      "learning_rate": 9.10502977342896e-06,
+      "loss": 1.0787,
+      "step": 3052
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 9.26673228622858,
+      "learning_rate": 9.104370398756444e-06,
+      "loss": 0.9138,
+      "step": 3053
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.599389112189641,
+      "learning_rate": 9.103710805168233e-06,
+      "loss": 0.9792,
+      "step": 3054
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.498825173103372,
+      "learning_rate": 9.103050992699513e-06,
+      "loss": 1.0242,
+      "step": 3055
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.532608325242908,
+      "learning_rate": 9.102390961385472e-06,
+      "loss": 1.0485,
+      "step": 3056
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 6.991358157557537,
+      "learning_rate": 9.101730711261318e-06,
+      "loss": 0.9618,
+      "step": 3057
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.376563502472571,
+      "learning_rate": 9.101070242362264e-06,
+      "loss": 1.029,
+      "step": 3058
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.583807468799374,
+      "learning_rate": 9.100409554723539e-06,
+      "loss": 1.0974,
+      "step": 3059
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.137636113800423,
+      "learning_rate": 9.099748648380382e-06,
+      "loss": 1.0367,
+      "step": 3060
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 9.9513245829433,
+      "learning_rate": 9.099087523368043e-06,
+      "loss": 1.1006,
+      "step": 3061
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.7963141949585735,
+      "learning_rate": 9.098426179721786e-06,
+      "loss": 1.0553,
+      "step": 3062
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.648581351235139,
+      "learning_rate": 9.097764617476886e-06,
+      "loss": 1.0466,
+      "step": 3063
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.543181110267973,
+      "learning_rate": 9.097102836668625e-06,
+      "loss": 1.0064,
+      "step": 3064
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 11.12577420697675,
+      "learning_rate": 9.096440837332305e-06,
+      "loss": 1.0736,
+      "step": 3065
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.867257991824657,
+      "learning_rate": 9.095778619503233e-06,
+      "loss": 1.0503,
+      "step": 3066
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.446103192332531,
+      "learning_rate": 9.095116183216727e-06,
+      "loss": 0.9753,
+      "step": 3067
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.718857315486387,
+      "learning_rate": 9.094453528508127e-06,
+      "loss": 1.0038,
+      "step": 3068
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.970751719477389,
+      "learning_rate": 9.093790655412772e-06,
+      "loss": 1.0125,
+      "step": 3069
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 10.64014023651484,
+      "learning_rate": 9.093127563966017e-06,
+      "loss": 1.0336,
+      "step": 3070
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 5.818615997498979,
+      "learning_rate": 9.09246425420323e-06,
+      "loss": 1.0952,
+      "step": 3071
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 9.184436174204858,
+      "learning_rate": 9.091800726159794e-06,
+      "loss": 1.0036,
+      "step": 3072
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 6.187714987029728,
+      "learning_rate": 9.091136979871095e-06,
+      "loss": 1.0322,
+      "step": 3073
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.349587609146983,
+      "learning_rate": 9.090473015372538e-06,
+      "loss": 0.9638,
+      "step": 3074
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.633724145732334,
+      "learning_rate": 9.089808832699536e-06,
+      "loss": 1.0581,
+      "step": 3075
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.830003580592008,
+      "learning_rate": 9.089144431887515e-06,
+      "loss": 0.9409,
+      "step": 3076
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.77392751046453,
+      "learning_rate": 9.088479812971913e-06,
+      "loss": 1.0386,
+      "step": 3077
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.071531386970315,
+      "learning_rate": 9.087814975988179e-06,
+      "loss": 1.0749,
+      "step": 3078
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.499420298832491,
+      "learning_rate": 9.08714992097177e-06,
+      "loss": 1.0136,
+      "step": 3079
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.221700021303132,
+      "learning_rate": 9.086484647958161e-06,
+      "loss": 1.051,
+      "step": 3080
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 6.7489928002808135,
+      "learning_rate": 9.085819156982838e-06,
+      "loss": 1.0485,
+      "step": 3081
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 10.709659556289303,
+      "learning_rate": 9.085153448081292e-06,
+      "loss": 1.0916,
+      "step": 3082
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 11.840242200193332,
+      "learning_rate": 9.084487521289033e-06,
+      "loss": 1.1782,
+      "step": 3083
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.691558088706847,
+      "learning_rate": 9.083821376641579e-06,
+      "loss": 1.0813,
+      "step": 3084
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.104820785755851,
+      "learning_rate": 9.083155014174461e-06,
+      "loss": 1.0569,
+      "step": 3085
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 13.005094297427997,
+      "learning_rate": 9.082488433923217e-06,
+      "loss": 1.048,
+      "step": 3086
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 6.852111362866203,
+      "learning_rate": 9.081821635923406e-06,
+      "loss": 1.0779,
+      "step": 3087
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.500169018715065,
+      "learning_rate": 9.081154620210592e-06,
+      "loss": 1.0002,
+      "step": 3088
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.156156769452943,
+      "learning_rate": 9.080487386820348e-06,
+      "loss": 0.987,
+      "step": 3089
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.500163197813363,
+      "learning_rate": 9.079819935788266e-06,
+      "loss": 1.0301,
+      "step": 3090
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 9.54862276877692,
+      "learning_rate": 9.079152267149944e-06,
+      "loss": 0.9781,
+      "step": 3091
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.582929180240202,
+      "learning_rate": 9.078484380940997e-06,
+      "loss": 1.0078,
+      "step": 3092
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.537901506663486,
+      "learning_rate": 9.077816277197043e-06,
+      "loss": 0.9716,
+      "step": 3093
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 6.736653751535692,
+      "learning_rate": 9.07714795595372e-06,
+      "loss": 1.098,
+      "step": 3094
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.126942628482016,
+      "learning_rate": 9.076479417246676e-06,
+      "loss": 1.0615,
+      "step": 3095
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.30116901609754,
+      "learning_rate": 9.075810661111564e-06,
+      "loss": 0.9823,
+      "step": 3096
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.345828281294618,
+      "learning_rate": 9.075141687584056e-06,
+      "loss": 1.0291,
+      "step": 3097
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.629035574487208,
+      "learning_rate": 9.074472496699837e-06,
+      "loss": 0.9935,
+      "step": 3098
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.224791350482617,
+      "learning_rate": 9.073803088494595e-06,
+      "loss": 0.9921,
+      "step": 3099
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.284687562604178,
+      "learning_rate": 9.073133463004035e-06,
+      "loss": 0.9273,
+      "step": 3100
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 5.981657414276388,
+      "learning_rate": 9.072463620263874e-06,
+      "loss": 0.9865,
+      "step": 3101
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.892754744186332,
+      "learning_rate": 9.07179356030984e-06,
+      "loss": 0.9125,
+      "step": 3102
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.724279983629947,
+      "learning_rate": 9.07112328317767e-06,
+      "loss": 1.0882,
+      "step": 3103
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.759702053678007,
+      "learning_rate": 9.070452788903117e-06,
+      "loss": 1.0541,
+      "step": 3104
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 9.140019896510534,
+      "learning_rate": 9.069782077521943e-06,
+      "loss": 0.9898,
+      "step": 3105
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 9.95991691461247,
+      "learning_rate": 9.069111149069919e-06,
+      "loss": 1.0996,
+      "step": 3106
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.473304004674974,
+      "learning_rate": 9.068440003582835e-06,
+      "loss": 1.0673,
+      "step": 3107
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 9.12901608589405,
+      "learning_rate": 9.067768641096485e-06,
+      "loss": 0.9936,
+      "step": 3108
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.237613703447738,
+      "learning_rate": 9.067097061646679e-06,
+      "loss": 1.0424,
+      "step": 3109
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 6.83388787037852,
+      "learning_rate": 9.066425265269235e-06,
+      "loss": 1.0463,
+      "step": 3110
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 10.4490992407003,
+      "learning_rate": 9.065753251999988e-06,
+      "loss": 1.109,
+      "step": 3111
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.005806649777623,
+      "learning_rate": 9.065081021874779e-06,
+      "loss": 1.0324,
+      "step": 3112
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 6.9191965662872,
+      "learning_rate": 9.064408574929464e-06,
+      "loss": 1.0654,
+      "step": 3113
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.432032193557545,
+      "learning_rate": 9.063735911199908e-06,
+      "loss": 1.0415,
+      "step": 3114
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.9498497283799745,
+      "learning_rate": 9.06306303072199e-06,
+      "loss": 1.0032,
+      "step": 3115
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.785408670149339,
+      "learning_rate": 9.062389933531601e-06,
+      "loss": 1.0872,
+      "step": 3116
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 9.113797820902148,
+      "learning_rate": 9.061716619664639e-06,
+      "loss": 0.99,
+      "step": 3117
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.308205629648933,
+      "learning_rate": 9.061043089157019e-06,
+      "loss": 1.0846,
+      "step": 3118
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 9.449139168642915,
+      "learning_rate": 9.060369342044666e-06,
+      "loss": 1.0771,
+      "step": 3119
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.2354058144861115,
+      "learning_rate": 9.05969537836351e-06,
+      "loss": 1.0127,
+      "step": 3120
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 9.001751075466304,
+      "learning_rate": 9.059021198149507e-06,
+      "loss": 1.0476,
+      "step": 3121
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 8.613058436353088,
+      "learning_rate": 9.05834680143861e-06,
+      "loss": 1.0464,
+      "step": 3122
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 10.349535670585379,
+      "learning_rate": 9.057672188266793e-06,
+      "loss": 1.082,
+      "step": 3123
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 8.125326622267522,
+      "learning_rate": 9.056997358670035e-06,
+      "loss": 1.0615,
+      "step": 3124
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 8.858867475887129,
+      "learning_rate": 9.05632231268433e-06,
+      "loss": 1.0146,
+      "step": 3125
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 6.825716748442939,
+      "learning_rate": 9.055647050345684e-06,
+      "loss": 1.0459,
+      "step": 3126
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 8.314140824883006,
+      "learning_rate": 9.054971571690115e-06,
+      "loss": 1.0478,
+      "step": 3127
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 8.183438727498388,
+      "learning_rate": 9.054295876753646e-06,
+      "loss": 1.0851,
+      "step": 3128
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.624221047286588,
+      "learning_rate": 9.053619965572323e-06,
+      "loss": 1.0104,
+      "step": 3129
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.9633891079755035,
+      "learning_rate": 9.052943838182194e-06,
+      "loss": 1.0473,
+      "step": 3130
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 8.211024309147973,
+      "learning_rate": 9.05226749461932e-06,
+      "loss": 1.0291,
+      "step": 3131
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.003929126511194,
+      "learning_rate": 9.051590934919779e-06,
+      "loss": 1.0349,
+      "step": 3132
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 8.906995246299386,
+      "learning_rate": 9.050914159119656e-06,
+      "loss": 1.0278,
+      "step": 3133
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 9.252280766541174,
+      "learning_rate": 9.050237167255044e-06,
+      "loss": 1.0444,
+      "step": 3134
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.289410598864617,
+      "learning_rate": 9.049559959362059e-06,
+      "loss": 1.0438,
+      "step": 3135
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 6.149706605995033,
+      "learning_rate": 9.048882535476817e-06,
+      "loss": 1.124,
+      "step": 3136
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 9.346616734769086,
+      "learning_rate": 9.04820489563545e-06,
+      "loss": 1.092,
+      "step": 3137
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 4.873633059392164,
+      "learning_rate": 9.047527039874105e-06,
+      "loss": 1.0549,
+      "step": 3138
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 5.9607838325286435,
+      "learning_rate": 9.04684896822893e-06,
+      "loss": 0.9858,
+      "step": 3139
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 8.257059502190383,
+      "learning_rate": 9.046170680736096e-06,
+      "loss": 0.9901,
+      "step": 3140
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 6.643907952200108,
+      "learning_rate": 9.045492177431783e-06,
+      "loss": 1.0137,
+      "step": 3141
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 10.091695259539462,
+      "learning_rate": 9.044813458352178e-06,
+      "loss": 1.0038,
+      "step": 3142
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 5.976984097269902,
+      "learning_rate": 9.04413452353348e-06,
+      "loss": 0.9856,
+      "step": 3143
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 8.686039059175602,
+      "learning_rate": 9.043455373011903e-06,
+      "loss": 1.0388,
+      "step": 3144
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.678681282559029,
+      "learning_rate": 9.042776006823672e-06,
+      "loss": 1.0811,
+      "step": 3145
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.673649469147897,
+      "learning_rate": 9.042096425005024e-06,
+      "loss": 1.0431,
+      "step": 3146
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 5.701779898677095,
+      "learning_rate": 9.041416627592201e-06,
+      "loss": 1.0129,
+      "step": 3147
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 10.953981629715399,
+      "learning_rate": 9.040736614621467e-06,
+      "loss": 1.0309,
+      "step": 3148
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.226468020736554,
+      "learning_rate": 9.040056386129088e-06,
+      "loss": 0.9949,
+      "step": 3149
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 8.469921076853003,
+      "learning_rate": 9.039375942151347e-06,
+      "loss": 1.0633,
+      "step": 3150
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.245263652338826,
+      "learning_rate": 9.038695282724536e-06,
+      "loss": 0.9772,
+      "step": 3151
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.799842363857953,
+      "learning_rate": 9.038014407884962e-06,
+      "loss": 1.0566,
+      "step": 3152
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 9.883956900496202,
+      "learning_rate": 9.037333317668939e-06,
+      "loss": 0.9541,
+      "step": 3153
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.998132297027823,
+      "learning_rate": 9.036652012112796e-06,
+      "loss": 1.0543,
+      "step": 3154
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.878152942595799,
+      "learning_rate": 9.03597049125287e-06,
+      "loss": 1.0696,
+      "step": 3155
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.979684195837658,
+      "learning_rate": 9.03528875512551e-06,
+      "loss": 1.0071,
+      "step": 3156
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.486736338338476,
+      "learning_rate": 9.034606803767082e-06,
+      "loss": 1.0433,
+      "step": 3157
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 5.79455161994075,
+      "learning_rate": 9.033924637213957e-06,
+      "loss": 0.9963,
+      "step": 3158
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 9.531451195987039,
+      "learning_rate": 9.033242255502522e-06,
+      "loss": 0.9215,
+      "step": 3159
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.543695601169872,
+      "learning_rate": 9.03255965866917e-06,
+      "loss": 1.0317,
+      "step": 3160
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.561309938446297,
+      "learning_rate": 9.031876846750311e-06,
+      "loss": 0.9597,
+      "step": 3161
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 8.143258207813723,
+      "learning_rate": 9.031193819782364e-06,
+      "loss": 0.9448,
+      "step": 3162
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 6.794624849457138,
+      "learning_rate": 9.030510577801758e-06,
+      "loss": 0.9951,
+      "step": 3163
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 6.713332739651579,
+      "learning_rate": 9.029827120844939e-06,
+      "loss": 0.9526,
+      "step": 3164
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.9620303402971375,
+      "learning_rate": 9.029143448948357e-06,
+      "loss": 1.1347,
+      "step": 3165
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 5.585665866255266,
+      "learning_rate": 9.028459562148479e-06,
+      "loss": 0.9819,
+      "step": 3166
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.644201586391027,
+      "learning_rate": 9.027775460481781e-06,
+      "loss": 1.0537,
+      "step": 3167
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 9.87043713726975,
+      "learning_rate": 9.027091143984751e-06,
+      "loss": 1.025,
+      "step": 3168
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.747249959192116,
+      "learning_rate": 9.02640661269389e-06,
+      "loss": 1.0976,
+      "step": 3169
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.2277917928246795,
+      "learning_rate": 9.025721866645707e-06,
+      "loss": 1.0864,
+      "step": 3170
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 6.897834033134975,
+      "learning_rate": 9.025036905876727e-06,
+      "loss": 1.0771,
+      "step": 3171
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 8.297749159937984,
+      "learning_rate": 9.024351730423479e-06,
+      "loss": 1.1333,
+      "step": 3172
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 9.451797553697148,
+      "learning_rate": 9.023666340322514e-06,
+      "loss": 1.0855,
+      "step": 3173
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 8.603860050324728,
+      "learning_rate": 9.022980735610386e-06,
+      "loss": 1.0306,
+      "step": 3174
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 8.085029766950283,
+      "learning_rate": 9.022294916323665e-06,
+      "loss": 0.9769,
+      "step": 3175
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 8.531918453385325,
+      "learning_rate": 9.021608882498927e-06,
+      "loss": 1.0765,
+      "step": 3176
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 8.164005385160864,
+      "learning_rate": 9.020922634172767e-06,
+      "loss": 1.1037,
+      "step": 3177
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 9.850744122324434,
+      "learning_rate": 9.020236171381785e-06,
+      "loss": 1.0191,
+      "step": 3178
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.906118762654024,
+      "learning_rate": 9.019549494162596e-06,
+      "loss": 1.0418,
+      "step": 3179
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 6.497309693495392,
+      "learning_rate": 9.018862602551826e-06,
+      "loss": 0.9756,
+      "step": 3180
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.692904353032056,
+      "learning_rate": 9.01817549658611e-06,
+      "loss": 1.0559,
+      "step": 3181
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 6.978014575097933,
+      "learning_rate": 9.017488176302098e-06,
+      "loss": 1.0339,
+      "step": 3182
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.496625697773652,
+      "learning_rate": 9.016800641736452e-06,
+      "loss": 1.0518,
+      "step": 3183
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 9.865780953178644,
+      "learning_rate": 9.016112892925837e-06,
+      "loss": 1.0999,
+      "step": 3184
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 8.2554311766773,
+      "learning_rate": 9.015424929906941e-06,
+      "loss": 1.0506,
+      "step": 3185
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 10.16817742800201,
+      "learning_rate": 9.014736752716457e-06,
+      "loss": 1.0415,
+      "step": 3186
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 9.364280748884978,
+      "learning_rate": 9.014048361391088e-06,
+      "loss": 0.9176,
+      "step": 3187
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.866238783830587,
+      "learning_rate": 9.013359755967554e-06,
+      "loss": 0.9951,
+      "step": 3188
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 6.3512439984073,
+      "learning_rate": 9.012670936482582e-06,
+      "loss": 0.9995,
+      "step": 3189
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 8.482120811955827,
+      "learning_rate": 9.01198190297291e-06,
+      "loss": 1.1322,
+      "step": 3190
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 8.067216713886621,
+      "learning_rate": 9.011292655475292e-06,
+      "loss": 1.0422,
+      "step": 3191
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 8.753752852824537,
+      "learning_rate": 9.01060319402649e-06,
+      "loss": 1.0013,
+      "step": 3192
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 9.586351850518493,
+      "learning_rate": 9.009913518663275e-06,
+      "loss": 0.9844,
+      "step": 3193
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 8.312920848696175,
+      "learning_rate": 9.009223629422437e-06,
+      "loss": 1.0037,
+      "step": 3194
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 5.727951991256535,
+      "learning_rate": 9.008533526340768e-06,
+      "loss": 1.0567,
+      "step": 3195
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.529276404124138,
+      "learning_rate": 9.007843209455081e-06,
+      "loss": 1.0495,
+      "step": 3196
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.589321963050307,
+      "learning_rate": 9.007152678802191e-06,
+      "loss": 1.0655,
+      "step": 3197
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.4504233241223075,
+      "learning_rate": 9.006461934418932e-06,
+      "loss": 1.0454,
+      "step": 3198
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.168085051758832,
+      "learning_rate": 9.005770976342145e-06,
+      "loss": 1.0557,
+      "step": 3199
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 8.218218659818266,
+      "learning_rate": 9.005079804608686e-06,
+      "loss": 1.0028,
+      "step": 3200
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.467205204989653,
+      "learning_rate": 9.004388419255415e-06,
+      "loss": 1.0272,
+      "step": 3201
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.4269906210117735,
+      "learning_rate": 9.003696820319214e-06,
+      "loss": 0.9654,
+      "step": 3202
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 5.645274652873678,
+      "learning_rate": 9.00300500783697e-06,
+      "loss": 1.0289,
+      "step": 3203
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 6.139470100230324,
+      "learning_rate": 9.002312981845581e-06,
+      "loss": 1.0353,
+      "step": 3204
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 8.88731765974257,
+      "learning_rate": 9.001620742381957e-06,
+      "loss": 1.0482,
+      "step": 3205
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 8.865573110122531,
+      "learning_rate": 9.000928289483022e-06,
+      "loss": 1.0637,
+      "step": 3206
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.116735817396368,
+      "learning_rate": 9.000235623185707e-06,
+      "loss": 1.1507,
+      "step": 3207
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 8.761285752818718,
+      "learning_rate": 8.999542743526962e-06,
+      "loss": 1.0254,
+      "step": 3208
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 5.151010728792429,
+      "learning_rate": 8.998849650543736e-06,
+      "loss": 1.0924,
+      "step": 3209
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 9.786767049362293,
+      "learning_rate": 8.998156344273002e-06,
+      "loss": 1.0191,
+      "step": 3210
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 9.700622694466912,
+      "learning_rate": 8.997462824751738e-06,
+      "loss": 1.0816,
+      "step": 3211
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 6.941960507085939,
+      "learning_rate": 8.996769092016935e-06,
+      "loss": 0.918,
+      "step": 3212
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 6.013663262103304,
+      "learning_rate": 8.996075146105592e-06,
+      "loss": 0.9937,
+      "step": 3213
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 9.922633191022376,
+      "learning_rate": 8.995380987054724e-06,
+      "loss": 1.0113,
+      "step": 3214
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 6.456258377268351,
+      "learning_rate": 8.994686614901356e-06,
+      "loss": 0.9643,
+      "step": 3215
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 6.694356288077295,
+      "learning_rate": 8.993992029682523e-06,
+      "loss": 1.0701,
+      "step": 3216
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.810722768323431,
+      "learning_rate": 8.993297231435272e-06,
+      "loss": 1.0684,
+      "step": 3217
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 8.67027801261493,
+      "learning_rate": 8.992602220196662e-06,
+      "loss": 0.9677,
+      "step": 3218
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.766014987775624,
+      "learning_rate": 8.991906996003765e-06,
+      "loss": 1.0397,
+      "step": 3219
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 9.24876099214665,
+      "learning_rate": 8.991211558893658e-06,
+      "loss": 1.0635,
+      "step": 3220
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.059750792892453,
+      "learning_rate": 8.990515908903436e-06,
+      "loss": 0.9685,
+      "step": 3221
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 8.464469333039059,
+      "learning_rate": 8.989820046070206e-06,
+      "loss": 1.0592,
+      "step": 3222
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 6.383799245886626,
+      "learning_rate": 8.989123970431077e-06,
+      "loss": 1.0152,
+      "step": 3223
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 10.890147817485712,
+      "learning_rate": 8.988427682023181e-06,
+      "loss": 1.0978,
+      "step": 3224
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.581531758538741,
+      "learning_rate": 8.987731180883654e-06,
+      "loss": 1.0138,
+      "step": 3225
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.50839245913826,
+      "learning_rate": 8.987034467049647e-06,
+      "loss": 0.9947,
+      "step": 3226
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 8.703298086575739,
+      "learning_rate": 8.98633754055832e-06,
+      "loss": 0.9874,
+      "step": 3227
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.377543519673175,
+      "learning_rate": 8.98564040144684e-06,
+      "loss": 1.0473,
+      "step": 3228
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 8.282365654298168,
+      "learning_rate": 8.9849430497524e-06,
+      "loss": 0.9798,
+      "step": 3229
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 6.321959546764116,
+      "learning_rate": 8.984245485512186e-06,
+      "loss": 1.0362,
+      "step": 3230
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.195615589569339,
+      "learning_rate": 8.98354770876341e-06,
+      "loss": 1.0277,
+      "step": 3231
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.4128587158352275,
+      "learning_rate": 8.98284971954329e-06,
+      "loss": 1.0315,
+      "step": 3232
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.859049458017136,
+      "learning_rate": 8.982151517889049e-06,
+      "loss": 1.0218,
+      "step": 3233
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 6.520013789177126,
+      "learning_rate": 8.981453103837932e-06,
+      "loss": 0.9259,
+      "step": 3234
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 8.348002104643962,
+      "learning_rate": 8.980754477427188e-06,
+      "loss": 1.1056,
+      "step": 3235
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 6.43872238296642,
+      "learning_rate": 8.98005563869408e-06,
+      "loss": 1.0166,
+      "step": 3236
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 8.877610730779464,
+      "learning_rate": 8.979356587675884e-06,
+      "loss": 0.9637,
+      "step": 3237
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 6.521871862325565,
+      "learning_rate": 8.978657324409883e-06,
+      "loss": 1.0158,
+      "step": 3238
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 9.55704371363944,
+      "learning_rate": 8.977957848933376e-06,
+      "loss": 0.9922,
+      "step": 3239
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 5.757898126227358,
+      "learning_rate": 8.97725816128367e-06,
+      "loss": 1.0145,
+      "step": 3240
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.2453768210662055,
+      "learning_rate": 8.976558261498085e-06,
+      "loss": 1.0047,
+      "step": 3241
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 8.660573678300269,
+      "learning_rate": 8.97585814961395e-06,
+      "loss": 1.0323,
+      "step": 3242
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 5.536634949502295,
+      "learning_rate": 8.975157825668607e-06,
+      "loss": 1.0184,
+      "step": 3243
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.172302651460712,
+      "learning_rate": 8.974457289699414e-06,
+      "loss": 1.0595,
+      "step": 3244
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 8.015479876212035,
+      "learning_rate": 8.97375654174373e-06,
+      "loss": 1.0661,
+      "step": 3245
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 6.961025621497804,
+      "learning_rate": 8.973055581838933e-06,
+      "loss": 1.0394,
+      "step": 3246
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 8.681211574164264,
+      "learning_rate": 8.972354410022412e-06,
+      "loss": 1.0665,
+      "step": 3247
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 6.087340528002642,
+      "learning_rate": 8.971653026331561e-06,
+      "loss": 1.1132,
+      "step": 3248
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 6.966952689382223,
+      "learning_rate": 8.970951430803797e-06,
+      "loss": 1.0323,
+      "step": 3249
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 9.692811286833615,
+      "learning_rate": 8.970249623476532e-06,
+      "loss": 1.0655,
+      "step": 3250
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.352090020422443,
+      "learning_rate": 8.969547604387206e-06,
+      "loss": 1.0274,
+      "step": 3251
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.0842106632298005,
+      "learning_rate": 8.96884537357326e-06,
+      "loss": 1.0503,
+      "step": 3252
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.648740155081349,
+      "learning_rate": 8.968142931072149e-06,
+      "loss": 1.0283,
+      "step": 3253
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.346583442141222,
+      "learning_rate": 8.96744027692134e-06,
+      "loss": 0.9978,
+      "step": 3254
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 9.941163508057416,
+      "learning_rate": 8.966737411158308e-06,
+      "loss": 0.973,
+      "step": 3255
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 5.175420395748287,
+      "learning_rate": 8.966034333820546e-06,
+      "loss": 1.0971,
+      "step": 3256
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 6.795561469262742,
+      "learning_rate": 8.96533104494555e-06,
+      "loss": 0.9632,
+      "step": 3257
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.334327609132853,
+      "learning_rate": 8.964627544570837e-06,
+      "loss": 1.0276,
+      "step": 3258
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.72593041522237,
+      "learning_rate": 8.963923832733925e-06,
+      "loss": 1.0855,
+      "step": 3259
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 8.609103414163563,
+      "learning_rate": 8.96321990947235e-06,
+      "loss": 0.9755,
+      "step": 3260
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 8.645582303981024,
+      "learning_rate": 8.962515774823655e-06,
+      "loss": 1.1051,
+      "step": 3261
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.331390040484553,
+      "learning_rate": 8.961811428825398e-06,
+      "loss": 1.0575,
+      "step": 3262
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.158238572794665,
+      "learning_rate": 8.961106871515148e-06,
+      "loss": 1.054,
+      "step": 3263
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.414432798535946,
+      "learning_rate": 8.960402102930483e-06,
+      "loss": 1.0459,
+      "step": 3264
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 8.29271455690264,
+      "learning_rate": 8.959697123108995e-06,
+      "loss": 1.0283,
+      "step": 3265
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 8.974050276011312,
+      "learning_rate": 8.958991932088283e-06,
+      "loss": 1.0018,
+      "step": 3266
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 8.666984232207126,
+      "learning_rate": 8.958286529905964e-06,
+      "loss": 1.0908,
+      "step": 3267
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 6.904472418235228,
+      "learning_rate": 8.957580916599656e-06,
+      "loss": 1.08,
+      "step": 3268
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.316563109015683,
+      "learning_rate": 8.956875092207e-06,
+      "loss": 0.9762,
+      "step": 3269
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.904657520464246,
+      "learning_rate": 8.956169056765638e-06,
+      "loss": 1.0842,
+      "step": 3270
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 10.03077680583686,
+      "learning_rate": 8.955462810313234e-06,
+      "loss": 0.9283,
+      "step": 3271
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 8.181961059940248,
+      "learning_rate": 8.954756352887452e-06,
+      "loss": 1.0208,
+      "step": 3272
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.166964246763246,
+      "learning_rate": 8.954049684525973e-06,
+      "loss": 0.947,
+      "step": 3273
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.77679829280958,
+      "learning_rate": 8.953342805266492e-06,
+      "loss": 0.9704,
+      "step": 3274
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 6.946909148039117,
+      "learning_rate": 8.95263571514671e-06,
+      "loss": 0.9716,
+      "step": 3275
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 11.2767423452254,
+      "learning_rate": 8.95192841420434e-06,
+      "loss": 0.9688,
+      "step": 3276
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.742901960000571,
+      "learning_rate": 8.95122090247711e-06,
+      "loss": 1.0485,
+      "step": 3277
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 6.646582350485537,
+      "learning_rate": 8.950513180002754e-06,
+      "loss": 1.0084,
+      "step": 3278
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.777647965543435,
+      "learning_rate": 8.949805246819022e-06,
+      "loss": 0.9712,
+      "step": 3279
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.494975441491656,
+      "learning_rate": 8.949097102963674e-06,
+      "loss": 1.0323,
+      "step": 3280
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 8.478258988210047,
+      "learning_rate": 8.948388748474477e-06,
+      "loss": 0.9841,
+      "step": 3281
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.2717815748478305,
+      "learning_rate": 8.947680183389214e-06,
+      "loss": 1.0786,
+      "step": 3282
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 9.210562293423024,
+      "learning_rate": 8.946971407745679e-06,
+      "loss": 0.9552,
+      "step": 3283
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.706011383897647,
+      "learning_rate": 8.946262421581675e-06,
+      "loss": 1.0537,
+      "step": 3284
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.783453662735134,
+      "learning_rate": 8.94555322493502e-06,
+      "loss": 1.0377,
+      "step": 3285
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 5.417126121542819,
+      "learning_rate": 8.944843817843537e-06,
+      "loss": 1.0305,
+      "step": 3286
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 8.416288860442716,
+      "learning_rate": 8.944134200345065e-06,
+      "loss": 1.0492,
+      "step": 3287
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.481079660711138,
+      "learning_rate": 8.943424372477455e-06,
+      "loss": 0.9547,
+      "step": 3288
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 8.42279462155846,
+      "learning_rate": 8.942714334278564e-06,
+      "loss": 1.0178,
+      "step": 3289
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 8.970633127377077,
+      "learning_rate": 8.942004085786266e-06,
+      "loss": 1.0056,
+      "step": 3290
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 8.122715764808234,
+      "learning_rate": 8.941293627038443e-06,
+      "loss": 1.0564,
+      "step": 3291
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 10.315549878203257,
+      "learning_rate": 8.940582958072988e-06,
+      "loss": 1.0235,
+      "step": 3292
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.5420950711373544,
+      "learning_rate": 8.939872078927806e-06,
+      "loss": 1.1148,
+      "step": 3293
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.462460300042594,
+      "learning_rate": 8.939160989640813e-06,
+      "loss": 0.9737,
+      "step": 3294
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 8.633268943916915,
+      "learning_rate": 8.93844969024994e-06,
+      "loss": 1.0785,
+      "step": 3295
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.477115643822865,
+      "learning_rate": 8.937738180793123e-06,
+      "loss": 1.0133,
+      "step": 3296
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.747188821533474,
+      "learning_rate": 8.937026461308311e-06,
+      "loss": 0.9429,
+      "step": 3297
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.670011499955937,
+      "learning_rate": 8.936314531833467e-06,
+      "loss": 1.0466,
+      "step": 3298
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 8.593308065894899,
+      "learning_rate": 8.935602392406565e-06,
+      "loss": 0.9545,
+      "step": 3299
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 8.707378128418446,
+      "learning_rate": 8.934890043065583e-06,
+      "loss": 1.1148,
+      "step": 3300
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 9.834937100010606,
+      "learning_rate": 8.934177483848518e-06,
+      "loss": 1.0191,
+      "step": 3301
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.885512299626496,
+      "learning_rate": 8.93346471479338e-06,
+      "loss": 1.0065,
+      "step": 3302
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 4.916051384163887,
+      "learning_rate": 8.93275173593818e-06,
+      "loss": 1.0015,
+      "step": 3303
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 6.662985216992582,
+      "learning_rate": 8.932038547320953e-06,
+      "loss": 1.0299,
+      "step": 3304
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 8.676609892294046,
+      "learning_rate": 8.931325148979733e-06,
+      "loss": 0.9885,
+      "step": 3305
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 6.034500073330018,
+      "learning_rate": 8.930611540952571e-06,
+      "loss": 1.0258,
+      "step": 3306
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 10.105960133709155,
+      "learning_rate": 8.929897723277531e-06,
+      "loss": 1.025,
+      "step": 3307
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.474575108838838,
+      "learning_rate": 8.929183695992688e-06,
+      "loss": 1.0776,
+      "step": 3308
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 8.830791966052077,
+      "learning_rate": 8.928469459136121e-06,
+      "loss": 0.8916,
+      "step": 3309
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 6.8607866699989675,
+      "learning_rate": 8.92775501274593e-06,
+      "loss": 1.0139,
+      "step": 3310
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 10.146203042425501,
+      "learning_rate": 8.927040356860218e-06,
+      "loss": 1.0512,
+      "step": 3311
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.360690813564484,
+      "learning_rate": 8.926325491517103e-06,
+      "loss": 1.0004,
+      "step": 3312
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.5065219904641705,
+      "learning_rate": 8.925610416754717e-06,
+      "loss": 1.0035,
+      "step": 3313
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 6.491704780759864,
+      "learning_rate": 8.924895132611198e-06,
+      "loss": 1.0118,
+      "step": 3314
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.069482866873269,
+      "learning_rate": 8.924179639124698e-06,
+      "loss": 0.9589,
+      "step": 3315
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.26690196836218,
+      "learning_rate": 8.923463936333378e-06,
+      "loss": 0.9852,
+      "step": 3316
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 9.308050740256508,
+      "learning_rate": 8.922748024275415e-06,
+      "loss": 1.0299,
+      "step": 3317
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 9.056945600369472,
+      "learning_rate": 8.92203190298899e-06,
+      "loss": 1.0033,
+      "step": 3318
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 6.779212656488148,
+      "learning_rate": 8.9213155725123e-06,
+      "loss": 1.1131,
+      "step": 3319
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 8.019837626378571,
+      "learning_rate": 8.920599032883553e-06,
+      "loss": 0.9518,
+      "step": 3320
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 5.689651921504285,
+      "learning_rate": 8.919882284140966e-06,
+      "loss": 1.0457,
+      "step": 3321
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 5.046953742276906,
+      "learning_rate": 8.91916532632277e-06,
+      "loss": 0.9772,
+      "step": 3322
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 6.7215641663581795,
+      "learning_rate": 8.918448159467204e-06,
+      "loss": 1.0461,
+      "step": 3323
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.313049510982329,
+      "learning_rate": 8.91773078361252e-06,
+      "loss": 0.9738,
+      "step": 3324
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 9.778465072044058,
+      "learning_rate": 8.917013198796983e-06,
+      "loss": 1.0344,
+      "step": 3325
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.95218571237,
+      "learning_rate": 8.916295405058863e-06,
+      "loss": 1.0678,
+      "step": 3326
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.159862153984353,
+      "learning_rate": 8.915577402436446e-06,
+      "loss": 0.9732,
+      "step": 3327
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 12.508972328399638,
+      "learning_rate": 8.914859190968031e-06,
+      "loss": 1.0802,
+      "step": 3328
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 8.397270807898563,
+      "learning_rate": 8.914140770691925e-06,
+      "loss": 1.0302,
+      "step": 3329
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.208645170995258,
+      "learning_rate": 8.913422141646444e-06,
+      "loss": 0.9571,
+      "step": 3330
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 9.590048700001624,
+      "learning_rate": 8.912703303869919e-06,
+      "loss": 0.9567,
+      "step": 3331
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 6.646719265660767,
+      "learning_rate": 8.911984257400692e-06,
+      "loss": 0.9648,
+      "step": 3332
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 6.4754392216115795,
+      "learning_rate": 8.911265002277112e-06,
+      "loss": 1.1176,
+      "step": 3333
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 9.230317014611103,
+      "learning_rate": 8.910545538537544e-06,
+      "loss": 1.0149,
+      "step": 3334
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.401383178593375,
+      "learning_rate": 8.909825866220363e-06,
+      "loss": 1.0612,
+      "step": 3335
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 9.271710740274603,
+      "learning_rate": 8.909105985363956e-06,
+      "loss": 1.0228,
+      "step": 3336
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.780735227251275,
+      "learning_rate": 8.908385896006713e-06,
+      "loss": 1.0138,
+      "step": 3337
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 9.191186006336972,
+      "learning_rate": 8.907665598187045e-06,
+      "loss": 1.0137,
+      "step": 3338
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 5.65649623032451,
+      "learning_rate": 8.906945091943372e-06,
+      "loss": 1.0472,
+      "step": 3339
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 9.03175954878865,
+      "learning_rate": 8.906224377314125e-06,
+      "loss": 1.0349,
+      "step": 3340
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 6.465653148172087,
+      "learning_rate": 8.90550345433774e-06,
+      "loss": 1.0558,
+      "step": 3341
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.432789129350421,
+      "learning_rate": 8.904782323052675e-06,
+      "loss": 0.9854,
+      "step": 3342
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 8.74460473035433,
+      "learning_rate": 8.904060983497388e-06,
+      "loss": 1.0661,
+      "step": 3343
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.054921245486436,
+      "learning_rate": 8.903339435710355e-06,
+      "loss": 1.0583,
+      "step": 3344
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 9.850769929854213,
+      "learning_rate": 8.90261767973006e-06,
+      "loss": 1.04,
+      "step": 3345
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 5.8178358926660625,
+      "learning_rate": 8.901895715595003e-06,
+      "loss": 1.0826,
+      "step": 3346
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.923029721887818,
+      "learning_rate": 8.901173543343689e-06,
+      "loss": 1.0685,
+      "step": 3347
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.60226446549625,
+      "learning_rate": 8.900451163014636e-06,
+      "loss": 0.9901,
+      "step": 3348
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.962051704738035,
+      "learning_rate": 8.899728574646376e-06,
+      "loss": 1.0265,
+      "step": 3349
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 8.509405673692811,
+      "learning_rate": 8.89900577827745e-06,
+      "loss": 0.9851,
+      "step": 3350
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.439693659465855,
+      "learning_rate": 8.898282773946408e-06,
+      "loss": 0.9821,
+      "step": 3351
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 10.055058622536748,
+      "learning_rate": 8.897559561691814e-06,
+      "loss": 1.1509,
+      "step": 3352
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 6.767207137251235,
+      "learning_rate": 8.89683614155224e-06,
+      "loss": 1.0005,
+      "step": 3353
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 8.780839924727085,
+      "learning_rate": 8.896112513566275e-06,
+      "loss": 1.093,
+      "step": 3354
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.707340277898906,
+      "learning_rate": 8.895388677772513e-06,
+      "loss": 1.1135,
+      "step": 3355
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 6.850000482332284,
+      "learning_rate": 8.894664634209563e-06,
+      "loss": 0.9699,
+      "step": 3356
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 6.724376826781328,
+      "learning_rate": 8.89394038291604e-06,
+      "loss": 1.0386,
+      "step": 3357
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 6.851707867330608,
+      "learning_rate": 8.893215923930577e-06,
+      "loss": 1.055,
+      "step": 3358
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 5.946659162358192,
+      "learning_rate": 8.892491257291813e-06,
+      "loss": 1.0388,
+      "step": 3359
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 5.623496128926692,
+      "learning_rate": 8.891766383038401e-06,
+      "loss": 1.0313,
+      "step": 3360
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 9.343863531656092,
+      "learning_rate": 8.891041301209003e-06,
+      "loss": 1.0866,
+      "step": 3361
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.341786278819837,
+      "learning_rate": 8.89031601184229e-06,
+      "loss": 1.0479,
+      "step": 3362
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.716342398154345,
+      "learning_rate": 8.889590514976953e-06,
+      "loss": 1.0315,
+      "step": 3363
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 5.372169543776639,
+      "learning_rate": 8.888864810651683e-06,
+      "loss": 1.0196,
+      "step": 3364
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 6.660792793595403,
+      "learning_rate": 8.88813889890519e-06,
+      "loss": 1.044,
+      "step": 3365
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 5.993301184539166,
+      "learning_rate": 8.88741277977619e-06,
+      "loss": 0.9867,
+      "step": 3366
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 6.800827803130947,
+      "learning_rate": 8.886686453303413e-06,
+      "loss": 1.0111,
+      "step": 3367
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 6.669467257481694,
+      "learning_rate": 8.8859599195256e-06,
+      "loss": 1.0642,
+      "step": 3368
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 6.693220811655083,
+      "learning_rate": 8.8852331784815e-06,
+      "loss": 1.0686,
+      "step": 3369
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 6.661960845699007,
+      "learning_rate": 8.884506230209878e-06,
+      "loss": 0.9404,
+      "step": 3370
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 8.985974370513707,
+      "learning_rate": 8.883779074749506e-06,
+      "loss": 1.0625,
+      "step": 3371
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 9.813478580860519,
+      "learning_rate": 8.88305171213917e-06,
+      "loss": 1.0637,
+      "step": 3372
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 8.887928550745224,
+      "learning_rate": 8.882324142417662e-06,
+      "loss": 1.0054,
+      "step": 3373
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 8.771693830666532,
+      "learning_rate": 8.881596365623793e-06,
+      "loss": 1.0194,
+      "step": 3374
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.766891097378689,
+      "learning_rate": 8.880868381796376e-06,
+      "loss": 0.9225,
+      "step": 3375
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.639448009616552,
+      "learning_rate": 8.880140190974243e-06,
+      "loss": 1.0538,
+      "step": 3376
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.343953398564601,
+      "learning_rate": 8.879411793196233e-06,
+      "loss": 0.9686,
+      "step": 3377
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 9.797229675130088,
+      "learning_rate": 8.878683188501196e-06,
+      "loss": 1.0585,
+      "step": 3378
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.424996840455508,
+      "learning_rate": 8.877954376927996e-06,
+      "loss": 1.0319,
+      "step": 3379
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 8.410226094546232,
+      "learning_rate": 8.877225358515501e-06,
+      "loss": 0.9566,
+      "step": 3380
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 8.515974725867498,
+      "learning_rate": 8.876496133302599e-06,
+      "loss": 1.0803,
+      "step": 3381
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 8.9655288176897,
+      "learning_rate": 8.875766701328184e-06,
+      "loss": 1.0397,
+      "step": 3382
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 8.801999633911652,
+      "learning_rate": 8.87503706263116e-06,
+      "loss": 1.0896,
+      "step": 3383
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.4437944305937345,
+      "learning_rate": 8.874307217250446e-06,
+      "loss": 0.9738,
+      "step": 3384
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 8.184011997722447,
+      "learning_rate": 8.87357716522497e-06,
+      "loss": 1.0491,
+      "step": 3385
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 8.699329358247702,
+      "learning_rate": 8.87284690659367e-06,
+      "loss": 1.0955,
+      "step": 3386
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 8.16299575103495,
+      "learning_rate": 8.872116441395496e-06,
+      "loss": 0.904,
+      "step": 3387
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 8.483695106126367,
+      "learning_rate": 8.871385769669408e-06,
+      "loss": 0.9961,
+      "step": 3388
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 5.839774089616245,
+      "learning_rate": 8.87065489145438e-06,
+      "loss": 1.0863,
+      "step": 3389
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 10.248816000900565,
+      "learning_rate": 8.869923806789395e-06,
+      "loss": 0.9781,
+      "step": 3390
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 6.629217534551811,
+      "learning_rate": 8.869192515713445e-06,
+      "loss": 0.9975,
+      "step": 3391
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 8.508227723648092,
+      "learning_rate": 8.868461018265537e-06,
+      "loss": 1.0917,
+      "step": 3392
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 5.849819357787687,
+      "learning_rate": 8.867729314484686e-06,
+      "loss": 1.0662,
+      "step": 3393
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 6.761323988311729,
+      "learning_rate": 8.866997404409918e-06,
+      "loss": 1.0483,
+      "step": 3394
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 6.246254578473332,
+      "learning_rate": 8.866265288080274e-06,
+      "loss": 0.989,
+      "step": 3395
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.411683414422332,
+      "learning_rate": 8.8655329655348e-06,
+      "loss": 1.0503,
+      "step": 3396
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 9.612362791679743,
+      "learning_rate": 8.864800436812557e-06,
+      "loss": 1.0772,
+      "step": 3397
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 6.739698197130292,
+      "learning_rate": 8.864067701952619e-06,
+      "loss": 0.9886,
+      "step": 3398
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 6.913134708089705,
+      "learning_rate": 8.863334760994063e-06,
+      "loss": 0.9588,
+      "step": 3399
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 9.368412258650682,
+      "learning_rate": 8.862601613975986e-06,
+      "loss": 1.0364,
+      "step": 3400
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 9.807097280885548,
+      "learning_rate": 8.86186826093749e-06,
+      "loss": 0.9041,
+      "step": 3401
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 8.280945829662391,
+      "learning_rate": 8.861134701917691e-06,
+      "loss": 1.0348,
+      "step": 3402
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 6.7394996184621565,
+      "learning_rate": 8.860400936955714e-06,
+      "loss": 1.074,
+      "step": 3403
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 9.50890715848105,
+      "learning_rate": 8.859666966090697e-06,
+      "loss": 1.0376,
+      "step": 3404
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 8.329292735729354,
+      "learning_rate": 8.858932789361786e-06,
+      "loss": 1.0133,
+      "step": 3405
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 8.716694519100754,
+      "learning_rate": 8.858198406808143e-06,
+      "loss": 1.1233,
+      "step": 3406
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.415468858321014,
+      "learning_rate": 8.857463818468936e-06,
+      "loss": 1.0875,
+      "step": 3407
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.813752380028872,
+      "learning_rate": 8.856729024383346e-06,
+      "loss": 1.0006,
+      "step": 3408
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 6.2620752989426505,
+      "learning_rate": 8.855994024590566e-06,
+      "loss": 1.0656,
+      "step": 3409
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 8.259595151285872,
+      "learning_rate": 8.855258819129796e-06,
+      "loss": 0.9784,
+      "step": 3410
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 9.56139148935337,
+      "learning_rate": 8.854523408040253e-06,
+      "loss": 1.0101,
+      "step": 3411
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.128544359378788,
+      "learning_rate": 8.85378779136116e-06,
+      "loss": 0.9994,
+      "step": 3412
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.522284658113894,
+      "learning_rate": 8.853051969131755e-06,
+      "loss": 1.0454,
+      "step": 3413
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 8.412657658640136,
+      "learning_rate": 8.85231594139128e-06,
+      "loss": 1.0838,
+      "step": 3414
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.668466286902625,
+      "learning_rate": 8.851579708178998e-06,
+      "loss": 1.005,
+      "step": 3415
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.8452784371112,
+      "learning_rate": 8.850843269534176e-06,
+      "loss": 1.0582,
+      "step": 3416
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 10.296342169112668,
+      "learning_rate": 8.850106625496091e-06,
+      "loss": 1.0641,
+      "step": 3417
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.705139526121329,
+      "learning_rate": 8.849369776104037e-06,
+      "loss": 1.0482,
+      "step": 3418
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.225334229564426,
+      "learning_rate": 8.848632721397312e-06,
+      "loss": 1.0104,
+      "step": 3419
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.75673445821379,
+      "learning_rate": 8.847895461415232e-06,
+      "loss": 1.0143,
+      "step": 3420
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 6.567571680400342,
+      "learning_rate": 8.847157996197118e-06,
+      "loss": 0.9762,
+      "step": 3421
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 8.923684278903863,
+      "learning_rate": 8.846420325782306e-06,
+      "loss": 1.0991,
+      "step": 3422
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.496528620810101,
+      "learning_rate": 8.84568245021014e-06,
+      "loss": 0.9698,
+      "step": 3423
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 8.656861911864823,
+      "learning_rate": 8.844944369519976e-06,
+      "loss": 1.0179,
+      "step": 3424
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 8.048500320476354,
+      "learning_rate": 8.844206083751183e-06,
+      "loss": 1.0839,
+      "step": 3425
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.144052389854512,
+      "learning_rate": 8.843467592943135e-06,
+      "loss": 0.9627,
+      "step": 3426
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 8.200841135975871,
+      "learning_rate": 8.842728897135228e-06,
+      "loss": 1.0406,
+      "step": 3427
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 6.626701047384742,
+      "learning_rate": 8.841989996366856e-06,
+      "loss": 0.9943,
+      "step": 3428
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 9.2861981596898,
+      "learning_rate": 8.841250890677432e-06,
+      "loss": 1.0238,
+      "step": 3429
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 8.00840046321076,
+      "learning_rate": 8.840511580106377e-06,
+      "loss": 1.0353,
+      "step": 3430
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 9.455531331998708,
+      "learning_rate": 8.839772064693127e-06,
+      "loss": 1.1008,
+      "step": 3431
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.163520672688195,
+      "learning_rate": 8.83903234447712e-06,
+      "loss": 1.0992,
+      "step": 3432
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.319074143373137,
+      "learning_rate": 8.838292419497813e-06,
+      "loss": 0.9001,
+      "step": 3433
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 10.3332367184454,
+      "learning_rate": 8.837552289794676e-06,
+      "loss": 0.9779,
+      "step": 3434
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 11.375508520456432,
+      "learning_rate": 8.836811955407179e-06,
+      "loss": 1.0126,
+      "step": 3435
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 9.463245855575048,
+      "learning_rate": 8.836071416374814e-06,
+      "loss": 1.0613,
+      "step": 3436
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 8.764237533443533,
+      "learning_rate": 8.835330672737077e-06,
+      "loss": 1.0184,
+      "step": 3437
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 8.99541443879492,
+      "learning_rate": 8.834589724533477e-06,
+      "loss": 1.0325,
+      "step": 3438
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 9.348299795184776,
+      "learning_rate": 8.833848571803535e-06,
+      "loss": 1.028,
+      "step": 3439
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 6.654876764866514,
+      "learning_rate": 8.83310721458678e-06,
+      "loss": 0.9938,
+      "step": 3440
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.4742696903787715,
+      "learning_rate": 8.832365652922758e-06,
+      "loss": 0.9756,
+      "step": 3441
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.300666780439302,
+      "learning_rate": 8.831623886851019e-06,
+      "loss": 0.9684,
+      "step": 3442
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 6.508907666935793,
+      "learning_rate": 8.830881916411127e-06,
+      "loss": 1.0067,
+      "step": 3443
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 10.373608384560727,
+      "learning_rate": 8.830139741642659e-06,
+      "loss": 1.0845,
+      "step": 3444
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.140066196263205,
+      "learning_rate": 8.829397362585197e-06,
+      "loss": 1.0037,
+      "step": 3445
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.890905767021604,
+      "learning_rate": 8.828654779278338e-06,
+      "loss": 1.0283,
+      "step": 3446
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 10.733730054746392,
+      "learning_rate": 8.82791199176169e-06,
+      "loss": 0.9628,
+      "step": 3447
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.8877410302591215,
+      "learning_rate": 8.827169000074872e-06,
+      "loss": 1.0634,
+      "step": 3448
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 6.997904388695998,
+      "learning_rate": 8.826425804257513e-06,
+      "loss": 1.096,
+      "step": 3449
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 9.13613720010061,
+      "learning_rate": 8.825682404349251e-06,
+      "loss": 1.0284,
+      "step": 3450
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 9.983724418895,
+      "learning_rate": 8.824938800389741e-06,
+      "loss": 1.0297,
+      "step": 3451
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 5.423617144310924,
+      "learning_rate": 8.824194992418639e-06,
+      "loss": 1.0443,
+      "step": 3452
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 8.507969567421087,
+      "learning_rate": 8.823450980475623e-06,
+      "loss": 1.0426,
+      "step": 3453
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.919032825255324,
+      "learning_rate": 8.822706764600374e-06,
+      "loss": 1.0221,
+      "step": 3454
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 8.576481454810743,
+      "learning_rate": 8.821962344832587e-06,
+      "loss": 1.0242,
+      "step": 3455
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.387304355668392,
+      "learning_rate": 8.821217721211967e-06,
+      "loss": 1.0412,
+      "step": 3456
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 5.605381842555912,
+      "learning_rate": 8.82047289377823e-06,
+      "loss": 1.02,
+      "step": 3457
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 5.941196315146477,
+      "learning_rate": 8.819727862571104e-06,
+      "loss": 1.0377,
+      "step": 3458
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.001809715030381,
+      "learning_rate": 8.818982627630323e-06,
+      "loss": 1.0316,
+      "step": 3459
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 5.566534129365784,
+      "learning_rate": 8.818237188995642e-06,
+      "loss": 0.9539,
+      "step": 3460
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 6.234782453700263,
+      "learning_rate": 8.817491546706818e-06,
+      "loss": 0.964,
+      "step": 3461
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.533951229468654,
+      "learning_rate": 8.816745700803618e-06,
+      "loss": 0.9086,
+      "step": 3462
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 6.9264436281071236,
+      "learning_rate": 8.815999651325828e-06,
+      "loss": 1.0232,
+      "step": 3463
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.534711264024297,
+      "learning_rate": 8.815253398313239e-06,
+      "loss": 1.0894,
+      "step": 3464
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 8.45191520608645,
+      "learning_rate": 8.814506941805653e-06,
+      "loss": 1.0699,
+      "step": 3465
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 8.296679613989522,
+      "learning_rate": 8.813760281842885e-06,
+      "loss": 1.0054,
+      "step": 3466
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 6.460042270026219,
+      "learning_rate": 8.813013418464758e-06,
+      "loss": 1.0173,
+      "step": 3467
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 9.155595971307156,
+      "learning_rate": 8.81226635171111e-06,
+      "loss": 1.0965,
+      "step": 3468
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 6.514855955731086,
+      "learning_rate": 8.811519081621784e-06,
+      "loss": 1.0005,
+      "step": 3469
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.339318806194749,
+      "learning_rate": 8.810771608236642e-06,
+      "loss": 0.9886,
+      "step": 3470
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 5.862858110719876,
+      "learning_rate": 8.81002393159555e-06,
+      "loss": 0.9939,
+      "step": 3471
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 6.724872850093236,
+      "learning_rate": 8.809276051738385e-06,
+      "loss": 1.0845,
+      "step": 3472
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 8.57807040572409,
+      "learning_rate": 8.80852796870504e-06,
+      "loss": 0.9985,
+      "step": 3473
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.597375088401118,
+      "learning_rate": 8.807779682535413e-06,
+      "loss": 1.0143,
+      "step": 3474
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 5.672306561828834,
+      "learning_rate": 8.807031193269417e-06,
+      "loss": 0.9777,
+      "step": 3475
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 8.36257061165318,
+      "learning_rate": 8.806282500946976e-06,
+      "loss": 0.9656,
+      "step": 3476
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.523777858064298,
+      "learning_rate": 8.80553360560802e-06,
+      "loss": 1.0152,
+      "step": 3477
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.615377347482875,
+      "learning_rate": 8.804784507292495e-06,
+      "loss": 1.006,
+      "step": 3478
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 9.471944054518733,
+      "learning_rate": 8.804035206040353e-06,
+      "loss": 1.1152,
+      "step": 3479
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 6.465243074705298,
+      "learning_rate": 8.803285701891563e-06,
+      "loss": 1.0015,
+      "step": 3480
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.846028755009598,
+      "learning_rate": 8.8025359948861e-06,
+      "loss": 0.9903,
+      "step": 3481
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 8.773902112073985,
+      "learning_rate": 8.801786085063952e-06,
+      "loss": 1.0761,
+      "step": 3482
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.356584694350719,
+      "learning_rate": 8.801035972465115e-06,
+      "loss": 1.019,
+      "step": 3483
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.931354906640308,
+      "learning_rate": 8.800285657129602e-06,
+      "loss": 0.99,
+      "step": 3484
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.836798149637621,
+      "learning_rate": 8.799535139097428e-06,
+      "loss": 1.0779,
+      "step": 3485
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.677258267751163,
+      "learning_rate": 8.798784418408625e-06,
+      "loss": 0.9884,
+      "step": 3486
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 8.220513624582559,
+      "learning_rate": 8.79803349510324e-06,
+      "loss": 1.0399,
+      "step": 3487
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 6.116917650746934,
+      "learning_rate": 8.797282369221315e-06,
+      "loss": 1.0479,
+      "step": 3488
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 6.5017299691331765,
+      "learning_rate": 8.79653104080292e-06,
+      "loss": 0.9657,
+      "step": 3489
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 5.957756700469852,
+      "learning_rate": 8.795779509888127e-06,
+      "loss": 1.059,
+      "step": 3490
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 8.472363376746593,
+      "learning_rate": 8.795027776517021e-06,
+      "loss": 1.0145,
+      "step": 3491
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 9.472879338845093,
+      "learning_rate": 8.794275840729699e-06,
+      "loss": 1.098,
+      "step": 3492
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 6.9570105021226665,
+      "learning_rate": 8.793523702566261e-06,
+      "loss": 1.0436,
+      "step": 3493
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.546881080889942,
+      "learning_rate": 8.79277136206683e-06,
+      "loss": 0.9712,
+      "step": 3494
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 6.901245784034133,
+      "learning_rate": 8.792018819271531e-06,
+      "loss": 0.9953,
+      "step": 3495
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 8.50796013622224,
+      "learning_rate": 8.791266074220503e-06,
+      "loss": 0.997,
+      "step": 3496
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 8.203729983774238,
+      "learning_rate": 8.790513126953898e-06,
+      "loss": 1.0402,
+      "step": 3497
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 8.799484917660422,
+      "learning_rate": 8.789759977511872e-06,
+      "loss": 0.9693,
+      "step": 3498
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 6.70996887848914,
+      "learning_rate": 8.789006625934598e-06,
+      "loss": 0.9057,
+      "step": 3499
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 8.75559319235915,
+      "learning_rate": 8.788253072262258e-06,
+      "loss": 1.0919,
+      "step": 3500
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.895820168240641,
+      "learning_rate": 8.787499316535043e-06,
+      "loss": 0.927,
+      "step": 3501
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.293653419298406,
+      "learning_rate": 8.786745358793158e-06,
+      "loss": 1.0107,
+      "step": 3502
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.384087352740173,
+      "learning_rate": 8.785991199076816e-06,
+      "loss": 0.9626,
+      "step": 3503
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 9.660988213369327,
+      "learning_rate": 8.78523683742624e-06,
+      "loss": 1.0693,
+      "step": 3504
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 8.675316648532279,
+      "learning_rate": 8.784482273881669e-06,
+      "loss": 0.987,
+      "step": 3505
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 9.357689379801723,
+      "learning_rate": 8.783727508483345e-06,
+      "loss": 1.0791,
+      "step": 3506
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.153674635070023,
+      "learning_rate": 8.78297254127153e-06,
+      "loss": 0.9903,
+      "step": 3507
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.243421530405142,
+      "learning_rate": 8.782217372286491e-06,
+      "loss": 1.0242,
+      "step": 3508
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.542468921360885,
+      "learning_rate": 8.781462001568503e-06,
+      "loss": 0.9192,
+      "step": 3509
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.562230306071103,
+      "learning_rate": 8.780706429157859e-06,
+      "loss": 0.9888,
+      "step": 3510
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 5.633682210226835,
+      "learning_rate": 8.779950655094858e-06,
+      "loss": 1.0183,
+      "step": 3511
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 6.508424793405674,
+      "learning_rate": 8.779194679419809e-06,
+      "loss": 1.076,
+      "step": 3512
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 6.863003453810007,
+      "learning_rate": 8.778438502173037e-06,
+      "loss": 1.0417,
+      "step": 3513
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 4.99816731876706,
+      "learning_rate": 8.77768212339487e-06,
+      "loss": 1.0397,
+      "step": 3514
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 6.736501561344286,
+      "learning_rate": 8.776925543125656e-06,
+      "loss": 1.0794,
+      "step": 3515
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.160483439750547,
+      "learning_rate": 8.776168761405746e-06,
+      "loss": 0.9978,
+      "step": 3516
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.783285061458503,
+      "learning_rate": 8.775411778275506e-06,
+      "loss": 1.0654,
+      "step": 3517
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.291784966753052,
+      "learning_rate": 8.77465459377531e-06,
+      "loss": 0.954,
+      "step": 3518
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.115259339354624,
+      "learning_rate": 8.773897207945546e-06,
+      "loss": 0.9855,
+      "step": 3519
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 9.059055013076085,
+      "learning_rate": 8.773139620826608e-06,
+      "loss": 1.029,
+      "step": 3520
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.581748474204147,
+      "learning_rate": 8.772381832458906e-06,
+      "loss": 1.0015,
+      "step": 3521
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.054618502228446,
+      "learning_rate": 8.771623842882857e-06,
+      "loss": 0.9987,
+      "step": 3522
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.965909505715154,
+      "learning_rate": 8.770865652138891e-06,
+      "loss": 0.9835,
+      "step": 3523
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 5.906532101051324,
+      "learning_rate": 8.770107260267446e-06,
+      "loss": 1.0813,
+      "step": 3524
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.394173689078561,
+      "learning_rate": 8.769348667308977e-06,
+      "loss": 1.0933,
+      "step": 3525
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 9.384056748364555,
+      "learning_rate": 8.76858987330394e-06,
+      "loss": 1.0399,
+      "step": 3526
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.102612316615846,
+      "learning_rate": 8.76783087829281e-06,
+      "loss": 1.0386,
+      "step": 3527
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.723456848548139,
+      "learning_rate": 8.767071682316068e-06,
+      "loss": 0.9364,
+      "step": 3528
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 4.924069514383292,
+      "learning_rate": 8.766312285414208e-06,
+      "loss": 0.9481,
+      "step": 3529
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 6.393281346791696,
+      "learning_rate": 8.765552687627735e-06,
+      "loss": 0.967,
+      "step": 3530
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 10.866361662084477,
+      "learning_rate": 8.764792888997163e-06,
+      "loss": 0.9908,
+      "step": 3531
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.556827300004838,
+      "learning_rate": 8.764032889563017e-06,
+      "loss": 1.0524,
+      "step": 3532
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 8.48656138498311,
+      "learning_rate": 8.763272689365834e-06,
+      "loss": 1.0038,
+      "step": 3533
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.69512762348968,
+      "learning_rate": 8.762512288446163e-06,
+      "loss": 0.9647,
+      "step": 3534
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 8.43155278400936,
+      "learning_rate": 8.761751686844557e-06,
+      "loss": 0.9918,
+      "step": 3535
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 8.499716401416038,
+      "learning_rate": 8.760990884601588e-06,
+      "loss": 0.9283,
+      "step": 3536
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 8.780641847048237,
+      "learning_rate": 8.760229881757834e-06,
+      "loss": 0.9895,
+      "step": 3537
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 9.703368080389609,
+      "learning_rate": 8.759468678353883e-06,
+      "loss": 0.9708,
+      "step": 3538
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 6.477510315285872,
+      "learning_rate": 8.75870727443034e-06,
+      "loss": 1.0379,
+      "step": 3539
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 11.65380365403306,
+      "learning_rate": 8.757945670027813e-06,
+      "loss": 0.9717,
+      "step": 3540
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 8.187458373777654,
+      "learning_rate": 8.757183865186923e-06,
+      "loss": 1.0116,
+      "step": 3541
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.773795087827309,
+      "learning_rate": 8.756421859948305e-06,
+      "loss": 1.0681,
+      "step": 3542
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 9.154447134068725,
+      "learning_rate": 8.755659654352599e-06,
+      "loss": 0.9558,
+      "step": 3543
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.2779628407912496,
+      "learning_rate": 8.754897248440464e-06,
+      "loss": 1.0925,
+      "step": 3544
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 8.668876437813818,
+      "learning_rate": 8.75413464225256e-06,
+      "loss": 0.9859,
+      "step": 3545
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 6.399367888416984,
+      "learning_rate": 8.753371835829563e-06,
+      "loss": 0.9961,
+      "step": 3546
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 8.327540646873702,
+      "learning_rate": 8.752608829212162e-06,
+      "loss": 1.0877,
+      "step": 3547
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.060129013431898,
+      "learning_rate": 8.75184562244105e-06,
+      "loss": 1.0476,
+      "step": 3548
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 8.477916254608884,
+      "learning_rate": 8.751082215556936e-06,
+      "loss": 0.9732,
+      "step": 3549
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 10.359790046044864,
+      "learning_rate": 8.750318608600538e-06,
+      "loss": 0.9854,
+      "step": 3550
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.906979131238562,
+      "learning_rate": 8.749554801612585e-06,
+      "loss": 1.0423,
+      "step": 3551
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 8.174199486066295,
+      "learning_rate": 8.748790794633815e-06,
+      "loss": 1.0447,
+      "step": 3552
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.791923388077486,
+      "learning_rate": 8.748026587704979e-06,
+      "loss": 1.0221,
+      "step": 3553
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.231555599835643,
+      "learning_rate": 8.747262180866838e-06,
+      "loss": 1.0443,
+      "step": 3554
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 13.285763632682725,
+      "learning_rate": 8.746497574160162e-06,
+      "loss": 0.9958,
+      "step": 3555
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 9.72325964053549,
+      "learning_rate": 8.745732767625734e-06,
+      "loss": 1.0181,
+      "step": 3556
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.868594098186243,
+      "learning_rate": 8.744967761304348e-06,
+      "loss": 0.9199,
+      "step": 3557
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.201328654678693,
+      "learning_rate": 8.744202555236804e-06,
+      "loss": 1.0472,
+      "step": 3558
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 9.745085272209682,
+      "learning_rate": 8.743437149463918e-06,
+      "loss": 0.9801,
+      "step": 3559
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 9.365438776115145,
+      "learning_rate": 8.742671544026515e-06,
+      "loss": 1.0265,
+      "step": 3560
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 9.321364490851787,
+      "learning_rate": 8.74190573896543e-06,
+      "loss": 1.0088,
+      "step": 3561
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 10.142630792720505,
+      "learning_rate": 8.741139734321507e-06,
+      "loss": 1.0662,
+      "step": 3562
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 6.486063753099658,
+      "learning_rate": 8.740373530135607e-06,
+      "loss": 1.0102,
+      "step": 3563
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.594893894347019,
+      "learning_rate": 8.739607126448591e-06,
+      "loss": 0.9811,
+      "step": 3564
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 6.975629955562345,
+      "learning_rate": 8.738840523301342e-06,
+      "loss": 0.9843,
+      "step": 3565
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 6.889662017748051,
+      "learning_rate": 8.738073720734746e-06,
+      "loss": 1.0082,
+      "step": 3566
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 8.580639520286537,
+      "learning_rate": 8.737306718789703e-06,
+      "loss": 0.9435,
+      "step": 3567
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 5.70777602272724,
+      "learning_rate": 8.736539517507122e-06,
+      "loss": 1.0117,
+      "step": 3568
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 9.681334074234316,
+      "learning_rate": 8.735772116927925e-06,
+      "loss": 1.0074,
+      "step": 3569
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.618266293170204,
+      "learning_rate": 8.73500451709304e-06,
+      "loss": 1.0307,
+      "step": 3570
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 8.62366487467074,
+      "learning_rate": 8.734236718043411e-06,
+      "loss": 0.9666,
+      "step": 3571
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 6.923689853790025,
+      "learning_rate": 8.73346871981999e-06,
+      "loss": 1.0406,
+      "step": 3572
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 8.120847858109604,
+      "learning_rate": 8.732700522463742e-06,
+      "loss": 1.0598,
+      "step": 3573
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.216104022450375,
+      "learning_rate": 8.731932126015637e-06,
+      "loss": 0.9783,
+      "step": 3574
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 6.552118066554922,
+      "learning_rate": 8.731163530516658e-06,
+      "loss": 1.0361,
+      "step": 3575
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 8.488780873281193,
+      "learning_rate": 8.730394736007803e-06,
+      "loss": 1.0515,
+      "step": 3576
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 9.10509209824764,
+      "learning_rate": 8.729625742530078e-06,
+      "loss": 0.9935,
+      "step": 3577
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.321372182585585,
+      "learning_rate": 8.728856550124496e-06,
+      "loss": 0.9567,
+      "step": 3578
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.893922357411447,
+      "learning_rate": 8.728087158832086e-06,
+      "loss": 0.9507,
+      "step": 3579
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 9.25987378767628,
+      "learning_rate": 8.727317568693882e-06,
+      "loss": 1.0258,
+      "step": 3580
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 6.525528246815572,
+      "learning_rate": 8.726547779750937e-06,
+      "loss": 0.9571,
+      "step": 3581
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 5.592292485633673,
+      "learning_rate": 8.725777792044304e-06,
+      "loss": 0.941,
+      "step": 3582
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 6.9707348419638535,
+      "learning_rate": 8.725007605615055e-06,
+      "loss": 0.9888,
+      "step": 3583
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.508265206995756,
+      "learning_rate": 8.72423722050427e-06,
+      "loss": 1.007,
+      "step": 3584
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 8.444830767042284,
+      "learning_rate": 8.723466636753038e-06,
+      "loss": 1.0588,
+      "step": 3585
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 5.74425163831789,
+      "learning_rate": 8.72269585440246e-06,
+      "loss": 0.9144,
+      "step": 3586
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 6.6299275120752315,
+      "learning_rate": 8.721924873493644e-06,
+      "loss": 0.96,
+      "step": 3587
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 6.923948670216932,
+      "learning_rate": 8.721153694067718e-06,
+      "loss": 1.068,
+      "step": 3588
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 10.246435191222783,
+      "learning_rate": 8.720382316165811e-06,
+      "loss": 1.0494,
+      "step": 3589
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 9.325132643957167,
+      "learning_rate": 8.719610739829069e-06,
+      "loss": 0.9216,
+      "step": 3590
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 10.520238073276293,
+      "learning_rate": 8.718838965098641e-06,
+      "loss": 0.9535,
+      "step": 3591
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 10.022252158845326,
+      "learning_rate": 8.718066992015696e-06,
+      "loss": 1.0335,
+      "step": 3592
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.256372903997218,
+      "learning_rate": 8.717294820621407e-06,
+      "loss": 0.9871,
+      "step": 3593
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 6.369091693218785,
+      "learning_rate": 8.716522450956959e-06,
+      "loss": 1.0002,
+      "step": 3594
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.457514814500723,
+      "learning_rate": 8.715749883063548e-06,
+      "loss": 0.9897,
+      "step": 3595
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 6.654235808037986,
+      "learning_rate": 8.714977116982381e-06,
+      "loss": 1.0656,
+      "step": 3596
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.4615731410980715,
+      "learning_rate": 8.714204152754677e-06,
+      "loss": 0.9801,
+      "step": 3597
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 5.0480201616214515,
+      "learning_rate": 8.71343099042166e-06,
+      "loss": 1.0727,
+      "step": 3598
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 10.603331991182765,
+      "learning_rate": 8.712657630024571e-06,
+      "loss": 1.1803,
+      "step": 3599
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.964666856394975,
+      "learning_rate": 8.711884071604659e-06,
+      "loss": 1.0247,
+      "step": 3600
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.778239829836285,
+      "learning_rate": 8.711110315203182e-06,
+      "loss": 1.0033,
+      "step": 3601
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.295069570989958,
+      "learning_rate": 8.71033636086141e-06,
+      "loss": 1.0739,
+      "step": 3602
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 6.438990810042817,
+      "learning_rate": 8.709562208620625e-06,
+      "loss": 0.9971,
+      "step": 3603
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 8.857486828414185,
+      "learning_rate": 8.708787858522119e-06,
+      "loss": 0.9569,
+      "step": 3604
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 8.278366044710635,
+      "learning_rate": 8.70801331060719e-06,
+      "loss": 0.9666,
+      "step": 3605
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 6.041459577737785,
+      "learning_rate": 8.707238564917153e-06,
+      "loss": 1.008,
+      "step": 3606
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.179580095745253,
+      "learning_rate": 8.70646362149333e-06,
+      "loss": 0.9499,
+      "step": 3607
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 8.605704207392836,
+      "learning_rate": 8.705688480377054e-06,
+      "loss": 0.9474,
+      "step": 3608
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.716965134256739,
+      "learning_rate": 8.70491314160967e-06,
+      "loss": 0.9897,
+      "step": 3609
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 6.934707387222937,
+      "learning_rate": 8.704137605232532e-06,
+      "loss": 0.9715,
+      "step": 3610
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 6.348995782088164,
+      "learning_rate": 8.703361871287004e-06,
+      "loss": 1.0333,
+      "step": 3611
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.6256145998766725,
+      "learning_rate": 8.702585939814462e-06,
+      "loss": 0.9749,
+      "step": 3612
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 8.232940917583738,
+      "learning_rate": 8.701809810856294e-06,
+      "loss": 1.1017,
+      "step": 3613
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 6.307870081625176,
+      "learning_rate": 8.701033484453893e-06,
+      "loss": 0.9862,
+      "step": 3614
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.346382253018145,
+      "learning_rate": 8.700256960648668e-06,
+      "loss": 0.9727,
+      "step": 3615
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 9.612026825989554,
+      "learning_rate": 8.699480239482036e-06,
+      "loss": 0.9382,
+      "step": 3616
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 6.686333918455443,
+      "learning_rate": 8.698703320995428e-06,
+      "loss": 1.0205,
+      "step": 3617
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.764017181499691,
+      "learning_rate": 8.697926205230279e-06,
+      "loss": 1.0499,
+      "step": 3618
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 5.548044111412729,
+      "learning_rate": 8.697148892228038e-06,
+      "loss": 1.0227,
+      "step": 3619
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.407807114159239,
+      "learning_rate": 8.696371382030166e-06,
+      "loss": 0.9455,
+      "step": 3620
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 6.465839329396516,
+      "learning_rate": 8.695593674678135e-06,
+      "loss": 1.0242,
+      "step": 3621
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.650068634480745,
+      "learning_rate": 8.694815770213425e-06,
+      "loss": 0.9811,
+      "step": 3622
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.647630196293801,
+      "learning_rate": 8.694037668677524e-06,
+      "loss": 0.9538,
+      "step": 3623
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.956070978485532,
+      "learning_rate": 8.693259370111937e-06,
+      "loss": 0.9971,
+      "step": 3624
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 6.849387419521413,
+      "learning_rate": 8.692480874558176e-06,
+      "loss": 0.9583,
+      "step": 3625
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.395779953367687,
+      "learning_rate": 8.691702182057762e-06,
+      "loss": 1.0496,
+      "step": 3626
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 4.812894290114384,
+      "learning_rate": 8.690923292652231e-06,
+      "loss": 1.0024,
+      "step": 3627
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.425641397781404,
+      "learning_rate": 8.690144206383127e-06,
+      "loss": 0.9561,
+      "step": 3628
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.098554888351174,
+      "learning_rate": 8.689364923291998e-06,
+      "loss": 1.062,
+      "step": 3629
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.03875035534627,
+      "learning_rate": 8.688585443420417e-06,
+      "loss": 1.005,
+      "step": 3630
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.2223258329184885,
+      "learning_rate": 8.687805766809954e-06,
+      "loss": 0.9714,
+      "step": 3631
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 6.788174301972642,
+      "learning_rate": 8.687025893502198e-06,
+      "loss": 1.0171,
+      "step": 3632
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 8.762055408423322,
+      "learning_rate": 8.686245823538743e-06,
+      "loss": 0.9794,
+      "step": 3633
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 9.354863563397293,
+      "learning_rate": 8.685465556961196e-06,
+      "loss": 1.0228,
+      "step": 3634
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.6113452925484175,
+      "learning_rate": 8.684685093811176e-06,
+      "loss": 1.0655,
+      "step": 3635
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 6.3965260546997085,
+      "learning_rate": 8.683904434130307e-06,
+      "loss": 1.016,
+      "step": 3636
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.081839822000326,
+      "learning_rate": 8.683123577960232e-06,
+      "loss": 1.0155,
+      "step": 3637
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.189308211718365,
+      "learning_rate": 8.682342525342597e-06,
+      "loss": 1.0025,
+      "step": 3638
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 8.197119727202825,
+      "learning_rate": 8.681561276319061e-06,
+      "loss": 1.0653,
+      "step": 3639
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 8.312241268967261,
+      "learning_rate": 8.680779830931294e-06,
+      "loss": 0.9905,
+      "step": 3640
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 8.066296426373842,
+      "learning_rate": 8.679998189220977e-06,
+      "loss": 0.9239,
+      "step": 3641
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 8.489375145712216,
+      "learning_rate": 8.6792163512298e-06,
+      "loss": 0.9402,
+      "step": 3642
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.306138181022572,
+      "learning_rate": 8.678434316999463e-06,
+      "loss": 1.0786,
+      "step": 3643
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 6.520295412071188,
+      "learning_rate": 8.67765208657168e-06,
+      "loss": 0.9839,
+      "step": 3644
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.562757659778358,
+      "learning_rate": 8.676869659988169e-06,
+      "loss": 1.0138,
+      "step": 3645
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 6.195144125808651,
+      "learning_rate": 8.676087037290669e-06,
+      "loss": 1.0132,
+      "step": 3646
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 6.9504765361639596,
+      "learning_rate": 8.675304218520916e-06,
+      "loss": 1.0788,
+      "step": 3647
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.950164137093733,
+      "learning_rate": 8.674521203720667e-06,
+      "loss": 0.9858,
+      "step": 3648
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 10.57645504323962,
+      "learning_rate": 8.673737992931685e-06,
+      "loss": 1.0817,
+      "step": 3649
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 5.868282645392339,
+      "learning_rate": 8.672954586195743e-06,
+      "loss": 0.9887,
+      "step": 3650
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 9.226280520330187,
+      "learning_rate": 8.67217098355463e-06,
+      "loss": 0.9492,
+      "step": 3651
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.520028761422789,
+      "learning_rate": 8.671387185050137e-06,
+      "loss": 0.9426,
+      "step": 3652
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 6.422644031506743,
+      "learning_rate": 8.670603190724069e-06,
+      "loss": 0.9815,
+      "step": 3653
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 6.555362307986619,
+      "learning_rate": 8.669819000618248e-06,
+      "loss": 1.0321,
+      "step": 3654
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 8.967229604172513,
+      "learning_rate": 8.669034614774491e-06,
+      "loss": 1.0271,
+      "step": 3655
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.571882013748855,
+      "learning_rate": 8.668250033234645e-06,
+      "loss": 1.0463,
+      "step": 3656
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 9.232568628217564,
+      "learning_rate": 8.667465256040552e-06,
+      "loss": 0.9856,
+      "step": 3657
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 6.9065375667694,
+      "learning_rate": 8.66668028323407e-06,
+      "loss": 1.0618,
+      "step": 3658
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 10.12445074524835,
+      "learning_rate": 8.665895114857069e-06,
+      "loss": 0.9773,
+      "step": 3659
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 8.364936839910454,
+      "learning_rate": 8.665109750951426e-06,
+      "loss": 1.0554,
+      "step": 3660
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.345885030156748,
+      "learning_rate": 8.66432419155903e-06,
+      "loss": 0.967,
+      "step": 3661
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 9.437490162841584,
+      "learning_rate": 8.663538436721782e-06,
+      "loss": 1.0284,
+      "step": 3662
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.427872425816894,
+      "learning_rate": 8.662752486481591e-06,
+      "loss": 0.9123,
+      "step": 3663
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.4673076715870765,
+      "learning_rate": 8.66196634088038e-06,
+      "loss": 0.9395,
+      "step": 3664
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 10.061434801074666,
+      "learning_rate": 8.661179999960074e-06,
+      "loss": 1.0142,
+      "step": 3665
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 8.952912909124084,
+      "learning_rate": 8.660393463762619e-06,
+      "loss": 0.9683,
+      "step": 3666
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 5.670757200066841,
+      "learning_rate": 8.659606732329965e-06,
+      "loss": 0.9509,
+      "step": 3667
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.4128299690331625,
+      "learning_rate": 8.658819805704076e-06,
+      "loss": 0.9779,
+      "step": 3668
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.170327135424686,
+      "learning_rate": 8.658032683926923e-06,
+      "loss": 1.056,
+      "step": 3669
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.054099528928581,
+      "learning_rate": 8.657245367040488e-06,
+      "loss": 0.9352,
+      "step": 3670
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 6.5503675935978745,
+      "learning_rate": 8.656457855086763e-06,
+      "loss": 1.0545,
+      "step": 3671
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 6.845824798935914,
+      "learning_rate": 8.655670148107758e-06,
+      "loss": 0.919,
+      "step": 3672
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 6.465918002325426,
+      "learning_rate": 8.65488224614548e-06,
+      "loss": 1.0441,
+      "step": 3673
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.140377341952155,
+      "learning_rate": 8.654094149241958e-06,
+      "loss": 1.0112,
+      "step": 3674
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.824515568202887,
+      "learning_rate": 8.653305857439224e-06,
+      "loss": 0.9935,
+      "step": 3675
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.535939172386517,
+      "learning_rate": 8.652517370779325e-06,
+      "loss": 1.0795,
+      "step": 3676
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 8.30617631533632,
+      "learning_rate": 8.651728689304317e-06,
+      "loss": 1.0853,
+      "step": 3677
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.632466485455502,
+      "learning_rate": 8.650939813056266e-06,
+      "loss": 1.0713,
+      "step": 3678
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 8.765209535135373,
+      "learning_rate": 8.650150742077246e-06,
+      "loss": 0.9933,
+      "step": 3679
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 9.392565771337415,
+      "learning_rate": 8.649361476409347e-06,
+      "loss": 1.0666,
+      "step": 3680
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.419659382656632,
+      "learning_rate": 8.648572016094663e-06,
+      "loss": 0.9619,
+      "step": 3681
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 8.05433602949691,
+      "learning_rate": 8.647782361175306e-06,
+      "loss": 0.9702,
+      "step": 3682
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 8.918773387259074,
+      "learning_rate": 8.646992511693391e-06,
+      "loss": 0.992,
+      "step": 3683
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 6.013425680254025,
+      "learning_rate": 8.646202467691045e-06,
+      "loss": 1.0077,
+      "step": 3684
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.63540702665708,
+      "learning_rate": 8.64541222921041e-06,
+      "loss": 1.0762,
+      "step": 3685
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 8.007332747381769,
+      "learning_rate": 8.644621796293633e-06,
+      "loss": 0.9559,
+      "step": 3686
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 6.347775001818078,
+      "learning_rate": 8.643831168982876e-06,
+      "loss": 0.9581,
+      "step": 3687
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.225288204256018,
+      "learning_rate": 8.643040347320306e-06,
+      "loss": 1.074,
+      "step": 3688
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 8.534231025032796,
+      "learning_rate": 8.642249331348105e-06,
+      "loss": 1.0014,
+      "step": 3689
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 5.700226724255266,
+      "learning_rate": 8.641458121108463e-06,
+      "loss": 0.9781,
+      "step": 3690
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 8.926791341578795,
+      "learning_rate": 8.640666716643579e-06,
+      "loss": 1.0018,
+      "step": 3691
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.589433091785266,
+      "learning_rate": 8.639875117995669e-06,
+      "loss": 1.0655,
+      "step": 3692
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 6.921304451329064,
+      "learning_rate": 8.63908332520695e-06,
+      "loss": 1.0266,
+      "step": 3693
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.177617903027173,
+      "learning_rate": 8.638291338319655e-06,
+      "loss": 1.0352,
+      "step": 3694
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 9.34377991706078,
+      "learning_rate": 8.637499157376029e-06,
+      "loss": 0.9546,
+      "step": 3695
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 9.940089106788761,
+      "learning_rate": 8.636706782418323e-06,
+      "loss": 1.0396,
+      "step": 3696
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 10.881923141056692,
+      "learning_rate": 8.6359142134888e-06,
+      "loss": 1.0681,
+      "step": 3697
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 9.373642850006474,
+      "learning_rate": 8.635121450629733e-06,
+      "loss": 1.0129,
+      "step": 3698
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.205714854876972,
+      "learning_rate": 8.634328493883407e-06,
+      "loss": 1.0139,
+      "step": 3699
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 8.92684157180635,
+      "learning_rate": 8.633535343292113e-06,
+      "loss": 1.0375,
+      "step": 3700
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.861269136616237,
+      "learning_rate": 8.63274199889816e-06,
+      "loss": 0.9566,
+      "step": 3701
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 6.7027041073175395,
+      "learning_rate": 8.63194846074386e-06,
+      "loss": 0.9908,
+      "step": 3702
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.487529354046191,
+      "learning_rate": 8.631154728871538e-06,
+      "loss": 1.0237,
+      "step": 3703
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 9.089009428942687,
+      "learning_rate": 8.630360803323532e-06,
+      "loss": 1.0313,
+      "step": 3704
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 8.566024058931315,
+      "learning_rate": 8.629566684142184e-06,
+      "loss": 1.0097,
+      "step": 3705
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 5.194958819902575,
+      "learning_rate": 8.628772371369854e-06,
+      "loss": 0.9869,
+      "step": 3706
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 6.7165761606516385,
+      "learning_rate": 8.627977865048904e-06,
+      "loss": 0.9191,
+      "step": 3707
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 8.889516867041978,
+      "learning_rate": 8.627183165221716e-06,
+      "loss": 1.0545,
+      "step": 3708
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 6.436590735915177,
+      "learning_rate": 8.626388271930673e-06,
+      "loss": 1.0136,
+      "step": 3709
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.079165558243544,
+      "learning_rate": 8.625593185218172e-06,
+      "loss": 1.005,
+      "step": 3710
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 5.449734433200259,
+      "learning_rate": 8.624797905126625e-06,
+      "loss": 0.9772,
+      "step": 3711
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.282183219323283,
+      "learning_rate": 8.624002431698446e-06,
+      "loss": 0.9806,
+      "step": 3712
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 10.400393899902536,
+      "learning_rate": 8.623206764976065e-06,
+      "loss": 0.9906,
+      "step": 3713
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 9.707667531233993,
+      "learning_rate": 8.622410905001922e-06,
+      "loss": 1.0196,
+      "step": 3714
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 6.722144117645195,
+      "learning_rate": 8.621614851818462e-06,
+      "loss": 0.9974,
+      "step": 3715
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 8.332186231204874,
+      "learning_rate": 8.62081860546815e-06,
+      "loss": 0.9734,
+      "step": 3716
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 6.737647797034962,
+      "learning_rate": 8.620022165993447e-06,
+      "loss": 0.9743,
+      "step": 3717
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 6.004171042652465,
+      "learning_rate": 8.619225533436841e-06,
+      "loss": 0.9768,
+      "step": 3718
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.697845901456156,
+      "learning_rate": 8.61842870784082e-06,
+      "loss": 0.9789,
+      "step": 3719
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 45.59298211931955,
+      "learning_rate": 8.617631689247882e-06,
+      "loss": 0.9752,
+      "step": 3720
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 10.473969405502176,
+      "learning_rate": 8.616834477700541e-06,
+      "loss": 0.9707,
+      "step": 3721
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 6.413320285053284,
+      "learning_rate": 8.616037073241314e-06,
+      "loss": 0.9879,
+      "step": 3722
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 5.430343619869026,
+      "learning_rate": 8.615239475912738e-06,
+      "loss": 1.1312,
+      "step": 3723
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.175701678850598,
+      "learning_rate": 8.61444168575735e-06,
+      "loss": 1.0335,
+      "step": 3724
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 9.500264612642496,
+      "learning_rate": 8.613643702817703e-06,
+      "loss": 1.0057,
+      "step": 3725
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 10.131668236177623,
+      "learning_rate": 8.61284552713636e-06,
+      "loss": 0.9874,
+      "step": 3726
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 8.145857416838217,
+      "learning_rate": 8.612047158755892e-06,
+      "loss": 1.0922,
+      "step": 3727
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 10.824715815974661,
+      "learning_rate": 8.611248597718883e-06,
+      "loss": 0.9612,
+      "step": 3728
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.879476932473115,
+      "learning_rate": 8.610449844067928e-06,
+      "loss": 1.0564,
+      "step": 3729
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.890521233326047,
+      "learning_rate": 8.609650897845627e-06,
+      "loss": 0.9849,
+      "step": 3730
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.732553957606645,
+      "learning_rate": 8.608851759094594e-06,
+      "loss": 1.0458,
+      "step": 3731
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 8.100348474547006,
+      "learning_rate": 8.608052427857453e-06,
+      "loss": 0.9597,
+      "step": 3732
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 6.788850026753545,
+      "learning_rate": 8.60725290417684e-06,
+      "loss": 1.0489,
+      "step": 3733
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 8.661195355927282,
+      "learning_rate": 8.606453188095397e-06,
+      "loss": 1.0384,
+      "step": 3734
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.291595687229815,
+      "learning_rate": 8.605653279655779e-06,
+      "loss": 1.0253,
+      "step": 3735
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 6.295571741882843,
+      "learning_rate": 8.604853178900652e-06,
+      "loss": 0.9943,
+      "step": 3736
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 8.304940793560863,
+      "learning_rate": 8.604052885872691e-06,
+      "loss": 1.0788,
+      "step": 3737
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 9.578095463361343,
+      "learning_rate": 8.603252400614581e-06,
+      "loss": 1.0295,
+      "step": 3738
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.331830544346416,
+      "learning_rate": 8.602451723169017e-06,
+      "loss": 1.0315,
+      "step": 3739
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 6.804024164068163,
+      "learning_rate": 8.601650853578707e-06,
+      "loss": 0.9741,
+      "step": 3740
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 6.888922072486771,
+      "learning_rate": 8.600849791886364e-06,
+      "loss": 1.042,
+      "step": 3741
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 8.23240266678335,
+      "learning_rate": 8.600048538134718e-06,
+      "loss": 0.9966,
+      "step": 3742
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.414318557885941,
+      "learning_rate": 8.599247092366503e-06,
+      "loss": 1.0678,
+      "step": 3743
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 8.209246123667272,
+      "learning_rate": 8.598445454624465e-06,
+      "loss": 1.0119,
+      "step": 3744
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 6.6975440169870595,
+      "learning_rate": 8.597643624951366e-06,
+      "loss": 1.0233,
+      "step": 3745
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 8.120793031165356,
+      "learning_rate": 8.596841603389967e-06,
+      "loss": 1.0748,
+      "step": 3746
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.666909023157772,
+      "learning_rate": 8.596039389983051e-06,
+      "loss": 0.985,
+      "step": 3747
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.070256718764912,
+      "learning_rate": 8.595236984773402e-06,
+      "loss": 1.0718,
+      "step": 3748
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.186366307158066,
+      "learning_rate": 8.594434387803821e-06,
+      "loss": 0.9979,
+      "step": 3749
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 9.026775414175894,
+      "learning_rate": 8.593631599117113e-06,
+      "loss": 1.1202,
+      "step": 3750
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 9.16207327300797,
+      "learning_rate": 8.592828618756101e-06,
+      "loss": 0.9872,
+      "step": 3751
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 6.23879090497157,
+      "learning_rate": 8.592025446763608e-06,
+      "loss": 1.0752,
+      "step": 3752
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.491171293150036,
+      "learning_rate": 8.59122208318248e-06,
+      "loss": 0.9921,
+      "step": 3753
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.75327374484352,
+      "learning_rate": 8.59041852805556e-06,
+      "loss": 1.0059,
+      "step": 3754
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.668303669542436,
+      "learning_rate": 8.589614781425709e-06,
+      "loss": 1.0801,
+      "step": 3755
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 6.906287043883962,
+      "learning_rate": 8.5888108433358e-06,
+      "loss": 0.9599,
+      "step": 3756
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.600346791792754,
+      "learning_rate": 8.588006713828706e-06,
+      "loss": 1.0327,
+      "step": 3757
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 10.224439194893682,
+      "learning_rate": 8.587202392947324e-06,
+      "loss": 1.0075,
+      "step": 3758
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 9.437901088148372,
+      "learning_rate": 8.586397880734552e-06,
+      "loss": 0.9325,
+      "step": 3759
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 5.65868296240151,
+      "learning_rate": 8.5855931772333e-06,
+      "loss": 1.0289,
+      "step": 3760
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 6.768906132455501,
+      "learning_rate": 8.584788282486487e-06,
+      "loss": 0.9906,
+      "step": 3761
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 10.456273839396491,
+      "learning_rate": 8.583983196537044e-06,
+      "loss": 1.0861,
+      "step": 3762
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.016428424466722,
+      "learning_rate": 8.583177919427916e-06,
+      "loss": 1.0175,
+      "step": 3763
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 8.156509517263654,
+      "learning_rate": 8.582372451202052e-06,
+      "loss": 1.0267,
+      "step": 3764
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 6.63993722034386,
+      "learning_rate": 8.581566791902412e-06,
+      "loss": 0.9858,
+      "step": 3765
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.286640065928069,
+      "learning_rate": 8.580760941571968e-06,
+      "loss": 1.0576,
+      "step": 3766
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.978041285665146,
+      "learning_rate": 8.579954900253701e-06,
+      "loss": 1.0029,
+      "step": 3767
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.497296098365652,
+      "learning_rate": 8.579148667990608e-06,
+      "loss": 1.0459,
+      "step": 3768
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.441532172726469,
+      "learning_rate": 8.578342244825685e-06,
+      "loss": 1.017,
+      "step": 3769
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 8.672123617269781,
+      "learning_rate": 8.577535630801949e-06,
+      "loss": 1.0229,
+      "step": 3770
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 6.880704995867788,
+      "learning_rate": 8.57672882596242e-06,
+      "loss": 0.9982,
+      "step": 3771
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 6.269200515017189,
+      "learning_rate": 8.575921830350131e-06,
+      "loss": 1.0176,
+      "step": 3772
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.57870689726068,
+      "learning_rate": 8.575114644008125e-06,
+      "loss": 1.0808,
+      "step": 3773
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 5.935356213497507,
+      "learning_rate": 8.574307266979456e-06,
+      "loss": 1.0643,
+      "step": 3774
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 8.759981236852976,
+      "learning_rate": 8.573499699307187e-06,
+      "loss": 0.9843,
+      "step": 3775
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.316306598915712,
+      "learning_rate": 8.57269194103439e-06,
+      "loss": 1.0128,
+      "step": 3776
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.845767309183834,
+      "learning_rate": 8.571883992204149e-06,
+      "loss": 1.0423,
+      "step": 3777
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 9.092792355198222,
+      "learning_rate": 8.571075852859558e-06,
+      "loss": 1.0105,
+      "step": 3778
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.464875557408208,
+      "learning_rate": 8.570267523043722e-06,
+      "loss": 1.0947,
+      "step": 3779
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 8.271076378568356,
+      "learning_rate": 8.569459002799757e-06,
+      "loss": 0.9939,
+      "step": 3780
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 8.869201895974278,
+      "learning_rate": 8.568650292170779e-06,
+      "loss": 1.0171,
+      "step": 3781
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 5.424795356087079,
+      "learning_rate": 8.567841391199932e-06,
+      "loss": 1.0397,
+      "step": 3782
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.346771771844434,
+      "learning_rate": 8.567032299930354e-06,
+      "loss": 1.0505,
+      "step": 3783
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 10.206110011581737,
+      "learning_rate": 8.566223018405203e-06,
+      "loss": 0.9961,
+      "step": 3784
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.237061200818936,
+      "learning_rate": 8.565413546667642e-06,
+      "loss": 0.9763,
+      "step": 3785
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.730058297814936,
+      "learning_rate": 8.56460388476085e-06,
+      "loss": 1.0807,
+      "step": 3786
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.725140813599425,
+      "learning_rate": 8.563794032728006e-06,
+      "loss": 0.9661,
+      "step": 3787
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 12.115279534505149,
+      "learning_rate": 8.562983990612309e-06,
+      "loss": 1.0338,
+      "step": 3788
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.872659035811887,
+      "learning_rate": 8.562173758456962e-06,
+      "loss": 0.9111,
+      "step": 3789
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 8.963694105466683,
+      "learning_rate": 8.561363336305183e-06,
+      "loss": 1.0176,
+      "step": 3790
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.881113510180743,
+      "learning_rate": 8.560552724200198e-06,
+      "loss": 1.0,
+      "step": 3791
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 10.667994915315708,
+      "learning_rate": 8.55974192218524e-06,
+      "loss": 1.0063,
+      "step": 3792
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 9.69347884979855,
+      "learning_rate": 8.558930930303558e-06,
+      "loss": 0.9772,
+      "step": 3793
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.3039743146863945,
+      "learning_rate": 8.558119748598405e-06,
+      "loss": 0.9879,
+      "step": 3794
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 8.193236617589843,
+      "learning_rate": 8.55730837711305e-06,
+      "loss": 0.9907,
+      "step": 3795
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 6.4149974015127,
+      "learning_rate": 8.556496815890767e-06,
+      "loss": 0.9858,
+      "step": 3796
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 9.089987833865626,
+      "learning_rate": 8.555685064974843e-06,
+      "loss": 0.9705,
+      "step": 3797
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 9.636032243348044,
+      "learning_rate": 8.554873124408575e-06,
+      "loss": 0.9883,
+      "step": 3798
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 6.572227540665724,
+      "learning_rate": 8.554060994235272e-06,
+      "loss": 1.0387,
+      "step": 3799
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 6.38415278605002,
+      "learning_rate": 8.553248674498248e-06,
+      "loss": 0.9872,
+      "step": 3800
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 5.882876339008534,
+      "learning_rate": 8.552436165240827e-06,
+      "loss": 1.0028,
+      "step": 3801
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 9.734901669190458,
+      "learning_rate": 8.551623466506351e-06,
+      "loss": 1.0682,
+      "step": 3802
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.580772640552674,
+      "learning_rate": 8.550810578338167e-06,
+      "loss": 1.0143,
+      "step": 3803
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 8.903344821972034,
+      "learning_rate": 8.549997500779627e-06,
+      "loss": 0.9217,
+      "step": 3804
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 8.957696165477051,
+      "learning_rate": 8.549184233874105e-06,
+      "loss": 1.0193,
+      "step": 3805
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 6.713910065435993,
+      "learning_rate": 8.548370777664975e-06,
+      "loss": 1.0215,
+      "step": 3806
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.384662236417772,
+      "learning_rate": 8.547557132195622e-06,
+      "loss": 0.985,
+      "step": 3807
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 8.35251253919935,
+      "learning_rate": 8.546743297509448e-06,
+      "loss": 1.0329,
+      "step": 3808
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 8.287307709765246,
+      "learning_rate": 8.54592927364986e-06,
+      "loss": 0.9506,
+      "step": 3809
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 8.564093950753376,
+      "learning_rate": 8.545115060660273e-06,
+      "loss": 1.0772,
+      "step": 3810
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 8.095262833727183,
+      "learning_rate": 8.544300658584117e-06,
+      "loss": 1.0193,
+      "step": 3811
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.035043490611317,
+      "learning_rate": 8.54348606746483e-06,
+      "loss": 0.9515,
+      "step": 3812
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 6.747127721139274,
+      "learning_rate": 8.542671287345859e-06,
+      "loss": 0.9404,
+      "step": 3813
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 8.201143981235063,
+      "learning_rate": 8.541856318270663e-06,
+      "loss": 1.0094,
+      "step": 3814
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 9.720605600779027,
+      "learning_rate": 8.541041160282709e-06,
+      "loss": 1.0597,
+      "step": 3815
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 8.814158664274233,
+      "learning_rate": 8.540225813425477e-06,
+      "loss": 0.9461,
+      "step": 3816
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 8.460778869481063,
+      "learning_rate": 8.539410277742453e-06,
+      "loss": 0.9826,
+      "step": 3817
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.685331377353647,
+      "learning_rate": 8.538594553277139e-06,
+      "loss": 0.9714,
+      "step": 3818
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 8.471092504790825,
+      "learning_rate": 8.53777864007304e-06,
+      "loss": 1.0197,
+      "step": 3819
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.5486473970350785,
+      "learning_rate": 8.536962538173676e-06,
+      "loss": 0.9459,
+      "step": 3820
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.45989276026594,
+      "learning_rate": 8.536146247622574e-06,
+      "loss": 0.9573,
+      "step": 3821
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 9.729650927965642,
+      "learning_rate": 8.535329768463276e-06,
+      "loss": 1.0033,
+      "step": 3822
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 6.475169427818198,
+      "learning_rate": 8.53451310073933e-06,
+      "loss": 1.0398,
+      "step": 3823
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.009750177531681,
+      "learning_rate": 8.53369624449429e-06,
+      "loss": 1.0441,
+      "step": 3824
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 6.801296233196933,
+      "learning_rate": 8.532879199771731e-06,
+      "loss": 0.9347,
+      "step": 3825
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 8.12284130004184,
+      "learning_rate": 8.532061966615228e-06,
+      "loss": 0.9855,
+      "step": 3826
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 9.954299095741856,
+      "learning_rate": 8.531244545068372e-06,
+      "loss": 1.0219,
+      "step": 3827
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 9.424177449495623,
+      "learning_rate": 8.530426935174762e-06,
+      "loss": 0.9211,
+      "step": 3828
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.137457255277199,
+      "learning_rate": 8.529609136978006e-06,
+      "loss": 0.9331,
+      "step": 3829
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 6.474494429883433,
+      "learning_rate": 8.528791150521723e-06,
+      "loss": 1.0206,
+      "step": 3830
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 8.93172076442788,
+      "learning_rate": 8.52797297584954e-06,
+      "loss": 1.0686,
+      "step": 3831
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 6.239041282485982,
+      "learning_rate": 8.527154613005102e-06,
+      "loss": 1.0296,
+      "step": 3832
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 8.399217995573204,
+      "learning_rate": 8.526336062032053e-06,
+      "loss": 1.0239,
+      "step": 3833
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.079640999602016,
+      "learning_rate": 8.525517322974056e-06,
+      "loss": 0.9757,
+      "step": 3834
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 6.032484885467756,
+      "learning_rate": 8.524698395874776e-06,
+      "loss": 1.0095,
+      "step": 3835
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 8.429826023522642,
+      "learning_rate": 8.523879280777896e-06,
+      "loss": 0.9494,
+      "step": 3836
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.163078743609877,
+      "learning_rate": 8.523059977727105e-06,
+      "loss": 1.0167,
+      "step": 3837
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 6.954648446783138,
+      "learning_rate": 8.5222404867661e-06,
+      "loss": 1.0302,
+      "step": 3838
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 5.91300665695355,
+      "learning_rate": 8.521420807938591e-06,
+      "loss": 0.9869,
+      "step": 3839
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 8.95222068052177,
+      "learning_rate": 8.520600941288298e-06,
+      "loss": 1.0397,
+      "step": 3840
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 8.019726048917587,
+      "learning_rate": 8.51978088685895e-06,
+      "loss": 0.9937,
+      "step": 3841
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.5220950972389415,
+      "learning_rate": 8.51896064469429e-06,
+      "loss": 1.0453,
+      "step": 3842
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 9.519193421204836,
+      "learning_rate": 8.518140214838062e-06,
+      "loss": 0.9188,
+      "step": 3843
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.1111923939671335,
+      "learning_rate": 8.517319597334029e-06,
+      "loss": 0.9753,
+      "step": 3844
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 10.08251342494295,
+      "learning_rate": 8.516498792225958e-06,
+      "loss": 1.0157,
+      "step": 3845
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 9.83007905092667,
+      "learning_rate": 8.515677799557629e-06,
+      "loss": 0.9785,
+      "step": 3846
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 6.748435782021481,
+      "learning_rate": 8.514856619372832e-06,
+      "loss": 1.0563,
+      "step": 3847
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 8.037204822530315,
+      "learning_rate": 8.514035251715369e-06,
+      "loss": 0.9411,
+      "step": 3848
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.440216763683607,
+      "learning_rate": 8.513213696629046e-06,
+      "loss": 0.977,
+      "step": 3849
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.797372666499376,
+      "learning_rate": 8.512391954157682e-06,
+      "loss": 0.9675,
+      "step": 3850
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 8.910770082616132,
+      "learning_rate": 8.51157002434511e-06,
+      "loss": 0.9638,
+      "step": 3851
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 6.6109563345714495,
+      "learning_rate": 8.510747907235167e-06,
+      "loss": 1.0175,
+      "step": 3852
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.143012021101079,
+      "learning_rate": 8.509925602871702e-06,
+      "loss": 1.0638,
+      "step": 3853
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.45844570281024,
+      "learning_rate": 8.509103111298577e-06,
+      "loss": 0.9855,
+      "step": 3854
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.682099944051118,
+      "learning_rate": 8.508280432559656e-06,
+      "loss": 1.0523,
+      "step": 3855
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 8.691277077523996,
+      "learning_rate": 8.507457566698827e-06,
+      "loss": 1.0868,
+      "step": 3856
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 8.139772801487508,
+      "learning_rate": 8.50663451375997e-06,
+      "loss": 0.986,
+      "step": 3857
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 8.099785071022376,
+      "learning_rate": 8.505811273786992e-06,
+      "loss": 0.9825,
+      "step": 3858
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.092901030307633,
+      "learning_rate": 8.504987846823798e-06,
+      "loss": 1.0085,
+      "step": 3859
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 8.28550592077301,
+      "learning_rate": 8.50416423291431e-06,
+      "loss": 1.0812,
+      "step": 3860
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 8.67141192225834,
+      "learning_rate": 8.503340432102453e-06,
+      "loss": 0.9933,
+      "step": 3861
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.664029294049827,
+      "learning_rate": 8.50251644443217e-06,
+      "loss": 1.0406,
+      "step": 3862
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 5.659440036358569,
+      "learning_rate": 8.50169226994741e-06,
+      "loss": 1.0152,
+      "step": 3863
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 6.461791838925811,
+      "learning_rate": 8.50086790869213e-06,
+      "loss": 1.071,
+      "step": 3864
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.0377798230805775,
+      "learning_rate": 8.500043360710301e-06,
+      "loss": 0.9943,
+      "step": 3865
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.545354989849371,
+      "learning_rate": 8.499218626045903e-06,
+      "loss": 1.0413,
+      "step": 3866
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.721494946961592,
+      "learning_rate": 8.498393704742922e-06,
+      "loss": 1.0404,
+      "step": 3867
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 9.443722172331887,
+      "learning_rate": 8.497568596845358e-06,
+      "loss": 1.0262,
+      "step": 3868
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 9.164354716633266,
+      "learning_rate": 8.49674330239722e-06,
+      "loss": 1.0263,
+      "step": 3869
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 9.202989150638595,
+      "learning_rate": 8.495917821442531e-06,
+      "loss": 0.9434,
+      "step": 3870
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 8.778322744599922,
+      "learning_rate": 8.495092154025316e-06,
+      "loss": 1.0131,
+      "step": 3871
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.180727695627745,
+      "learning_rate": 8.494266300189611e-06,
+      "loss": 1.0276,
+      "step": 3872
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 6.4518053225614755,
+      "learning_rate": 8.49344025997947e-06,
+      "loss": 1.0941,
+      "step": 3873
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 8.14517103452142,
+      "learning_rate": 8.492614033438949e-06,
+      "loss": 0.9914,
+      "step": 3874
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 6.6681694874956925,
+      "learning_rate": 8.491787620612118e-06,
+      "loss": 1.0358,
+      "step": 3875
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 5.780186193969148,
+      "learning_rate": 8.490961021543054e-06,
+      "loss": 1.1086,
+      "step": 3876
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 8.535269453691395,
+      "learning_rate": 8.490134236275846e-06,
+      "loss": 0.9803,
+      "step": 3877
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 8.342229969764816,
+      "learning_rate": 8.489307264854593e-06,
+      "loss": 1.1103,
+      "step": 3878
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.0006254340646255,
+      "learning_rate": 8.488480107323403e-06,
+      "loss": 0.9804,
+      "step": 3879
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 5.851072691243369,
+      "learning_rate": 8.487652763726395e-06,
+      "loss": 1.0171,
+      "step": 3880
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.821041123230156,
+      "learning_rate": 8.486825234107696e-06,
+      "loss": 1.0593,
+      "step": 3881
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.1153782478685255,
+      "learning_rate": 8.485997518511444e-06,
+      "loss": 0.985,
+      "step": 3882
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 9.35595216870922,
+      "learning_rate": 8.485169616981787e-06,
+      "loss": 0.958,
+      "step": 3883
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 6.571688875194124,
+      "learning_rate": 8.484341529562886e-06,
+      "loss": 1.0186,
+      "step": 3884
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.8036802401226515,
+      "learning_rate": 8.483513256298907e-06,
+      "loss": 0.9896,
+      "step": 3885
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 8.623956912824035,
+      "learning_rate": 8.482684797234024e-06,
+      "loss": 0.9993,
+      "step": 3886
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.474042282894859,
+      "learning_rate": 8.48185615241243e-06,
+      "loss": 0.9611,
+      "step": 3887
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 5.753292229114149,
+      "learning_rate": 8.481027321878322e-06,
+      "loss": 0.9793,
+      "step": 3888
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 5.945504754297105,
+      "learning_rate": 8.480198305675903e-06,
+      "loss": 1.031,
+      "step": 3889
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 6.774438935377229,
+      "learning_rate": 8.479369103849396e-06,
+      "loss": 0.9923,
+      "step": 3890
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 6.877983954221773,
+      "learning_rate": 8.478539716443027e-06,
+      "loss": 0.9849,
+      "step": 3891
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.61444206249251,
+      "learning_rate": 8.47771014350103e-06,
+      "loss": 0.9434,
+      "step": 3892
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 8.088129574058595,
+      "learning_rate": 8.476880385067655e-06,
+      "loss": 1.0472,
+      "step": 3893
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.325081918612979,
+      "learning_rate": 8.476050441187156e-06,
+      "loss": 1.0367,
+      "step": 3894
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.026980565588253,
+      "learning_rate": 8.475220311903804e-06,
+      "loss": 0.9905,
+      "step": 3895
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 4.709946710258868,
+      "learning_rate": 8.474389997261874e-06,
+      "loss": 0.9863,
+      "step": 3896
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 5.991130325100333,
+      "learning_rate": 8.473559497305653e-06,
+      "loss": 1.0082,
+      "step": 3897
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 6.384794268422774,
+      "learning_rate": 8.472728812079436e-06,
+      "loss": 1.0292,
+      "step": 3898
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 6.222294516117738,
+      "learning_rate": 8.471897941627531e-06,
+      "loss": 1.0381,
+      "step": 3899
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 6.258188078999031,
+      "learning_rate": 8.471066885994253e-06,
+      "loss": 0.9394,
+      "step": 3900
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 8.921453862452754,
+      "learning_rate": 8.47023564522393e-06,
+      "loss": 1.0192,
+      "step": 3901
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.3199187877328935,
+      "learning_rate": 8.469404219360898e-06,
+      "loss": 1.0709,
+      "step": 3902
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.474076062042372,
+      "learning_rate": 8.468572608449501e-06,
+      "loss": 0.9034,
+      "step": 3903
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 9.030629333365983,
+      "learning_rate": 8.467740812534095e-06,
+      "loss": 1.0062,
+      "step": 3904
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.686940811922775,
+      "learning_rate": 8.466908831659049e-06,
+      "loss": 0.9989,
+      "step": 3905
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 8.875538471314485,
+      "learning_rate": 8.466076665868734e-06,
+      "loss": 1.0109,
+      "step": 3906
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 6.398785452630272,
+      "learning_rate": 8.46524431520754e-06,
+      "loss": 0.9748,
+      "step": 3907
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.2147550826408535,
+      "learning_rate": 8.464411779719857e-06,
+      "loss": 0.921,
+      "step": 3908
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.758584490442917,
+      "learning_rate": 8.463579059450094e-06,
+      "loss": 0.9081,
+      "step": 3909
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 10.527067401938856,
+      "learning_rate": 8.462746154442665e-06,
+      "loss": 0.9402,
+      "step": 3910
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.721786967697753,
+      "learning_rate": 8.461913064741996e-06,
+      "loss": 1.0833,
+      "step": 3911
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.75250683112846,
+      "learning_rate": 8.461079790392518e-06,
+      "loss": 1.0734,
+      "step": 3912
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 5.52463185519099,
+      "learning_rate": 8.46024633143868e-06,
+      "loss": 1.0388,
+      "step": 3913
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 8.23207438937991,
+      "learning_rate": 8.459412687924934e-06,
+      "loss": 1.0088,
+      "step": 3914
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.942805820470846,
+      "learning_rate": 8.458578859895743e-06,
+      "loss": 0.9371,
+      "step": 3915
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 8.910681311966089,
+      "learning_rate": 8.457744847395583e-06,
+      "loss": 0.9338,
+      "step": 3916
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 8.947103390661091,
+      "learning_rate": 8.45691065046894e-06,
+      "loss": 1.0678,
+      "step": 3917
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.659525331474169,
+      "learning_rate": 8.456076269160302e-06,
+      "loss": 1.0129,
+      "step": 3918
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 5.17091518197667,
+      "learning_rate": 8.455241703514179e-06,
+      "loss": 1.0286,
+      "step": 3919
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 6.108016166019402,
+      "learning_rate": 8.454406953575079e-06,
+      "loss": 0.9518,
+      "step": 3920
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.142065645217521,
+      "learning_rate": 8.453572019387529e-06,
+      "loss": 0.9382,
+      "step": 3921
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 9.50886552308845,
+      "learning_rate": 8.452736900996057e-06,
+      "loss": 1.0223,
+      "step": 3922
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 8.924981361118714,
+      "learning_rate": 8.451901598445214e-06,
+      "loss": 0.9744,
+      "step": 3923
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.453539457300747,
+      "learning_rate": 8.451066111779546e-06,
+      "loss": 0.9612,
+      "step": 3924
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 8.315958285414583,
+      "learning_rate": 8.45023044104362e-06,
+      "loss": 1.0865,
+      "step": 3925
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.512422969781047,
+      "learning_rate": 8.449394586282002e-06,
+      "loss": 0.9818,
+      "step": 3926
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.813683275005901,
+      "learning_rate": 8.448558547539281e-06,
+      "loss": 0.9253,
+      "step": 3927
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 8.333744122779613,
+      "learning_rate": 8.447722324860045e-06,
+      "loss": 1.079,
+      "step": 3928
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.804306037254503,
+      "learning_rate": 8.446885918288897e-06,
+      "loss": 0.9789,
+      "step": 3929
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 6.6227740625430265,
+      "learning_rate": 8.44604932787045e-06,
+      "loss": 1.0282,
+      "step": 3930
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 8.51507259244182,
+      "learning_rate": 8.445212553649323e-06,
+      "loss": 1.0367,
+      "step": 3931
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 10.15179405098759,
+      "learning_rate": 8.444375595670148e-06,
+      "loss": 1.0023,
+      "step": 3932
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 6.8524264883799715,
+      "learning_rate": 8.443538453977566e-06,
+      "loss": 0.9759,
+      "step": 3933
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 5.556820274413043,
+      "learning_rate": 8.442701128616228e-06,
+      "loss": 1.0072,
+      "step": 3934
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 8.122887416263007,
+      "learning_rate": 8.441863619630796e-06,
+      "loss": 1.0566,
+      "step": 3935
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 5.999962329775351,
+      "learning_rate": 8.441025927065937e-06,
+      "loss": 1.0353,
+      "step": 3936
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.062577815660346,
+      "learning_rate": 8.440188050966336e-06,
+      "loss": 1.0135,
+      "step": 3937
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 6.886069066043079,
+      "learning_rate": 8.439349991376679e-06,
+      "loss": 1.0234,
+      "step": 3938
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 5.984394719495044,
+      "learning_rate": 8.438511748341666e-06,
+      "loss": 1.0288,
+      "step": 3939
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.365066315004793,
+      "learning_rate": 8.437673321906008e-06,
+      "loss": 0.9522,
+      "step": 3940
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 8.609541498995888,
+      "learning_rate": 8.436834712114424e-06,
+      "loss": 1.0396,
+      "step": 3941
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 6.3527085436413415,
+      "learning_rate": 8.435995919011644e-06,
+      "loss": 1.0857,
+      "step": 3942
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.586335696018958,
+      "learning_rate": 8.435156942642404e-06,
+      "loss": 1.0972,
+      "step": 3943
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 6.463468056910693,
+      "learning_rate": 8.434317783051456e-06,
+      "loss": 1.1162,
+      "step": 3944
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 5.923862693136605,
+      "learning_rate": 8.433478440283555e-06,
+      "loss": 0.9657,
+      "step": 3945
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 6.674405135378374,
+      "learning_rate": 8.432638914383471e-06,
+      "loss": 1.0546,
+      "step": 3946
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.437887166121094,
+      "learning_rate": 8.431799205395984e-06,
+      "loss": 1.0001,
+      "step": 3947
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 6.943715024585363,
+      "learning_rate": 8.430959313365879e-06,
+      "loss": 1.1676,
+      "step": 3948
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 5.287468604724251,
+      "learning_rate": 8.430119238337956e-06,
+      "loss": 0.9803,
+      "step": 3949
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 8.043565496755772,
+      "learning_rate": 8.429278980357018e-06,
+      "loss": 0.979,
+      "step": 3950
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 6.728169489113093,
+      "learning_rate": 8.428438539467887e-06,
+      "loss": 1.0012,
+      "step": 3951
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.208724072610038,
+      "learning_rate": 8.427597915715387e-06,
+      "loss": 0.9923,
+      "step": 3952
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.873418394221004,
+      "learning_rate": 8.426757109144354e-06,
+      "loss": 0.9938,
+      "step": 3953
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 6.9175609927217305,
+      "learning_rate": 8.425916119799634e-06,
+      "loss": 1.0265,
+      "step": 3954
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 5.644234884275319,
+      "learning_rate": 8.425074947726087e-06,
+      "loss": 0.9414,
+      "step": 3955
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.64161721294982,
+      "learning_rate": 8.424233592968574e-06,
+      "loss": 1.0775,
+      "step": 3956
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 8.458546818908845,
+      "learning_rate": 8.423392055571974e-06,
+      "loss": 1.1373,
+      "step": 3957
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 6.652395033666688,
+      "learning_rate": 8.42255033558117e-06,
+      "loss": 1.0474,
+      "step": 3958
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 5.553254202399497,
+      "learning_rate": 8.421708433041059e-06,
+      "loss": 1.0317,
+      "step": 3959
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 6.267208996309915,
+      "learning_rate": 8.420866347996542e-06,
+      "loss": 0.9868,
+      "step": 3960
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 8.20924334460908,
+      "learning_rate": 8.42002408049254e-06,
+      "loss": 0.9885,
+      "step": 3961
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.189148069675349,
+      "learning_rate": 8.41918163057397e-06,
+      "loss": 1.0253,
+      "step": 3962
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 8.490453783971882,
+      "learning_rate": 8.41833899828577e-06,
+      "loss": 0.9452,
+      "step": 3963
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 9.095814514974286,
+      "learning_rate": 8.417496183672882e-06,
+      "loss": 0.9416,
+      "step": 3964
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.023584918665552,
+      "learning_rate": 8.416653186780262e-06,
+      "loss": 0.9434,
+      "step": 3965
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.4497527113678075,
+      "learning_rate": 8.41581000765287e-06,
+      "loss": 1.0133,
+      "step": 3966
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 8.3731495614178,
+      "learning_rate": 8.41496664633568e-06,
+      "loss": 0.9535,
+      "step": 3967
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.779829799211507,
+      "learning_rate": 8.414123102873677e-06,
+      "loss": 0.9801,
+      "step": 3968
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.108661403120785,
+      "learning_rate": 8.41327937731185e-06,
+      "loss": 0.958,
+      "step": 3969
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.0599080942309875,
+      "learning_rate": 8.412435469695199e-06,
+      "loss": 0.9722,
+      "step": 3970
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 8.404146432381964,
+      "learning_rate": 8.411591380068742e-06,
+      "loss": 0.9817,
+      "step": 3971
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.792052392186766,
+      "learning_rate": 8.410747108477496e-06,
+      "loss": 1.0817,
+      "step": 3972
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 5.340541119324229,
+      "learning_rate": 8.409902654966493e-06,
+      "loss": 0.9963,
+      "step": 3973
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.308480113793131,
+      "learning_rate": 8.409058019580773e-06,
+      "loss": 1.0066,
+      "step": 3974
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.5696242889053815,
+      "learning_rate": 8.40821320236539e-06,
+      "loss": 0.9941,
+      "step": 3975
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 9.877670736237414,
+      "learning_rate": 8.4073682033654e-06,
+      "loss": 1.1093,
+      "step": 3976
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 8.865694768776114,
+      "learning_rate": 8.406523022625874e-06,
+      "loss": 0.9612,
+      "step": 3977
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.275807828549941,
+      "learning_rate": 8.405677660191892e-06,
+      "loss": 1.0051,
+      "step": 3978
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.844502820439878,
+      "learning_rate": 8.404832116108546e-06,
+      "loss": 0.9769,
+      "step": 3979
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.359918701742958,
+      "learning_rate": 8.403986390420929e-06,
+      "loss": 0.9605,
+      "step": 3980
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.599040654932704,
+      "learning_rate": 8.403140483174155e-06,
+      "loss": 1.0077,
+      "step": 3981
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.508979695886328,
+      "learning_rate": 8.40229439441334e-06,
+      "loss": 1.0152,
+      "step": 3982
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.119680892257657,
+      "learning_rate": 8.40144812418361e-06,
+      "loss": 0.9823,
+      "step": 3983
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 8.002267644484089,
+      "learning_rate": 8.40060167253011e-06,
+      "loss": 1.0121,
+      "step": 3984
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.999665181992723,
+      "learning_rate": 8.399755039497979e-06,
+      "loss": 1.1202,
+      "step": 3985
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.913708939695561,
+      "learning_rate": 8.398908225132378e-06,
+      "loss": 0.9704,
+      "step": 3986
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 8.18528831792108,
+      "learning_rate": 8.398061229478473e-06,
+      "loss": 0.9819,
+      "step": 3987
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 9.137528951657444,
+      "learning_rate": 8.39721405258144e-06,
+      "loss": 0.9924,
+      "step": 3988
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.218056796495539,
+      "learning_rate": 8.396366694486466e-06,
+      "loss": 1.0265,
+      "step": 3989
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.8474386160598595,
+      "learning_rate": 8.395519155238747e-06,
+      "loss": 0.9907,
+      "step": 3990
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 10.006751289605994,
+      "learning_rate": 8.394671434883488e-06,
+      "loss": 0.9886,
+      "step": 3991
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 8.887999310578103,
+      "learning_rate": 8.393823533465904e-06,
+      "loss": 1.1091,
+      "step": 3992
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.0809965310396885,
+      "learning_rate": 8.392975451031218e-06,
+      "loss": 0.8872,
+      "step": 3993
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 10.115798941028977,
+      "learning_rate": 8.392127187624665e-06,
+      "loss": 0.9694,
+      "step": 3994
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 8.598139376748176,
+      "learning_rate": 8.391278743291493e-06,
+      "loss": 0.997,
+      "step": 3995
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 5.957913525812976,
+      "learning_rate": 8.39043011807695e-06,
+      "loss": 0.992,
+      "step": 3996
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 5.860411918382889,
+      "learning_rate": 8.389581312026306e-06,
+      "loss": 1.0154,
+      "step": 3997
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 4.888111689466636,
+      "learning_rate": 8.388732325184824e-06,
+      "loss": 0.9898,
+      "step": 3998
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 5.802834552710997,
+      "learning_rate": 8.387883157597796e-06,
+      "loss": 1.0086,
+      "step": 3999
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.978157363546076,
+      "learning_rate": 8.38703380931051e-06,
+      "loss": 1.0506,
+      "step": 4000
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 8.762136649855929,
+      "learning_rate": 8.386184280368269e-06,
+      "loss": 1.0964,
+      "step": 4001
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.466527737264222,
+      "learning_rate": 8.385334570816381e-06,
+      "loss": 1.0618,
+      "step": 4002
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.401643129691513,
+      "learning_rate": 8.384484680700173e-06,
+      "loss": 1.0713,
+      "step": 4003
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.397195999176267,
+      "learning_rate": 8.383634610064972e-06,
+      "loss": 0.9691,
+      "step": 4004
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 8.78231316608418,
+      "learning_rate": 8.38278435895612e-06,
+      "loss": 0.9473,
+      "step": 4005
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.322138736669466,
+      "learning_rate": 8.381933927418966e-06,
+      "loss": 0.9668,
+      "step": 4006
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.64809507112372,
+      "learning_rate": 8.381083315498868e-06,
+      "loss": 1.0079,
+      "step": 4007
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.411153484047498,
+      "learning_rate": 8.3802325232412e-06,
+      "loss": 1.0604,
+      "step": 4008
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.2494840119675095,
+      "learning_rate": 8.379381550691338e-06,
+      "loss": 1.0441,
+      "step": 4009
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.6059341010939265,
+      "learning_rate": 8.37853039789467e-06,
+      "loss": 1.0032,
+      "step": 4010
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.966113572717607,
+      "learning_rate": 8.377679064896595e-06,
+      "loss": 0.9443,
+      "step": 4011
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 8.343535358311229,
+      "learning_rate": 8.37682755174252e-06,
+      "loss": 1.05,
+      "step": 4012
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.692626569861883,
+      "learning_rate": 8.375975858477862e-06,
+      "loss": 1.0431,
+      "step": 4013
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.078149116073792,
+      "learning_rate": 8.375123985148051e-06,
+      "loss": 0.9739,
+      "step": 4014
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.643884538416135,
+      "learning_rate": 8.374271931798519e-06,
+      "loss": 0.9928,
+      "step": 4015
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.975127240009729,
+      "learning_rate": 8.373419698474716e-06,
+      "loss": 0.9981,
+      "step": 4016
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.381209834057389,
+      "learning_rate": 8.372567285222097e-06,
+      "loss": 1.052,
+      "step": 4017
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.450366340052989,
+      "learning_rate": 8.371714692086125e-06,
+      "loss": 1.0099,
+      "step": 4018
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.3198736948115855,
+      "learning_rate": 8.370861919112276e-06,
+      "loss": 0.9828,
+      "step": 4019
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.720950304151397,
+      "learning_rate": 8.370008966346037e-06,
+      "loss": 1.1297,
+      "step": 4020
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 9.707183199891077,
+      "learning_rate": 8.3691558338329e-06,
+      "loss": 1.0401,
+      "step": 4021
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 8.92259054976028,
+      "learning_rate": 8.368302521618367e-06,
+      "loss": 0.9585,
+      "step": 4022
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.605395983131485,
+      "learning_rate": 8.367449029747956e-06,
+      "loss": 0.9121,
+      "step": 4023
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 8.240930997401772,
+      "learning_rate": 8.366595358267185e-06,
+      "loss": 1.0835,
+      "step": 4024
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 9.388145587300041,
+      "learning_rate": 8.36574150722159e-06,
+      "loss": 0.9951,
+      "step": 4025
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.729555101719971,
+      "learning_rate": 8.364887476656711e-06,
+      "loss": 1.0125,
+      "step": 4026
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.835200000802362,
+      "learning_rate": 8.3640332666181e-06,
+      "loss": 0.9772,
+      "step": 4027
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.171344663607135,
+      "learning_rate": 8.36317887715132e-06,
+      "loss": 1.0739,
+      "step": 4028
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.180791669321408,
+      "learning_rate": 8.362324308301936e-06,
+      "loss": 0.9785,
+      "step": 4029
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.758932611997683,
+      "learning_rate": 8.361469560115535e-06,
+      "loss": 1.033,
+      "step": 4030
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.701716540035146,
+      "learning_rate": 8.360614632637705e-06,
+      "loss": 1.0607,
+      "step": 4031
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 6.140003306710314,
+      "learning_rate": 8.359759525914045e-06,
+      "loss": 0.95,
+      "step": 4032
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 11.790580489972147,
+      "learning_rate": 8.358904239990163e-06,
+      "loss": 1.017,
+      "step": 4033
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 7.855673138410925,
+      "learning_rate": 8.35804877491168e-06,
+      "loss": 0.9892,
+      "step": 4034
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 6.342478241146532,
+      "learning_rate": 8.35719313072422e-06,
+      "loss": 0.9546,
+      "step": 4035
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 9.45067237406434,
+      "learning_rate": 8.356337307473425e-06,
+      "loss": 1.0412,
+      "step": 4036
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 9.014989382980168,
+      "learning_rate": 8.35548130520494e-06,
+      "loss": 1.119,
+      "step": 4037
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.025851568250449,
+      "learning_rate": 8.354625123964422e-06,
+      "loss": 0.9355,
+      "step": 4038
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 5.80589609694058,
+      "learning_rate": 8.353768763797537e-06,
+      "loss": 0.9247,
+      "step": 4039
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 9.863921735671608,
+      "learning_rate": 8.352912224749963e-06,
+      "loss": 1.0142,
+      "step": 4040
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.040924776186078,
+      "learning_rate": 8.352055506867383e-06,
+      "loss": 0.935,
+      "step": 4041
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 7.244286937329163,
+      "learning_rate": 8.351198610195491e-06,
+      "loss": 1.0329,
+      "step": 4042
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 10.307695289307567,
+      "learning_rate": 8.350341534779995e-06,
+      "loss": 0.9607,
+      "step": 4043
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.775735089752322,
+      "learning_rate": 8.349484280666608e-06,
+      "loss": 0.991,
+      "step": 4044
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 6.950300848317603,
+      "learning_rate": 8.34862684790105e-06,
+      "loss": 1.0777,
+      "step": 4045
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 7.214186279326829,
+      "learning_rate": 8.34776923652906e-06,
+      "loss": 1.0062,
+      "step": 4046
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.270861457398249,
+      "learning_rate": 8.346911446596378e-06,
+      "loss": 0.9594,
+      "step": 4047
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 6.923732447778612,
+      "learning_rate": 8.346053478148753e-06,
+      "loss": 0.9635,
+      "step": 4048
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 5.786018492901258,
+      "learning_rate": 8.34519533123195e-06,
+      "loss": 1.0349,
+      "step": 4049
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 6.935645168749514,
+      "learning_rate": 8.344337005891741e-06,
+      "loss": 1.0417,
+      "step": 4050
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 7.322052316098953,
+      "learning_rate": 8.343478502173903e-06,
+      "loss": 0.991,
+      "step": 4051
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.361716808818835,
+      "learning_rate": 8.342619820124229e-06,
+      "loss": 1.0361,
+      "step": 4052
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.05680367848096,
+      "learning_rate": 8.341760959788519e-06,
+      "loss": 0.927,
+      "step": 4053
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 6.865518879133373,
+      "learning_rate": 8.34090192121258e-06,
+      "loss": 1.0534,
+      "step": 4054
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 6.851767914992381,
+      "learning_rate": 8.340042704442233e-06,
+      "loss": 1.001,
+      "step": 4055
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 7.027688687910159,
+      "learning_rate": 8.339183309523305e-06,
+      "loss": 0.974,
+      "step": 4056
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 7.61942106319158,
+      "learning_rate": 8.338323736501633e-06,
+      "loss": 0.9695,
+      "step": 4057
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.265044508304317,
+      "learning_rate": 8.337463985423067e-06,
+      "loss": 1.0218,
+      "step": 4058
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 5.776459297732997,
+      "learning_rate": 8.33660405633346e-06,
+      "loss": 1.0564,
+      "step": 4059
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 7.992698339243954,
+      "learning_rate": 8.33574394927868e-06,
+      "loss": 1.0665,
+      "step": 4060
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 6.177558962795771,
+      "learning_rate": 8.334883664304605e-06,
+      "loss": 1.009,
+      "step": 4061
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.350621306686078,
+      "learning_rate": 8.334023201457117e-06,
+      "loss": 1.0007,
+      "step": 4062
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 4.975894880062341,
+      "learning_rate": 8.333162560782111e-06,
+      "loss": 0.9497,
+      "step": 4063
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.29717547593432,
+      "learning_rate": 8.332301742325493e-06,
+      "loss": 1.0208,
+      "step": 4064
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 7.884648003200207,
+      "learning_rate": 8.331440746133175e-06,
+      "loss": 0.9702,
+      "step": 4065
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 6.440501796406945,
+      "learning_rate": 8.33057957225108e-06,
+      "loss": 1.0098,
+      "step": 4066
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 10.065210448611397,
+      "learning_rate": 8.329718220725143e-06,
+      "loss": 0.9504,
+      "step": 4067
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 5.103683242552639,
+      "learning_rate": 8.328856691601304e-06,
+      "loss": 1.027,
+      "step": 4068
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 5.898627430517647,
+      "learning_rate": 8.327994984925514e-06,
+      "loss": 1.0368,
+      "step": 4069
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 6.873534091826216,
+      "learning_rate": 8.327133100743734e-06,
+      "loss": 1.0503,
+      "step": 4070
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 6.989820739327193,
+      "learning_rate": 8.326271039101939e-06,
+      "loss": 0.9282,
+      "step": 4071
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 5.982890027319467,
+      "learning_rate": 8.325408800046104e-06,
+      "loss": 1.0244,
+      "step": 4072
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 7.31355129398247,
+      "learning_rate": 8.324546383622218e-06,
+      "loss": 1.0225,
+      "step": 4073
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.048706504737586,
+      "learning_rate": 8.323683789876285e-06,
+      "loss": 1.0313,
+      "step": 4074
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 5.252334756748988,
+      "learning_rate": 8.322821018854308e-06,
+      "loss": 1.004,
+      "step": 4075
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.335666953974517,
+      "learning_rate": 8.321958070602308e-06,
+      "loss": 1.0232,
+      "step": 4076
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.093862859673102,
+      "learning_rate": 8.321094945166311e-06,
+      "loss": 1.026,
+      "step": 4077
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.744171061705408,
+      "learning_rate": 8.320231642592354e-06,
+      "loss": 0.9316,
+      "step": 4078
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 7.796135615415381,
+      "learning_rate": 8.319368162926482e-06,
+      "loss": 0.9826,
+      "step": 4079
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 7.279363265683464,
+      "learning_rate": 8.318504506214753e-06,
+      "loss": 0.9475,
+      "step": 4080
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 6.233588474885111,
+      "learning_rate": 8.31764067250323e-06,
+      "loss": 1.0771,
+      "step": 4081
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 6.455779993042077,
+      "learning_rate": 8.316776661837988e-06,
+      "loss": 0.9695,
+      "step": 4082
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 5.794028807808728,
+      "learning_rate": 8.315912474265113e-06,
+      "loss": 0.9054,
+      "step": 4083
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.29750537127702,
+      "learning_rate": 8.315048109830695e-06,
+      "loss": 0.9905,
+      "step": 4084
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.257086396845853,
+      "learning_rate": 8.31418356858084e-06,
+      "loss": 0.9673,
+      "step": 4085
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.303292084753872,
+      "learning_rate": 8.313318850561657e-06,
+      "loss": 0.9811,
+      "step": 4086
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 7.472908553105587,
+      "learning_rate": 8.31245395581927e-06,
+      "loss": 1.0783,
+      "step": 4087
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 9.094783855676289,
+      "learning_rate": 8.311588884399808e-06,
+      "loss": 1.0023,
+      "step": 4088
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 9.243332449327117,
+      "learning_rate": 8.310723636349413e-06,
+      "loss": 0.9762,
+      "step": 4089
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.636475869606654,
+      "learning_rate": 8.309858211714235e-06,
+      "loss": 1.0141,
+      "step": 4090
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 5.710289279843413,
+      "learning_rate": 8.308992610540435e-06,
+      "loss": 1.0255,
+      "step": 4091
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 5.6846413969788285,
+      "learning_rate": 8.308126832874177e-06,
+      "loss": 1.0514,
+      "step": 4092
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.123125794075607,
+      "learning_rate": 8.307260878761643e-06,
+      "loss": 0.9939,
+      "step": 4093
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.269368550750233,
+      "learning_rate": 8.30639474824902e-06,
+      "loss": 1.0591,
+      "step": 4094
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 10.472231695517804,
+      "learning_rate": 8.305528441382504e-06,
+      "loss": 1.032,
+      "step": 4095
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.069017338505152,
+      "learning_rate": 8.304661958208302e-06,
+      "loss": 1.0246,
+      "step": 4096
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 9.444337297756663,
+      "learning_rate": 8.30379529877263e-06,
+      "loss": 1.1124,
+      "step": 4097
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 5.562808918591469,
+      "learning_rate": 8.302928463121713e-06,
+      "loss": 0.9696,
+      "step": 4098
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 5.05745239392379,
+      "learning_rate": 8.302061451301783e-06,
+      "loss": 1.0235,
+      "step": 4099
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 9.593733203124584,
+      "learning_rate": 8.30119426335909e-06,
+      "loss": 1.0568,
+      "step": 4100
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 7.833879475532025,
+      "learning_rate": 8.300326899339883e-06,
+      "loss": 1.029,
+      "step": 4101
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.162045265238879,
+      "learning_rate": 8.299459359290426e-06,
+      "loss": 0.9701,
+      "step": 4102
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 8.573661272620905,
+      "learning_rate": 8.29859164325699e-06,
+      "loss": 0.9546,
+      "step": 4103
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.434650999213497,
+      "learning_rate": 8.297723751285857e-06,
+      "loss": 1.0222,
+      "step": 4104
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.181345392472013,
+      "learning_rate": 8.29685568342332e-06,
+      "loss": 0.9974,
+      "step": 4105
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.851018534590985,
+      "learning_rate": 8.295987439715676e-06,
+      "loss": 1.0063,
+      "step": 4106
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 10.14874744014263,
+      "learning_rate": 8.295119020209239e-06,
+      "loss": 1.029,
+      "step": 4107
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.481326494630119,
+      "learning_rate": 8.294250424950324e-06,
+      "loss": 0.9903,
+      "step": 4108
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.605726617650507,
+      "learning_rate": 8.293381653985259e-06,
+      "loss": 1.0129,
+      "step": 4109
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.836777173700638,
+      "learning_rate": 8.292512707360386e-06,
+      "loss": 1.0331,
+      "step": 4110
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 9.44442443127748,
+      "learning_rate": 8.291643585122049e-06,
+      "loss": 0.9956,
+      "step": 4111
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.47114396300696,
+      "learning_rate": 8.290774287316606e-06,
+      "loss": 0.9432,
+      "step": 4112
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 8.893724249991225,
+      "learning_rate": 8.289904813990423e-06,
+      "loss": 1.078,
+      "step": 4113
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.695101519897837,
+      "learning_rate": 8.289035165189874e-06,
+      "loss": 0.9612,
+      "step": 4114
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.229662204348345,
+      "learning_rate": 8.288165340961345e-06,
+      "loss": 0.9842,
+      "step": 4115
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 8.34299800370855,
+      "learning_rate": 8.28729534135123e-06,
+      "loss": 0.9935,
+      "step": 4116
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.70571503935664,
+      "learning_rate": 8.286425166405929e-06,
+      "loss": 1.0811,
+      "step": 4117
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.220813632723488,
+      "learning_rate": 8.285554816171859e-06,
+      "loss": 1.0064,
+      "step": 4118
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.707053643361515,
+      "learning_rate": 8.284684290695441e-06,
+      "loss": 0.9817,
+      "step": 4119
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.019261560751377,
+      "learning_rate": 8.283813590023104e-06,
+      "loss": 1.0153,
+      "step": 4120
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.186090719432447,
+      "learning_rate": 8.282942714201294e-06,
+      "loss": 1.0507,
+      "step": 4121
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 8.207177806179299,
+      "learning_rate": 8.282071663276455e-06,
+      "loss": 1.0381,
+      "step": 4122
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.315708628020638,
+      "learning_rate": 8.28120043729505e-06,
+      "loss": 1.022,
+      "step": 4123
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.586534090099759,
+      "learning_rate": 8.280329036303548e-06,
+      "loss": 0.9761,
+      "step": 4124
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 9.479931062200723,
+      "learning_rate": 8.279457460348424e-06,
+      "loss": 1.0281,
+      "step": 4125
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.828438133825945,
+      "learning_rate": 8.27858570947617e-06,
+      "loss": 0.9544,
+      "step": 4126
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 5.899054790538135,
+      "learning_rate": 8.277713783733278e-06,
+      "loss": 0.9903,
+      "step": 4127
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 9.022580728879872,
+      "learning_rate": 8.276841683166257e-06,
+      "loss": 1.0118,
+      "step": 4128
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.125062911381626,
+      "learning_rate": 8.275969407821623e-06,
+      "loss": 0.9675,
+      "step": 4129
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.101990400041588,
+      "learning_rate": 8.275096957745899e-06,
+      "loss": 0.8979,
+      "step": 4130
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 8.00957208214338,
+      "learning_rate": 8.27422433298562e-06,
+      "loss": 1.0114,
+      "step": 4131
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.144534086316114,
+      "learning_rate": 8.273351533587326e-06,
+      "loss": 1.0135,
+      "step": 4132
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 5.490483624505848,
+      "learning_rate": 8.272478559597574e-06,
+      "loss": 0.9811,
+      "step": 4133
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.993146802956993,
+      "learning_rate": 8.271605411062927e-06,
+      "loss": 1.0661,
+      "step": 4134
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 4.937842692781526,
+      "learning_rate": 8.270732088029952e-06,
+      "loss": 1.0211,
+      "step": 4135
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.4024612341045435,
+      "learning_rate": 8.269858590545232e-06,
+      "loss": 0.928,
+      "step": 4136
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 5.672502084028606,
+      "learning_rate": 8.268984918655356e-06,
+      "loss": 0.9676,
+      "step": 4137
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.691973728520965,
+      "learning_rate": 8.268111072406924e-06,
+      "loss": 0.9403,
+      "step": 4138
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.167446393352607,
+      "learning_rate": 8.267237051846543e-06,
+      "loss": 0.9988,
+      "step": 4139
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.091415655240657,
+      "learning_rate": 8.266362857020835e-06,
+      "loss": 1.0264,
+      "step": 4140
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.97017467680112,
+      "learning_rate": 8.265488487976422e-06,
+      "loss": 1.0286,
+      "step": 4141
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.987262780515021,
+      "learning_rate": 8.264613944759942e-06,
+      "loss": 1.0204,
+      "step": 4142
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 8.077444250216237,
+      "learning_rate": 8.263739227418043e-06,
+      "loss": 0.9131,
+      "step": 4143
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.527102799115024,
+      "learning_rate": 8.262864335997379e-06,
+      "loss": 1.1057,
+      "step": 4144
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.161308789238011,
+      "learning_rate": 8.261989270544612e-06,
+      "loss": 0.9828,
+      "step": 4145
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.286193944113537,
+      "learning_rate": 8.261114031106417e-06,
+      "loss": 0.9586,
+      "step": 4146
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 9.364251951407914,
+      "learning_rate": 8.260238617729477e-06,
+      "loss": 0.9746,
+      "step": 4147
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.27247214079545,
+      "learning_rate": 8.259363030460485e-06,
+      "loss": 0.9423,
+      "step": 4148
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 9.23098163342924,
+      "learning_rate": 8.25848726934614e-06,
+      "loss": 0.917,
+      "step": 4149
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 8.022192650345671,
+      "learning_rate": 8.257611334433157e-06,
+      "loss": 1.0117,
+      "step": 4150
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 8.067383228909973,
+      "learning_rate": 8.256735225768249e-06,
+      "loss": 1.0194,
+      "step": 4151
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 8.166084835172454,
+      "learning_rate": 8.255858943398151e-06,
+      "loss": 1.0216,
+      "step": 4152
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.567305134865203,
+      "learning_rate": 8.254982487369602e-06,
+      "loss": 1.0232,
+      "step": 4153
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.519528773103338,
+      "learning_rate": 8.254105857729345e-06,
+      "loss": 0.9386,
+      "step": 4154
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.235248272536717,
+      "learning_rate": 8.25322905452414e-06,
+      "loss": 1.0709,
+      "step": 4155
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 5.7706851648527175,
+      "learning_rate": 8.252352077800752e-06,
+      "loss": 1.0357,
+      "step": 4156
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 5.213925886748128,
+      "learning_rate": 8.251474927605958e-06,
+      "loss": 0.9452,
+      "step": 4157
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.221405509722925,
+      "learning_rate": 8.25059760398654e-06,
+      "loss": 1.0182,
+      "step": 4158
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.516459229573587,
+      "learning_rate": 8.249720106989297e-06,
+      "loss": 1.028,
+      "step": 4159
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.082072307774522,
+      "learning_rate": 8.248842436661027e-06,
+      "loss": 0.9654,
+      "step": 4160
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.19407428696021,
+      "learning_rate": 8.247964593048545e-06,
+      "loss": 0.9953,
+      "step": 4161
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.579503062008561,
+      "learning_rate": 8.247086576198675e-06,
+      "loss": 0.9462,
+      "step": 4162
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 9.344499627620122,
+      "learning_rate": 8.24620838615824e-06,
+      "loss": 0.9648,
+      "step": 4163
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 11.447876860878004,
+      "learning_rate": 8.24533002297409e-06,
+      "loss": 1.0014,
+      "step": 4164
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.572804708276304,
+      "learning_rate": 8.244451486693068e-06,
+      "loss": 0.9516,
+      "step": 4165
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.427412786865537,
+      "learning_rate": 8.243572777362036e-06,
+      "loss": 0.9762,
+      "step": 4166
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 8.694092331818762,
+      "learning_rate": 8.242693895027861e-06,
+      "loss": 1.0037,
+      "step": 4167
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 5.923182092382701,
+      "learning_rate": 8.241814839737418e-06,
+      "loss": 1.084,
+      "step": 4168
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.343804598610717,
+      "learning_rate": 8.240935611537595e-06,
+      "loss": 1.05,
+      "step": 4169
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.802734392172915,
+      "learning_rate": 8.240056210475289e-06,
+      "loss": 1.0649,
+      "step": 4170
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.237922339252111,
+      "learning_rate": 8.239176636597403e-06,
+      "loss": 0.9713,
+      "step": 4171
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.74390034751787,
+      "learning_rate": 8.238296889950853e-06,
+      "loss": 0.9174,
+      "step": 4172
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 9.54064593057659,
+      "learning_rate": 8.237416970582558e-06,
+      "loss": 0.9948,
+      "step": 4173
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 13.603050684054349,
+      "learning_rate": 8.236536878539454e-06,
+      "loss": 1.0439,
+      "step": 4174
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.871995850374321,
+      "learning_rate": 8.235656613868481e-06,
+      "loss": 1.1628,
+      "step": 4175
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 5.886609356830169,
+      "learning_rate": 8.234776176616592e-06,
+      "loss": 1.0397,
+      "step": 4176
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.821229694634467,
+      "learning_rate": 8.233895566830747e-06,
+      "loss": 0.9752,
+      "step": 4177
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 8.586559841079142,
+      "learning_rate": 8.233014784557914e-06,
+      "loss": 1.0164,
+      "step": 4178
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 6.738474460661711,
+      "learning_rate": 8.232133829845069e-06,
+      "loss": 0.9869,
+      "step": 4179
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.053534428113914,
+      "learning_rate": 8.231252702739203e-06,
+      "loss": 0.9836,
+      "step": 4180
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 5.1482361633991385,
+      "learning_rate": 8.230371403287313e-06,
+      "loss": 1.002,
+      "step": 4181
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 9.99171638113292,
+      "learning_rate": 8.229489931536405e-06,
+      "loss": 0.9371,
+      "step": 4182
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 8.293049839371431,
+      "learning_rate": 8.228608287533492e-06,
+      "loss": 0.9405,
+      "step": 4183
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.903443499132066,
+      "learning_rate": 8.227726471325599e-06,
+      "loss": 0.9406,
+      "step": 4184
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.362740596339203,
+      "learning_rate": 8.226844482959761e-06,
+      "loss": 0.9095,
+      "step": 4185
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 11.311432135927415,
+      "learning_rate": 8.22596232248302e-06,
+      "loss": 1.0358,
+      "step": 4186
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.264571627531192,
+      "learning_rate": 8.225079989942431e-06,
+      "loss": 0.9906,
+      "step": 4187
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.445090733097506,
+      "learning_rate": 8.224197485385051e-06,
+      "loss": 1.0036,
+      "step": 4188
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 8.357708347592112,
+      "learning_rate": 8.22331480885795e-06,
+      "loss": 1.0231,
+      "step": 4189
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 5.758196372000047,
+      "learning_rate": 8.22243196040821e-06,
+      "loss": 1.0454,
+      "step": 4190
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 6.666906124516462,
+      "learning_rate": 8.22154894008292e-06,
+      "loss": 0.9445,
+      "step": 4191
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 6.326467992704289,
+      "learning_rate": 8.220665747929176e-06,
+      "loss": 0.9354,
+      "step": 4192
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.264355664410155,
+      "learning_rate": 8.219782383994085e-06,
+      "loss": 1.0016,
+      "step": 4193
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.9495113852597985,
+      "learning_rate": 8.218898848324765e-06,
+      "loss": 1.1016,
+      "step": 4194
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 6.957914972940761,
+      "learning_rate": 8.21801514096834e-06,
+      "loss": 0.9705,
+      "step": 4195
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 5.862116988863068,
+      "learning_rate": 8.217131261971947e-06,
+      "loss": 0.9483,
+      "step": 4196
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 9.62271724773893,
+      "learning_rate": 8.216247211382725e-06,
+      "loss": 0.9039,
+      "step": 4197
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 8.345158880844728,
+      "learning_rate": 8.21536298924783e-06,
+      "loss": 1.0362,
+      "step": 4198
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 4.5879476976656015,
+      "learning_rate": 8.214478595614423e-06,
+      "loss": 0.9812,
+      "step": 4199
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.123664267066975,
+      "learning_rate": 8.213594030529675e-06,
+      "loss": 0.9855,
+      "step": 4200
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.1393657305212965,
+      "learning_rate": 8.212709294040765e-06,
+      "loss": 1.003,
+      "step": 4201
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 5.24831445546729,
+      "learning_rate": 8.211824386194886e-06,
+      "loss": 0.9579,
+      "step": 4202
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.277465758917226,
+      "learning_rate": 8.210939307039235e-06,
+      "loss": 1.0885,
+      "step": 4203
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.545174422380535,
+      "learning_rate": 8.210054056621017e-06,
+      "loss": 1.03,
+      "step": 4204
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 9.914858701771834,
+      "learning_rate": 8.209168634987453e-06,
+      "loss": 0.9411,
+      "step": 4205
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.045712204502242,
+      "learning_rate": 8.208283042185765e-06,
+      "loss": 1.0267,
+      "step": 4206
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 8.280358423108344,
+      "learning_rate": 8.207397278263192e-06,
+      "loss": 1.0591,
+      "step": 4207
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 6.761237976882123,
+      "learning_rate": 8.206511343266973e-06,
+      "loss": 0.9201,
+      "step": 4208
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.352453047554453,
+      "learning_rate": 8.205625237244365e-06,
+      "loss": 1.0363,
+      "step": 4209
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 5.429387156430917,
+      "learning_rate": 8.204738960242631e-06,
+      "loss": 1.0288,
+      "step": 4210
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.608751576305575,
+      "learning_rate": 8.203852512309042e-06,
+      "loss": 0.9543,
+      "step": 4211
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.962300043933783,
+      "learning_rate": 8.202965893490877e-06,
+      "loss": 0.9843,
+      "step": 4212
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 6.390894010037997,
+      "learning_rate": 8.202079103835426e-06,
+      "loss": 1.029,
+      "step": 4213
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 8.549351300369828,
+      "learning_rate": 8.20119214338999e-06,
+      "loss": 0.9986,
+      "step": 4214
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 8.373886691212807,
+      "learning_rate": 8.200305012201874e-06,
+      "loss": 1.0968,
+      "step": 4215
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.963583965472257,
+      "learning_rate": 8.199417710318399e-06,
+      "loss": 0.9857,
+      "step": 4216
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 6.613909356917578,
+      "learning_rate": 8.198530237786888e-06,
+      "loss": 0.9341,
+      "step": 4217
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 6.4669427526803975,
+      "learning_rate": 8.197642594654675e-06,
+      "loss": 1.0757,
+      "step": 4218
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 5.934527472181257,
+      "learning_rate": 8.196754780969109e-06,
+      "loss": 1.0184,
+      "step": 4219
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 8.375249091369174,
+      "learning_rate": 8.195866796777542e-06,
+      "loss": 1.0522,
+      "step": 4220
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 6.660751099727059,
+      "learning_rate": 8.194978642127334e-06,
+      "loss": 1.016,
+      "step": 4221
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 8.204326495816575,
+      "learning_rate": 8.19409031706586e-06,
+      "loss": 0.9859,
+      "step": 4222
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.240026681043225,
+      "learning_rate": 8.193201821640498e-06,
+      "loss": 0.9903,
+      "step": 4223
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.325859682474391,
+      "learning_rate": 8.19231315589864e-06,
+      "loss": 0.9578,
+      "step": 4224
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.228541260049034,
+      "learning_rate": 8.191424319887684e-06,
+      "loss": 0.9213,
+      "step": 4225
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 6.6637577910063195,
+      "learning_rate": 8.190535313655038e-06,
+      "loss": 1.0618,
+      "step": 4226
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 8.635057767330277,
+      "learning_rate": 8.189646137248119e-06,
+      "loss": 0.9849,
+      "step": 4227
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 6.727189104059218,
+      "learning_rate": 8.188756790714353e-06,
+      "loss": 0.9893,
+      "step": 4228
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 5.541338701329236,
+      "learning_rate": 8.187867274101178e-06,
+      "loss": 0.986,
+      "step": 4229
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.152995348618623,
+      "learning_rate": 8.186977587456035e-06,
+      "loss": 0.9557,
+      "step": 4230
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.947370537879317,
+      "learning_rate": 8.186087730826377e-06,
+      "loss": 0.9714,
+      "step": 4231
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 5.814809890570026,
+      "learning_rate": 8.185197704259668e-06,
+      "loss": 1.0501,
+      "step": 4232
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.191425066200165,
+      "learning_rate": 8.184307507803382e-06,
+      "loss": 0.9993,
+      "step": 4233
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.670763831760544,
+      "learning_rate": 8.183417141504996e-06,
+      "loss": 0.9732,
+      "step": 4234
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 8.615564181408553,
+      "learning_rate": 8.182526605412e-06,
+      "loss": 1.0776,
+      "step": 4235
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 9.89030567757957,
+      "learning_rate": 8.181635899571894e-06,
+      "loss": 0.8834,
+      "step": 4236
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.465180187783508,
+      "learning_rate": 8.180745024032186e-06,
+      "loss": 0.9758,
+      "step": 4237
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 8.35956715971589,
+      "learning_rate": 8.17985397884039e-06,
+      "loss": 1.0227,
+      "step": 4238
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 6.73731246190584,
+      "learning_rate": 8.178962764044036e-06,
+      "loss": 0.9659,
+      "step": 4239
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 9.246858904374609,
+      "learning_rate": 8.178071379690656e-06,
+      "loss": 1.0367,
+      "step": 4240
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 8.020006742917204,
+      "learning_rate": 8.177179825827795e-06,
+      "loss": 0.9595,
+      "step": 4241
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 8.774340148415714,
+      "learning_rate": 8.176288102503005e-06,
+      "loss": 1.114,
+      "step": 4242
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 6.833390531392376,
+      "learning_rate": 8.17539620976385e-06,
+      "loss": 0.9412,
+      "step": 4243
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 6.646972603830489,
+      "learning_rate": 8.1745041476579e-06,
+      "loss": 1.0549,
+      "step": 4244
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 8.923786936608225,
+      "learning_rate": 8.173611916232735e-06,
+      "loss": 1.0175,
+      "step": 4245
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 6.596873548304152,
+      "learning_rate": 8.172719515535943e-06,
+      "loss": 0.991,
+      "step": 4246
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 6.801027386076181,
+      "learning_rate": 8.171826945615124e-06,
+      "loss": 0.9697,
+      "step": 4247
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 6.900280520550768,
+      "learning_rate": 8.170934206517886e-06,
+      "loss": 1.0556,
+      "step": 4248
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 9.750842359781696,
+      "learning_rate": 8.170041298291841e-06,
+      "loss": 0.9676,
+      "step": 4249
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.872163153392417,
+      "learning_rate": 8.16914822098462e-06,
+      "loss": 1.0542,
+      "step": 4250
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 11.504246593017523,
+      "learning_rate": 8.168254974643851e-06,
+      "loss": 0.9579,
+      "step": 4251
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.199855422770296,
+      "learning_rate": 8.167361559317184e-06,
+      "loss": 0.9963,
+      "step": 4252
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 6.487382697119889,
+      "learning_rate": 8.166467975052265e-06,
+      "loss": 0.9059,
+      "step": 4253
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 6.281532400625482,
+      "learning_rate": 8.165574221896761e-06,
+      "loss": 1.0346,
+      "step": 4254
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 9.61640905338099,
+      "learning_rate": 8.164680299898337e-06,
+      "loss": 1.0265,
+      "step": 4255
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 6.958803680746861,
+      "learning_rate": 8.163786209104675e-06,
+      "loss": 1.0454,
+      "step": 4256
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.166673478000275,
+      "learning_rate": 8.162891949563465e-06,
+      "loss": 1.023,
+      "step": 4257
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.01909848282049,
+      "learning_rate": 8.1619975213224e-06,
+      "loss": 1.0163,
+      "step": 4258
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.067229465974159,
+      "learning_rate": 8.161102924429191e-06,
+      "loss": 0.9755,
+      "step": 4259
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 6.227459283924599,
+      "learning_rate": 8.16020815893155e-06,
+      "loss": 1.059,
+      "step": 4260
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 8.293876525868518,
+      "learning_rate": 8.1593132248772e-06,
+      "loss": 1.035,
+      "step": 4261
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.175379929351431,
+      "learning_rate": 8.15841812231388e-06,
+      "loss": 0.9569,
+      "step": 4262
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.647973111632014,
+      "learning_rate": 8.157522851289328e-06,
+      "loss": 1.0771,
+      "step": 4263
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.590276089832064,
+      "learning_rate": 8.156627411851295e-06,
+      "loss": 1.143,
+      "step": 4264
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.1453627218600815,
+      "learning_rate": 8.155731804047543e-06,
+      "loss": 1.0782,
+      "step": 4265
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 8.433282676000802,
+      "learning_rate": 8.154836027925841e-06,
+      "loss": 0.9503,
+      "step": 4266
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 4.515483674982367,
+      "learning_rate": 8.153940083533967e-06,
+      "loss": 0.8907,
+      "step": 4267
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 8.920331172180003,
+      "learning_rate": 8.153043970919707e-06,
+      "loss": 1.0348,
+      "step": 4268
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 6.193960959259393,
+      "learning_rate": 8.152147690130858e-06,
+      "loss": 1.0518,
+      "step": 4269
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 8.145880050200772,
+      "learning_rate": 8.151251241215226e-06,
+      "loss": 1.021,
+      "step": 4270
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.884957344375402,
+      "learning_rate": 8.150354624220624e-06,
+      "loss": 0.8933,
+      "step": 4271
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 8.33104484469993,
+      "learning_rate": 8.149457839194877e-06,
+      "loss": 1.0246,
+      "step": 4272
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.742154342597387,
+      "learning_rate": 8.148560886185813e-06,
+      "loss": 0.9646,
+      "step": 4273
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 8.739327526098212,
+      "learning_rate": 8.147663765241278e-06,
+      "loss": 1.0037,
+      "step": 4274
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.824827522794762,
+      "learning_rate": 8.146766476409116e-06,
+      "loss": 1.0551,
+      "step": 4275
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 4.618538615729761,
+      "learning_rate": 8.145869019737192e-06,
+      "loss": 1.0405,
+      "step": 4276
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 9.253130523155267,
+      "learning_rate": 8.14497139527337e-06,
+      "loss": 1.0304,
+      "step": 4277
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 8.966535957616333,
+      "learning_rate": 8.144073603065527e-06,
+      "loss": 0.9979,
+      "step": 4278
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 6.530299002405326,
+      "learning_rate": 8.143175643161551e-06,
+      "loss": 1.1254,
+      "step": 4279
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 6.209576898035287,
+      "learning_rate": 8.142277515609335e-06,
+      "loss": 1.0137,
+      "step": 4280
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 6.112203026042829,
+      "learning_rate": 8.141379220456783e-06,
+      "loss": 0.9499,
+      "step": 4281
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.114574053954589,
+      "learning_rate": 8.140480757751807e-06,
+      "loss": 0.9344,
+      "step": 4282
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 6.929542332417086,
+      "learning_rate": 8.13958212754233e-06,
+      "loss": 0.9572,
+      "step": 4283
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.810754500079492,
+      "learning_rate": 8.13868332987628e-06,
+      "loss": 1.0938,
+      "step": 4284
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 6.404913099938729,
+      "learning_rate": 8.137784364801598e-06,
+      "loss": 0.9414,
+      "step": 4285
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.97717950637467,
+      "learning_rate": 8.136885232366232e-06,
+      "loss": 0.9832,
+      "step": 4286
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.818826000970016,
+      "learning_rate": 8.13598593261814e-06,
+      "loss": 1.0149,
+      "step": 4287
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 8.464875212763502,
+      "learning_rate": 8.135086465605288e-06,
+      "loss": 1.0595,
+      "step": 4288
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 5.1400165222089,
+      "learning_rate": 8.13418683137565e-06,
+      "loss": 1.0108,
+      "step": 4289
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 9.534517669473553,
+      "learning_rate": 8.13328702997721e-06,
+      "loss": 0.9868,
+      "step": 4290
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 6.6939998828905285,
+      "learning_rate": 8.132387061457961e-06,
+      "loss": 1.1292,
+      "step": 4291
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 8.749332388606344,
+      "learning_rate": 8.131486925865906e-06,
+      "loss": 1.0201,
+      "step": 4292
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 8.045908517942042,
+      "learning_rate": 8.130586623249055e-06,
+      "loss": 1.018,
+      "step": 4293
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.931859911955406,
+      "learning_rate": 8.129686153655427e-06,
+      "loss": 1.114,
+      "step": 4294
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 6.71627828434491,
+      "learning_rate": 8.128785517133052e-06,
+      "loss": 0.9623,
+      "step": 4295
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.975972778114865,
+      "learning_rate": 8.127884713729965e-06,
+      "loss": 1.008,
+      "step": 4296
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 6.847692096693766,
+      "learning_rate": 8.126983743494215e-06,
+      "loss": 0.9789,
+      "step": 4297
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 8.365745917113799,
+      "learning_rate": 8.126082606473856e-06,
+      "loss": 1.0201,
+      "step": 4298
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 8.35970685468455,
+      "learning_rate": 8.12518130271695e-06,
+      "loss": 0.998,
+      "step": 4299
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 9.234807081100186,
+      "learning_rate": 8.124279832271576e-06,
+      "loss": 0.9323,
+      "step": 4300
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 6.251249794705171,
+      "learning_rate": 8.123378195185811e-06,
+      "loss": 1.0311,
+      "step": 4301
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.7543507690667886,
+      "learning_rate": 8.122476391507747e-06,
+      "loss": 1.0195,
+      "step": 4302
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 9.4564061200934,
+      "learning_rate": 8.121574421285485e-06,
+      "loss": 0.9806,
+      "step": 4303
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.501575562399604,
+      "learning_rate": 8.120672284567129e-06,
+      "loss": 1.0559,
+      "step": 4304
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.321393010759918,
+      "learning_rate": 8.119769981400802e-06,
+      "loss": 1.0803,
+      "step": 4305
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 9.837750932707259,
+      "learning_rate": 8.118867511834628e-06,
+      "loss": 0.9926,
+      "step": 4306
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.771048120577073,
+      "learning_rate": 8.117964875916743e-06,
+      "loss": 0.9807,
+      "step": 4307
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.520802268541189,
+      "learning_rate": 8.117062073695288e-06,
+      "loss": 0.9548,
+      "step": 4308
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 9.521917632033086,
+      "learning_rate": 8.11615910521842e-06,
+      "loss": 1.0756,
+      "step": 4309
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 9.842595173408192,
+      "learning_rate": 8.1152559705343e-06,
+      "loss": 0.9355,
+      "step": 4310
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 5.2754031480291355,
+      "learning_rate": 8.114352669691097e-06,
+      "loss": 0.9986,
+      "step": 4311
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 8.609646461221175,
+      "learning_rate": 8.113449202736992e-06,
+      "loss": 1.0581,
+      "step": 4312
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.952626858217254,
+      "learning_rate": 8.112545569720171e-06,
+      "loss": 1.0289,
+      "step": 4313
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.373758981583769,
+      "learning_rate": 8.111641770688837e-06,
+      "loss": 0.9447,
+      "step": 4314
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 4.894986826889681,
+      "learning_rate": 8.110737805691188e-06,
+      "loss": 1.01,
+      "step": 4315
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.2334732515885705,
+      "learning_rate": 8.109833674775447e-06,
+      "loss": 0.989,
+      "step": 4316
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 6.298884343757658,
+      "learning_rate": 8.108929377989832e-06,
+      "loss": 1.0601,
+      "step": 4317
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.3171084564917495,
+      "learning_rate": 8.108024915382577e-06,
+      "loss": 1.0086,
+      "step": 4318
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.330859701598954,
+      "learning_rate": 8.107120287001924e-06,
+      "loss": 1.0026,
+      "step": 4319
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 6.678639923047041,
+      "learning_rate": 8.106215492896126e-06,
+      "loss": 0.9608,
+      "step": 4320
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 10.540447691995382,
+      "learning_rate": 8.105310533113439e-06,
+      "loss": 0.97,
+      "step": 4321
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 9.654190433540723,
+      "learning_rate": 8.104405407702132e-06,
+      "loss": 0.9828,
+      "step": 4322
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 8.85311049258524,
+      "learning_rate": 8.10350011671048e-06,
+      "loss": 1.0474,
+      "step": 4323
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 6.241684559692439,
+      "learning_rate": 8.10259466018677e-06,
+      "loss": 1.0007,
+      "step": 4324
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 8.591604100020936,
+      "learning_rate": 8.101689038179301e-06,
+      "loss": 1.0152,
+      "step": 4325
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 9.295446400424703,
+      "learning_rate": 8.10078325073637e-06,
+      "loss": 1.0132,
+      "step": 4326
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 6.102665770243578,
+      "learning_rate": 8.09987729790629e-06,
+      "loss": 0.952,
+      "step": 4327
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.126241120568903,
+      "learning_rate": 8.098971179737385e-06,
+      "loss": 1.0172,
+      "step": 4328
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.432652252247937,
+      "learning_rate": 8.098064896277984e-06,
+      "loss": 0.8959,
+      "step": 4329
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 5.63670043908674,
+      "learning_rate": 8.097158447576424e-06,
+      "loss": 0.9551,
+      "step": 4330
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.460691492447831,
+      "learning_rate": 8.096251833681053e-06,
+      "loss": 1.0072,
+      "step": 4331
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.18760093510869,
+      "learning_rate": 8.095345054640229e-06,
+      "loss": 1.0385,
+      "step": 4332
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.035528655391563,
+      "learning_rate": 8.094438110502315e-06,
+      "loss": 1.0607,
+      "step": 4333
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 9.09448242500796,
+      "learning_rate": 8.093531001315684e-06,
+      "loss": 1.0779,
+      "step": 4334
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 9.611569025605046,
+      "learning_rate": 8.092623727128724e-06,
+      "loss": 0.9824,
+      "step": 4335
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 10.550682598759593,
+      "learning_rate": 8.09171628798982e-06,
+      "loss": 1.0048,
+      "step": 4336
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.130239314977041,
+      "learning_rate": 8.090808683947376e-06,
+      "loss": 0.9641,
+      "step": 4337
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 8.292307576187612,
+      "learning_rate": 8.089900915049798e-06,
+      "loss": 0.9654,
+      "step": 4338
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 6.2945828339659755,
+      "learning_rate": 8.08899298134551e-06,
+      "loss": 1.0551,
+      "step": 4339
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 9.135902433949825,
+      "learning_rate": 8.088084882882932e-06,
+      "loss": 1.0248,
+      "step": 4340
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 6.087608688769553,
+      "learning_rate": 8.087176619710503e-06,
+      "loss": 0.9638,
+      "step": 4341
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 8.216815671721895,
+      "learning_rate": 8.086268191876664e-06,
+      "loss": 1.0521,
+      "step": 4342
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 8.237032063299633,
+      "learning_rate": 8.085359599429872e-06,
+      "loss": 1.0621,
+      "step": 4343
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 5.964363989724386,
+      "learning_rate": 8.084450842418588e-06,
+      "loss": 0.9811,
+      "step": 4344
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.26191259153686,
+      "learning_rate": 8.083541920891281e-06,
+      "loss": 1.01,
+      "step": 4345
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.1078818351471345,
+      "learning_rate": 8.082632834896431e-06,
+      "loss": 0.9724,
+      "step": 4346
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 9.460748100906226,
+      "learning_rate": 8.081723584482526e-06,
+      "loss": 0.9095,
+      "step": 4347
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.6120897363217885,
+      "learning_rate": 8.080814169698064e-06,
+      "loss": 1.0143,
+      "step": 4348
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 5.707880713587209,
+      "learning_rate": 8.079904590591549e-06,
+      "loss": 0.9323,
+      "step": 4349
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 6.942281417081743,
+      "learning_rate": 8.078994847211497e-06,
+      "loss": 1.0039,
+      "step": 4350
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.350841575443985,
+      "learning_rate": 8.078084939606428e-06,
+      "loss": 0.9949,
+      "step": 4351
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 8.930699349950764,
+      "learning_rate": 8.077174867824877e-06,
+      "loss": 0.9539,
+      "step": 4352
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 8.531741914683286,
+      "learning_rate": 8.076264631915385e-06,
+      "loss": 1.0484,
+      "step": 4353
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.120794887298456,
+      "learning_rate": 8.0753542319265e-06,
+      "loss": 1.0305,
+      "step": 4354
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 8.112336673533738,
+      "learning_rate": 8.07444366790678e-06,
+      "loss": 0.9015,
+      "step": 4355
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 8.102788322172641,
+      "learning_rate": 8.073532939904793e-06,
+      "loss": 1.0113,
+      "step": 4356
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 6.4698921350399345,
+      "learning_rate": 8.072622047969114e-06,
+      "loss": 0.9737,
+      "step": 4357
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 6.033467859386554,
+      "learning_rate": 8.071710992148328e-06,
+      "loss": 0.9326,
+      "step": 4358
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 5.921699091617098,
+      "learning_rate": 8.070799772491027e-06,
+      "loss": 0.9809,
+      "step": 4359
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.6421891983763395,
+      "learning_rate": 8.069888389045815e-06,
+      "loss": 0.9402,
+      "step": 4360
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 10.126651968387728,
+      "learning_rate": 8.0689768418613e-06,
+      "loss": 0.9894,
+      "step": 4361
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 9.024802284820415,
+      "learning_rate": 8.068065130986104e-06,
+      "loss": 0.9744,
+      "step": 4362
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.045843544766518,
+      "learning_rate": 8.067153256468853e-06,
+      "loss": 1.0478,
+      "step": 4363
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.3428245857726795,
+      "learning_rate": 8.066241218358187e-06,
+      "loss": 0.9491,
+      "step": 4364
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.2714035638767704,
+      "learning_rate": 8.065329016702748e-06,
+      "loss": 1.0075,
+      "step": 4365
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 8.764169402141265,
+      "learning_rate": 8.06441665155119e-06,
+      "loss": 0.9277,
+      "step": 4366
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 6.355042756649542,
+      "learning_rate": 8.06350412295218e-06,
+      "loss": 1.0461,
+      "step": 4367
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 8.762582048893684,
+      "learning_rate": 8.062591430954387e-06,
+      "loss": 0.9734,
+      "step": 4368
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 9.773321118316872,
+      "learning_rate": 8.06167857560649e-06,
+      "loss": 1.0135,
+      "step": 4369
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 6.073046875551305,
+      "learning_rate": 8.060765556957182e-06,
+      "loss": 0.9547,
+      "step": 4370
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 8.711562169240377,
+      "learning_rate": 8.059852375055157e-06,
+      "loss": 0.933,
+      "step": 4371
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 8.014241228884423,
+      "learning_rate": 8.058939029949127e-06,
+      "loss": 1.0308,
+      "step": 4372
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.266911560510229,
+      "learning_rate": 8.0580255216878e-06,
+      "loss": 1.0168,
+      "step": 4373
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 8.082565847905206,
+      "learning_rate": 8.057111850319906e-06,
+      "loss": 0.9763,
+      "step": 4374
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 8.268771181280261,
+      "learning_rate": 8.056198015894174e-06,
+      "loss": 1.0083,
+      "step": 4375
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 5.878077215789536,
+      "learning_rate": 8.055284018459347e-06,
+      "loss": 0.9436,
+      "step": 4376
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.3791091488442175,
+      "learning_rate": 8.054369858064176e-06,
+      "loss": 1.0134,
+      "step": 4377
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 9.410395057469868,
+      "learning_rate": 8.053455534757418e-06,
+      "loss": 0.9964,
+      "step": 4378
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 8.906425523982469,
+      "learning_rate": 8.052541048587841e-06,
+      "loss": 1.0099,
+      "step": 4379
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 6.2488723791544425,
+      "learning_rate": 8.051626399604222e-06,
+      "loss": 0.9853,
+      "step": 4380
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 8.944124783532807,
+      "learning_rate": 8.050711587855344e-06,
+      "loss": 0.948,
+      "step": 4381
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.530014100104422,
+      "learning_rate": 8.049796613390003e-06,
+      "loss": 0.963,
+      "step": 4382
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.319705858640474,
+      "learning_rate": 8.048881476256999e-06,
+      "loss": 1.1088,
+      "step": 4383
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 5.135870613757859,
+      "learning_rate": 8.047966176505142e-06,
+      "loss": 0.9734,
+      "step": 4384
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 10.05179508899505,
+      "learning_rate": 8.047050714183256e-06,
+      "loss": 1.0794,
+      "step": 4385
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 5.84393903128063,
+      "learning_rate": 8.046135089340165e-06,
+      "loss": 0.879,
+      "step": 4386
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 8.649648437567436,
+      "learning_rate": 8.045219302024708e-06,
+      "loss": 0.9668,
+      "step": 4387
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 8.537610284523518,
+      "learning_rate": 8.04430335228573e-06,
+      "loss": 0.9949,
+      "step": 4388
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.83345224578353,
+      "learning_rate": 8.043387240172083e-06,
+      "loss": 0.9835,
+      "step": 4389
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.1317550938411385,
+      "learning_rate": 8.042470965732634e-06,
+      "loss": 0.9597,
+      "step": 4390
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.173661079869763,
+      "learning_rate": 8.041554529016252e-06,
+      "loss": 1.0049,
+      "step": 4391
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 8.498609077249386,
+      "learning_rate": 8.040637930071817e-06,
+      "loss": 1.0549,
+      "step": 4392
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.09525249294798,
+      "learning_rate": 8.03972116894822e-06,
+      "loss": 1.0259,
+      "step": 4393
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 8.21808170322937,
+      "learning_rate": 8.038804245694355e-06,
+      "loss": 0.9967,
+      "step": 4394
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 9.412392629344447,
+      "learning_rate": 8.037887160359132e-06,
+      "loss": 1.0571,
+      "step": 4395
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.078198573712498,
+      "learning_rate": 8.036969912991462e-06,
+      "loss": 0.9372,
+      "step": 4396
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.342519845591778,
+      "learning_rate": 8.036052503640272e-06,
+      "loss": 0.8846,
+      "step": 4397
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 5.9092261751181185,
+      "learning_rate": 8.035134932354491e-06,
+      "loss": 0.9201,
+      "step": 4398
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.195099165572067,
+      "learning_rate": 8.034217199183062e-06,
+      "loss": 1.013,
+      "step": 4399
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.586036144983698,
+      "learning_rate": 8.033299304174932e-06,
+      "loss": 0.9613,
+      "step": 4400
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.997323933794569,
+      "learning_rate": 8.032381247379063e-06,
+      "loss": 0.9526,
+      "step": 4401
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.524054325579481,
+      "learning_rate": 8.031463028844418e-06,
+      "loss": 1.0393,
+      "step": 4402
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 5.002570455058564,
+      "learning_rate": 8.030544648619973e-06,
+      "loss": 0.9914,
+      "step": 4403
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 9.366422888466698,
+      "learning_rate": 8.029626106754711e-06,
+      "loss": 1.0203,
+      "step": 4404
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 9.248216130089116,
+      "learning_rate": 8.028707403297625e-06,
+      "loss": 1.0014,
+      "step": 4405
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 4.221355370913504,
+      "learning_rate": 8.027788538297719e-06,
+      "loss": 0.9912,
+      "step": 4406
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.758805651000877,
+      "learning_rate": 8.026869511803998e-06,
+      "loss": 0.8856,
+      "step": 4407
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.14709473311514,
+      "learning_rate": 8.025950323865484e-06,
+      "loss": 0.9678,
+      "step": 4408
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 8.250436454931489,
+      "learning_rate": 8.025030974531202e-06,
+      "loss": 0.8524,
+      "step": 4409
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.699848403452825,
+      "learning_rate": 8.024111463850189e-06,
+      "loss": 0.9676,
+      "step": 4410
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.020703611939719,
+      "learning_rate": 8.023191791871487e-06,
+      "loss": 1.0159,
+      "step": 4411
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.89379316570464,
+      "learning_rate": 8.02227195864415e-06,
+      "loss": 1.0242,
+      "step": 4412
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.582185552617987,
+      "learning_rate": 8.021351964217238e-06,
+      "loss": 0.973,
+      "step": 4413
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 5.351567026644553,
+      "learning_rate": 8.020431808639825e-06,
+      "loss": 0.9992,
+      "step": 4414
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.621905378279095,
+      "learning_rate": 8.019511491960985e-06,
+      "loss": 0.9964,
+      "step": 4415
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 8.627478480131204,
+      "learning_rate": 8.018591014229808e-06,
+      "loss": 0.9877,
+      "step": 4416
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 8.434929598430582,
+      "learning_rate": 8.017670375495388e-06,
+      "loss": 1.0455,
+      "step": 4417
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.232218624255366,
+      "learning_rate": 8.01674957580683e-06,
+      "loss": 0.9686,
+      "step": 4418
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.728793278079519,
+      "learning_rate": 8.015828615213245e-06,
+      "loss": 0.9753,
+      "step": 4419
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.436808465808798,
+      "learning_rate": 8.01490749376376e-06,
+      "loss": 1.0639,
+      "step": 4420
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 5.738403100492077,
+      "learning_rate": 8.013986211507498e-06,
+      "loss": 1.0253,
+      "step": 4421
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 5.613530945900611,
+      "learning_rate": 8.013064768493604e-06,
+      "loss": 0.9404,
+      "step": 4422
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.079251072186428,
+      "learning_rate": 8.01214316477122e-06,
+      "loss": 1.0731,
+      "step": 4423
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.376904175765828,
+      "learning_rate": 8.011221400389504e-06,
+      "loss": 0.9584,
+      "step": 4424
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.718802696675683,
+      "learning_rate": 8.01029947539762e-06,
+      "loss": 1.024,
+      "step": 4425
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.860228534425378,
+      "learning_rate": 8.009377389844743e-06,
+      "loss": 1.065,
+      "step": 4426
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.6555744645233785,
+      "learning_rate": 8.008455143780051e-06,
+      "loss": 1.0436,
+      "step": 4427
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.40760137808683,
+      "learning_rate": 8.007532737252737e-06,
+      "loss": 0.9923,
+      "step": 4428
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 5.229540789958909,
+      "learning_rate": 8.006610170311998e-06,
+      "loss": 0.9359,
+      "step": 4429
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.307926155667543,
+      "learning_rate": 8.00568744300704e-06,
+      "loss": 1.0534,
+      "step": 4430
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 8.895112668861872,
+      "learning_rate": 8.00476455538708e-06,
+      "loss": 0.9225,
+      "step": 4431
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.33911374197293,
+      "learning_rate": 8.003841507501345e-06,
+      "loss": 0.9667,
+      "step": 4432
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.838944621607795,
+      "learning_rate": 8.002918299399064e-06,
+      "loss": 0.9825,
+      "step": 4433
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.609414154849799,
+      "learning_rate": 8.00199493112948e-06,
+      "loss": 1.0023,
+      "step": 4434
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 5.563402885868941,
+      "learning_rate": 8.001071402741843e-06,
+      "loss": 1.0065,
+      "step": 4435
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.69705725148762,
+      "learning_rate": 8.000147714285409e-06,
+      "loss": 0.9956,
+      "step": 4436
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 8.52480487631482,
+      "learning_rate": 7.999223865809446e-06,
+      "loss": 1.0046,
+      "step": 4437
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.867513460853983,
+      "learning_rate": 7.998299857363234e-06,
+      "loss": 0.9232,
+      "step": 4438
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.769466632487087,
+      "learning_rate": 7.997375688996051e-06,
+      "loss": 1.038,
+      "step": 4439
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.031235425730343,
+      "learning_rate": 7.996451360757193e-06,
+      "loss": 1.0591,
+      "step": 4440
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.782887873321294,
+      "learning_rate": 7.99552687269596e-06,
+      "loss": 1.0499,
+      "step": 4441
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 5.936219579069564,
+      "learning_rate": 7.99460222486166e-06,
+      "loss": 1.046,
+      "step": 4442
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.180117080553916,
+      "learning_rate": 7.993677417303616e-06,
+      "loss": 1.0645,
+      "step": 4443
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.023196025788807,
+      "learning_rate": 7.99275245007115e-06,
+      "loss": 0.9608,
+      "step": 4444
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.305493194041208,
+      "learning_rate": 7.9918273232136e-06,
+      "loss": 0.9702,
+      "step": 4445
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.80837929802629,
+      "learning_rate": 7.990902036780306e-06,
+      "loss": 0.9957,
+      "step": 4446
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.6356576977344535,
+      "learning_rate": 7.989976590820623e-06,
+      "loss": 0.9471,
+      "step": 4447
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.705407725271706,
+      "learning_rate": 7.989050985383912e-06,
+      "loss": 0.9969,
+      "step": 4448
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 5.372537466827764,
+      "learning_rate": 7.988125220519541e-06,
+      "loss": 0.9718,
+      "step": 4449
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.538557942869383,
+      "learning_rate": 7.987199296276888e-06,
+      "loss": 0.9798,
+      "step": 4450
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 9.577974252766172,
+      "learning_rate": 7.986273212705341e-06,
+      "loss": 1.0514,
+      "step": 4451
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.749441954108607,
+      "learning_rate": 7.985346969854292e-06,
+      "loss": 1.0352,
+      "step": 4452
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.374734836243993,
+      "learning_rate": 7.984420567773146e-06,
+      "loss": 0.985,
+      "step": 4453
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.061517176355299,
+      "learning_rate": 7.983494006511314e-06,
+      "loss": 0.9966,
+      "step": 4454
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.873040405579287,
+      "learning_rate": 7.982567286118216e-06,
+      "loss": 1.0184,
+      "step": 4455
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 6.437582115200383,
+      "learning_rate": 7.98164040664328e-06,
+      "loss": 0.9787,
+      "step": 4456
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 8.335261925919147,
+      "learning_rate": 7.980713368135945e-06,
+      "loss": 0.9944,
+      "step": 4457
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 9.583765628085265,
+      "learning_rate": 7.979786170645657e-06,
+      "loss": 1.0642,
+      "step": 4458
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 9.371439547327144,
+      "learning_rate": 7.978858814221868e-06,
+      "loss": 0.9527,
+      "step": 4459
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.477546003989721,
+      "learning_rate": 7.977931298914041e-06,
+      "loss": 0.9581,
+      "step": 4460
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 8.262881436118219,
+      "learning_rate": 7.97700362477165e-06,
+      "loss": 1.004,
+      "step": 4461
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.950646340939108,
+      "learning_rate": 7.97607579184417e-06,
+      "loss": 1.0421,
+      "step": 4462
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.520528643408285,
+      "learning_rate": 7.975147800181091e-06,
+      "loss": 1.0393,
+      "step": 4463
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.4147637222682174,
+      "learning_rate": 7.97421964983191e-06,
+      "loss": 0.9958,
+      "step": 4464
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 10.156324558692187,
+      "learning_rate": 7.973291340846132e-06,
+      "loss": 0.9424,
+      "step": 4465
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.644012275292732,
+      "learning_rate": 7.97236287327327e-06,
+      "loss": 1.0073,
+      "step": 4466
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 5.1675030632596535,
+      "learning_rate": 7.971434247162846e-06,
+      "loss": 0.9344,
+      "step": 4467
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 6.20087750187666,
+      "learning_rate": 7.97050546256439e-06,
+      "loss": 1.0851,
+      "step": 4468
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 8.52197505497717,
+      "learning_rate": 7.969576519527441e-06,
+      "loss": 1.0598,
+      "step": 4469
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 10.665902651399547,
+      "learning_rate": 7.968647418101548e-06,
+      "loss": 0.9433,
+      "step": 4470
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 6.566585713019903,
+      "learning_rate": 7.967718158336264e-06,
+      "loss": 1.0178,
+      "step": 4471
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 9.69476509500092,
+      "learning_rate": 7.966788740281153e-06,
+      "loss": 0.9904,
+      "step": 4472
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.24400143416992,
+      "learning_rate": 7.96585916398579e-06,
+      "loss": 1.0683,
+      "step": 4473
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 6.328486143279428,
+      "learning_rate": 7.964929429499755e-06,
+      "loss": 0.9471,
+      "step": 4474
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.431493554542962,
+      "learning_rate": 7.963999536872636e-06,
+      "loss": 1.0426,
+      "step": 4475
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.890382109362813,
+      "learning_rate": 7.963069486154032e-06,
+      "loss": 0.9932,
+      "step": 4476
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.027003864458365,
+      "learning_rate": 7.96213927739355e-06,
+      "loss": 0.9641,
+      "step": 4477
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 6.3617359818701456,
+      "learning_rate": 7.961208910640805e-06,
+      "loss": 0.9582,
+      "step": 4478
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 6.43208430994814,
+      "learning_rate": 7.960278385945418e-06,
+      "loss": 0.9029,
+      "step": 4479
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 8.788079008084743,
+      "learning_rate": 7.95934770335702e-06,
+      "loss": 0.9723,
+      "step": 4480
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.885382168888156,
+      "learning_rate": 7.958416862925256e-06,
+      "loss": 0.9875,
+      "step": 4481
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 5.730458623372167,
+      "learning_rate": 7.957485864699773e-06,
+      "loss": 0.8887,
+      "step": 4482
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 8.433819649309868,
+      "learning_rate": 7.956554708730223e-06,
+      "loss": 1.0018,
+      "step": 4483
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.228205487746287,
+      "learning_rate": 7.955623395066274e-06,
+      "loss": 0.9554,
+      "step": 4484
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.9044780460870685,
+      "learning_rate": 7.954691923757602e-06,
+      "loss": 0.973,
+      "step": 4485
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 6.9910468764337175,
+      "learning_rate": 7.953760294853886e-06,
+      "loss": 1.0226,
+      "step": 4486
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.461600901916325,
+      "learning_rate": 7.952828508404819e-06,
+      "loss": 1.0448,
+      "step": 4487
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 8.468685864856946,
+      "learning_rate": 7.951896564460098e-06,
+      "loss": 1.0106,
+      "step": 4488
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 9.671560644033084,
+      "learning_rate": 7.95096446306943e-06,
+      "loss": 1.0286,
+      "step": 4489
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.6990098113913525,
+      "learning_rate": 7.950032204282532e-06,
+      "loss": 1.0061,
+      "step": 4490
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 5.906423849495994,
+      "learning_rate": 7.949099788149128e-06,
+      "loss": 0.9938,
+      "step": 4491
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 8.873828236110434,
+      "learning_rate": 7.94816721471895e-06,
+      "loss": 1.0082,
+      "step": 4492
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 5.779393871771466,
+      "learning_rate": 7.94723448404174e-06,
+      "loss": 0.9677,
+      "step": 4493
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 5.905008571495095,
+      "learning_rate": 7.946301596167245e-06,
+      "loss": 1.0386,
+      "step": 4494
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.995256233734595,
+      "learning_rate": 7.945368551145226e-06,
+      "loss": 0.9766,
+      "step": 4495
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 9.157537211484254,
+      "learning_rate": 7.944435349025444e-06,
+      "loss": 0.9665,
+      "step": 4496
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 8.344508440612529,
+      "learning_rate": 7.943501989857678e-06,
+      "loss": 0.9541,
+      "step": 4497
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 5.181040942252074,
+      "learning_rate": 7.94256847369171e-06,
+      "loss": 0.959,
+      "step": 4498
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.345319317693958,
+      "learning_rate": 7.941634800577329e-06,
+      "loss": 1.015,
+      "step": 4499
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.394847055412412,
+      "learning_rate": 7.940700970564336e-06,
+      "loss": 0.9196,
+      "step": 4500
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.326185226809631,
+      "learning_rate": 7.93976698370254e-06,
+      "loss": 0.8853,
+      "step": 4501
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 6.197763816954727,
+      "learning_rate": 7.938832840041754e-06,
+      "loss": 0.9591,
+      "step": 4502
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 6.406380305949381,
+      "learning_rate": 7.937898539631806e-06,
+      "loss": 0.9916,
+      "step": 4503
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 6.466310934871315,
+      "learning_rate": 7.936964082522526e-06,
+      "loss": 0.9492,
+      "step": 4504
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 9.990267729470478,
+      "learning_rate": 7.936029468763757e-06,
+      "loss": 0.9589,
+      "step": 4505
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.3932450636464315,
+      "learning_rate": 7.93509469840535e-06,
+      "loss": 0.8945,
+      "step": 4506
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.098249820546774,
+      "learning_rate": 7.934159771497162e-06,
+      "loss": 0.9782,
+      "step": 4507
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 6.080114038769861,
+      "learning_rate": 7.933224688089059e-06,
+      "loss": 1.0175,
+      "step": 4508
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.17711760717482,
+      "learning_rate": 7.932289448230915e-06,
+      "loss": 0.9866,
+      "step": 4509
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 6.679166542601174,
+      "learning_rate": 7.931354051972613e-06,
+      "loss": 0.9565,
+      "step": 4510
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.52200910279434,
+      "learning_rate": 7.930418499364048e-06,
+      "loss": 0.9772,
+      "step": 4511
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 8.328142688833752,
+      "learning_rate": 7.929482790455114e-06,
+      "loss": 0.9564,
+      "step": 4512
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 6.441944779644499,
+      "learning_rate": 7.928546925295725e-06,
+      "loss": 0.9794,
+      "step": 4513
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 6.52940090947858,
+      "learning_rate": 7.927610903935793e-06,
+      "loss": 1.0029,
+      "step": 4514
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.975149480782051,
+      "learning_rate": 7.926674726425246e-06,
+      "loss": 0.9655,
+      "step": 4515
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 10.521571163469387,
+      "learning_rate": 7.925738392814013e-06,
+      "loss": 0.9706,
+      "step": 4516
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.94172447932378,
+      "learning_rate": 7.92480190315204e-06,
+      "loss": 0.9683,
+      "step": 4517
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 9.743585229961287,
+      "learning_rate": 7.923865257489274e-06,
+      "loss": 0.9872,
+      "step": 4518
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 6.624349753319632,
+      "learning_rate": 7.922928455875673e-06,
+      "loss": 1.0027,
+      "step": 4519
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 9.467461964478154,
+      "learning_rate": 7.921991498361206e-06,
+      "loss": 1.0358,
+      "step": 4520
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.27172316303988,
+      "learning_rate": 7.921054384995845e-06,
+      "loss": 0.9706,
+      "step": 4521
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 6.862389546643312,
+      "learning_rate": 7.920117115829576e-06,
+      "loss": 1.0037,
+      "step": 4522
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 9.39915066862764,
+      "learning_rate": 7.919179690912384e-06,
+      "loss": 0.986,
+      "step": 4523
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 6.520844719769396,
+      "learning_rate": 7.918242110294276e-06,
+      "loss": 0.9909,
+      "step": 4524
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 9.019596822982958,
+      "learning_rate": 7.917304374025257e-06,
+      "loss": 1.0044,
+      "step": 4525
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 5.982359129884975,
+      "learning_rate": 7.91636648215534e-06,
+      "loss": 0.9956,
+      "step": 4526
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.539223343047254,
+      "learning_rate": 7.915428434734556e-06,
+      "loss": 0.9609,
+      "step": 4527
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 9.20789748114456,
+      "learning_rate": 7.914490231812932e-06,
+      "loss": 1.0302,
+      "step": 4528
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 6.174674945537167,
+      "learning_rate": 7.913551873440513e-06,
+      "loss": 1.087,
+      "step": 4529
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.247106452759768,
+      "learning_rate": 7.912613359667347e-06,
+      "loss": 0.8858,
+      "step": 4530
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.065266751657735,
+      "learning_rate": 7.911674690543491e-06,
+      "loss": 0.9238,
+      "step": 4531
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.254187439909379,
+      "learning_rate": 7.910735866119013e-06,
+      "loss": 0.9282,
+      "step": 4532
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 8.221011063266427,
+      "learning_rate": 7.909796886443983e-06,
+      "loss": 1.0411,
+      "step": 4533
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 6.759083755649278,
+      "learning_rate": 7.908857751568489e-06,
+      "loss": 0.9897,
+      "step": 4534
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 6.182247962050824,
+      "learning_rate": 7.90791846154262e-06,
+      "loss": 0.9558,
+      "step": 4535
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 5.753964760057804,
+      "learning_rate": 7.906979016416474e-06,
+      "loss": 0.9461,
+      "step": 4536
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 6.562159211032356,
+      "learning_rate": 7.906039416240158e-06,
+      "loss": 1.0465,
+      "step": 4537
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.373228281809943,
+      "learning_rate": 7.90509966106379e-06,
+      "loss": 1.0075,
+      "step": 4538
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 6.551344491154187,
+      "learning_rate": 7.90415975093749e-06,
+      "loss": 0.9467,
+      "step": 4539
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 10.8752794236861,
+      "learning_rate": 7.903219685911395e-06,
+      "loss": 1.0577,
+      "step": 4540
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.338217156061882,
+      "learning_rate": 7.90227946603564e-06,
+      "loss": 0.9676,
+      "step": 4541
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 6.7088839469636525,
+      "learning_rate": 7.90133909136038e-06,
+      "loss": 0.9988,
+      "step": 4542
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 6.365633995293936,
+      "learning_rate": 7.900398561935766e-06,
+      "loss": 0.9445,
+      "step": 4543
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.672825297046636,
+      "learning_rate": 7.899457877811968e-06,
+      "loss": 1.0216,
+      "step": 4544
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 8.839294107453368,
+      "learning_rate": 7.89851703903916e-06,
+      "loss": 1.0018,
+      "step": 4545
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 11.194322313532792,
+      "learning_rate": 7.897576045667519e-06,
+      "loss": 0.9441,
+      "step": 4546
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.189264072263572,
+      "learning_rate": 7.896634897747235e-06,
+      "loss": 0.9648,
+      "step": 4547
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.4013401753319235,
+      "learning_rate": 7.89569359532851e-06,
+      "loss": 0.9771,
+      "step": 4548
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 8.58373954530127,
+      "learning_rate": 7.89475213846155e-06,
+      "loss": 1.0243,
+      "step": 4549
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 6.210124659674624,
+      "learning_rate": 7.893810527196565e-06,
+      "loss": 0.9778,
+      "step": 4550
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 6.31413637780736,
+      "learning_rate": 7.892868761583787e-06,
+      "loss": 0.9789,
+      "step": 4551
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 6.485092716800297,
+      "learning_rate": 7.891926841673437e-06,
+      "loss": 0.953,
+      "step": 4552
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 6.994496218045255,
+      "learning_rate": 7.890984767515762e-06,
+      "loss": 0.9111,
+      "step": 4553
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.338991596281956,
+      "learning_rate": 7.890042539161005e-06,
+      "loss": 1.0644,
+      "step": 4554
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.308438999038096,
+      "learning_rate": 7.889100156659423e-06,
+      "loss": 0.9655,
+      "step": 4555
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 6.633500227599117,
+      "learning_rate": 7.888157620061281e-06,
+      "loss": 1.0027,
+      "step": 4556
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.540989658255863,
+      "learning_rate": 7.88721492941685e-06,
+      "loss": 0.9133,
+      "step": 4557
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 9.44008750350005,
+      "learning_rate": 7.886272084776413e-06,
+      "loss": 1.0069,
+      "step": 4558
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 10.799988858164706,
+      "learning_rate": 7.885329086190256e-06,
+      "loss": 1.0425,
+      "step": 4559
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 10.03836967716538,
+      "learning_rate": 7.884385933708678e-06,
+      "loss": 1.0213,
+      "step": 4560
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 9.677336707188495,
+      "learning_rate": 7.883442627381983e-06,
+      "loss": 1.0156,
+      "step": 4561
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 8.891107227682506,
+      "learning_rate": 7.882499167260483e-06,
+      "loss": 1.0214,
+      "step": 4562
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.2908506625132725,
+      "learning_rate": 7.8815555533945e-06,
+      "loss": 1.0375,
+      "step": 4563
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.132782040974059,
+      "learning_rate": 7.880611785834367e-06,
+      "loss": 1.0007,
+      "step": 4564
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 8.659898359432384,
+      "learning_rate": 7.879667864630417e-06,
+      "loss": 0.9241,
+      "step": 4565
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 8.511410808322742,
+      "learning_rate": 7.878723789833001e-06,
+      "loss": 1.0512,
+      "step": 4566
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.976104872175839,
+      "learning_rate": 7.877779561492471e-06,
+      "loss": 0.9872,
+      "step": 4567
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 8.284343357557185,
+      "learning_rate": 7.87683517965919e-06,
+      "loss": 1.0707,
+      "step": 4568
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 9.20138471945315,
+      "learning_rate": 7.875890644383526e-06,
+      "loss": 1.0108,
+      "step": 4569
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.9369654419795745,
+      "learning_rate": 7.874945955715863e-06,
+      "loss": 0.9821,
+      "step": 4570
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 8.336156369327533,
+      "learning_rate": 7.874001113706583e-06,
+      "loss": 1.0718,
+      "step": 4571
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 8.605329156264283,
+      "learning_rate": 7.873056118406082e-06,
+      "loss": 1.1063,
+      "step": 4572
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 9.761468166656835,
+      "learning_rate": 7.872110969864767e-06,
+      "loss": 0.9583,
+      "step": 4573
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.5899739683590255,
+      "learning_rate": 7.871165668133047e-06,
+      "loss": 0.9794,
+      "step": 4574
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 85.3306817941011,
+      "learning_rate": 7.870220213261344e-06,
+      "loss": 1.2982,
+      "step": 4575
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.017938226993204,
+      "learning_rate": 7.869274605300081e-06,
+      "loss": 0.9697,
+      "step": 4576
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.644363629180273,
+      "learning_rate": 7.8683288442997e-06,
+      "loss": 0.9452,
+      "step": 4577
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.008635491580495,
+      "learning_rate": 7.86738293031064e-06,
+      "loss": 1.0344,
+      "step": 4578
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 6.913684337334269,
+      "learning_rate": 7.866436863383358e-06,
+      "loss": 0.9766,
+      "step": 4579
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.499251977770369,
+      "learning_rate": 7.865490643568312e-06,
+      "loss": 1.0615,
+      "step": 4580
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.889940253050355,
+      "learning_rate": 7.864544270915971e-06,
+      "loss": 0.9977,
+      "step": 4581
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 5.798521523450225,
+      "learning_rate": 7.863597745476813e-06,
+      "loss": 1.0479,
+      "step": 4582
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 5.297268309492571,
+      "learning_rate": 7.86265106730132e-06,
+      "loss": 1.0203,
+      "step": 4583
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.8769904827247785,
+      "learning_rate": 7.861704236439988e-06,
+      "loss": 1.0141,
+      "step": 4584
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 8.074753559639904,
+      "learning_rate": 7.860757252943318e-06,
+      "loss": 0.9971,
+      "step": 4585
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.5754333621114345,
+      "learning_rate": 7.859810116861818e-06,
+      "loss": 0.9694,
+      "step": 4586
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 8.99918202134605,
+      "learning_rate": 7.85886282824601e-06,
+      "loss": 0.9171,
+      "step": 4587
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 9.123158159753258,
+      "learning_rate": 7.857915387146415e-06,
+      "loss": 0.9575,
+      "step": 4588
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 23.625829656912156,
+      "learning_rate": 7.856967793613567e-06,
+      "loss": 0.916,
+      "step": 4589
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 6.58457142841031,
+      "learning_rate": 7.85602004769801e-06,
+      "loss": 0.9642,
+      "step": 4590
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 8.009359018855537,
+      "learning_rate": 7.855072149450294e-06,
+      "loss": 1.0163,
+      "step": 4591
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 8.049924302836498,
+      "learning_rate": 7.854124098920975e-06,
+      "loss": 0.9263,
+      "step": 4592
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 6.463577879819033,
+      "learning_rate": 7.853175896160622e-06,
+      "loss": 0.9866,
+      "step": 4593
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 8.350379234844352,
+      "learning_rate": 7.85222754121981e-06,
+      "loss": 1.0048,
+      "step": 4594
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 10.088757587480865,
+      "learning_rate": 7.851279034149117e-06,
+      "loss": 1.0335,
+      "step": 4595
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 8.497713750193656,
+      "learning_rate": 7.85033037499914e-06,
+      "loss": 1.0265,
+      "step": 4596
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 6.528668447340777,
+      "learning_rate": 7.849381563820472e-06,
+      "loss": 1.035,
+      "step": 4597
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 8.249822878825714,
+      "learning_rate": 7.848432600663726e-06,
+      "loss": 1.0275,
+      "step": 4598
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 8.657719266123742,
+      "learning_rate": 7.847483485579512e-06,
+      "loss": 0.997,
+      "step": 4599
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 8.162424133279583,
+      "learning_rate": 7.846534218618455e-06,
+      "loss": 1.0588,
+      "step": 4600
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 6.595116060660918,
+      "learning_rate": 7.845584799831185e-06,
+      "loss": 0.9383,
+      "step": 4601
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 6.3834274430687294,
+      "learning_rate": 7.844635229268344e-06,
+      "loss": 1.079,
+      "step": 4602
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 5.542510138057447,
+      "learning_rate": 7.84368550698058e-06,
+      "loss": 0.9571,
+      "step": 4603
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 8.204048808693578,
+      "learning_rate": 7.842735633018543e-06,
+      "loss": 0.9317,
+      "step": 4604
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 5.993717155931603,
+      "learning_rate": 7.8417856074329e-06,
+      "loss": 1.0031,
+      "step": 4605
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 9.061043272865586,
+      "learning_rate": 7.840835430274325e-06,
+      "loss": 1.0701,
+      "step": 4606
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 10.86577939261496,
+      "learning_rate": 7.839885101593494e-06,
+      "loss": 0.9329,
+      "step": 4607
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 4.8951728539938815,
+      "learning_rate": 7.838934621441097e-06,
+      "loss": 0.9944,
+      "step": 4608
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 8.15174089407226,
+      "learning_rate": 7.83798398986783e-06,
+      "loss": 0.9772,
+      "step": 4609
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 7.946229355335363,
+      "learning_rate": 7.837033206924396e-06,
+      "loss": 0.9229,
+      "step": 4610
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 8.320445998711982,
+      "learning_rate": 7.83608227266151e-06,
+      "loss": 1.0201,
+      "step": 4611
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 8.80496516578323,
+      "learning_rate": 7.835131187129887e-06,
+      "loss": 0.9467,
+      "step": 4612
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 8.960242274626735,
+      "learning_rate": 7.834179950380258e-06,
+      "loss": 1.0792,
+      "step": 4613
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 7.7309076756080835,
+      "learning_rate": 7.833228562463362e-06,
+      "loss": 1.0159,
+      "step": 4614
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 8.41981744847345,
+      "learning_rate": 7.832277023429938e-06,
+      "loss": 0.9539,
+      "step": 4615
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 7.954609809506451,
+      "learning_rate": 7.831325333330744e-06,
+      "loss": 0.9755,
+      "step": 4616
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 5.0122286581375555,
+      "learning_rate": 7.830373492216536e-06,
+      "loss": 0.9172,
+      "step": 4617
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 7.937374258613269,
+      "learning_rate": 7.829421500138085e-06,
+      "loss": 0.9442,
+      "step": 4618
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 6.142624916979091,
+      "learning_rate": 7.828469357146165e-06,
+      "loss": 1.0052,
+      "step": 4619
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 8.245666212467915,
+      "learning_rate": 7.827517063291565e-06,
+      "loss": 1.0275,
+      "step": 4620
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 10.896189510705568,
+      "learning_rate": 7.826564618625074e-06,
+      "loss": 1.0041,
+      "step": 4621
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 6.598291602856675,
+      "learning_rate": 7.825612023197496e-06,
+      "loss": 0.9594,
+      "step": 4622
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 7.266283151436521,
+      "learning_rate": 7.824659277059636e-06,
+      "loss": 0.9094,
+      "step": 4623
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 7.348273114394593,
+      "learning_rate": 7.823706380262312e-06,
+      "loss": 0.9825,
+      "step": 4624
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 4.592117039441434,
+      "learning_rate": 7.822753332856352e-06,
+      "loss": 0.987,
+      "step": 4625
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 9.371011378093506,
+      "learning_rate": 7.821800134892584e-06,
+      "loss": 1.0232,
+      "step": 4626
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 7.244126453929615,
+      "learning_rate": 7.820846786421854e-06,
+      "loss": 0.9663,
+      "step": 4627
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 7.479910913981092,
+      "learning_rate": 7.819893287495005e-06,
+      "loss": 1.0131,
+      "step": 4628
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 9.067427481054429,
+      "learning_rate": 7.818939638162899e-06,
+      "loss": 0.9329,
+      "step": 4629
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 8.204744796399856,
+      "learning_rate": 7.8179858384764e-06,
+      "loss": 0.9344,
+      "step": 4630
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 6.164009717802479,
+      "learning_rate": 7.817031888486379e-06,
+      "loss": 0.9654,
+      "step": 4631
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 9.366483851863016,
+      "learning_rate": 7.81607778824372e-06,
+      "loss": 0.9662,
+      "step": 4632
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 7.34671456768611,
+      "learning_rate": 7.815123537799308e-06,
+      "loss": 0.9906,
+      "step": 4633
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 6.828070173199264,
+      "learning_rate": 7.814169137204042e-06,
+      "loss": 1.0374,
+      "step": 4634
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 7.8702199183044455,
+      "learning_rate": 7.81321458650883e-06,
+      "loss": 1.0179,
+      "step": 4635
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 6.327861432678508,
+      "learning_rate": 7.812259885764582e-06,
+      "loss": 0.9778,
+      "step": 4636
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 7.456962644059171,
+      "learning_rate": 7.811305035022218e-06,
+      "loss": 0.9656,
+      "step": 4637
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 7.554641243333256,
+      "learning_rate": 7.810350034332672e-06,
+      "loss": 0.9472,
+      "step": 4638
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 9.06930882881521,
+      "learning_rate": 7.809394883746874e-06,
+      "loss": 0.9707,
+      "step": 4639
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 8.306487172440692,
+      "learning_rate": 7.808439583315776e-06,
+      "loss": 1.0855,
+      "step": 4640
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 10.25005154269168,
+      "learning_rate": 7.807484133090327e-06,
+      "loss": 1.0084,
+      "step": 4641
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 6.899751069248867,
+      "learning_rate": 7.806528533121489e-06,
+      "loss": 0.9426,
+      "step": 4642
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 6.216740768068875,
+      "learning_rate": 7.80557278346023e-06,
+      "loss": 1.0587,
+      "step": 4643
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 6.191376728773811,
+      "learning_rate": 7.804616884157529e-06,
+      "loss": 1.0154,
+      "step": 4644
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 8.90018583903665,
+      "learning_rate": 7.80366083526437e-06,
+      "loss": 1.1308,
+      "step": 4645
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 8.659804688573015,
+      "learning_rate": 7.802704636831747e-06,
+      "loss": 1.0449,
+      "step": 4646
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 9.28319757510741,
+      "learning_rate": 7.80174828891066e-06,
+      "loss": 1.0052,
+      "step": 4647
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 8.318862181305345,
+      "learning_rate": 7.800791791552115e-06,
+      "loss": 1.024,
+      "step": 4648
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 6.673528343949623,
+      "learning_rate": 7.799835144807135e-06,
+      "loss": 0.9448,
+      "step": 4649
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 7.607182731017398,
+      "learning_rate": 7.798878348726741e-06,
+      "loss": 1.024,
+      "step": 4650
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 7.25591492234534,
+      "learning_rate": 7.797921403361968e-06,
+      "loss": 1.0771,
+      "step": 4651
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 7.6098297292262185,
+      "learning_rate": 7.796964308763853e-06,
+      "loss": 1.0753,
+      "step": 4652
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 8.72077111386434,
+      "learning_rate": 7.796007064983448e-06,
+      "loss": 0.9658,
+      "step": 4653
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 5.422363158433836,
+      "learning_rate": 7.795049672071809e-06,
+      "loss": 0.9608,
+      "step": 4654
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 7.424362478854948,
+      "learning_rate": 7.79409213008e-06,
+      "loss": 0.9629,
+      "step": 4655
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 6.387411589137708,
+      "learning_rate": 7.793134439059096e-06,
+      "loss": 1.0263,
+      "step": 4656
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 6.816566346972849,
+      "learning_rate": 7.792176599060173e-06,
+      "loss": 1.0479,
+      "step": 4657
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 7.400479509401143,
+      "learning_rate": 7.791218610134324e-06,
+      "loss": 1.0137,
+      "step": 4658
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 8.26291975359119,
+      "learning_rate": 7.790260472332645e-06,
+      "loss": 0.9782,
+      "step": 4659
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 6.717018661452121,
+      "learning_rate": 7.789302185706236e-06,
+      "loss": 0.9017,
+      "step": 4660
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 9.551900564240636,
+      "learning_rate": 7.788343750306215e-06,
+      "loss": 0.9528,
+      "step": 4661
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 6.796114616125482,
+      "learning_rate": 7.787385166183699e-06,
+      "loss": 0.9237,
+      "step": 4662
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.00329944613631,
+      "learning_rate": 7.786426433389817e-06,
+      "loss": 0.9729,
+      "step": 4663
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.60917659674419,
+      "learning_rate": 7.785467551975705e-06,
+      "loss": 0.9733,
+      "step": 4664
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 8.813055410136869,
+      "learning_rate": 7.784508521992507e-06,
+      "loss": 0.9594,
+      "step": 4665
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 4.779506186802516,
+      "learning_rate": 7.783549343491376e-06,
+      "loss": 0.9947,
+      "step": 4666
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.661069120208598,
+      "learning_rate": 7.78259001652347e-06,
+      "loss": 0.9889,
+      "step": 4667
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 8.011544235790955,
+      "learning_rate": 7.781630541139959e-06,
+      "loss": 1.0377,
+      "step": 4668
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.042539335450581,
+      "learning_rate": 7.780670917392016e-06,
+      "loss": 0.9975,
+      "step": 4669
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 5.948827277167801,
+      "learning_rate": 7.779711145330828e-06,
+      "loss": 0.9959,
+      "step": 4670
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.504635451831871,
+      "learning_rate": 7.778751225007584e-06,
+      "loss": 1.0042,
+      "step": 4671
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.9671001864136075,
+      "learning_rate": 7.777791156473485e-06,
+      "loss": 0.8729,
+      "step": 4672
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 8.879848778582467,
+      "learning_rate": 7.776830939779737e-06,
+      "loss": 1.0701,
+      "step": 4673
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.307972590442289,
+      "learning_rate": 7.775870574977556e-06,
+      "loss": 0.9233,
+      "step": 4674
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.0906822306627735,
+      "learning_rate": 7.774910062118165e-06,
+      "loss": 0.9963,
+      "step": 4675
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.309176180227625,
+      "learning_rate": 7.773949401252796e-06,
+      "loss": 0.9285,
+      "step": 4676
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 8.3799757004443,
+      "learning_rate": 7.772988592432687e-06,
+      "loss": 0.9819,
+      "step": 4677
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 5.199446662082755,
+      "learning_rate": 7.772027635709086e-06,
+      "loss": 1.0382,
+      "step": 4678
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.051826782462201,
+      "learning_rate": 7.771066531133247e-06,
+      "loss": 0.9511,
+      "step": 4679
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 8.014538046025299,
+      "learning_rate": 7.770105278756433e-06,
+      "loss": 0.9656,
+      "step": 4680
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 5.733754331762763,
+      "learning_rate": 7.769143878629911e-06,
+      "loss": 1.0132,
+      "step": 4681
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.484748182425313,
+      "learning_rate": 7.768182330804964e-06,
+      "loss": 1.0561,
+      "step": 4682
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.728558983446488,
+      "learning_rate": 7.767220635332877e-06,
+      "loss": 0.9211,
+      "step": 4683
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.0583639187387295,
+      "learning_rate": 7.766258792264944e-06,
+      "loss": 0.8561,
+      "step": 4684
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.145812755941397,
+      "learning_rate": 7.765296801652468e-06,
+      "loss": 1.0451,
+      "step": 4685
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.455710328861717,
+      "learning_rate": 7.764334663546757e-06,
+      "loss": 0.962,
+      "step": 4686
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.119065833655487,
+      "learning_rate": 7.76337237799913e-06,
+      "loss": 1.0345,
+      "step": 4687
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.281988619578241,
+      "learning_rate": 7.762409945060912e-06,
+      "loss": 0.915,
+      "step": 4688
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.411904031060032,
+      "learning_rate": 7.761447364783436e-06,
+      "loss": 0.9768,
+      "step": 4689
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.524410468219156,
+      "learning_rate": 7.760484637218044e-06,
+      "loss": 1.0134,
+      "step": 4690
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.6149675545130435,
+      "learning_rate": 7.759521762416085e-06,
+      "loss": 1.0047,
+      "step": 4691
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.751241424036564,
+      "learning_rate": 7.758558740428918e-06,
+      "loss": 1.0279,
+      "step": 4692
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 8.110658874580334,
+      "learning_rate": 7.757595571307905e-06,
+      "loss": 1.0004,
+      "step": 4693
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.301084317844451,
+      "learning_rate": 7.756632255104422e-06,
+      "loss": 1.0458,
+      "step": 4694
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.811290862546265,
+      "learning_rate": 7.755668791869845e-06,
+      "loss": 0.9373,
+      "step": 4695
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.353022733607814,
+      "learning_rate": 7.754705181655566e-06,
+      "loss": 1.035,
+      "step": 4696
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 9.56665477285909,
+      "learning_rate": 7.753741424512981e-06,
+      "loss": 1.0567,
+      "step": 4697
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.525012927203768,
+      "learning_rate": 7.752777520493493e-06,
+      "loss": 1.0366,
+      "step": 4698
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 5.848317579720327,
+      "learning_rate": 7.751813469648515e-06,
+      "loss": 0.967,
+      "step": 4699
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.937286701264727,
+      "learning_rate": 7.750849272029465e-06,
+      "loss": 0.9811,
+      "step": 4700
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 9.115306491615822,
+      "learning_rate": 7.749884927687775e-06,
+      "loss": 0.9967,
+      "step": 4701
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.691723844403473,
+      "learning_rate": 7.748920436674874e-06,
+      "loss": 0.9779,
+      "step": 4702
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.202379456298293,
+      "learning_rate": 7.74795579904221e-06,
+      "loss": 1.021,
+      "step": 4703
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.815740121584342,
+      "learning_rate": 7.746991014841231e-06,
+      "loss": 1.0755,
+      "step": 4704
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 8.257389026223082,
+      "learning_rate": 7.746026084123399e-06,
+      "loss": 1.0002,
+      "step": 4705
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 8.291844903198948,
+      "learning_rate": 7.74506100694018e-06,
+      "loss": 1.0476,
+      "step": 4706
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 5.764273454341538,
+      "learning_rate": 7.744095783343048e-06,
+      "loss": 1.0225,
+      "step": 4707
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.12681065988312,
+      "learning_rate": 7.743130413383484e-06,
+      "loss": 0.8511,
+      "step": 4708
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.601597762234104,
+      "learning_rate": 7.74216489711298e-06,
+      "loss": 0.97,
+      "step": 4709
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.8465402162728335,
+      "learning_rate": 7.741199234583032e-06,
+      "loss": 0.9168,
+      "step": 4710
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.893374457507197,
+      "learning_rate": 7.740233425845148e-06,
+      "loss": 1.0849,
+      "step": 4711
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 8.762087108880074,
+      "learning_rate": 7.739267470950839e-06,
+      "loss": 0.984,
+      "step": 4712
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.059573417963636,
+      "learning_rate": 7.73830136995163e-06,
+      "loss": 1.041,
+      "step": 4713
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.6273001266326,
+      "learning_rate": 7.737335122899048e-06,
+      "loss": 0.9158,
+      "step": 4714
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.409117510190439,
+      "learning_rate": 7.73636872984463e-06,
+      "loss": 0.986,
+      "step": 4715
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 9.566412812976594,
+      "learning_rate": 7.735402190839919e-06,
+      "loss": 0.9872,
+      "step": 4716
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 8.830322961158062,
+      "learning_rate": 7.73443550593647e-06,
+      "loss": 1.0632,
+      "step": 4717
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 5.520793410484077,
+      "learning_rate": 7.733468675185843e-06,
+      "loss": 1.0094,
+      "step": 4718
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.650666355946062,
+      "learning_rate": 7.732501698639606e-06,
+      "loss": 0.9565,
+      "step": 4719
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 5.6817243819810015,
+      "learning_rate": 7.73153457634933e-06,
+      "loss": 0.9534,
+      "step": 4720
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.754755486050004,
+      "learning_rate": 7.730567308366608e-06,
+      "loss": 0.9622,
+      "step": 4721
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.790562209329557,
+      "learning_rate": 7.729599894743025e-06,
+      "loss": 0.9891,
+      "step": 4722
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 8.400349533740101,
+      "learning_rate": 7.728632335530183e-06,
+      "loss": 0.9205,
+      "step": 4723
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.990892272959046,
+      "learning_rate": 7.727664630779685e-06,
+      "loss": 0.9616,
+      "step": 4724
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.997882403014004,
+      "learning_rate": 7.726696780543149e-06,
+      "loss": 0.937,
+      "step": 4725
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.343573159299395,
+      "learning_rate": 7.725728784872196e-06,
+      "loss": 0.9192,
+      "step": 4726
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.592051539478343,
+      "learning_rate": 7.724760643818456e-06,
+      "loss": 1.0331,
+      "step": 4727
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.0513819894765675,
+      "learning_rate": 7.72379235743357e-06,
+      "loss": 0.9685,
+      "step": 4728
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.555255963138798,
+      "learning_rate": 7.722823925769179e-06,
+      "loss": 1.0027,
+      "step": 4729
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.329176258248736,
+      "learning_rate": 7.721855348876942e-06,
+      "loss": 0.9443,
+      "step": 4730
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 5.215847294206348,
+      "learning_rate": 7.720886626808514e-06,
+      "loss": 0.9913,
+      "step": 4731
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.3822681049997545,
+      "learning_rate": 7.719917759615568e-06,
+      "loss": 0.9885,
+      "step": 4732
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.553409260093253,
+      "learning_rate": 7.71894874734978e-06,
+      "loss": 0.9235,
+      "step": 4733
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 10.756524312497973,
+      "learning_rate": 7.717979590062833e-06,
+      "loss": 1.0419,
+      "step": 4734
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.178917616456327,
+      "learning_rate": 7.717010287806422e-06,
+      "loss": 0.9879,
+      "step": 4735
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.344622820481451,
+      "learning_rate": 7.716040840632244e-06,
+      "loss": 0.9304,
+      "step": 4736
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.815672058861285,
+      "learning_rate": 7.715071248592007e-06,
+      "loss": 1.0302,
+      "step": 4737
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 8.94443603831699,
+      "learning_rate": 7.714101511737427e-06,
+      "loss": 0.9784,
+      "step": 4738
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 5.298757224526076,
+      "learning_rate": 7.71313163012023e-06,
+      "loss": 0.967,
+      "step": 4739
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.348887408768124,
+      "learning_rate": 7.712161603792141e-06,
+      "loss": 0.9782,
+      "step": 4740
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 8.443248453129728,
+      "learning_rate": 7.7111914328049e-06,
+      "loss": 0.913,
+      "step": 4741
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.014426796355997,
+      "learning_rate": 7.71022111721026e-06,
+      "loss": 0.9702,
+      "step": 4742
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 8.93536947919815,
+      "learning_rate": 7.709250657059965e-06,
+      "loss": 1.0619,
+      "step": 4743
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 8.778794689138016,
+      "learning_rate": 7.708280052405782e-06,
+      "loss": 0.9881,
+      "step": 4744
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.54828641715693,
+      "learning_rate": 7.70730930329948e-06,
+      "loss": 0.942,
+      "step": 4745
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.9369127963016854,
+      "learning_rate": 7.706338409792836e-06,
+      "loss": 0.9946,
+      "step": 4746
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.22844449578632,
+      "learning_rate": 7.705367371937636e-06,
+      "loss": 0.9754,
+      "step": 4747
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.369701440018602,
+      "learning_rate": 7.704396189785668e-06,
+      "loss": 1.0185,
+      "step": 4748
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.987432167576434,
+      "learning_rate": 7.703424863388735e-06,
+      "loss": 0.999,
+      "step": 4749
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 8.892566348722005,
+      "learning_rate": 7.702453392798645e-06,
+      "loss": 1.0286,
+      "step": 4750
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 8.405746838394847,
+      "learning_rate": 7.701481778067215e-06,
+      "loss": 0.9593,
+      "step": 4751
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 9.245004926635893,
+      "learning_rate": 7.700510019246265e-06,
+      "loss": 0.9978,
+      "step": 4752
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.087529440121963,
+      "learning_rate": 7.69953811638763e-06,
+      "loss": 0.9788,
+      "step": 4753
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.834650246337617,
+      "learning_rate": 7.698566069543143e-06,
+      "loss": 0.9598,
+      "step": 4754
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.92961753749168,
+      "learning_rate": 7.697593878764655e-06,
+      "loss": 1.0171,
+      "step": 4755
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 9.184456567608786,
+      "learning_rate": 7.696621544104019e-06,
+      "loss": 0.9835,
+      "step": 4756
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 8.570848090055726,
+      "learning_rate": 7.695649065613095e-06,
+      "loss": 0.9652,
+      "step": 4757
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 8.932692739830907,
+      "learning_rate": 7.694676443343752e-06,
+      "loss": 0.9435,
+      "step": 4758
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 8.322119893053337,
+      "learning_rate": 7.693703677347869e-06,
+      "loss": 0.9897,
+      "step": 4759
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.337647739335185,
+      "learning_rate": 7.692730767677331e-06,
+      "loss": 1.0045,
+      "step": 4760
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.201865342371166,
+      "learning_rate": 7.691757714384029e-06,
+      "loss": 0.9996,
+      "step": 4761
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.3815979488266095,
+      "learning_rate": 7.69078451751986e-06,
+      "loss": 1.0019,
+      "step": 4762
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 5.789367964016041,
+      "learning_rate": 7.68981117713674e-06,
+      "loss": 0.9738,
+      "step": 4763
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 8.633881508289996,
+      "learning_rate": 7.688837693286575e-06,
+      "loss": 0.9266,
+      "step": 4764
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.447628721688679,
+      "learning_rate": 7.687864066021291e-06,
+      "loss": 1.048,
+      "step": 4765
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.565188480111765,
+      "learning_rate": 7.686890295392822e-06,
+      "loss": 1.08,
+      "step": 4766
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.828620813704514,
+      "learning_rate": 7.685916381453103e-06,
+      "loss": 1.0159,
+      "step": 4767
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 8.084926473630722,
+      "learning_rate": 7.684942324254082e-06,
+      "loss": 0.8996,
+      "step": 4768
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.9186739169407465,
+      "learning_rate": 7.683968123847706e-06,
+      "loss": 0.9899,
+      "step": 4769
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.275734497622789,
+      "learning_rate": 7.682993780285948e-06,
+      "loss": 0.9996,
+      "step": 4770
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.226720309338683,
+      "learning_rate": 7.682019293620767e-06,
+      "loss": 1.0031,
+      "step": 4771
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.920827717712095,
+      "learning_rate": 7.68104466390414e-06,
+      "loss": 0.9506,
+      "step": 4772
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.773422576652452,
+      "learning_rate": 7.680069891188056e-06,
+      "loss": 0.9224,
+      "step": 4773
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 8.648589727489442,
+      "learning_rate": 7.679094975524502e-06,
+      "loss": 1.0017,
+      "step": 4774
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 8.379046393457077,
+      "learning_rate": 7.67811991696548e-06,
+      "loss": 0.9799,
+      "step": 4775
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.226129348909192,
+      "learning_rate": 7.677144715562998e-06,
+      "loss": 0.9522,
+      "step": 4776
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 8.044170944002758,
+      "learning_rate": 7.676169371369066e-06,
+      "loss": 0.9998,
+      "step": 4777
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.981820732061937,
+      "learning_rate": 7.675193884435713e-06,
+      "loss": 0.9,
+      "step": 4778
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.886412448927113,
+      "learning_rate": 7.674218254814962e-06,
+      "loss": 1.0085,
+      "step": 4779
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 8.25362617872057,
+      "learning_rate": 7.673242482558852e-06,
+      "loss": 0.9788,
+      "step": 4780
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.564693613702882,
+      "learning_rate": 7.67226656771943e-06,
+      "loss": 0.905,
+      "step": 4781
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 8.097728183803381,
+      "learning_rate": 7.671290510348748e-06,
+      "loss": 0.9709,
+      "step": 4782
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.699509745724496,
+      "learning_rate": 7.670314310498865e-06,
+      "loss": 1.0487,
+      "step": 4783
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.611715651977757,
+      "learning_rate": 7.66933796822185e-06,
+      "loss": 1.0427,
+      "step": 4784
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 9.250709588883765,
+      "learning_rate": 7.668361483569779e-06,
+      "loss": 1.0123,
+      "step": 4785
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.824299895292029,
+      "learning_rate": 7.667384856594733e-06,
+      "loss": 1.0507,
+      "step": 4786
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 8.792848899686547,
+      "learning_rate": 7.666408087348802e-06,
+      "loss": 1.0403,
+      "step": 4787
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 9.330753624441842,
+      "learning_rate": 7.665431175884087e-06,
+      "loss": 0.9326,
+      "step": 4788
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.938938688112201,
+      "learning_rate": 7.664454122252694e-06,
+      "loss": 0.9405,
+      "step": 4789
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.449697402314023,
+      "learning_rate": 7.663476926506731e-06,
+      "loss": 1.0218,
+      "step": 4790
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.590110976609736,
+      "learning_rate": 7.662499588698324e-06,
+      "loss": 1.0391,
+      "step": 4791
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.698670959540552,
+      "learning_rate": 7.661522108879602e-06,
+      "loss": 0.9564,
+      "step": 4792
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.537709735544556,
+      "learning_rate": 7.660544487102698e-06,
+      "loss": 1.0044,
+      "step": 4793
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.806708700186498,
+      "learning_rate": 7.659566723419756e-06,
+      "loss": 0.9778,
+      "step": 4794
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 8.58728946723827,
+      "learning_rate": 7.65858881788293e-06,
+      "loss": 0.9644,
+      "step": 4795
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.809957727700305,
+      "learning_rate": 7.657610770544373e-06,
+      "loss": 0.9694,
+      "step": 4796
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.469279888121918,
+      "learning_rate": 7.656632581456258e-06,
+      "loss": 0.9402,
+      "step": 4797
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.672872985672934,
+      "learning_rate": 7.655654250670755e-06,
+      "loss": 1.0364,
+      "step": 4798
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.672758421138964,
+      "learning_rate": 7.654675778240046e-06,
+      "loss": 0.9361,
+      "step": 4799
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 5.603673538300709,
+      "learning_rate": 7.65369716421632e-06,
+      "loss": 0.9256,
+      "step": 4800
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.479068548108766,
+      "learning_rate": 7.652718408651775e-06,
+      "loss": 0.933,
+      "step": 4801
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 8.830592164798837,
+      "learning_rate": 7.651739511598615e-06,
+      "loss": 1.0201,
+      "step": 4802
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.961682359086335,
+      "learning_rate": 7.650760473109048e-06,
+      "loss": 0.9447,
+      "step": 4803
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.746537522839159,
+      "learning_rate": 7.649781293235297e-06,
+      "loss": 0.9488,
+      "step": 4804
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.2195419059648875,
+      "learning_rate": 7.648801972029587e-06,
+      "loss": 0.9342,
+      "step": 4805
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.390508382712611,
+      "learning_rate": 7.647822509544154e-06,
+      "loss": 0.9351,
+      "step": 4806
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 5.961149270135042,
+      "learning_rate": 7.646842905831238e-06,
+      "loss": 1.0081,
+      "step": 4807
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.545588340889751,
+      "learning_rate": 7.645863160943088e-06,
+      "loss": 0.961,
+      "step": 4808
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.780424116508093,
+      "learning_rate": 7.644883274931964e-06,
+      "loss": 0.9659,
+      "step": 4809
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.132000600608468,
+      "learning_rate": 7.643903247850126e-06,
+      "loss": 0.9603,
+      "step": 4810
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.344078842559287,
+      "learning_rate": 7.642923079749848e-06,
+      "loss": 0.9992,
+      "step": 4811
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 5.232153993522179,
+      "learning_rate": 7.641942770683412e-06,
+      "loss": 1.0372,
+      "step": 4812
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.323030406042943,
+      "learning_rate": 7.6409623207031e-06,
+      "loss": 1.0408,
+      "step": 4813
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.871353431025631,
+      "learning_rate": 7.63998172986121e-06,
+      "loss": 1.0089,
+      "step": 4814
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 8.264493021382131,
+      "learning_rate": 7.639000998210042e-06,
+      "loss": 0.9563,
+      "step": 4815
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.48927247605988,
+      "learning_rate": 7.638020125801908e-06,
+      "loss": 0.9952,
+      "step": 4816
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.933611467709873,
+      "learning_rate": 7.637039112689124e-06,
+      "loss": 1.0315,
+      "step": 4817
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.711100129953235,
+      "learning_rate": 7.636057958924013e-06,
+      "loss": 1.0194,
+      "step": 4818
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.539348991763201,
+      "learning_rate": 7.635076664558906e-06,
+      "loss": 0.9587,
+      "step": 4819
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.892147855745927,
+      "learning_rate": 7.634095229646148e-06,
+      "loss": 0.9047,
+      "step": 4820
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.966479867427437,
+      "learning_rate": 7.63311365423808e-06,
+      "loss": 1.1291,
+      "step": 4821
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 8.327515549709238,
+      "learning_rate": 7.632131938387059e-06,
+      "loss": 0.9289,
+      "step": 4822
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 8.972306442684301,
+      "learning_rate": 7.631150082145449e-06,
+      "loss": 0.9596,
+      "step": 4823
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.585121241193435,
+      "learning_rate": 7.630168085565618e-06,
+      "loss": 1.0432,
+      "step": 4824
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 5.003451944922498,
+      "learning_rate": 7.629185948699939e-06,
+      "loss": 1.05,
+      "step": 4825
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.322676385986951,
+      "learning_rate": 7.628203671600801e-06,
+      "loss": 0.9052,
+      "step": 4826
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.763636439432991,
+      "learning_rate": 7.627221254320596e-06,
+      "loss": 0.9685,
+      "step": 4827
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 8.24797821273928,
+      "learning_rate": 7.626238696911722e-06,
+      "loss": 1.0242,
+      "step": 4828
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.668521091650746,
+      "learning_rate": 7.625255999426585e-06,
+      "loss": 1.0015,
+      "step": 4829
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 8.743837016255048,
+      "learning_rate": 7.624273161917601e-06,
+      "loss": 0.9385,
+      "step": 4830
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 5.856995236743033,
+      "learning_rate": 7.623290184437192e-06,
+      "loss": 1.0117,
+      "step": 4831
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 8.070044284695784,
+      "learning_rate": 7.622307067037786e-06,
+      "loss": 1.0174,
+      "step": 4832
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 8.172608921501016,
+      "learning_rate": 7.62132380977182e-06,
+      "loss": 0.9777,
+      "step": 4833
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 8.12955523553369,
+      "learning_rate": 7.620340412691739e-06,
+      "loss": 1.0127,
+      "step": 4834
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.896326915325467,
+      "learning_rate": 7.619356875849994e-06,
+      "loss": 0.9294,
+      "step": 4835
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 5.883690692219771,
+      "learning_rate": 7.6183731992990455e-06,
+      "loss": 0.9459,
+      "step": 4836
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.461905281843726,
+      "learning_rate": 7.617389383091357e-06,
+      "loss": 0.9466,
+      "step": 4837
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.16453084940568,
+      "learning_rate": 7.616405427279406e-06,
+      "loss": 0.9839,
+      "step": 4838
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 9.41477815392956,
+      "learning_rate": 7.61542133191567e-06,
+      "loss": 1.0105,
+      "step": 4839
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 15.975707550755955,
+      "learning_rate": 7.61443709705264e-06,
+      "loss": 0.9982,
+      "step": 4840
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.785303230930052,
+      "learning_rate": 7.613452722742816e-06,
+      "loss": 1.0135,
+      "step": 4841
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 9.219027472809278,
+      "learning_rate": 7.612468209038695e-06,
+      "loss": 0.9727,
+      "step": 4842
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 5.311314952733289,
+      "learning_rate": 7.611483555992793e-06,
+      "loss": 1.0202,
+      "step": 4843
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.715928636659752,
+      "learning_rate": 7.610498763657628e-06,
+      "loss": 0.9606,
+      "step": 4844
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.860374861439033,
+      "learning_rate": 7.609513832085724e-06,
+      "loss": 1.04,
+      "step": 4845
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.952281171063028,
+      "learning_rate": 7.608528761329617e-06,
+      "loss": 1.065,
+      "step": 4846
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.024199883040537,
+      "learning_rate": 7.607543551441846e-06,
+      "loss": 1.0405,
+      "step": 4847
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.337341318034106,
+      "learning_rate": 7.606558202474958e-06,
+      "loss": 0.9526,
+      "step": 4848
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.147401664079356,
+      "learning_rate": 7.605572714481515e-06,
+      "loss": 0.965,
+      "step": 4849
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.67432048292295,
+      "learning_rate": 7.604587087514075e-06,
+      "loss": 1.0052,
+      "step": 4850
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.9135915663563114,
+      "learning_rate": 7.603601321625208e-06,
+      "loss": 1.0243,
+      "step": 4851
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 8.310529853114728,
+      "learning_rate": 7.602615416867494e-06,
+      "loss": 0.9696,
+      "step": 4852
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.28749789472409,
+      "learning_rate": 7.60162937329352e-06,
+      "loss": 0.9638,
+      "step": 4853
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.722082496595776,
+      "learning_rate": 7.600643190955874e-06,
+      "loss": 0.9575,
+      "step": 4854
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 10.386414043907726,
+      "learning_rate": 7.599656869907162e-06,
+      "loss": 0.9135,
+      "step": 4855
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 4.6471729148359096,
+      "learning_rate": 7.598670410199988e-06,
+      "loss": 0.987,
+      "step": 4856
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.998517380195275,
+      "learning_rate": 7.597683811886967e-06,
+      "loss": 0.9841,
+      "step": 4857
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.006021130901474,
+      "learning_rate": 7.596697075020723e-06,
+      "loss": 0.9931,
+      "step": 4858
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.910547013968678,
+      "learning_rate": 7.5957101996538826e-06,
+      "loss": 1.0536,
+      "step": 4859
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.048364319050816,
+      "learning_rate": 7.594723185839088e-06,
+      "loss": 1.0658,
+      "step": 4860
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.02737017548557,
+      "learning_rate": 7.59373603362898e-06,
+      "loss": 1.016,
+      "step": 4861
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 8.83148145991693,
+      "learning_rate": 7.592748743076212e-06,
+      "loss": 0.9266,
+      "step": 4862
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 8.01018154279606,
+      "learning_rate": 7.591761314233442e-06,
+      "loss": 1.0269,
+      "step": 4863
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.120530887012537,
+      "learning_rate": 7.590773747153339e-06,
+      "loss": 1.002,
+      "step": 4864
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.165252104043175,
+      "learning_rate": 7.589786041888576e-06,
+      "loss": 0.9738,
+      "step": 4865
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 5.24332242254464,
+      "learning_rate": 7.588798198491834e-06,
+      "loss": 0.9797,
+      "step": 4866
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.47533994253746,
+      "learning_rate": 7.587810217015801e-06,
+      "loss": 0.9562,
+      "step": 4867
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.9593911624106335,
+      "learning_rate": 7.586822097513176e-06,
+      "loss": 0.9614,
+      "step": 4868
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.785101824701652,
+      "learning_rate": 7.585833840036658e-06,
+      "loss": 1.0877,
+      "step": 4869
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.6982916588899775,
+      "learning_rate": 7.584845444638964e-06,
+      "loss": 1.0363,
+      "step": 4870
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.231101196458952,
+      "learning_rate": 7.583856911372807e-06,
+      "loss": 0.9853,
+      "step": 4871
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.9932430316447345,
+      "learning_rate": 7.582868240290915e-06,
+      "loss": 1.0342,
+      "step": 4872
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 5.813607000640293,
+      "learning_rate": 7.581879431446019e-06,
+      "loss": 1.0644,
+      "step": 4873
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 6.595476945031503,
+      "learning_rate": 7.580890484890864e-06,
+      "loss": 1.0577,
+      "step": 4874
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.254879848979563,
+      "learning_rate": 7.579901400678193e-06,
+      "loss": 0.9651,
+      "step": 4875
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.4504405568333265,
+      "learning_rate": 7.578912178860762e-06,
+      "loss": 0.8805,
+      "step": 4876
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 8.789966142751872,
+      "learning_rate": 7.577922819491333e-06,
+      "loss": 0.9891,
+      "step": 4877
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.066944085363992,
+      "learning_rate": 7.576933322622679e-06,
+      "loss": 0.9812,
+      "step": 4878
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 6.62311740978725,
+      "learning_rate": 7.575943688307572e-06,
+      "loss": 0.9682,
+      "step": 4879
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.59763317437149,
+      "learning_rate": 7.5749539165988e-06,
+      "loss": 0.9219,
+      "step": 4880
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.4076284457576245,
+      "learning_rate": 7.5739640075491546e-06,
+      "loss": 0.932,
+      "step": 4881
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 5.482802724855872,
+      "learning_rate": 7.572973961211431e-06,
+      "loss": 0.9208,
+      "step": 4882
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 8.381846579682774,
+      "learning_rate": 7.5719837776384405e-06,
+      "loss": 0.926,
+      "step": 4883
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.536355329264909,
+      "learning_rate": 7.570993456882995e-06,
+      "loss": 1.0095,
+      "step": 4884
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.925807098764958,
+      "learning_rate": 7.570002998997914e-06,
+      "loss": 0.9244,
+      "step": 4885
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.430831875582823,
+      "learning_rate": 7.569012404036027e-06,
+      "loss": 1.0071,
+      "step": 4886
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.228498494426027,
+      "learning_rate": 7.568021672050167e-06,
+      "loss": 1.0389,
+      "step": 4887
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.4223418423917265,
+      "learning_rate": 7.567030803093182e-06,
+      "loss": 0.9947,
+      "step": 4888
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 6.597948164418447,
+      "learning_rate": 7.566039797217918e-06,
+      "loss": 0.961,
+      "step": 4889
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.562027451465641,
+      "learning_rate": 7.565048654477234e-06,
+      "loss": 0.9328,
+      "step": 4890
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.440232608144066,
+      "learning_rate": 7.564057374923994e-06,
+      "loss": 1.0456,
+      "step": 4891
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 6.634542330724047,
+      "learning_rate": 7.563065958611071e-06,
+      "loss": 0.9644,
+      "step": 4892
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 6.019016557563045,
+      "learning_rate": 7.562074405591345e-06,
+      "loss": 1.0025,
+      "step": 4893
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.438352891967681,
+      "learning_rate": 7.561082715917702e-06,
+      "loss": 0.9804,
+      "step": 4894
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 8.720790339868929,
+      "learning_rate": 7.5600908896430345e-06,
+      "loss": 1.0104,
+      "step": 4895
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.214361710828092,
+      "learning_rate": 7.559098926820245e-06,
+      "loss": 0.9349,
+      "step": 4896
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 5.969249418098108,
+      "learning_rate": 7.558106827502244e-06,
+      "loss": 0.9363,
+      "step": 4897
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 6.261839699586587,
+      "learning_rate": 7.557114591741943e-06,
+      "loss": 0.9974,
+      "step": 4898
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 5.6313595550569016,
+      "learning_rate": 7.556122219592269e-06,
+      "loss": 1.0245,
+      "step": 4899
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.807422041757912,
+      "learning_rate": 7.555129711106151e-06,
+      "loss": 0.9778,
+      "step": 4900
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 6.935725693662138,
+      "learning_rate": 7.554137066336524e-06,
+      "loss": 1.0016,
+      "step": 4901
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 5.396566929659907,
+      "learning_rate": 7.553144285336338e-06,
+      "loss": 0.9072,
+      "step": 4902
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 6.944557813054517,
+      "learning_rate": 7.552151368158543e-06,
+      "loss": 0.9509,
+      "step": 4903
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 6.978223004116168,
+      "learning_rate": 7.551158314856095e-06,
+      "loss": 0.9665,
+      "step": 4904
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.579432342321029,
+      "learning_rate": 7.5501651254819666e-06,
+      "loss": 0.993,
+      "step": 4905
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 5.297855384359432,
+      "learning_rate": 7.549171800089126e-06,
+      "loss": 0.9969,
+      "step": 4906
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 6.586444814904642,
+      "learning_rate": 7.54817833873056e-06,
+      "loss": 0.9287,
+      "step": 4907
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.9915408065124325,
+      "learning_rate": 7.547184741459253e-06,
+      "loss": 0.9351,
+      "step": 4908
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 6.0668477175478,
+      "learning_rate": 7.546191008328202e-06,
+      "loss": 1.0216,
+      "step": 4909
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.936729872521081,
+      "learning_rate": 7.54519713939041e-06,
+      "loss": 0.9599,
+      "step": 4910
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.079963314383375,
+      "learning_rate": 7.544203134698886e-06,
+      "loss": 1.0242,
+      "step": 4911
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 5.973425057565387,
+      "learning_rate": 7.543208994306651e-06,
+      "loss": 1.0215,
+      "step": 4912
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.4647106439375035,
+      "learning_rate": 7.542214718266727e-06,
+      "loss": 0.9116,
+      "step": 4913
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 8.879957469733903,
+      "learning_rate": 7.5412203066321454e-06,
+      "loss": 0.9399,
+      "step": 4914
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 9.456731572387575,
+      "learning_rate": 7.540225759455946e-06,
+      "loss": 1.0221,
+      "step": 4915
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 8.723108433941832,
+      "learning_rate": 7.539231076791176e-06,
+      "loss": 1.0262,
+      "step": 4916
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 5.564370485569488,
+      "learning_rate": 7.538236258690888e-06,
+      "loss": 0.9524,
+      "step": 4917
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 6.802006786385862,
+      "learning_rate": 7.537241305208143e-06,
+      "loss": 1.0259,
+      "step": 4918
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 8.684850938255652,
+      "learning_rate": 7.536246216396011e-06,
+      "loss": 1.0579,
+      "step": 4919
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.021909847229621,
+      "learning_rate": 7.535250992307561e-06,
+      "loss": 1.0236,
+      "step": 4920
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 6.176624445700488,
+      "learning_rate": 7.534255632995884e-06,
+      "loss": 0.9614,
+      "step": 4921
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 8.042879819671452,
+      "learning_rate": 7.533260138514065e-06,
+      "loss": 0.9337,
+      "step": 4922
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 8.422033354129239,
+      "learning_rate": 7.532264508915202e-06,
+      "loss": 1.0173,
+      "step": 4923
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.864671190326659,
+      "learning_rate": 7.531268744252397e-06,
+      "loss": 0.9027,
+      "step": 4924
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 5.678801047671293,
+      "learning_rate": 7.530272844578763e-06,
+      "loss": 1.0003,
+      "step": 4925
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 8.4957865233888,
+      "learning_rate": 7.52927680994742e-06,
+      "loss": 1.0036,
+      "step": 4926
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 8.357800126075778,
+      "learning_rate": 7.528280640411492e-06,
+      "loss": 1.0808,
+      "step": 4927
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 9.925755202480959,
+      "learning_rate": 7.527284336024112e-06,
+      "loss": 1.0904,
+      "step": 4928
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.957180443387081,
+      "learning_rate": 7.52628789683842e-06,
+      "loss": 0.9934,
+      "step": 4929
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 6.846820863790172,
+      "learning_rate": 7.525291322907563e-06,
+      "loss": 0.9837,
+      "step": 4930
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 8.280008488098467,
+      "learning_rate": 7.524294614284697e-06,
+      "loss": 0.9122,
+      "step": 4931
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 8.501537919873837,
+      "learning_rate": 7.5232977710229835e-06,
+      "loss": 0.9405,
+      "step": 4932
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 6.546899697694547,
+      "learning_rate": 7.522300793175589e-06,
+      "loss": 0.9807,
+      "step": 4933
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 6.442681909994828,
+      "learning_rate": 7.521303680795691e-06,
+      "loss": 0.9327,
+      "step": 4934
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 6.9900924200925605,
+      "learning_rate": 7.520306433936473e-06,
+      "loss": 0.9215,
+      "step": 4935
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 5.90251935239503,
+      "learning_rate": 7.519309052651126e-06,
+      "loss": 0.9878,
+      "step": 4936
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.001337218272011,
+      "learning_rate": 7.518311536992846e-06,
+      "loss": 0.9158,
+      "step": 4937
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.1832057859769005,
+      "learning_rate": 7.517313887014838e-06,
+      "loss": 0.9384,
+      "step": 4938
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 6.451131633799201,
+      "learning_rate": 7.516316102770314e-06,
+      "loss": 0.9539,
+      "step": 4939
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 6.527282317386549,
+      "learning_rate": 7.515318184312495e-06,
+      "loss": 0.998,
+      "step": 4940
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.422494273975946,
+      "learning_rate": 7.514320131694603e-06,
+      "loss": 0.9219,
+      "step": 4941
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.98308969246632,
+      "learning_rate": 7.513321944969878e-06,
+      "loss": 0.8993,
+      "step": 4942
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.481886022526117,
+      "learning_rate": 7.512323624191553e-06,
+      "loss": 0.9764,
+      "step": 4943
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.572115523273703,
+      "learning_rate": 7.51132516941288e-06,
+      "loss": 0.9876,
+      "step": 4944
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.586520796218872,
+      "learning_rate": 7.510326580687112e-06,
+      "loss": 0.9643,
+      "step": 4945
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 7.468697874114449,
+      "learning_rate": 7.509327858067512e-06,
+      "loss": 0.9411,
+      "step": 4946
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.4522372009708375,
+      "learning_rate": 7.508329001607348e-06,
+      "loss": 1.0043,
+      "step": 4947
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 7.171033213593852,
+      "learning_rate": 7.507330011359899e-06,
+      "loss": 0.954,
+      "step": 4948
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 8.0984387603722,
+      "learning_rate": 7.5063308873784445e-06,
+      "loss": 1.0169,
+      "step": 4949
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 4.807920346972667,
+      "learning_rate": 7.505331629716278e-06,
+      "loss": 1.0208,
+      "step": 4950
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 9.4554831324357,
+      "learning_rate": 7.504332238426697e-06,
+      "loss": 0.9856,
+      "step": 4951
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.975406631006061,
+      "learning_rate": 7.503332713563003e-06,
+      "loss": 1.0154,
+      "step": 4952
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 9.846216909340425,
+      "learning_rate": 7.50233305517851e-06,
+      "loss": 0.9838,
+      "step": 4953
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 8.571319235700726,
+      "learning_rate": 7.50133326332654e-06,
+      "loss": 1.0222,
+      "step": 4954
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.587613907355194,
+      "learning_rate": 7.500333338060415e-06,
+      "loss": 0.9891,
+      "step": 4955
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.148531781508516,
+      "learning_rate": 7.499333279433469e-06,
+      "loss": 0.9922,
+      "step": 4956
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.3098581810718475,
+      "learning_rate": 7.498333087499042e-06,
+      "loss": 1.0252,
+      "step": 4957
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.061930352107393,
+      "learning_rate": 7.497332762310483e-06,
+      "loss": 0.9428,
+      "step": 4958
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 5.6939570005112925,
+      "learning_rate": 7.4963323039211455e-06,
+      "loss": 0.9265,
+      "step": 4959
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.645096661385085,
+      "learning_rate": 7.495331712384393e-06,
+      "loss": 0.9765,
+      "step": 4960
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.182718713801711,
+      "learning_rate": 7.494330987753592e-06,
+      "loss": 0.9844,
+      "step": 4961
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.129309122241607,
+      "learning_rate": 7.4933301300821196e-06,
+      "loss": 0.9247,
+      "step": 4962
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 7.376166291062702,
+      "learning_rate": 7.492329139423356e-06,
+      "loss": 0.9975,
+      "step": 4963
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 8.191449208037794,
+      "learning_rate": 7.491328015830696e-06,
+      "loss": 0.9606,
+      "step": 4964
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 10.887772672889929,
+      "learning_rate": 7.490326759357534e-06,
+      "loss": 0.9409,
+      "step": 4965
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.269886532014895,
+      "learning_rate": 7.489325370057272e-06,
+      "loss": 1.059,
+      "step": 4966
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 7.467857165067246,
+      "learning_rate": 7.488323847983328e-06,
+      "loss": 0.9293,
+      "step": 4967
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 5.792405730836307,
+      "learning_rate": 7.487322193189113e-06,
+      "loss": 0.9093,
+      "step": 4968
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.033733568635005,
+      "learning_rate": 7.486320405728056e-06,
+      "loss": 0.9508,
+      "step": 4969
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 8.721456009609584,
+      "learning_rate": 7.485318485653591e-06,
+      "loss": 1.0407,
+      "step": 4970
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 7.819767691078626,
+      "learning_rate": 7.484316433019154e-06,
+      "loss": 0.9859,
+      "step": 4971
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 7.393726679980697,
+      "learning_rate": 7.483314247878195e-06,
+      "loss": 0.9289,
+      "step": 4972
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 5.536998552719983,
+      "learning_rate": 7.482311930284165e-06,
+      "loss": 1.0268,
+      "step": 4973
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 7.725541820157247,
+      "learning_rate": 7.481309480290528e-06,
+      "loss": 0.9093,
+      "step": 4974
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.031436475353824,
+      "learning_rate": 7.480306897950749e-06,
+      "loss": 0.9817,
+      "step": 4975
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 8.482776956317213,
+      "learning_rate": 7.479304183318303e-06,
+      "loss": 0.9594,
+      "step": 4976
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 7.085436879620729,
+      "learning_rate": 7.478301336446675e-06,
+      "loss": 0.8972,
+      "step": 4977
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.489081283332494,
+      "learning_rate": 7.477298357389351e-06,
+      "loss": 1.0225,
+      "step": 4978
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.618781394396903,
+      "learning_rate": 7.476295246199829e-06,
+      "loss": 1.0133,
+      "step": 4979
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 5.938421023865355,
+      "learning_rate": 7.4752920029316115e-06,
+      "loss": 0.9667,
+      "step": 4980
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 8.516044737170535,
+      "learning_rate": 7.474288627638209e-06,
+      "loss": 1.0274,
+      "step": 4981
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 8.150381955540514,
+      "learning_rate": 7.473285120373136e-06,
+      "loss": 1.0715,
+      "step": 4982
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 7.286037849235431,
+      "learning_rate": 7.472281481189921e-06,
+      "loss": 0.9852,
+      "step": 4983
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.098809192863694,
+      "learning_rate": 7.471277710142093e-06,
+      "loss": 1.0444,
+      "step": 4984
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.587640370013144,
+      "learning_rate": 7.47027380728319e-06,
+      "loss": 0.9824,
+      "step": 4985
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 8.645081272129659,
+      "learning_rate": 7.4692697726667604e-06,
+      "loss": 0.9836,
+      "step": 4986
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.732087004881079,
+      "learning_rate": 7.468265606346352e-06,
+      "loss": 0.9674,
+      "step": 4987
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.784765853532626,
+      "learning_rate": 7.467261308375527e-06,
+      "loss": 1.0228,
+      "step": 4988
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.508403480722828,
+      "learning_rate": 7.466256878807854e-06,
+      "loss": 0.9599,
+      "step": 4989
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 7.51372058881807,
+      "learning_rate": 7.465252317696901e-06,
+      "loss": 0.9838,
+      "step": 4990
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.437119840206399,
+      "learning_rate": 7.4642476250962525e-06,
+      "loss": 0.9783,
+      "step": 4991
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.41795002377501,
+      "learning_rate": 7.4632428010594944e-06,
+      "loss": 0.9633,
+      "step": 4992
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.367077287063999,
+      "learning_rate": 7.4622378456402225e-06,
+      "loss": 0.9258,
+      "step": 4993
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.50965741317348,
+      "learning_rate": 7.461232758892038e-06,
+      "loss": 0.9622,
+      "step": 4994
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.392047756640183,
+      "learning_rate": 7.460227540868547e-06,
+      "loss": 0.8866,
+      "step": 4995
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 8.586749195290057,
+      "learning_rate": 7.459222191623369e-06,
+      "loss": 1.0405,
+      "step": 4996
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 7.716667761667832,
+      "learning_rate": 7.4582167112101245e-06,
+      "loss": 0.8982,
+      "step": 4997
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.763904928467839,
+      "learning_rate": 7.457211099682442e-06,
+      "loss": 0.9486,
+      "step": 4998
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.3237282983784135,
+      "learning_rate": 7.456205357093961e-06,
+      "loss": 0.98,
+      "step": 4999
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 9.043965669558228,
+      "learning_rate": 7.455199483498321e-06,
+      "loss": 0.9537,
+      "step": 5000
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 4.976429901724347,
+      "learning_rate": 7.454193478949174e-06,
+      "loss": 0.9715,
+      "step": 5001
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 8.721248472840339,
+      "learning_rate": 7.453187343500181e-06,
+      "loss": 1.0065,
+      "step": 5002
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 5.806312880373095,
+      "learning_rate": 7.4521810772050026e-06,
+      "loss": 0.982,
+      "step": 5003
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.7541875139288665,
+      "learning_rate": 7.451174680117311e-06,
+      "loss": 0.9702,
+      "step": 5004
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 7.1010933785825,
+      "learning_rate": 7.4501681522907855e-06,
+      "loss": 0.9723,
+      "step": 5005
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 9.26168356516178,
+      "learning_rate": 7.449161493779109e-06,
+      "loss": 0.981,
+      "step": 5006
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 9.58697457075623,
+      "learning_rate": 7.448154704635977e-06,
+      "loss": 0.8984,
+      "step": 5007
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 9.579026944188197,
+      "learning_rate": 7.447147784915088e-06,
+      "loss": 0.9669,
+      "step": 5008
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 5.300644459144054,
+      "learning_rate": 7.446140734670147e-06,
+      "loss": 0.9996,
+      "step": 5009
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 8.074598980847522,
+      "learning_rate": 7.445133553954869e-06,
+      "loss": 1.055,
+      "step": 5010
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.374898437122017,
+      "learning_rate": 7.444126242822972e-06,
+      "loss": 0.9549,
+      "step": 5011
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.800102716527954,
+      "learning_rate": 7.443118801328185e-06,
+      "loss": 1.0141,
+      "step": 5012
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 7.5555752637842275,
+      "learning_rate": 7.442111229524242e-06,
+      "loss": 0.9339,
+      "step": 5013
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 6.381911753282211,
+      "learning_rate": 7.4411035274648834e-06,
+      "loss": 0.97,
+      "step": 5014
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.884471227438748,
+      "learning_rate": 7.440095695203857e-06,
+      "loss": 0.9153,
+      "step": 5015
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.659580496118459,
+      "learning_rate": 7.439087732794919e-06,
+      "loss": 1.0182,
+      "step": 5016
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 6.072802991044516,
+      "learning_rate": 7.43807964029183e-06,
+      "loss": 0.8844,
+      "step": 5017
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.388216940415243,
+      "learning_rate": 7.437071417748361e-06,
+      "loss": 0.9993,
+      "step": 5018
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.6652470396713435,
+      "learning_rate": 7.436063065218284e-06,
+      "loss": 0.8955,
+      "step": 5019
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.186301243217022,
+      "learning_rate": 7.435054582755385e-06,
+      "loss": 0.9689,
+      "step": 5020
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 6.099694204301689,
+      "learning_rate": 7.434045970413453e-06,
+      "loss": 0.9633,
+      "step": 5021
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.031187343616266,
+      "learning_rate": 7.433037228246282e-06,
+      "loss": 1.0232,
+      "step": 5022
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.616396367432516,
+      "learning_rate": 7.43202835630768e-06,
+      "loss": 1.0099,
+      "step": 5023
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 9.282085040755408,
+      "learning_rate": 7.431019354651454e-06,
+      "loss": 1.0017,
+      "step": 5024
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 5.049092055677103,
+      "learning_rate": 7.430010223331423e-06,
+      "loss": 1.0148,
+      "step": 5025
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.984770532144419,
+      "learning_rate": 7.42900096240141e-06,
+      "loss": 0.9866,
+      "step": 5026
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 6.618980178607788,
+      "learning_rate": 7.427991571915249e-06,
+      "loss": 1.0231,
+      "step": 5027
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 6.835267605374465,
+      "learning_rate": 7.426982051926775e-06,
+      "loss": 1.0244,
+      "step": 5028
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.006700783873202,
+      "learning_rate": 7.425972402489832e-06,
+      "loss": 0.9928,
+      "step": 5029
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 5.508339027982207,
+      "learning_rate": 7.424962623658274e-06,
+      "loss": 0.8827,
+      "step": 5030
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 6.81910073920903,
+      "learning_rate": 7.423952715485961e-06,
+      "loss": 0.9472,
+      "step": 5031
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 6.779559789561745,
+      "learning_rate": 7.422942678026757e-06,
+      "loss": 1.0208,
+      "step": 5032
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 8.147708207806716,
+      "learning_rate": 7.421932511334534e-06,
+      "loss": 0.9117,
+      "step": 5033
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.81116964513179,
+      "learning_rate": 7.420922215463173e-06,
+      "loss": 0.969,
+      "step": 5034
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.690656257834082,
+      "learning_rate": 7.41991179046656e-06,
+      "loss": 0.88,
+      "step": 5035
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 5.203461697534309,
+      "learning_rate": 7.418901236398587e-06,
+      "loss": 0.9777,
+      "step": 5036
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.871434175491308,
+      "learning_rate": 7.417890553313156e-06,
+      "loss": 0.9944,
+      "step": 5037
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 8.63113991973118,
+      "learning_rate": 7.416879741264174e-06,
+      "loss": 0.9211,
+      "step": 5038
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 6.6932416349025035,
+      "learning_rate": 7.415868800305551e-06,
+      "loss": 1.0157,
+      "step": 5039
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 6.931046407410018,
+      "learning_rate": 7.414857730491214e-06,
+      "loss": 1.0224,
+      "step": 5040
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.218012491586003,
+      "learning_rate": 7.413846531875086e-06,
+      "loss": 0.9067,
+      "step": 5041
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.311691037471317,
+      "learning_rate": 7.412835204511103e-06,
+      "loss": 0.9476,
+      "step": 5042
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.767166024085261,
+      "learning_rate": 7.411823748453206e-06,
+      "loss": 1.0397,
+      "step": 5043
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 4.647110623065421,
+      "learning_rate": 7.410812163755345e-06,
+      "loss": 0.9916,
+      "step": 5044
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.588928709153854,
+      "learning_rate": 7.409800450471472e-06,
+      "loss": 1.008,
+      "step": 5045
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 6.094485402521973,
+      "learning_rate": 7.408788608655551e-06,
+      "loss": 1.0202,
+      "step": 5046
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 6.780973722946557,
+      "learning_rate": 7.407776638361552e-06,
+      "loss": 0.9505,
+      "step": 5047
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 6.176563614235075,
+      "learning_rate": 7.406764539643449e-06,
+      "loss": 1.0544,
+      "step": 5048
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 8.48031060236094,
+      "learning_rate": 7.4057523125552234e-06,
+      "loss": 0.9824,
+      "step": 5049
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 5.567378836593804,
+      "learning_rate": 7.404739957150866e-06,
+      "loss": 0.9832,
+      "step": 5050
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 6.004490847618534,
+      "learning_rate": 7.403727473484373e-06,
+      "loss": 0.9518,
+      "step": 5051
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 8.329470670099568,
+      "learning_rate": 7.4027148616097476e-06,
+      "loss": 1.0226,
+      "step": 5052
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 6.284119507694862,
+      "learning_rate": 7.401702121581001e-06,
+      "loss": 0.9473,
+      "step": 5053
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.030291658539295,
+      "learning_rate": 7.400689253452147e-06,
+      "loss": 0.9943,
+      "step": 5054
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 10.235359557230746,
+      "learning_rate": 7.3996762572772095e-06,
+      "loss": 0.9705,
+      "step": 5055
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 8.19196786229207,
+      "learning_rate": 7.398663133110222e-06,
+      "loss": 0.9593,
+      "step": 5056
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 8.836065939032531,
+      "learning_rate": 7.39764988100522e-06,
+      "loss": 0.9595,
+      "step": 5057
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.832985756491334,
+      "learning_rate": 7.396636501016246e-06,
+      "loss": 1.0847,
+      "step": 5058
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 9.273616431310666,
+      "learning_rate": 7.395622993197351e-06,
+      "loss": 1.0222,
+      "step": 5059
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.42883619821808,
+      "learning_rate": 7.394609357602596e-06,
+      "loss": 0.944,
+      "step": 5060
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 8.253340925725006,
+      "learning_rate": 7.393595594286042e-06,
+      "loss": 1.0134,
+      "step": 5061
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 6.865300694410859,
+      "learning_rate": 7.392581703301762e-06,
+      "loss": 1.0022,
+      "step": 5062
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 8.70631472590493,
+      "learning_rate": 7.391567684703833e-06,
+      "loss": 0.9257,
+      "step": 5063
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 8.12433926109111,
+      "learning_rate": 7.390553538546341e-06,
+      "loss": 0.9831,
+      "step": 5064
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 8.625832485857693,
+      "learning_rate": 7.3895392648833786e-06,
+      "loss": 0.9257,
+      "step": 5065
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 6.831332802070288,
+      "learning_rate": 7.388524863769043e-06,
+      "loss": 0.9591,
+      "step": 5066
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 5.209180554797904,
+      "learning_rate": 7.387510335257439e-06,
+      "loss": 0.9736,
+      "step": 5067
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 6.4764483401823885,
+      "learning_rate": 7.386495679402677e-06,
+      "loss": 0.9728,
+      "step": 5068
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 8.001073466530631,
+      "learning_rate": 7.385480896258883e-06,
+      "loss": 0.949,
+      "step": 5069
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 8.053707355875831,
+      "learning_rate": 7.384465985880176e-06,
+      "loss": 0.9412,
+      "step": 5070
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 6.821966545535429,
+      "learning_rate": 7.383450948320689e-06,
+      "loss": 1.0155,
+      "step": 5071
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 8.48921800674091,
+      "learning_rate": 7.382435783634565e-06,
+      "loss": 1.0119,
+      "step": 5072
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 6.20834693948912,
+      "learning_rate": 7.381420491875946e-06,
+      "loss": 0.9503,
+      "step": 5073
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 6.421205082803415,
+      "learning_rate": 7.380405073098988e-06,
+      "loss": 1.0374,
+      "step": 5074
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.356715025753856,
+      "learning_rate": 7.37938952735785e-06,
+      "loss": 1.0014,
+      "step": 5075
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 8.360493151880572,
+      "learning_rate": 7.378373854706696e-06,
+      "loss": 0.9684,
+      "step": 5076
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 6.475389606342985,
+      "learning_rate": 7.3773580551997014e-06,
+      "loss": 0.9835,
+      "step": 5077
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 4.862316730508392,
+      "learning_rate": 7.376342128891045e-06,
+      "loss": 0.9938,
+      "step": 5078
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 6.744732901921937,
+      "learning_rate": 7.3753260758349155e-06,
+      "loss": 0.9503,
+      "step": 5079
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.991545307157114,
+      "learning_rate": 7.374309896085503e-06,
+      "loss": 1.0597,
+      "step": 5080
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 5.875820873442094,
+      "learning_rate": 7.37329358969701e-06,
+      "loss": 1.0436,
+      "step": 5081
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 8.242456578665012,
+      "learning_rate": 7.372277156723644e-06,
+      "loss": 0.9689,
+      "step": 5082
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 8.519201839395796,
+      "learning_rate": 7.371260597219618e-06,
+      "loss": 0.9307,
+      "step": 5083
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.22607105098335,
+      "learning_rate": 7.370243911239151e-06,
+      "loss": 0.9761,
+      "step": 5084
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.431053379800716,
+      "learning_rate": 7.369227098836473e-06,
+      "loss": 1.0119,
+      "step": 5085
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 5.300031654348625,
+      "learning_rate": 7.368210160065815e-06,
+      "loss": 0.9838,
+      "step": 5086
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.132312908801512,
+      "learning_rate": 7.367193094981418e-06,
+      "loss": 1.0007,
+      "step": 5087
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.042014382429488,
+      "learning_rate": 7.366175903637533e-06,
+      "loss": 1.0338,
+      "step": 5088
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.7580791184592846,
+      "learning_rate": 7.3651585860884105e-06,
+      "loss": 1.0709,
+      "step": 5089
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.312442334077767,
+      "learning_rate": 7.364141142388313e-06,
+      "loss": 1.0478,
+      "step": 5090
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.56153935788155,
+      "learning_rate": 7.363123572591507e-06,
+      "loss": 0.9775,
+      "step": 5091
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.9983495424462605,
+      "learning_rate": 7.362105876752266e-06,
+      "loss": 0.9949,
+      "step": 5092
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.412976324569986,
+      "learning_rate": 7.361088054924874e-06,
+      "loss": 0.9898,
+      "step": 5093
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 5.504021643530689,
+      "learning_rate": 7.360070107163618e-06,
+      "loss": 0.947,
+      "step": 5094
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 9.90225587733911,
+      "learning_rate": 7.359052033522791e-06,
+      "loss": 0.9975,
+      "step": 5095
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.997919996817606,
+      "learning_rate": 7.358033834056695e-06,
+      "loss": 0.9783,
+      "step": 5096
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.55096005626686,
+      "learning_rate": 7.357015508819638e-06,
+      "loss": 0.9561,
+      "step": 5097
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.329440001906897,
+      "learning_rate": 7.355997057865934e-06,
+      "loss": 0.9578,
+      "step": 5098
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.722045913875027,
+      "learning_rate": 7.3549784812499044e-06,
+      "loss": 0.9918,
+      "step": 5099
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.351844655852868,
+      "learning_rate": 7.353959779025879e-06,
+      "loss": 1.0122,
+      "step": 5100
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.544550572514428,
+      "learning_rate": 7.352940951248192e-06,
+      "loss": 1.0179,
+      "step": 5101
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.692941202785795,
+      "learning_rate": 7.351921997971182e-06,
+      "loss": 0.9986,
+      "step": 5102
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.057579138227496,
+      "learning_rate": 7.3509029192492e-06,
+      "loss": 0.9628,
+      "step": 5103
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.91233535792061,
+      "learning_rate": 7.349883715136601e-06,
+      "loss": 1.0291,
+      "step": 5104
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.113402607146651,
+      "learning_rate": 7.348864385687745e-06,
+      "loss": 1.0063,
+      "step": 5105
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 8.635808852156382,
+      "learning_rate": 7.347844930957e-06,
+      "loss": 1.0155,
+      "step": 5106
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.5252647771170995,
+      "learning_rate": 7.346825350998743e-06,
+      "loss": 1.0408,
+      "step": 5107
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.673831714635416,
+      "learning_rate": 7.345805645867354e-06,
+      "loss": 0.9912,
+      "step": 5108
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.799881551015944,
+      "learning_rate": 7.3447858156172205e-06,
+      "loss": 0.9256,
+      "step": 5109
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.428693687902044,
+      "learning_rate": 7.343765860302738e-06,
+      "loss": 1.0108,
+      "step": 5110
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.198329462312868,
+      "learning_rate": 7.34274577997831e-06,
+      "loss": 0.9751,
+      "step": 5111
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 8.885667198841357,
+      "learning_rate": 7.3417255746983416e-06,
+      "loss": 0.9588,
+      "step": 5112
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.1733810072787545,
+      "learning_rate": 7.340705244517252e-06,
+      "loss": 0.9379,
+      "step": 5113
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 8.450129961364839,
+      "learning_rate": 7.339684789489459e-06,
+      "loss": 0.9974,
+      "step": 5114
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.763145649845672,
+      "learning_rate": 7.3386642096693905e-06,
+      "loss": 0.993,
+      "step": 5115
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 8.747137069667005,
+      "learning_rate": 7.337643505111483e-06,
+      "loss": 0.933,
+      "step": 5116
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 8.571009318403599,
+      "learning_rate": 7.336622675870178e-06,
+      "loss": 0.9613,
+      "step": 5117
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.5717189179980116,
+      "learning_rate": 7.3356017219999236e-06,
+      "loss": 1.0327,
+      "step": 5118
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 5.149069752021121,
+      "learning_rate": 7.3345806435551744e-06,
+      "loss": 0.9623,
+      "step": 5119
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.4522925574036725,
+      "learning_rate": 7.333559440590393e-06,
+      "loss": 1.0039,
+      "step": 5120
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.58356656302088,
+      "learning_rate": 7.3325381131600445e-06,
+      "loss": 1.0211,
+      "step": 5121
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.397885703719799,
+      "learning_rate": 7.3315166613186074e-06,
+      "loss": 0.8997,
+      "step": 5122
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.692962486645053,
+      "learning_rate": 7.330495085120562e-06,
+      "loss": 1.0359,
+      "step": 5123
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.159932067342932,
+      "learning_rate": 7.329473384620395e-06,
+      "loss": 0.9287,
+      "step": 5124
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.579117067545824,
+      "learning_rate": 7.3284515598726e-06,
+      "loss": 1.0031,
+      "step": 5125
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 8.128664830676115,
+      "learning_rate": 7.327429610931682e-06,
+      "loss": 0.9877,
+      "step": 5126
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.646687377981033,
+      "learning_rate": 7.326407537852147e-06,
+      "loss": 0.9612,
+      "step": 5127
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 5.7021958882346375,
+      "learning_rate": 7.325385340688509e-06,
+      "loss": 0.9269,
+      "step": 5128
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.886314592594175,
+      "learning_rate": 7.324363019495289e-06,
+      "loss": 0.9928,
+      "step": 5129
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.234668425333972,
+      "learning_rate": 7.323340574327017e-06,
+      "loss": 0.9322,
+      "step": 5130
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.724689124849343,
+      "learning_rate": 7.322318005238225e-06,
+      "loss": 0.9952,
+      "step": 5131
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 8.303024589661488,
+      "learning_rate": 7.321295312283455e-06,
+      "loss": 0.9685,
+      "step": 5132
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 8.733064011702721,
+      "learning_rate": 7.320272495517256e-06,
+      "loss": 0.9613,
+      "step": 5133
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.647833470654301,
+      "learning_rate": 7.319249554994178e-06,
+      "loss": 0.9353,
+      "step": 5134
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.766141086886544,
+      "learning_rate": 7.318226490768785e-06,
+      "loss": 0.9682,
+      "step": 5135
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 10.203588845101926,
+      "learning_rate": 7.3172033028956445e-06,
+      "loss": 0.9117,
+      "step": 5136
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 8.000991382443143,
+      "learning_rate": 7.316179991429328e-06,
+      "loss": 1.04,
+      "step": 5137
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.944007512700868,
+      "learning_rate": 7.3151565564244205e-06,
+      "loss": 0.9908,
+      "step": 5138
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.722846843946971,
+      "learning_rate": 7.314132997935506e-06,
+      "loss": 0.9236,
+      "step": 5139
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.3286885189535,
+      "learning_rate": 7.313109316017178e-06,
+      "loss": 0.9202,
+      "step": 5140
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 9.817473931481661,
+      "learning_rate": 7.312085510724037e-06,
+      "loss": 0.9754,
+      "step": 5141
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 8.07306275187653,
+      "learning_rate": 7.311061582110692e-06,
+      "loss": 1.0637,
+      "step": 5142
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 9.2522969541072,
+      "learning_rate": 7.310037530231755e-06,
+      "loss": 0.8888,
+      "step": 5143
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.512268087807185,
+      "learning_rate": 7.309013355141846e-06,
+      "loss": 0.9498,
+      "step": 5144
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.19858781257226,
+      "learning_rate": 7.307989056895591e-06,
+      "loss": 0.9402,
+      "step": 5145
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.97390509295416,
+      "learning_rate": 7.306964635547626e-06,
+      "loss": 0.906,
+      "step": 5146
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.562265783723796,
+      "learning_rate": 7.305940091152586e-06,
+      "loss": 0.9439,
+      "step": 5147
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.974693526269365,
+      "learning_rate": 7.304915423765122e-06,
+      "loss": 0.9039,
+      "step": 5148
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 8.260746262972745,
+      "learning_rate": 7.303890633439885e-06,
+      "loss": 0.9287,
+      "step": 5149
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 5.258330109243745,
+      "learning_rate": 7.302865720231533e-06,
+      "loss": 1.0083,
+      "step": 5150
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.850271286697441,
+      "learning_rate": 7.301840684194735e-06,
+      "loss": 0.9773,
+      "step": 5151
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.904517003618885,
+      "learning_rate": 7.300815525384162e-06,
+      "loss": 0.9746,
+      "step": 5152
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 9.839344280374135,
+      "learning_rate": 7.299790243854495e-06,
+      "loss": 1.0006,
+      "step": 5153
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.684064799235096,
+      "learning_rate": 7.298764839660414e-06,
+      "loss": 1.0574,
+      "step": 5154
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 8.617095931729343,
+      "learning_rate": 7.297739312856618e-06,
+      "loss": 0.9539,
+      "step": 5155
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.879511334943859,
+      "learning_rate": 7.296713663497802e-06,
+      "loss": 0.9587,
+      "step": 5156
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 6.105969728556171,
+      "learning_rate": 7.295687891638672e-06,
+      "loss": 1.0384,
+      "step": 5157
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 5.956527798153953,
+      "learning_rate": 7.294661997333941e-06,
+      "loss": 0.9112,
+      "step": 5158
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.909835507621771,
+      "learning_rate": 7.293635980638325e-06,
+      "loss": 0.9609,
+      "step": 5159
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.929108144840347,
+      "learning_rate": 7.292609841606551e-06,
+      "loss": 0.8645,
+      "step": 5160
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.993621610529737,
+      "learning_rate": 7.291583580293349e-06,
+      "loss": 0.9691,
+      "step": 5161
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 9.137918137986139,
+      "learning_rate": 7.290557196753457e-06,
+      "loss": 1.0079,
+      "step": 5162
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.042014696545216,
+      "learning_rate": 7.2895306910416205e-06,
+      "loss": 0.9506,
+      "step": 5163
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 8.202063405288733,
+      "learning_rate": 7.288504063212591e-06,
+      "loss": 1.0198,
+      "step": 5164
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 5.990268715086674,
+      "learning_rate": 7.287477313321123e-06,
+      "loss": 1.0707,
+      "step": 5165
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.2644306126536,
+      "learning_rate": 7.286450441421983e-06,
+      "loss": 1.0623,
+      "step": 5166
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 5.7471626812023215,
+      "learning_rate": 7.285423447569941e-06,
+      "loss": 0.9596,
+      "step": 5167
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 9.160705396854546,
+      "learning_rate": 7.284396331819774e-06,
+      "loss": 0.9231,
+      "step": 5168
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 6.153392461186663,
+      "learning_rate": 7.283369094226265e-06,
+      "loss": 0.957,
+      "step": 5169
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.431061143290104,
+      "learning_rate": 7.2823417348442036e-06,
+      "loss": 0.9258,
+      "step": 5170
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 8.012887614913444,
+      "learning_rate": 7.281314253728389e-06,
+      "loss": 0.9808,
+      "step": 5171
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 8.984354775428036,
+      "learning_rate": 7.280286650933621e-06,
+      "loss": 1.0409,
+      "step": 5172
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 5.2617714009375645,
+      "learning_rate": 7.27925892651471e-06,
+      "loss": 0.9967,
+      "step": 5173
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.085609980646861,
+      "learning_rate": 7.2782310805264735e-06,
+      "loss": 1.0126,
+      "step": 5174
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 6.6428856403396805,
+      "learning_rate": 7.277203113023731e-06,
+      "loss": 1.0181,
+      "step": 5175
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 8.721821470160526,
+      "learning_rate": 7.276175024061314e-06,
+      "loss": 1.0066,
+      "step": 5176
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.303802210983056,
+      "learning_rate": 7.2751468136940574e-06,
+      "loss": 1.0364,
+      "step": 5177
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.5779940820368035,
+      "learning_rate": 7.274118481976801e-06,
+      "loss": 0.9476,
+      "step": 5178
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 10.852807210062819,
+      "learning_rate": 7.273090028964396e-06,
+      "loss": 1.038,
+      "step": 5179
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 8.169625286251652,
+      "learning_rate": 7.272061454711696e-06,
+      "loss": 0.9768,
+      "step": 5180
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 5.769054518781717,
+      "learning_rate": 7.271032759273563e-06,
+      "loss": 1.0,
+      "step": 5181
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.76782636093529,
+      "learning_rate": 7.270003942704863e-06,
+      "loss": 0.983,
+      "step": 5182
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 6.432415461660093,
+      "learning_rate": 7.2689750050604714e-06,
+      "loss": 1.0065,
+      "step": 5183
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.12706389185767,
+      "learning_rate": 7.2679459463952695e-06,
+      "loss": 0.9725,
+      "step": 5184
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 10.276767823545221,
+      "learning_rate": 7.266916766764143e-06,
+      "loss": 0.9869,
+      "step": 5185
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 4.703050290702282,
+      "learning_rate": 7.2658874662219856e-06,
+      "loss": 1.0072,
+      "step": 5186
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 6.205005136830527,
+      "learning_rate": 7.264858044823699e-06,
+      "loss": 1.0104,
+      "step": 5187
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 6.881354065372525,
+      "learning_rate": 7.263828502624188e-06,
+      "loss": 0.9494,
+      "step": 5188
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 5.5509624656000325,
+      "learning_rate": 7.2627988396783665e-06,
+      "loss": 0.9046,
+      "step": 5189
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.993148994890376,
+      "learning_rate": 7.261769056041153e-06,
+      "loss": 0.9514,
+      "step": 5190
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.8817770702013314,
+      "learning_rate": 7.260739151767475e-06,
+      "loss": 0.9797,
+      "step": 5191
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 8.495921420591323,
+      "learning_rate": 7.259709126912261e-06,
+      "loss": 0.9299,
+      "step": 5192
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.134802471295353,
+      "learning_rate": 7.258678981530455e-06,
+      "loss": 0.994,
+      "step": 5193
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.142779215967749,
+      "learning_rate": 7.257648715676997e-06,
+      "loss": 0.9214,
+      "step": 5194
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 6.902699740043039,
+      "learning_rate": 7.256618329406842e-06,
+      "loss": 1.0127,
+      "step": 5195
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 9.174297654265075,
+      "learning_rate": 7.255587822774947e-06,
+      "loss": 1.0208,
+      "step": 5196
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 9.317307401836368,
+      "learning_rate": 7.254557195836274e-06,
+      "loss": 0.9759,
+      "step": 5197
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.182193978528865,
+      "learning_rate": 7.253526448645796e-06,
+      "loss": 0.919,
+      "step": 5198
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 9.808951923568117,
+      "learning_rate": 7.252495581258492e-06,
+      "loss": 0.9912,
+      "step": 5199
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 6.978545617690855,
+      "learning_rate": 7.2514645937293425e-06,
+      "loss": 0.957,
+      "step": 5200
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 8.252719186057712,
+      "learning_rate": 7.250433486113337e-06,
+      "loss": 1.0461,
+      "step": 5201
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 6.971305343331817,
+      "learning_rate": 7.249402258465476e-06,
+      "loss": 0.9733,
+      "step": 5202
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.744216858597143,
+      "learning_rate": 7.248370910840759e-06,
+      "loss": 0.9452,
+      "step": 5203
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 6.824533779939633,
+      "learning_rate": 7.247339443294195e-06,
+      "loss": 1.0417,
+      "step": 5204
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 8.583187884245355,
+      "learning_rate": 7.2463078558808005e-06,
+      "loss": 0.9334,
+      "step": 5205
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 9.067359737121398,
+      "learning_rate": 7.2452761486556e-06,
+      "loss": 1.0003,
+      "step": 5206
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.917243224035139,
+      "learning_rate": 7.244244321673617e-06,
+      "loss": 0.9626,
+      "step": 5207
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 4.825608684851852,
+      "learning_rate": 7.24321237498989e-06,
+      "loss": 0.9544,
+      "step": 5208
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 6.553840020642859,
+      "learning_rate": 7.242180308659459e-06,
+      "loss": 0.9566,
+      "step": 5209
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 8.334224058781087,
+      "learning_rate": 7.241148122737372e-06,
+      "loss": 0.9058,
+      "step": 5210
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 10.030837078421403,
+      "learning_rate": 7.24011581727868e-06,
+      "loss": 0.9317,
+      "step": 5211
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.461647077786068,
+      "learning_rate": 7.239083392338449e-06,
+      "loss": 0.9912,
+      "step": 5212
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.431270927668253,
+      "learning_rate": 7.238050847971739e-06,
+      "loss": 0.9516,
+      "step": 5213
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.898488127837302,
+      "learning_rate": 7.237018184233629e-06,
+      "loss": 1.0258,
+      "step": 5214
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 6.089033345038987,
+      "learning_rate": 7.235985401179194e-06,
+      "loss": 0.9397,
+      "step": 5215
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 8.35092495200252,
+      "learning_rate": 7.2349524988635234e-06,
+      "loss": 1.0725,
+      "step": 5216
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 9.631280349489012,
+      "learning_rate": 7.233919477341706e-06,
+      "loss": 1.0456,
+      "step": 5217
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 6.704967177579743,
+      "learning_rate": 7.232886336668843e-06,
+      "loss": 1.0104,
+      "step": 5218
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 6.310844042972244,
+      "learning_rate": 7.2318530769000375e-06,
+      "loss": 0.9144,
+      "step": 5219
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 5.64228877690135,
+      "learning_rate": 7.230819698090401e-06,
+      "loss": 0.975,
+      "step": 5220
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 9.692506689870282,
+      "learning_rate": 7.229786200295053e-06,
+      "loss": 0.8651,
+      "step": 5221
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.756578484505453,
+      "learning_rate": 7.2287525835691144e-06,
+      "loss": 0.9916,
+      "step": 5222
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.793087733175763,
+      "learning_rate": 7.227718847967715e-06,
+      "loss": 1.0304,
+      "step": 5223
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 5.361536335246088,
+      "learning_rate": 7.226684993545996e-06,
+      "loss": 0.9662,
+      "step": 5224
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.314007565555797,
+      "learning_rate": 7.225651020359097e-06,
+      "loss": 1.0011,
+      "step": 5225
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 8.136354442833897,
+      "learning_rate": 7.2246169284621656e-06,
+      "loss": 0.9103,
+      "step": 5226
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 9.98468728989215,
+      "learning_rate": 7.223582717910361e-06,
+      "loss": 0.9669,
+      "step": 5227
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.053350000372401,
+      "learning_rate": 7.2225483887588455e-06,
+      "loss": 0.9451,
+      "step": 5228
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.679568265075883,
+      "learning_rate": 7.221513941062784e-06,
+      "loss": 0.9591,
+      "step": 5229
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.59298441721765,
+      "learning_rate": 7.2204793748773505e-06,
+      "loss": 0.9652,
+      "step": 5230
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.5375136165375185,
+      "learning_rate": 7.21944469025773e-06,
+      "loss": 0.944,
+      "step": 5231
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.13251725348601,
+      "learning_rate": 7.218409887259107e-06,
+      "loss": 0.9774,
+      "step": 5232
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.808768959146838,
+      "learning_rate": 7.2173749659366756e-06,
+      "loss": 0.9546,
+      "step": 5233
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.223098248532225,
+      "learning_rate": 7.216339926345634e-06,
+      "loss": 1.0215,
+      "step": 5234
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 8.40694031365585,
+      "learning_rate": 7.215304768541193e-06,
+      "loss": 0.9459,
+      "step": 5235
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.921849464592782,
+      "learning_rate": 7.21426949257856e-06,
+      "loss": 0.9241,
+      "step": 5236
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.308779562792478,
+      "learning_rate": 7.213234098512957e-06,
+      "loss": 0.9626,
+      "step": 5237
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 5.958194961609394,
+      "learning_rate": 7.212198586399608e-06,
+      "loss": 0.9751,
+      "step": 5238
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 8.964938135464438,
+      "learning_rate": 7.211162956293742e-06,
+      "loss": 0.9658,
+      "step": 5239
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.460327705824191,
+      "learning_rate": 7.210127208250599e-06,
+      "loss": 1.089,
+      "step": 5240
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.505448324519133,
+      "learning_rate": 7.209091342325425e-06,
+      "loss": 0.9849,
+      "step": 5241
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.380156226812919,
+      "learning_rate": 7.208055358573467e-06,
+      "loss": 0.9879,
+      "step": 5242
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.330831541907995,
+      "learning_rate": 7.2070192570499805e-06,
+      "loss": 0.9589,
+      "step": 5243
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 5.019827579509281,
+      "learning_rate": 7.205983037810234e-06,
+      "loss": 1.0048,
+      "step": 5244
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.082473781237862,
+      "learning_rate": 7.204946700909488e-06,
+      "loss": 0.9342,
+      "step": 5245
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 8.23693462988769,
+      "learning_rate": 7.203910246403024e-06,
+      "loss": 1.0263,
+      "step": 5246
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.541100369739318,
+      "learning_rate": 7.202873674346124e-06,
+      "loss": 0.9665,
+      "step": 5247
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 9.19676539431951,
+      "learning_rate": 7.201836984794073e-06,
+      "loss": 0.9437,
+      "step": 5248
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.182599082828607,
+      "learning_rate": 7.2008001778021655e-06,
+      "loss": 0.9851,
+      "step": 5249
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.3386606754191375,
+      "learning_rate": 7.199763253425702e-06,
+      "loss": 0.919,
+      "step": 5250
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.140606777805519,
+      "learning_rate": 7.19872621171999e-06,
+      "loss": 0.9917,
+      "step": 5251
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.499337964932697,
+      "learning_rate": 7.197689052740342e-06,
+      "loss": 0.9776,
+      "step": 5252
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 8.80413373406232,
+      "learning_rate": 7.196651776542079e-06,
+      "loss": 0.8539,
+      "step": 5253
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.373745848250473,
+      "learning_rate": 7.195614383180523e-06,
+      "loss": 0.9683,
+      "step": 5254
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 8.109180283686541,
+      "learning_rate": 7.194576872711007e-06,
+      "loss": 0.9114,
+      "step": 5255
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 9.057098182758965,
+      "learning_rate": 7.1935392451888695e-06,
+      "loss": 0.9735,
+      "step": 5256
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.071234647003536,
+      "learning_rate": 7.192501500669455e-06,
+      "loss": 0.932,
+      "step": 5257
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.048405670680638,
+      "learning_rate": 7.191463639208114e-06,
+      "loss": 0.9201,
+      "step": 5258
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.080451146830782,
+      "learning_rate": 7.190425660860199e-06,
+      "loss": 0.987,
+      "step": 5259
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.49926882544068,
+      "learning_rate": 7.18938756568108e-06,
+      "loss": 1.0195,
+      "step": 5260
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 8.044369270048726,
+      "learning_rate": 7.188349353726122e-06,
+      "loss": 0.9938,
+      "step": 5261
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 8.420905488118937,
+      "learning_rate": 7.1873110250507e-06,
+      "loss": 0.928,
+      "step": 5262
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 9.234099452075235,
+      "learning_rate": 7.186272579710197e-06,
+      "loss": 1.0094,
+      "step": 5263
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 5.213435953048231,
+      "learning_rate": 7.185234017759999e-06,
+      "loss": 1.0182,
+      "step": 5264
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.498470813772127,
+      "learning_rate": 7.184195339255502e-06,
+      "loss": 1.0256,
+      "step": 5265
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 10.02493646741638,
+      "learning_rate": 7.183156544252106e-06,
+      "loss": 1.0366,
+      "step": 5266
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.876681390227978,
+      "learning_rate": 7.182117632805216e-06,
+      "loss": 0.9913,
+      "step": 5267
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 8.461782346793791,
+      "learning_rate": 7.1810786049702445e-06,
+      "loss": 0.927,
+      "step": 5268
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.637043601940303,
+      "learning_rate": 7.180039460802612e-06,
+      "loss": 0.9602,
+      "step": 5269
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 5.796767531246894,
+      "learning_rate": 7.179000200357743e-06,
+      "loss": 0.9384,
+      "step": 5270
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.039994124431365,
+      "learning_rate": 7.177960823691068e-06,
+      "loss": 0.9551,
+      "step": 5271
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 8.175276828879918,
+      "learning_rate": 7.176921330858026e-06,
+      "loss": 0.9872,
+      "step": 5272
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 8.030926547917804,
+      "learning_rate": 7.17588172191406e-06,
+      "loss": 1.0015,
+      "step": 5273
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.068842250376304,
+      "learning_rate": 7.1748419969146175e-06,
+      "loss": 0.8902,
+      "step": 5274
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.796872081596582,
+      "learning_rate": 7.173802155915158e-06,
+      "loss": 1.0482,
+      "step": 5275
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 9.514256247580187,
+      "learning_rate": 7.172762198971144e-06,
+      "loss": 0.985,
+      "step": 5276
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.162410804015551,
+      "learning_rate": 7.1717221261380396e-06,
+      "loss": 0.9612,
+      "step": 5277
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 8.926141611807884,
+      "learning_rate": 7.170681937471322e-06,
+      "loss": 0.9567,
+      "step": 5278
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.432072123690851,
+      "learning_rate": 7.169641633026474e-06,
+      "loss": 0.959,
+      "step": 5279
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.376681274113382,
+      "learning_rate": 7.168601212858979e-06,
+      "loss": 1.0275,
+      "step": 5280
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 5.31120221735899,
+      "learning_rate": 7.167560677024332e-06,
+      "loss": 0.9784,
+      "step": 5281
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.587565436921968,
+      "learning_rate": 7.166520025578033e-06,
+      "loss": 0.9777,
+      "step": 5282
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.3749460233136555,
+      "learning_rate": 7.165479258575585e-06,
+      "loss": 1.0177,
+      "step": 5283
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.5375435187672215,
+      "learning_rate": 7.1644383760725e-06,
+      "loss": 1.0323,
+      "step": 5284
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.007258016378798,
+      "learning_rate": 7.163397378124299e-06,
+      "loss": 0.9708,
+      "step": 5285
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.785563347218403,
+      "learning_rate": 7.162356264786503e-06,
+      "loss": 0.9964,
+      "step": 5286
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 9.187726933247008,
+      "learning_rate": 7.161315036114644e-06,
+      "loss": 0.9913,
+      "step": 5287
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.584115082696944,
+      "learning_rate": 7.160273692164255e-06,
+      "loss": 1.0296,
+      "step": 5288
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 5.640947101155822,
+      "learning_rate": 7.15923223299088e-06,
+      "loss": 0.9166,
+      "step": 5289
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.861858587868959,
+      "learning_rate": 7.158190658650071e-06,
+      "loss": 0.9704,
+      "step": 5290
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.561752065136707,
+      "learning_rate": 7.157148969197376e-06,
+      "loss": 0.962,
+      "step": 5291
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.169404107311299,
+      "learning_rate": 7.156107164688363e-06,
+      "loss": 0.9442,
+      "step": 5292
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 4.960836403179896,
+      "learning_rate": 7.155065245178592e-06,
+      "loss": 0.8748,
+      "step": 5293
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.840480042013341,
+      "learning_rate": 7.154023210723642e-06,
+      "loss": 0.9163,
+      "step": 5294
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.18102248944582,
+      "learning_rate": 7.152981061379089e-06,
+      "loss": 0.9976,
+      "step": 5295
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 8.547350629161372,
+      "learning_rate": 7.151938797200519e-06,
+      "loss": 1.0275,
+      "step": 5296
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.802561525811485,
+      "learning_rate": 7.150896418243523e-06,
+      "loss": 1.0156,
+      "step": 5297
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.433889937103519,
+      "learning_rate": 7.149853924563699e-06,
+      "loss": 0.9707,
+      "step": 5298
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.502361984822533,
+      "learning_rate": 7.148811316216652e-06,
+      "loss": 0.9625,
+      "step": 5299
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 5.4947899827480935,
+      "learning_rate": 7.14776859325799e-06,
+      "loss": 0.9159,
+      "step": 5300
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.098104030620923,
+      "learning_rate": 7.146725755743329e-06,
+      "loss": 0.9581,
+      "step": 5301
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.222229647908838,
+      "learning_rate": 7.145682803728292e-06,
+      "loss": 1.0825,
+      "step": 5302
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.294462035583321,
+      "learning_rate": 7.1446397372685075e-06,
+      "loss": 0.9161,
+      "step": 5303
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 5.944894597030549,
+      "learning_rate": 7.1435965564196086e-06,
+      "loss": 0.8829,
+      "step": 5304
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.383544592331844,
+      "learning_rate": 7.142553261237236e-06,
+      "loss": 0.9373,
+      "step": 5305
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.676155959474763,
+      "learning_rate": 7.141509851777036e-06,
+      "loss": 0.9217,
+      "step": 5306
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.275981505265855,
+      "learning_rate": 7.140466328094662e-06,
+      "loss": 0.9502,
+      "step": 5307
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.469553136459658,
+      "learning_rate": 7.139422690245772e-06,
+      "loss": 0.9915,
+      "step": 5308
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.062127354642803,
+      "learning_rate": 7.13837893828603e-06,
+      "loss": 1.0291,
+      "step": 5309
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.473372238110019,
+      "learning_rate": 7.1373350722711086e-06,
+      "loss": 0.9753,
+      "step": 5310
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.277202803406059,
+      "learning_rate": 7.136291092256684e-06,
+      "loss": 0.973,
+      "step": 5311
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.976030371262752,
+      "learning_rate": 7.135246998298437e-06,
+      "loss": 0.9199,
+      "step": 5312
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 5.421441552405296,
+      "learning_rate": 7.13420279045206e-06,
+      "loss": 1.0141,
+      "step": 5313
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 8.549715784085583,
+      "learning_rate": 7.133158468773249e-06,
+      "loss": 0.9727,
+      "step": 5314
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.126453287766076,
+      "learning_rate": 7.132114033317701e-06,
+      "loss": 0.9563,
+      "step": 5315
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.749590506323495,
+      "learning_rate": 7.131069484141124e-06,
+      "loss": 0.9109,
+      "step": 5316
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 5.977650466233566,
+      "learning_rate": 7.130024821299234e-06,
+      "loss": 0.9585,
+      "step": 5317
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 8.850404131449816,
+      "learning_rate": 7.1289800448477486e-06,
+      "loss": 0.9545,
+      "step": 5318
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 10.77388930927226,
+      "learning_rate": 7.127935154842394e-06,
+      "loss": 1.0499,
+      "step": 5319
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 8.032149479584568,
+      "learning_rate": 7.1268901513389e-06,
+      "loss": 1.0082,
+      "step": 5320
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 8.250581346592728,
+      "learning_rate": 7.1258450343930065e-06,
+      "loss": 0.9735,
+      "step": 5321
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.532272230098838,
+      "learning_rate": 7.124799804060456e-06,
+      "loss": 0.9279,
+      "step": 5322
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.072994824985116,
+      "learning_rate": 7.123754460396999e-06,
+      "loss": 0.9579,
+      "step": 5323
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 8.483744195962224,
+      "learning_rate": 7.1227090034583915e-06,
+      "loss": 1.0432,
+      "step": 5324
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.76378325345129,
+      "learning_rate": 7.121663433300393e-06,
+      "loss": 1.012,
+      "step": 5325
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 8.175392221846126,
+      "learning_rate": 7.120617749978771e-06,
+      "loss": 1.018,
+      "step": 5326
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 9.872956799454462,
+      "learning_rate": 7.119571953549305e-06,
+      "loss": 1.0523,
+      "step": 5327
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.060819129308175,
+      "learning_rate": 7.118526044067768e-06,
+      "loss": 0.9338,
+      "step": 5328
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 9.527679702186903,
+      "learning_rate": 7.117480021589949e-06,
+      "loss": 0.9342,
+      "step": 5329
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.899582392719355,
+      "learning_rate": 7.116433886171641e-06,
+      "loss": 0.9909,
+      "step": 5330
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.787786325600489,
+      "learning_rate": 7.1153876378686395e-06,
+      "loss": 0.9248,
+      "step": 5331
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 5.087802974619553,
+      "learning_rate": 7.114341276736749e-06,
+      "loss": 0.9718,
+      "step": 5332
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 9.172711141766012,
+      "learning_rate": 7.113294802831783e-06,
+      "loss": 0.9782,
+      "step": 5333
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 8.221697199285632,
+      "learning_rate": 7.1122482162095516e-06,
+      "loss": 0.9495,
+      "step": 5334
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.913748240889032,
+      "learning_rate": 7.1112015169258805e-06,
+      "loss": 0.9531,
+      "step": 5335
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.323254243717704,
+      "learning_rate": 7.110154705036598e-06,
+      "loss": 0.9067,
+      "step": 5336
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.293134506040214,
+      "learning_rate": 7.109107780597536e-06,
+      "loss": 0.9861,
+      "step": 5337
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 5.923817042131854,
+      "learning_rate": 7.1080607436645365e-06,
+      "loss": 0.9988,
+      "step": 5338
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 8.71614637518961,
+      "learning_rate": 7.107013594293443e-06,
+      "loss": 0.9349,
+      "step": 5339
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.3063894199574655,
+      "learning_rate": 7.105966332540112e-06,
+      "loss": 0.9464,
+      "step": 5340
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 9.217577358468372,
+      "learning_rate": 7.104918958460397e-06,
+      "loss": 0.9456,
+      "step": 5341
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.202339509394603,
+      "learning_rate": 7.103871472110166e-06,
+      "loss": 0.9732,
+      "step": 5342
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.119081724407259,
+      "learning_rate": 7.102823873545286e-06,
+      "loss": 0.9908,
+      "step": 5343
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.7831735931683,
+      "learning_rate": 7.101776162821632e-06,
+      "loss": 0.9904,
+      "step": 5344
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 5.917772466141522,
+      "learning_rate": 7.100728339995088e-06,
+      "loss": 1.0446,
+      "step": 5345
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.862513392314825,
+      "learning_rate": 7.099680405121544e-06,
+      "loss": 0.9957,
+      "step": 5346
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 5.1762557194392995,
+      "learning_rate": 7.098632358256891e-06,
+      "loss": 0.9568,
+      "step": 5347
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.186531589217788,
+      "learning_rate": 7.097584199457028e-06,
+      "loss": 0.9032,
+      "step": 5348
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.825380748840588,
+      "learning_rate": 7.096535928777865e-06,
+      "loss": 0.9358,
+      "step": 5349
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.3396785880971365,
+      "learning_rate": 7.095487546275308e-06,
+      "loss": 0.9958,
+      "step": 5350
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 9.269514241191155,
+      "learning_rate": 7.0944390520052796e-06,
+      "loss": 0.9878,
+      "step": 5351
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.756607197157042,
+      "learning_rate": 7.093390446023704e-06,
+      "loss": 1.0324,
+      "step": 5352
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.469521639652851,
+      "learning_rate": 7.092341728386507e-06,
+      "loss": 1.0241,
+      "step": 5353
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 5.9216390364582,
+      "learning_rate": 7.091292899149627e-06,
+      "loss": 0.946,
+      "step": 5354
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.963465967106221,
+      "learning_rate": 7.0902439583690056e-06,
+      "loss": 1.0505,
+      "step": 5355
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.988782064022119,
+      "learning_rate": 7.0891949061005875e-06,
+      "loss": 0.9744,
+      "step": 5356
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.393076785434408,
+      "learning_rate": 7.08814574240033e-06,
+      "loss": 0.9779,
+      "step": 5357
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.667403472697997,
+      "learning_rate": 7.08709646732419e-06,
+      "loss": 0.9608,
+      "step": 5358
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 8.966986097847778,
+      "learning_rate": 7.086047080928136e-06,
+      "loss": 0.9553,
+      "step": 5359
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.939474358136635,
+      "learning_rate": 7.0849975832681335e-06,
+      "loss": 1.0294,
+      "step": 5360
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.0463529680378025,
+      "learning_rate": 7.083947974400167e-06,
+      "loss": 0.9396,
+      "step": 5361
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.858291524313225,
+      "learning_rate": 7.082898254380215e-06,
+      "loss": 0.9571,
+      "step": 5362
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 8.15869275321712,
+      "learning_rate": 7.081848423264268e-06,
+      "loss": 0.9733,
+      "step": 5363
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.466413754375678,
+      "learning_rate": 7.080798481108319e-06,
+      "loss": 0.9622,
+      "step": 5364
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.456203909479334,
+      "learning_rate": 7.079748427968374e-06,
+      "loss": 0.8926,
+      "step": 5365
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.665751740171941,
+      "learning_rate": 7.0786982639004365e-06,
+      "loss": 0.9728,
+      "step": 5366
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.936683276543965,
+      "learning_rate": 7.077647988960519e-06,
+      "loss": 0.8846,
+      "step": 5367
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.209577794522399,
+      "learning_rate": 7.076597603204642e-06,
+      "loss": 1.0429,
+      "step": 5368
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.0138666932875005,
+      "learning_rate": 7.075547106688827e-06,
+      "loss": 1.0655,
+      "step": 5369
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.64300380852187,
+      "learning_rate": 7.074496499469108e-06,
+      "loss": 1.0101,
+      "step": 5370
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 5.411486542682962,
+      "learning_rate": 7.07344578160152e-06,
+      "loss": 0.954,
+      "step": 5371
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 5.872358217495784,
+      "learning_rate": 7.072394953142105e-06,
+      "loss": 1.0143,
+      "step": 5372
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.165120652702906,
+      "learning_rate": 7.071344014146912e-06,
+      "loss": 0.9092,
+      "step": 5373
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 5.509150902870479,
+      "learning_rate": 7.0702929646719955e-06,
+      "loss": 0.9811,
+      "step": 5374
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 8.384137938417533,
+      "learning_rate": 7.0692418047734135e-06,
+      "loss": 0.9596,
+      "step": 5375
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.731882506347967,
+      "learning_rate": 7.068190534507234e-06,
+      "loss": 1.1076,
+      "step": 5376
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 8.35404317832,
+      "learning_rate": 7.0671391539295286e-06,
+      "loss": 0.9822,
+      "step": 5377
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.867162885211278,
+      "learning_rate": 7.066087663096376e-06,
+      "loss": 0.9428,
+      "step": 5378
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 9.009599642040131,
+      "learning_rate": 7.065036062063856e-06,
+      "loss": 1.0778,
+      "step": 5379
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 8.434492049036864,
+      "learning_rate": 7.063984350888061e-06,
+      "loss": 0.9274,
+      "step": 5380
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.027653986335806,
+      "learning_rate": 7.062932529625087e-06,
+      "loss": 0.9738,
+      "step": 5381
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.211592039881816,
+      "learning_rate": 7.061880598331034e-06,
+      "loss": 0.9739,
+      "step": 5382
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.663938462671696,
+      "learning_rate": 7.060828557062007e-06,
+      "loss": 1.0256,
+      "step": 5383
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 8.285662397330482,
+      "learning_rate": 7.059776405874123e-06,
+      "loss": 0.9327,
+      "step": 5384
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.740670431481212,
+      "learning_rate": 7.0587241448235e-06,
+      "loss": 1.0001,
+      "step": 5385
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.861543078912723,
+      "learning_rate": 7.057671773966261e-06,
+      "loss": 0.9633,
+      "step": 5386
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 5.595002805240967,
+      "learning_rate": 7.056619293358535e-06,
+      "loss": 0.9753,
+      "step": 5387
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.399926570647988,
+      "learning_rate": 7.055566703056462e-06,
+      "loss": 0.9076,
+      "step": 5388
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.139020963008766,
+      "learning_rate": 7.054514003116183e-06,
+      "loss": 0.9467,
+      "step": 5389
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.554641893933928,
+      "learning_rate": 7.053461193593847e-06,
+      "loss": 0.9197,
+      "step": 5390
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.1242036048539115,
+      "learning_rate": 7.052408274545605e-06,
+      "loss": 0.9641,
+      "step": 5391
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.89775189299878,
+      "learning_rate": 7.051355246027619e-06,
+      "loss": 1.0346,
+      "step": 5392
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 8.838863675428469,
+      "learning_rate": 7.050302108096054e-06,
+      "loss": 1.0095,
+      "step": 5393
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.928036356754177,
+      "learning_rate": 7.049248860807082e-06,
+      "loss": 0.9716,
+      "step": 5394
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 5.692940118015242,
+      "learning_rate": 7.048195504216879e-06,
+      "loss": 0.9036,
+      "step": 5395
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 9.871722910084362,
+      "learning_rate": 7.047142038381629e-06,
+      "loss": 0.9842,
+      "step": 5396
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.376838736974116,
+      "learning_rate": 7.046088463357524e-06,
+      "loss": 0.982,
+      "step": 5397
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.478336801370089,
+      "learning_rate": 7.045034779200751e-06,
+      "loss": 0.9614,
+      "step": 5398
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 8.771581297576057,
+      "learning_rate": 7.043980985967518e-06,
+      "loss": 0.9935,
+      "step": 5399
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.501361538912795,
+      "learning_rate": 7.0429270837140285e-06,
+      "loss": 0.9247,
+      "step": 5400
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 5.719788262693256,
+      "learning_rate": 7.041873072496494e-06,
+      "loss": 0.9961,
+      "step": 5401
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.686290895347385,
+      "learning_rate": 7.040818952371135e-06,
+      "loss": 1.0634,
+      "step": 5402
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.894692198442357,
+      "learning_rate": 7.039764723394173e-06,
+      "loss": 0.9417,
+      "step": 5403
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.599735514739129,
+      "learning_rate": 7.038710385621837e-06,
+      "loss": 0.9884,
+      "step": 5404
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.432990919627359,
+      "learning_rate": 7.037655939110365e-06,
+      "loss": 0.9521,
+      "step": 5405
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 8.839562894444281,
+      "learning_rate": 7.036601383915998e-06,
+      "loss": 0.9336,
+      "step": 5406
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.545557757595587,
+      "learning_rate": 7.03554672009498e-06,
+      "loss": 1.0068,
+      "step": 5407
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.999182600242062,
+      "learning_rate": 7.034491947703567e-06,
+      "loss": 0.9232,
+      "step": 5408
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 8.117063637548322,
+      "learning_rate": 7.033437066798017e-06,
+      "loss": 0.9598,
+      "step": 5409
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.346923211721083,
+      "learning_rate": 7.032382077434594e-06,
+      "loss": 1.0046,
+      "step": 5410
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 11.472256597102293,
+      "learning_rate": 7.031326979669567e-06,
+      "loss": 0.9279,
+      "step": 5411
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.534451932772178,
+      "learning_rate": 7.030271773559214e-06,
+      "loss": 0.9926,
+      "step": 5412
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 8.611443809889758,
+      "learning_rate": 7.029216459159816e-06,
+      "loss": 0.971,
+      "step": 5413
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.673654812942903,
+      "learning_rate": 7.02816103652766e-06,
+      "loss": 0.9176,
+      "step": 5414
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 9.296972895195621,
+      "learning_rate": 7.0271055057190395e-06,
+      "loss": 0.9439,
+      "step": 5415
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.992546645019718,
+      "learning_rate": 7.026049866790255e-06,
+      "loss": 1.0294,
+      "step": 5416
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 9.473624442280121,
+      "learning_rate": 7.024994119797608e-06,
+      "loss": 0.8856,
+      "step": 5417
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.331221331792384,
+      "learning_rate": 7.0239382647974134e-06,
+      "loss": 0.9881,
+      "step": 5418
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.207473513038486,
+      "learning_rate": 7.022882301845985e-06,
+      "loss": 0.9477,
+      "step": 5419
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 9.277423384299565,
+      "learning_rate": 7.021826230999645e-06,
+      "loss": 0.9893,
+      "step": 5420
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.56625494445813,
+      "learning_rate": 7.020770052314722e-06,
+      "loss": 0.9816,
+      "step": 5421
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 9.152756381529883,
+      "learning_rate": 7.0197137658475485e-06,
+      "loss": 0.9448,
+      "step": 5422
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 8.414156104879792,
+      "learning_rate": 7.018657371654465e-06,
+      "loss": 1.0429,
+      "step": 5423
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.086081029897861,
+      "learning_rate": 7.017600869791817e-06,
+      "loss": 0.9907,
+      "step": 5424
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.802364110273725,
+      "learning_rate": 7.016544260315953e-06,
+      "loss": 0.9039,
+      "step": 5425
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 10.46254318692802,
+      "learning_rate": 7.015487543283233e-06,
+      "loss": 0.9319,
+      "step": 5426
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.771519726066665,
+      "learning_rate": 7.014430718750016e-06,
+      "loss": 0.9801,
+      "step": 5427
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.679569065531717,
+      "learning_rate": 7.013373786772673e-06,
+      "loss": 1.0227,
+      "step": 5428
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.183986987873537,
+      "learning_rate": 7.012316747407576e-06,
+      "loss": 0.9162,
+      "step": 5429
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.721879755397369,
+      "learning_rate": 7.011259600711105e-06,
+      "loss": 1.0092,
+      "step": 5430
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.329860292804056,
+      "learning_rate": 7.0102023467396445e-06,
+      "loss": 0.9162,
+      "step": 5431
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 8.870598577620969,
+      "learning_rate": 7.0091449855495876e-06,
+      "loss": 0.9899,
+      "step": 5432
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.2191871749309735,
+      "learning_rate": 7.008087517197328e-06,
+      "loss": 0.9722,
+      "step": 5433
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.044436884518901,
+      "learning_rate": 7.007029941739272e-06,
+      "loss": 0.9397,
+      "step": 5434
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.189355504564887,
+      "learning_rate": 7.005972259231825e-06,
+      "loss": 1.0332,
+      "step": 5435
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 8.124403562290983,
+      "learning_rate": 7.004914469731399e-06,
+      "loss": 0.968,
+      "step": 5436
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 5.320241638215744,
+      "learning_rate": 7.0038565732944175e-06,
+      "loss": 0.9826,
+      "step": 5437
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 8.334212041120267,
+      "learning_rate": 7.002798569977305e-06,
+      "loss": 0.99,
+      "step": 5438
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 7.0493000052802905,
+      "learning_rate": 7.001740459836491e-06,
+      "loss": 0.9758,
+      "step": 5439
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 5.704633114088716,
+      "learning_rate": 7.000682242928413e-06,
+      "loss": 1.0088,
+      "step": 5440
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 10.419118341738047,
+      "learning_rate": 6.9996239193095125e-06,
+      "loss": 0.9896,
+      "step": 5441
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.12925612958258,
+      "learning_rate": 6.998565489036238e-06,
+      "loss": 1.0047,
+      "step": 5442
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 8.152521427589205,
+      "learning_rate": 6.997506952165043e-06,
+      "loss": 0.9885,
+      "step": 5443
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 8.768713420046451,
+      "learning_rate": 6.996448308752388e-06,
+      "loss": 0.9334,
+      "step": 5444
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.785072880965307,
+      "learning_rate": 6.995389558854738e-06,
+      "loss": 0.9558,
+      "step": 5445
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 8.312303384730509,
+      "learning_rate": 6.994330702528562e-06,
+      "loss": 0.9845,
+      "step": 5446
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.5860682791139595,
+      "learning_rate": 6.9932717398303385e-06,
+      "loss": 0.9891,
+      "step": 5447
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 9.652449985362619,
+      "learning_rate": 6.992212670816548e-06,
+      "loss": 0.9018,
+      "step": 5448
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.640812689472643,
+      "learning_rate": 6.99115349554368e-06,
+      "loss": 1.0289,
+      "step": 5449
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 8.979798477120926,
+      "learning_rate": 6.990094214068225e-06,
+      "loss": 0.9965,
+      "step": 5450
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.779811206121267,
+      "learning_rate": 6.989034826446686e-06,
+      "loss": 1.0484,
+      "step": 5451
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 7.891738010602515,
+      "learning_rate": 6.987975332735565e-06,
+      "loss": 1.0036,
+      "step": 5452
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 9.875456443082227,
+      "learning_rate": 6.986915732991374e-06,
+      "loss": 0.8836,
+      "step": 5453
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 4.354993466151924,
+      "learning_rate": 6.985856027270629e-06,
+      "loss": 0.9655,
+      "step": 5454
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.633265926687557,
+      "learning_rate": 6.9847962156298486e-06,
+      "loss": 1.0338,
+      "step": 5455
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.335035662962314,
+      "learning_rate": 6.983736298125564e-06,
+      "loss": 0.9971,
+      "step": 5456
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 7.416630345970066,
+      "learning_rate": 6.982676274814308e-06,
+      "loss": 0.9108,
+      "step": 5457
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.810950174627227,
+      "learning_rate": 6.981616145752617e-06,
+      "loss": 0.9347,
+      "step": 5458
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 8.068817376232898,
+      "learning_rate": 6.9805559109970375e-06,
+      "loss": 1.0369,
+      "step": 5459
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 4.223499492748916,
+      "learning_rate": 6.979495570604119e-06,
+      "loss": 0.9741,
+      "step": 5460
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.0440317443183105,
+      "learning_rate": 6.978435124630416e-06,
+      "loss": 0.9651,
+      "step": 5461
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 5.955509908275162,
+      "learning_rate": 6.977374573132491e-06,
+      "loss": 0.8791,
+      "step": 5462
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.303098205925374,
+      "learning_rate": 6.976313916166909e-06,
+      "loss": 0.9679,
+      "step": 5463
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 9.354850345862468,
+      "learning_rate": 6.975253153790246e-06,
+      "loss": 0.9688,
+      "step": 5464
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 7.166681397733545,
+      "learning_rate": 6.974192286059075e-06,
+      "loss": 0.9778,
+      "step": 5465
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.327391939057936,
+      "learning_rate": 6.973131313029984e-06,
+      "loss": 0.9338,
+      "step": 5466
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 7.3031262811002815,
+      "learning_rate": 6.972070234759562e-06,
+      "loss": 0.8682,
+      "step": 5467
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 8.929283450136081,
+      "learning_rate": 6.971009051304403e-06,
+      "loss": 1.0421,
+      "step": 5468
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 7.3920536801821655,
+      "learning_rate": 6.969947762721106e-06,
+      "loss": 0.9332,
+      "step": 5469
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 7.303175163057232,
+      "learning_rate": 6.96888636906628e-06,
+      "loss": 1.021,
+      "step": 5470
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.617627762117369,
+      "learning_rate": 6.967824870396535e-06,
+      "loss": 0.966,
+      "step": 5471
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.897927212192305,
+      "learning_rate": 6.96676326676849e-06,
+      "loss": 0.9905,
+      "step": 5472
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.937533141563245,
+      "learning_rate": 6.965701558238764e-06,
+      "loss": 0.9553,
+      "step": 5473
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 8.309927443927704,
+      "learning_rate": 6.964639744863991e-06,
+      "loss": 0.877,
+      "step": 5474
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 7.787315079191766,
+      "learning_rate": 6.9635778267008015e-06,
+      "loss": 0.996,
+      "step": 5475
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.649693970761387,
+      "learning_rate": 6.962515803805838e-06,
+      "loss": 0.9471,
+      "step": 5476
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.529331708169945,
+      "learning_rate": 6.961453676235743e-06,
+      "loss": 0.9433,
+      "step": 5477
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.884980632488631,
+      "learning_rate": 6.960391444047169e-06,
+      "loss": 0.9678,
+      "step": 5478
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.488592217491617,
+      "learning_rate": 6.959329107296771e-06,
+      "loss": 0.9376,
+      "step": 5479
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 5.951450975694265,
+      "learning_rate": 6.958266666041213e-06,
+      "loss": 0.9572,
+      "step": 5480
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.158811133394107,
+      "learning_rate": 6.957204120337162e-06,
+      "loss": 0.9687,
+      "step": 5481
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.6281121917925265,
+      "learning_rate": 6.956141470241291e-06,
+      "loss": 1.0105,
+      "step": 5482
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 5.572621934927534,
+      "learning_rate": 6.95507871581028e-06,
+      "loss": 0.9137,
+      "step": 5483
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 11.068944345951845,
+      "learning_rate": 6.9540158571008105e-06,
+      "loss": 0.9318,
+      "step": 5484
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 8.79637747292161,
+      "learning_rate": 6.952952894169575e-06,
+      "loss": 1.0126,
+      "step": 5485
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 7.697965605955572,
+      "learning_rate": 6.951889827073269e-06,
+      "loss": 0.9978,
+      "step": 5486
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 8.623632794072279,
+      "learning_rate": 6.950826655868592e-06,
+      "loss": 0.8735,
+      "step": 5487
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 8.581341703545432,
+      "learning_rate": 6.949763380612251e-06,
+      "loss": 0.9348,
+      "step": 5488
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.581399841632216,
+      "learning_rate": 6.948700001360959e-06,
+      "loss": 1.0233,
+      "step": 5489
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 7.626378946695561,
+      "learning_rate": 6.9476365181714334e-06,
+      "loss": 0.9285,
+      "step": 5490
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 7.955991142776142,
+      "learning_rate": 6.946572931100396e-06,
+      "loss": 0.9321,
+      "step": 5491
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 8.211303464505114,
+      "learning_rate": 6.945509240204577e-06,
+      "loss": 0.9353,
+      "step": 5492
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.2585044841583475,
+      "learning_rate": 6.9444454455407115e-06,
+      "loss": 0.9363,
+      "step": 5493
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 7.952368209634528,
+      "learning_rate": 6.9433815471655386e-06,
+      "loss": 0.998,
+      "step": 5494
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 8.876621083260796,
+      "learning_rate": 6.942317545135801e-06,
+      "loss": 1.0041,
+      "step": 5495
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 5.965855999603505,
+      "learning_rate": 6.9412534395082555e-06,
+      "loss": 0.9472,
+      "step": 5496
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 8.018375700753658,
+      "learning_rate": 6.940189230339654e-06,
+      "loss": 0.9718,
+      "step": 5497
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 8.412804474463195,
+      "learning_rate": 6.939124917686758e-06,
+      "loss": 0.9089,
+      "step": 5498
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 7.830323724569946,
+      "learning_rate": 6.938060501606337e-06,
+      "loss": 0.9803,
+      "step": 5499
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 7.309367327859509,
+      "learning_rate": 6.9369959821551644e-06,
+      "loss": 1.0629,
+      "step": 5500
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.854684344194536,
+      "learning_rate": 6.935931359390017e-06,
+      "loss": 0.9483,
+      "step": 5501
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 9.877212086513516,
+      "learning_rate": 6.934866633367681e-06,
+      "loss": 0.9494,
+      "step": 5502
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 7.856516554486037,
+      "learning_rate": 6.9338018041449415e-06,
+      "loss": 0.972,
+      "step": 5503
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.627237802877937,
+      "learning_rate": 6.932736871778598e-06,
+      "loss": 0.9278,
+      "step": 5504
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 6.606261908189149,
+      "learning_rate": 6.931671836325451e-06,
+      "loss": 0.9517,
+      "step": 5505
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 6.474127573044516,
+      "learning_rate": 6.930606697842303e-06,
+      "loss": 0.9846,
+      "step": 5506
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 6.243834836176334,
+      "learning_rate": 6.92954145638597e-06,
+      "loss": 0.9746,
+      "step": 5507
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 8.989645577340873,
+      "learning_rate": 6.928476112013265e-06,
+      "loss": 0.9417,
+      "step": 5508
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 8.982672138864949,
+      "learning_rate": 6.9274106647810115e-06,
+      "loss": 1.0097,
+      "step": 5509
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 7.778582804082835,
+      "learning_rate": 6.9263451147460396e-06,
+      "loss": 1.0409,
+      "step": 5510
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 7.157214297717614,
+      "learning_rate": 6.92527946196518e-06,
+      "loss": 0.9157,
+      "step": 5511
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 7.025475073635729,
+      "learning_rate": 6.924213706495274e-06,
+      "loss": 0.9591,
+      "step": 5512
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 9.04034040260603,
+      "learning_rate": 6.9231478483931645e-06,
+      "loss": 1.0589,
+      "step": 5513
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 11.057180275979869,
+      "learning_rate": 6.9220818877157016e-06,
+      "loss": 1.001,
+      "step": 5514
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 6.771672732427428,
+      "learning_rate": 6.921015824519742e-06,
+      "loss": 0.9959,
+      "step": 5515
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 8.572698068659637,
+      "learning_rate": 6.919949658862145e-06,
+      "loss": 0.9268,
+      "step": 5516
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 7.235586158507896,
+      "learning_rate": 6.918883390799776e-06,
+      "loss": 0.9582,
+      "step": 5517
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 4.421580481127712,
+      "learning_rate": 6.917817020389511e-06,
+      "loss": 1.0181,
+      "step": 5518
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 10.065245382861296,
+      "learning_rate": 6.916750547688223e-06,
+      "loss": 0.9811,
+      "step": 5519
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 7.239533028230214,
+      "learning_rate": 6.915683972752796e-06,
+      "loss": 1.0967,
+      "step": 5520
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 7.220616638779322,
+      "learning_rate": 6.9146172956401185e-06,
+      "loss": 1.0023,
+      "step": 5521
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 7.413345549743327,
+      "learning_rate": 6.913550516407083e-06,
+      "loss": 0.9463,
+      "step": 5522
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 6.784731419444359,
+      "learning_rate": 6.91248363511059e-06,
+      "loss": 0.9682,
+      "step": 5523
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 9.51742863288774,
+      "learning_rate": 6.911416651807543e-06,
+      "loss": 0.9699,
+      "step": 5524
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 6.175274674918302,
+      "learning_rate": 6.9103495665548525e-06,
+      "loss": 0.9685,
+      "step": 5525
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 5.896819994859719,
+      "learning_rate": 6.909282379409434e-06,
+      "loss": 0.9802,
+      "step": 5526
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 6.734565590760859,
+      "learning_rate": 6.908215090428208e-06,
+      "loss": 0.9289,
+      "step": 5527
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 7.360131761598158,
+      "learning_rate": 6.9071476996681e-06,
+      "loss": 1.0143,
+      "step": 5528
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 9.591981685940844,
+      "learning_rate": 6.906080207186041e-06,
+      "loss": 0.9699,
+      "step": 5529
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 8.393145834880228,
+      "learning_rate": 6.90501261303897e-06,
+      "loss": 0.9584,
+      "step": 5530
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 8.036386925290776,
+      "learning_rate": 6.903944917283832e-06,
+      "loss": 0.8745,
+      "step": 5531
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 8.313940886848998,
+      "learning_rate": 6.902877119977568e-06,
+      "loss": 1.0286,
+      "step": 5532
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 5.368085774660072,
+      "learning_rate": 6.901809221177136e-06,
+      "loss": 0.9722,
+      "step": 5533
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 9.550453937350664,
+      "learning_rate": 6.900741220939495e-06,
+      "loss": 1.0255,
+      "step": 5534
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 6.192137541931764,
+      "learning_rate": 6.899673119321608e-06,
+      "loss": 1.0173,
+      "step": 5535
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 8.235492757038418,
+      "learning_rate": 6.898604916380442e-06,
+      "loss": 0.954,
+      "step": 5536
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 7.749350974656434,
+      "learning_rate": 6.897536612172978e-06,
+      "loss": 0.8846,
+      "step": 5537
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 7.008481999111594,
+      "learning_rate": 6.896468206756192e-06,
+      "loss": 0.9746,
+      "step": 5538
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 9.617271398007485,
+      "learning_rate": 6.89539970018707e-06,
+      "loss": 0.9995,
+      "step": 5539
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 6.440568676132799,
+      "learning_rate": 6.894331092522605e-06,
+      "loss": 0.9999,
+      "step": 5540
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 6.208728230614665,
+      "learning_rate": 6.89326238381979e-06,
+      "loss": 0.9632,
+      "step": 5541
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 6.922972664668123,
+      "learning_rate": 6.892193574135632e-06,
+      "loss": 0.9796,
+      "step": 5542
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 7.132321196601675,
+      "learning_rate": 6.891124663527134e-06,
+      "loss": 0.9549,
+      "step": 5543
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 6.594137947425417,
+      "learning_rate": 6.8900556520513115e-06,
+      "loss": 0.9224,
+      "step": 5544
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 5.049324321078926,
+      "learning_rate": 6.888986539765181e-06,
+      "loss": 0.9362,
+      "step": 5545
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 7.7271978165134705,
+      "learning_rate": 6.887917326725766e-06,
+      "loss": 0.9543,
+      "step": 5546
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 5.068069531972335,
+      "learning_rate": 6.886848012990098e-06,
+      "loss": 1.058,
+      "step": 5547
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 5.982112818772734,
+      "learning_rate": 6.885778598615206e-06,
+      "loss": 0.9057,
+      "step": 5548
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 6.436104694557236,
+      "learning_rate": 6.884709083658134e-06,
+      "loss": 1.0036,
+      "step": 5549
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 7.421997725842984,
+      "learning_rate": 6.883639468175926e-06,
+      "loss": 0.9503,
+      "step": 5550
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 6.622079913095185,
+      "learning_rate": 6.882569752225632e-06,
+      "loss": 0.9562,
+      "step": 5551
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 5.452879682226505,
+      "learning_rate": 6.881499935864306e-06,
+      "loss": 0.9609,
+      "step": 5552
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 7.131824842457692,
+      "learning_rate": 6.880430019149013e-06,
+      "loss": 0.9518,
+      "step": 5553
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 7.196547201137955,
+      "learning_rate": 6.879360002136817e-06,
+      "loss": 1.0358,
+      "step": 5554
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 9.509996188016007,
+      "learning_rate": 6.878289884884787e-06,
+      "loss": 0.9671,
+      "step": 5555
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 7.633462962480643,
+      "learning_rate": 6.877219667450004e-06,
+      "loss": 0.9761,
+      "step": 5556
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 8.168807854211302,
+      "learning_rate": 6.87614934988955e-06,
+      "loss": 0.9503,
+      "step": 5557
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 6.295972766780642,
+      "learning_rate": 6.875078932260512e-06,
+      "loss": 0.9397,
+      "step": 5558
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 4.638720600060492,
+      "learning_rate": 6.874008414619984e-06,
+      "loss": 0.9381,
+      "step": 5559
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 7.005150699884335,
+      "learning_rate": 6.87293779702506e-06,
+      "loss": 0.9611,
+      "step": 5560
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 7.2023632492019,
+      "learning_rate": 6.8718670795328506e-06,
+      "loss": 0.9709,
+      "step": 5561
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 6.792385802100498,
+      "learning_rate": 6.87079626220046e-06,
+      "loss": 0.9194,
+      "step": 5562
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 8.002698113660484,
+      "learning_rate": 6.869725345085004e-06,
+      "loss": 0.9815,
+      "step": 5563
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 8.01840310209126,
+      "learning_rate": 6.868654328243604e-06,
+      "loss": 0.971,
+      "step": 5564
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 7.191194267562053,
+      "learning_rate": 6.867583211733382e-06,
+      "loss": 0.9088,
+      "step": 5565
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 7.170869596460632,
+      "learning_rate": 6.86651199561147e-06,
+      "loss": 0.9789,
+      "step": 5566
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 5.803178846836227,
+      "learning_rate": 6.865440679935005e-06,
+      "loss": 1.0051,
+      "step": 5567
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 7.474871113958879,
+      "learning_rate": 6.8643692647611245e-06,
+      "loss": 0.9549,
+      "step": 5568
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 8.07421797506749,
+      "learning_rate": 6.86329775014698e-06,
+      "loss": 0.9277,
+      "step": 5569
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 6.375220079677526,
+      "learning_rate": 6.8622261361497165e-06,
+      "loss": 1.0782,
+      "step": 5570
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 8.247214899558747,
+      "learning_rate": 6.861154422826496e-06,
+      "loss": 0.952,
+      "step": 5571
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 8.150026577938023,
+      "learning_rate": 6.860082610234481e-06,
+      "loss": 0.9692,
+      "step": 5572
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 7.781847181611085,
+      "learning_rate": 6.859010698430834e-06,
+      "loss": 1.0318,
+      "step": 5573
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 5.700949123139139,
+      "learning_rate": 6.857938687472734e-06,
+      "loss": 0.925,
+      "step": 5574
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 8.18224112275429,
+      "learning_rate": 6.856866577417354e-06,
+      "loss": 0.9429,
+      "step": 5575
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 6.9953226087756954,
+      "learning_rate": 6.85579436832188e-06,
+      "loss": 0.9751,
+      "step": 5576
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 5.649773448850296,
+      "learning_rate": 6.8547220602435e-06,
+      "loss": 0.9974,
+      "step": 5577
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.021742951344898,
+      "learning_rate": 6.853649653239407e-06,
+      "loss": 0.9608,
+      "step": 5578
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 6.723560575411955,
+      "learning_rate": 6.852577147366801e-06,
+      "loss": 0.9661,
+      "step": 5579
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 6.582761768434448,
+      "learning_rate": 6.851504542682887e-06,
+      "loss": 0.9083,
+      "step": 5580
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.851508223897004,
+      "learning_rate": 6.850431839244877e-06,
+      "loss": 0.9977,
+      "step": 5581
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.189619431309463,
+      "learning_rate": 6.849359037109982e-06,
+      "loss": 0.903,
+      "step": 5582
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 5.76417808958089,
+      "learning_rate": 6.848286136335422e-06,
+      "loss": 0.9933,
+      "step": 5583
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 8.295510337654258,
+      "learning_rate": 6.847213136978425e-06,
+      "loss": 0.9836,
+      "step": 5584
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 8.793285675583252,
+      "learning_rate": 6.846140039096222e-06,
+      "loss": 0.986,
+      "step": 5585
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 5.456554276531459,
+      "learning_rate": 6.845066842746047e-06,
+      "loss": 0.9539,
+      "step": 5586
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.578974055464715,
+      "learning_rate": 6.843993547985143e-06,
+      "loss": 0.8986,
+      "step": 5587
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 6.925672917282224,
+      "learning_rate": 6.842920154870756e-06,
+      "loss": 1.0329,
+      "step": 5588
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 6.568816114670494,
+      "learning_rate": 6.841846663460137e-06,
+      "loss": 1.0408,
+      "step": 5589
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 6.406740244328062,
+      "learning_rate": 6.8407730738105435e-06,
+      "loss": 0.9257,
+      "step": 5590
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 5.839331567283385,
+      "learning_rate": 6.839699385979239e-06,
+      "loss": 0.9448,
+      "step": 5591
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.469783301604887,
+      "learning_rate": 6.8386256000234905e-06,
+      "loss": 1.0072,
+      "step": 5592
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 9.465919340816344,
+      "learning_rate": 6.83755171600057e-06,
+      "loss": 0.9614,
+      "step": 5593
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 8.784956346965302,
+      "learning_rate": 6.836477733967756e-06,
+      "loss": 1.0632,
+      "step": 5594
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 6.288587376122565,
+      "learning_rate": 6.835403653982331e-06,
+      "loss": 0.9647,
+      "step": 5595
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.9185936895655775,
+      "learning_rate": 6.8343294761015835e-06,
+      "loss": 0.973,
+      "step": 5596
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.667747457298487,
+      "learning_rate": 6.833255200382808e-06,
+      "loss": 0.9654,
+      "step": 5597
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 6.654520699657947,
+      "learning_rate": 6.832180826883303e-06,
+      "loss": 0.9517,
+      "step": 5598
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.289369555396754,
+      "learning_rate": 6.831106355660373e-06,
+      "loss": 0.9071,
+      "step": 5599
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 8.487249526956303,
+      "learning_rate": 6.830031786771327e-06,
+      "loss": 0.966,
+      "step": 5600
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 5.767227890805195,
+      "learning_rate": 6.8289571202734795e-06,
+      "loss": 1.0174,
+      "step": 5601
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 8.359550089129117,
+      "learning_rate": 6.827882356224151e-06,
+      "loss": 1.0342,
+      "step": 5602
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.295109574178521,
+      "learning_rate": 6.826807494680663e-06,
+      "loss": 0.9629,
+      "step": 5603
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 5.643214307226811,
+      "learning_rate": 6.82573253570035e-06,
+      "loss": 0.9396,
+      "step": 5604
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 8.599499663646336,
+      "learning_rate": 6.824657479340545e-06,
+      "loss": 0.9024,
+      "step": 5605
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.6223610958839085,
+      "learning_rate": 6.823582325658589e-06,
+      "loss": 0.9508,
+      "step": 5606
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 6.7168436176871875,
+      "learning_rate": 6.822507074711828e-06,
+      "loss": 1.0692,
+      "step": 5607
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.28847360927653,
+      "learning_rate": 6.8214317265576125e-06,
+      "loss": 0.9647,
+      "step": 5608
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 6.9241827891517,
+      "learning_rate": 6.820356281253298e-06,
+      "loss": 0.9449,
+      "step": 5609
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 6.265615135442104,
+      "learning_rate": 6.819280738856249e-06,
+      "loss": 0.9908,
+      "step": 5610
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 6.292634842608825,
+      "learning_rate": 6.818205099423827e-06,
+      "loss": 1.0041,
+      "step": 5611
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.568977787540161,
+      "learning_rate": 6.817129363013406e-06,
+      "loss": 0.9695,
+      "step": 5612
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 4.866243394483254,
+      "learning_rate": 6.816053529682362e-06,
+      "loss": 0.9487,
+      "step": 5613
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 5.863700926850534,
+      "learning_rate": 6.8149775994880796e-06,
+      "loss": 1.0229,
+      "step": 5614
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 6.9549452799420255,
+      "learning_rate": 6.813901572487945e-06,
+      "loss": 0.9998,
+      "step": 5615
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 8.614292871798275,
+      "learning_rate": 6.8128254487393465e-06,
+      "loss": 0.8619,
+      "step": 5616
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 8.109249627824045,
+      "learning_rate": 6.811749228299688e-06,
+      "loss": 0.9055,
+      "step": 5617
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 8.717130508579126,
+      "learning_rate": 6.810672911226366e-06,
+      "loss": 1.0065,
+      "step": 5618
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 5.851128032508564,
+      "learning_rate": 6.809596497576792e-06,
+      "loss": 0.9605,
+      "step": 5619
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 6.005307433790987,
+      "learning_rate": 6.808519987408377e-06,
+      "loss": 0.8999,
+      "step": 5620
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 6.269317793823961,
+      "learning_rate": 6.8074433807785415e-06,
+      "loss": 1.0223,
+      "step": 5621
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 9.584416708377736,
+      "learning_rate": 6.8063666777447045e-06,
+      "loss": 0.9362,
+      "step": 5622
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 8.913880940228582,
+      "learning_rate": 6.805289878364299e-06,
+      "loss": 1.0411,
+      "step": 5623
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.154020022486819,
+      "learning_rate": 6.804212982694757e-06,
+      "loss": 1.0083,
+      "step": 5624
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 5.719382455114771,
+      "learning_rate": 6.803135990793515e-06,
+      "loss": 0.9964,
+      "step": 5625
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.944595873829221,
+      "learning_rate": 6.80205890271802e-06,
+      "loss": 0.9017,
+      "step": 5626
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 5.590973387511235,
+      "learning_rate": 6.8009817185257175e-06,
+      "loss": 0.9743,
+      "step": 5627
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 5.015858673568466,
+      "learning_rate": 6.799904438274064e-06,
+      "loss": 0.9437,
+      "step": 5628
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.995900981254151,
+      "learning_rate": 6.798827062020519e-06,
+      "loss": 0.9224,
+      "step": 5629
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 5.670156581078309,
+      "learning_rate": 6.797749589822544e-06,
+      "loss": 0.9373,
+      "step": 5630
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 6.256727382536281,
+      "learning_rate": 6.79667202173761e-06,
+      "loss": 1.027,
+      "step": 5631
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 8.953690442197038,
+      "learning_rate": 6.795594357823192e-06,
+      "loss": 0.9609,
+      "step": 5632
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.262483088924975,
+      "learning_rate": 6.794516598136768e-06,
+      "loss": 0.9606,
+      "step": 5633
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 6.395763886784205,
+      "learning_rate": 6.793438742735825e-06,
+      "loss": 0.9746,
+      "step": 5634
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 12.690790526123639,
+      "learning_rate": 6.79236079167785e-06,
+      "loss": 0.9724,
+      "step": 5635
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.892482281963516,
+      "learning_rate": 6.791282745020342e-06,
+      "loss": 0.9098,
+      "step": 5636
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.147065644266215,
+      "learning_rate": 6.7902046028207954e-06,
+      "loss": 0.9702,
+      "step": 5637
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.692286502777142,
+      "learning_rate": 6.78912636513672e-06,
+      "loss": 0.8869,
+      "step": 5638
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 6.729327469497074,
+      "learning_rate": 6.788048032025625e-06,
+      "loss": 0.9645,
+      "step": 5639
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 9.004219390782872,
+      "learning_rate": 6.786969603545024e-06,
+      "loss": 0.9667,
+      "step": 5640
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.699075752630919,
+      "learning_rate": 6.785891079752438e-06,
+      "loss": 1.0388,
+      "step": 5641
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 6.35147451833447,
+      "learning_rate": 6.784812460705394e-06,
+      "loss": 0.9317,
+      "step": 5642
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 9.020600217455309,
+      "learning_rate": 6.7837337464614205e-06,
+      "loss": 0.9801,
+      "step": 5643
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 8.882444277486401,
+      "learning_rate": 6.782654937078055e-06,
+      "loss": 1.0576,
+      "step": 5644
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 8.850890565000542,
+      "learning_rate": 6.7815760326128375e-06,
+      "loss": 0.9366,
+      "step": 5645
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 5.832123604769494,
+      "learning_rate": 6.780497033123311e-06,
+      "loss": 0.9867,
+      "step": 5646
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 8.535456335895551,
+      "learning_rate": 6.77941793866703e-06,
+      "loss": 1.0149,
+      "step": 5647
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.211771678390694,
+      "learning_rate": 6.778338749301552e-06,
+      "loss": 0.9615,
+      "step": 5648
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 5.423576120752528,
+      "learning_rate": 6.7772594650844315e-06,
+      "loss": 0.9314,
+      "step": 5649
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 6.363317092488088,
+      "learning_rate": 6.77618008607324e-06,
+      "loss": 0.8853,
+      "step": 5650
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 6.972215500334706,
+      "learning_rate": 6.775100612325546e-06,
+      "loss": 1.0053,
+      "step": 5651
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 6.767934513668027,
+      "learning_rate": 6.7740210438989265e-06,
+      "loss": 1.0399,
+      "step": 5652
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 11.046232052569346,
+      "learning_rate": 6.772941380850963e-06,
+      "loss": 0.9391,
+      "step": 5653
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 6.80008973802106,
+      "learning_rate": 6.77186162323924e-06,
+      "loss": 0.9231,
+      "step": 5654
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 8.444340203018783,
+      "learning_rate": 6.7707817711213515e-06,
+      "loss": 0.9526,
+      "step": 5655
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 6.383157915483554,
+      "learning_rate": 6.769701824554891e-06,
+      "loss": 0.9719,
+      "step": 5656
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 6.700233538819096,
+      "learning_rate": 6.768621783597462e-06,
+      "loss": 1.0506,
+      "step": 5657
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 6.0257770847084435,
+      "learning_rate": 6.76754164830667e-06,
+      "loss": 0.9958,
+      "step": 5658
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 6.6445759412266385,
+      "learning_rate": 6.766461418740126e-06,
+      "loss": 0.9718,
+      "step": 5659
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 8.013051900797867,
+      "learning_rate": 6.765381094955448e-06,
+      "loss": 0.9475,
+      "step": 5660
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 9.594974939726548,
+      "learning_rate": 6.764300677010256e-06,
+      "loss": 0.9381,
+      "step": 5661
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.899434326815306,
+      "learning_rate": 6.763220164962177e-06,
+      "loss": 0.9899,
+      "step": 5662
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 6.689465671765959,
+      "learning_rate": 6.762139558868841e-06,
+      "loss": 0.9624,
+      "step": 5663
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.156386517003815,
+      "learning_rate": 6.761058858787886e-06,
+      "loss": 1.0239,
+      "step": 5664
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.136051907207528,
+      "learning_rate": 6.759978064776954e-06,
+      "loss": 0.9877,
+      "step": 5665
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 6.993648284848543,
+      "learning_rate": 6.7588971768936915e-06,
+      "loss": 0.9187,
+      "step": 5666
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 8.465051696836007,
+      "learning_rate": 6.75781619519575e-06,
+      "loss": 0.9909,
+      "step": 5667
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 6.351744636966941,
+      "learning_rate": 6.756735119740785e-06,
+      "loss": 0.9033,
+      "step": 5668
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.709025540338908,
+      "learning_rate": 6.755653950586458e-06,
+      "loss": 0.9293,
+      "step": 5669
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.57496492799287,
+      "learning_rate": 6.754572687790436e-06,
+      "loss": 0.9106,
+      "step": 5670
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 9.29699874831759,
+      "learning_rate": 6.7534913314103915e-06,
+      "loss": 0.8927,
+      "step": 5671
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 9.68556062309784,
+      "learning_rate": 6.7524098815039995e-06,
+      "loss": 0.9415,
+      "step": 5672
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.784440011252466,
+      "learning_rate": 6.751328338128943e-06,
+      "loss": 0.9915,
+      "step": 5673
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 8.338070223798105,
+      "learning_rate": 6.750246701342909e-06,
+      "loss": 0.9972,
+      "step": 5674
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 8.442485124945447,
+      "learning_rate": 6.749164971203584e-06,
+      "loss": 0.9192,
+      "step": 5675
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 9.454532444250512,
+      "learning_rate": 6.748083147768671e-06,
+      "loss": 0.9845,
+      "step": 5676
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.122916709973557,
+      "learning_rate": 6.747001231095869e-06,
+      "loss": 0.9215,
+      "step": 5677
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 5.512645584643239,
+      "learning_rate": 6.7459192212428835e-06,
+      "loss": 1.0037,
+      "step": 5678
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.285197100280322,
+      "learning_rate": 6.7448371182674265e-06,
+      "loss": 1.0315,
+      "step": 5679
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 9.1523634992578,
+      "learning_rate": 6.743754922227214e-06,
+      "loss": 1.0516,
+      "step": 5680
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 8.88694957049564,
+      "learning_rate": 6.742672633179968e-06,
+      "loss": 1.0075,
+      "step": 5681
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.147274861731059,
+      "learning_rate": 6.741590251183415e-06,
+      "loss": 1.0056,
+      "step": 5682
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 6.573524443206866,
+      "learning_rate": 6.740507776295285e-06,
+      "loss": 0.9264,
+      "step": 5683
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.641412852694816,
+      "learning_rate": 6.739425208573316e-06,
+      "loss": 1.0784,
+      "step": 5684
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 10.237242546282308,
+      "learning_rate": 6.738342548075247e-06,
+      "loss": 0.9312,
+      "step": 5685
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.994350725728525,
+      "learning_rate": 6.737259794858826e-06,
+      "loss": 0.9581,
+      "step": 5686
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 6.196640216212505,
+      "learning_rate": 6.736176948981803e-06,
+      "loss": 1.0519,
+      "step": 5687
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 6.369501862094214,
+      "learning_rate": 6.735094010501935e-06,
+      "loss": 0.9236,
+      "step": 5688
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 6.296218328352348,
+      "learning_rate": 6.734010979476981e-06,
+      "loss": 0.9853,
+      "step": 5689
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 8.97878848595811,
+      "learning_rate": 6.732927855964708e-06,
+      "loss": 0.982,
+      "step": 5690
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 6.677979236947285,
+      "learning_rate": 6.731844640022887e-06,
+      "loss": 1.022,
+      "step": 5691
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 6.312191224786253,
+      "learning_rate": 6.730761331709293e-06,
+      "loss": 0.9238,
+      "step": 5692
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 6.566834969829797,
+      "learning_rate": 6.729677931081709e-06,
+      "loss": 0.9189,
+      "step": 5693
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.646514511954878,
+      "learning_rate": 6.728594438197917e-06,
+      "loss": 0.9192,
+      "step": 5694
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.190845936295405,
+      "learning_rate": 6.727510853115709e-06,
+      "loss": 0.9425,
+      "step": 5695
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 5.823505577657227,
+      "learning_rate": 6.7264271758928825e-06,
+      "loss": 1.0132,
+      "step": 5696
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 8.0203863470463,
+      "learning_rate": 6.725343406587234e-06,
+      "loss": 0.9667,
+      "step": 5697
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.328375982044403,
+      "learning_rate": 6.724259545256572e-06,
+      "loss": 0.9376,
+      "step": 5698
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 6.922633456226782,
+      "learning_rate": 6.723175591958705e-06,
+      "loss": 0.9915,
+      "step": 5699
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 8.784710940158176,
+      "learning_rate": 6.722091546751448e-06,
+      "loss": 0.9746,
+      "step": 5700
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 8.401080430029623,
+      "learning_rate": 6.721007409692621e-06,
+      "loss": 0.9861,
+      "step": 5701
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.843094144009563,
+      "learning_rate": 6.719923180840051e-06,
+      "loss": 0.9704,
+      "step": 5702
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.745766354410197,
+      "learning_rate": 6.718838860251565e-06,
+      "loss": 0.9784,
+      "step": 5703
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 5.865821971304988,
+      "learning_rate": 6.717754447984998e-06,
+      "loss": 1.0447,
+      "step": 5704
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.673073040149736,
+      "learning_rate": 6.716669944098191e-06,
+      "loss": 0.9954,
+      "step": 5705
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 8.130253778777593,
+      "learning_rate": 6.715585348648988e-06,
+      "loss": 0.9834,
+      "step": 5706
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 6.082641851421377,
+      "learning_rate": 6.714500661695237e-06,
+      "loss": 1.0168,
+      "step": 5707
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 8.09836241230188,
+      "learning_rate": 6.713415883294791e-06,
+      "loss": 0.9634,
+      "step": 5708
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.266681286539247,
+      "learning_rate": 6.712331013505515e-06,
+      "loss": 0.9391,
+      "step": 5709
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 9.25732853939566,
+      "learning_rate": 6.711246052385267e-06,
+      "loss": 1.036,
+      "step": 5710
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 10.274446976272626,
+      "learning_rate": 6.710160999991918e-06,
+      "loss": 0.9097,
+      "step": 5711
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.883272576938785,
+      "learning_rate": 6.709075856383342e-06,
+      "loss": 0.9437,
+      "step": 5712
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 6.949681061064515,
+      "learning_rate": 6.707990621617417e-06,
+      "loss": 0.9518,
+      "step": 5713
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 8.899073378185314,
+      "learning_rate": 6.706905295752027e-06,
+      "loss": 1.0232,
+      "step": 5714
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 6.753069858266805,
+      "learning_rate": 6.705819878845059e-06,
+      "loss": 0.9919,
+      "step": 5715
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 6.334897779168186,
+      "learning_rate": 6.7047343709544066e-06,
+      "loss": 0.9938,
+      "step": 5716
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.151373613778034,
+      "learning_rate": 6.703648772137968e-06,
+      "loss": 0.9685,
+      "step": 5717
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 6.936494914533551,
+      "learning_rate": 6.702563082453646e-06,
+      "loss": 1.0769,
+      "step": 5718
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 6.533762416577729,
+      "learning_rate": 6.701477301959349e-06,
+      "loss": 0.9432,
+      "step": 5719
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 9.370048782245297,
+      "learning_rate": 6.700391430712988e-06,
+      "loss": 1.0776,
+      "step": 5720
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.4209073294424925,
+      "learning_rate": 6.699305468772481e-06,
+      "loss": 0.9932,
+      "step": 5721
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 4.710080887125568,
+      "learning_rate": 6.6982194161957525e-06,
+      "loss": 0.9833,
+      "step": 5722
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 6.63098944125086,
+      "learning_rate": 6.697133273040724e-06,
+      "loss": 1.0212,
+      "step": 5723
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 6.775581071543158,
+      "learning_rate": 6.696047039365333e-06,
+      "loss": 0.8892,
+      "step": 5724
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 6.273688390324919,
+      "learning_rate": 6.694960715227514e-06,
+      "loss": 0.9049,
+      "step": 5725
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 8.80086201412812,
+      "learning_rate": 6.693874300685208e-06,
+      "loss": 1.0081,
+      "step": 5726
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.380660522267481,
+      "learning_rate": 6.6927877957963605e-06,
+      "loss": 1.0887,
+      "step": 5727
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 9.190663533629957,
+      "learning_rate": 6.691701200618925e-06,
+      "loss": 0.9604,
+      "step": 5728
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 9.609883485443758,
+      "learning_rate": 6.690614515210857e-06,
+      "loss": 0.9293,
+      "step": 5729
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 8.007361407433207,
+      "learning_rate": 6.6895277396301154e-06,
+      "loss": 0.9401,
+      "step": 5730
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 6.3390499864236824,
+      "learning_rate": 6.688440873934668e-06,
+      "loss": 0.9534,
+      "step": 5731
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.441108193202663,
+      "learning_rate": 6.6873539181824835e-06,
+      "loss": 0.9692,
+      "step": 5732
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 10.654214480230744,
+      "learning_rate": 6.686266872431537e-06,
+      "loss": 0.9032,
+      "step": 5733
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 6.820312880346582,
+      "learning_rate": 6.685179736739811e-06,
+      "loss": 0.9518,
+      "step": 5734
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 8.1258692049934,
+      "learning_rate": 6.684092511165288e-06,
+      "loss": 1.0054,
+      "step": 5735
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 6.991932059304235,
+      "learning_rate": 6.683005195765958e-06,
+      "loss": 0.9616,
+      "step": 5736
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 6.751399074327416,
+      "learning_rate": 6.681917790599815e-06,
+      "loss": 0.9002,
+      "step": 5737
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 8.10644821916834,
+      "learning_rate": 6.680830295724859e-06,
+      "loss": 0.9636,
+      "step": 5738
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 8.589002949820623,
+      "learning_rate": 6.6797427111990945e-06,
+      "loss": 1.056,
+      "step": 5739
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 6.450858898766111,
+      "learning_rate": 6.678655037080528e-06,
+      "loss": 0.9679,
+      "step": 5740
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 10.499365612559615,
+      "learning_rate": 6.677567273427177e-06,
+      "loss": 1.058,
+      "step": 5741
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 8.088633195061796,
+      "learning_rate": 6.676479420297053e-06,
+      "loss": 0.9544,
+      "step": 5742
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 8.430754223388057,
+      "learning_rate": 6.675391477748185e-06,
+      "loss": 0.9045,
+      "step": 5743
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.290599671549821,
+      "learning_rate": 6.674303445838601e-06,
+      "loss": 0.9613,
+      "step": 5744
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 6.906879495290908,
+      "learning_rate": 6.67321532462633e-06,
+      "loss": 0.9408,
+      "step": 5745
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.8059925803113,
+      "learning_rate": 6.672127114169411e-06,
+      "loss": 0.8873,
+      "step": 5746
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 6.579439392262897,
+      "learning_rate": 6.671038814525886e-06,
+      "loss": 0.9981,
+      "step": 5747
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 6.342075379884625,
+      "learning_rate": 6.669950425753803e-06,
+      "loss": 1.0101,
+      "step": 5748
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.930944543765223,
+      "learning_rate": 6.668861947911212e-06,
+      "loss": 0.981,
+      "step": 5749
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.240976140378858,
+      "learning_rate": 6.66777338105617e-06,
+      "loss": 0.9372,
+      "step": 5750
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 6.310977017311938,
+      "learning_rate": 6.666684725246738e-06,
+      "loss": 1.0309,
+      "step": 5751
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.228731745489389,
+      "learning_rate": 6.665595980540982e-06,
+      "loss": 0.9834,
+      "step": 5752
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.46585353297545,
+      "learning_rate": 6.664507146996974e-06,
+      "loss": 0.9514,
+      "step": 5753
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.023191458714461,
+      "learning_rate": 6.663418224672786e-06,
+      "loss": 1.0492,
+      "step": 5754
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 6.261583390543906,
+      "learning_rate": 6.662329213626501e-06,
+      "loss": 0.9506,
+      "step": 5755
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 13.706490043545294,
+      "learning_rate": 6.661240113916203e-06,
+      "loss": 1.0467,
+      "step": 5756
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 6.935953415280247,
+      "learning_rate": 6.6601509255999805e-06,
+      "loss": 0.913,
+      "step": 5757
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.408896293128245,
+      "learning_rate": 6.6590616487359295e-06,
+      "loss": 1.0207,
+      "step": 5758
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 5.858012756959656,
+      "learning_rate": 6.657972283382148e-06,
+      "loss": 0.8432,
+      "step": 5759
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.538106345611639,
+      "learning_rate": 6.656882829596741e-06,
+      "loss": 1.0429,
+      "step": 5760
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 6.88883947220644,
+      "learning_rate": 6.6557932874378125e-06,
+      "loss": 1.0015,
+      "step": 5761
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.553402514663336,
+      "learning_rate": 6.654703656963482e-06,
+      "loss": 0.9559,
+      "step": 5762
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.599343952551496,
+      "learning_rate": 6.653613938231864e-06,
+      "loss": 0.9852,
+      "step": 5763
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 57.96839187651543,
+      "learning_rate": 6.65252413130108e-06,
+      "loss": 1.0,
+      "step": 5764
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 11.174882365698046,
+      "learning_rate": 6.651434236229259e-06,
+      "loss": 0.9927,
+      "step": 5765
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 9.139278319208868,
+      "learning_rate": 6.650344253074535e-06,
+      "loss": 0.9799,
+      "step": 5766
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.940564846094357,
+      "learning_rate": 6.64925418189504e-06,
+      "loss": 1.0609,
+      "step": 5767
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 8.07138570308956,
+      "learning_rate": 6.648164022748917e-06,
+      "loss": 1.0521,
+      "step": 5768
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 9.041104690762173,
+      "learning_rate": 6.647073775694315e-06,
+      "loss": 1.0295,
+      "step": 5769
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.218209688050021,
+      "learning_rate": 6.645983440789381e-06,
+      "loss": 1.101,
+      "step": 5770
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.438354423418402,
+      "learning_rate": 6.644893018092271e-06,
+      "loss": 0.9873,
+      "step": 5771
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 9.26132489631194,
+      "learning_rate": 6.643802507661148e-06,
+      "loss": 0.9921,
+      "step": 5772
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.983339019599481,
+      "learning_rate": 6.6427119095541745e-06,
+      "loss": 0.9597,
+      "step": 5773
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 8.143805162879238,
+      "learning_rate": 6.641621223829519e-06,
+      "loss": 0.9815,
+      "step": 5774
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 8.059653094606503,
+      "learning_rate": 6.640530450545358e-06,
+      "loss": 1.0015,
+      "step": 5775
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 9.371859044271934,
+      "learning_rate": 6.63943958975987e-06,
+      "loss": 0.9591,
+      "step": 5776
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 8.341388482355923,
+      "learning_rate": 6.6383486415312355e-06,
+      "loss": 0.9799,
+      "step": 5777
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 6.899189246754117,
+      "learning_rate": 6.637257605917646e-06,
+      "loss": 0.9573,
+      "step": 5778
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 8.17676151522683,
+      "learning_rate": 6.636166482977294e-06,
+      "loss": 0.9326,
+      "step": 5779
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 8.404854159640353,
+      "learning_rate": 6.635075272768374e-06,
+      "loss": 0.9692,
+      "step": 5780
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 8.97186206391505,
+      "learning_rate": 6.633983975349092e-06,
+      "loss": 1.0378,
+      "step": 5781
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 8.38489872543333,
+      "learning_rate": 6.632892590777653e-06,
+      "loss": 0.945,
+      "step": 5782
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.408199851803845,
+      "learning_rate": 6.6318011191122676e-06,
+      "loss": 0.9639,
+      "step": 5783
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 6.861757068074461,
+      "learning_rate": 6.630709560411153e-06,
+      "loss": 1.0664,
+      "step": 5784
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 8.591128180821789,
+      "learning_rate": 6.629617914732529e-06,
+      "loss": 1.0078,
+      "step": 5785
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.568843978146606,
+      "learning_rate": 6.628526182134623e-06,
+      "loss": 1.0149,
+      "step": 5786
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 8.383702284603213,
+      "learning_rate": 6.627434362675661e-06,
+      "loss": 0.9962,
+      "step": 5787
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.065991206942987,
+      "learning_rate": 6.626342456413881e-06,
+      "loss": 0.9372,
+      "step": 5788
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.738232621046829,
+      "learning_rate": 6.625250463407523e-06,
+      "loss": 0.9071,
+      "step": 5789
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.760057069370876,
+      "learning_rate": 6.624158383714827e-06,
+      "loss": 0.9291,
+      "step": 5790
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.265183986565752,
+      "learning_rate": 6.6230662173940456e-06,
+      "loss": 1.0192,
+      "step": 5791
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.9595450582998994,
+      "learning_rate": 6.62197396450343e-06,
+      "loss": 0.9929,
+      "step": 5792
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.661880765343488,
+      "learning_rate": 6.620881625101237e-06,
+      "loss": 0.9917,
+      "step": 5793
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.097938040454151,
+      "learning_rate": 6.619789199245728e-06,
+      "loss": 1.0048,
+      "step": 5794
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.184740727466115,
+      "learning_rate": 6.618696686995174e-06,
+      "loss": 0.9564,
+      "step": 5795
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.892196074763151,
+      "learning_rate": 6.617604088407845e-06,
+      "loss": 0.9614,
+      "step": 5796
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 8.988360108263427,
+      "learning_rate": 6.616511403542016e-06,
+      "loss": 1.0329,
+      "step": 5797
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 5.614652532846628,
+      "learning_rate": 6.615418632455968e-06,
+      "loss": 0.9271,
+      "step": 5798
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 8.708124704471853,
+      "learning_rate": 6.614325775207986e-06,
+      "loss": 0.9199,
+      "step": 5799
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 5.645557125060041,
+      "learning_rate": 6.613232831856361e-06,
+      "loss": 0.9847,
+      "step": 5800
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.084530492525872,
+      "learning_rate": 6.612139802459388e-06,
+      "loss": 1.0679,
+      "step": 5801
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.548694518777035,
+      "learning_rate": 6.611046687075365e-06,
+      "loss": 0.8891,
+      "step": 5802
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.439045223856554,
+      "learning_rate": 6.609953485762596e-06,
+      "loss": 0.9378,
+      "step": 5803
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.115758020589994,
+      "learning_rate": 6.60886019857939e-06,
+      "loss": 1.0448,
+      "step": 5804
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.8230510396367166,
+      "learning_rate": 6.607766825584058e-06,
+      "loss": 0.9684,
+      "step": 5805
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.935763086656515,
+      "learning_rate": 6.6066733668349195e-06,
+      "loss": 0.9618,
+      "step": 5806
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 9.045334443256174,
+      "learning_rate": 6.605579822390296e-06,
+      "loss": 0.9497,
+      "step": 5807
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.768242963129872,
+      "learning_rate": 6.604486192308515e-06,
+      "loss": 0.9899,
+      "step": 5808
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 8.10665826062694,
+      "learning_rate": 6.603392476647903e-06,
+      "loss": 0.9231,
+      "step": 5809
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.865506433670824,
+      "learning_rate": 6.602298675466803e-06,
+      "loss": 0.925,
+      "step": 5810
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.0551606119955,
+      "learning_rate": 6.601204788823551e-06,
+      "loss": 0.9767,
+      "step": 5811
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.051355774751378,
+      "learning_rate": 6.600110816776491e-06,
+      "loss": 1.0023,
+      "step": 5812
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 8.716271330493234,
+      "learning_rate": 6.599016759383974e-06,
+      "loss": 0.9831,
+      "step": 5813
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 8.077349852883778,
+      "learning_rate": 6.597922616704355e-06,
+      "loss": 0.9173,
+      "step": 5814
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 9.816565465451859,
+      "learning_rate": 6.596828388795993e-06,
+      "loss": 0.9575,
+      "step": 5815
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.862232944753829,
+      "learning_rate": 6.5957340757172474e-06,
+      "loss": 0.9752,
+      "step": 5816
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 9.385006602945358,
+      "learning_rate": 6.594639677526489e-06,
+      "loss": 0.9752,
+      "step": 5817
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.566360961004494,
+      "learning_rate": 6.593545194282087e-06,
+      "loss": 0.9409,
+      "step": 5818
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 9.588357674522056,
+      "learning_rate": 6.592450626042421e-06,
+      "loss": 0.9183,
+      "step": 5819
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.301380761052367,
+      "learning_rate": 6.591355972865873e-06,
+      "loss": 0.9701,
+      "step": 5820
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.59270631531273,
+      "learning_rate": 6.590261234810825e-06,
+      "loss": 1.008,
+      "step": 5821
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.912858616596038,
+      "learning_rate": 6.5891664119356705e-06,
+      "loss": 0.9271,
+      "step": 5822
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 12.453259749726117,
+      "learning_rate": 6.588071504298802e-06,
+      "loss": 1.0156,
+      "step": 5823
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 17.94812821329239,
+      "learning_rate": 6.58697651195862e-06,
+      "loss": 0.9289,
+      "step": 5824
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 8.680245818547448,
+      "learning_rate": 6.585881434973529e-06,
+      "loss": 0.8665,
+      "step": 5825
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.793939792208754,
+      "learning_rate": 6.5847862734019355e-06,
+      "loss": 0.8839,
+      "step": 5826
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 8.26465866469428,
+      "learning_rate": 6.583691027302255e-06,
+      "loss": 1.0304,
+      "step": 5827
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.439397766851101,
+      "learning_rate": 6.582595696732901e-06,
+      "loss": 1.034,
+      "step": 5828
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 5.3723358696040675,
+      "learning_rate": 6.581500281752299e-06,
+      "loss": 0.9832,
+      "step": 5829
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 5.805201133458014,
+      "learning_rate": 6.580404782418875e-06,
+      "loss": 0.9816,
+      "step": 5830
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.27651090624892,
+      "learning_rate": 6.579309198791059e-06,
+      "loss": 0.9928,
+      "step": 5831
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.1827833898915,
+      "learning_rate": 6.578213530927284e-06,
+      "loss": 1.0314,
+      "step": 5832
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 8.55211180902911,
+      "learning_rate": 6.577117778885996e-06,
+      "loss": 0.8599,
+      "step": 5833
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 8.39752486019873,
+      "learning_rate": 6.576021942725632e-06,
+      "loss": 0.9949,
+      "step": 5834
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 8.020389149384695,
+      "learning_rate": 6.574926022504646e-06,
+      "loss": 0.9918,
+      "step": 5835
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.379551663922093,
+      "learning_rate": 6.573830018281491e-06,
+      "loss": 0.9536,
+      "step": 5836
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.419726308814443,
+      "learning_rate": 6.572733930114621e-06,
+      "loss": 0.942,
+      "step": 5837
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.735655943942924,
+      "learning_rate": 6.5716377580625016e-06,
+      "loss": 0.9535,
+      "step": 5838
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.52056961759068,
+      "learning_rate": 6.5705415021836e-06,
+      "loss": 1.01,
+      "step": 5839
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.315863565736693,
+      "learning_rate": 6.5694451625363864e-06,
+      "loss": 0.8517,
+      "step": 5840
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.5728524339200725,
+      "learning_rate": 6.568348739179336e-06,
+      "loss": 0.9208,
+      "step": 5841
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 5.935986745039494,
+      "learning_rate": 6.567252232170929e-06,
+      "loss": 0.9273,
+      "step": 5842
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.925608797000203,
+      "learning_rate": 6.56615564156965e-06,
+      "loss": 0.9245,
+      "step": 5843
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.693634412304057,
+      "learning_rate": 6.565058967433989e-06,
+      "loss": 0.9707,
+      "step": 5844
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.921747271130119,
+      "learning_rate": 6.563962209822439e-06,
+      "loss": 0.9593,
+      "step": 5845
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 9.474354593003897,
+      "learning_rate": 6.5628653687935e-06,
+      "loss": 0.9832,
+      "step": 5846
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.258907879497539,
+      "learning_rate": 6.5617684444056685e-06,
+      "loss": 0.9654,
+      "step": 5847
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.3102500079355925,
+      "learning_rate": 6.560671436717457e-06,
+      "loss": 0.9508,
+      "step": 5848
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 8.988596552671433,
+      "learning_rate": 6.5595743457873775e-06,
+      "loss": 0.9238,
+      "step": 5849
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 9.90304607908211,
+      "learning_rate": 6.558477171673941e-06,
+      "loss": 1.0386,
+      "step": 5850
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 4.936172189653762,
+      "learning_rate": 6.557379914435671e-06,
+      "loss": 0.9901,
+      "step": 5851
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.09270178482959,
+      "learning_rate": 6.556282574131092e-06,
+      "loss": 0.9933,
+      "step": 5852
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 8.09048504346991,
+      "learning_rate": 6.555185150818731e-06,
+      "loss": 1.0218,
+      "step": 5853
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.626623024135975,
+      "learning_rate": 6.554087644557124e-06,
+      "loss": 0.923,
+      "step": 5854
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.0429533212315425,
+      "learning_rate": 6.552990055404808e-06,
+      "loss": 0.934,
+      "step": 5855
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 8.735890386476893,
+      "learning_rate": 6.551892383420324e-06,
+      "loss": 1.0236,
+      "step": 5856
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 8.689562670505824,
+      "learning_rate": 6.550794628662222e-06,
+      "loss": 0.959,
+      "step": 5857
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 8.395279301877231,
+      "learning_rate": 6.5496967911890485e-06,
+      "loss": 0.893,
+      "step": 5858
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.526871226654918,
+      "learning_rate": 6.548598871059365e-06,
+      "loss": 1.0238,
+      "step": 5859
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 9.833193384763938,
+      "learning_rate": 6.547500868331727e-06,
+      "loss": 0.9526,
+      "step": 5860
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 6.756745497084771,
+      "learning_rate": 6.546402783064698e-06,
+      "loss": 0.9404,
+      "step": 5861
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 9.811774283754255,
+      "learning_rate": 6.545304615316852e-06,
+      "loss": 0.9056,
+      "step": 5862
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 8.602116505680936,
+      "learning_rate": 6.544206365146757e-06,
+      "loss": 0.9266,
+      "step": 5863
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 8.629993682625654,
+      "learning_rate": 6.543108032612994e-06,
+      "loss": 0.8776,
+      "step": 5864
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 5.622806615210316,
+      "learning_rate": 6.542009617774145e-06,
+      "loss": 0.8958,
+      "step": 5865
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.055386603552718,
+      "learning_rate": 6.540911120688792e-06,
+      "loss": 0.8816,
+      "step": 5866
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.490679477356835,
+      "learning_rate": 6.539812541415531e-06,
+      "loss": 0.9803,
+      "step": 5867
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 6.551896923848565,
+      "learning_rate": 6.5387138800129555e-06,
+      "loss": 0.897,
+      "step": 5868
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 9.05076889573273,
+      "learning_rate": 6.537615136539663e-06,
+      "loss": 0.9226,
+      "step": 5869
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 6.584885653424391,
+      "learning_rate": 6.5365163110542605e-06,
+      "loss": 1.06,
+      "step": 5870
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.045666793561473,
+      "learning_rate": 6.535417403615355e-06,
+      "loss": 0.9336,
+      "step": 5871
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.601181878058769,
+      "learning_rate": 6.534318414281558e-06,
+      "loss": 0.9855,
+      "step": 5872
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.950210571369599,
+      "learning_rate": 6.533219343111487e-06,
+      "loss": 0.9157,
+      "step": 5873
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 6.533931615840258,
+      "learning_rate": 6.5321201901637665e-06,
+      "loss": 0.937,
+      "step": 5874
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.359404884597745,
+      "learning_rate": 6.531020955497019e-06,
+      "loss": 1.0349,
+      "step": 5875
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 8.333078910940252,
+      "learning_rate": 6.5299216391698735e-06,
+      "loss": 0.9874,
+      "step": 5876
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.887747727235067,
+      "learning_rate": 6.528822241240967e-06,
+      "loss": 0.9938,
+      "step": 5877
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 11.46308807351107,
+      "learning_rate": 6.52772276176894e-06,
+      "loss": 0.8952,
+      "step": 5878
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.89083455198176,
+      "learning_rate": 6.526623200812432e-06,
+      "loss": 0.9397,
+      "step": 5879
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 5.2444475691323165,
+      "learning_rate": 6.525523558430089e-06,
+      "loss": 1.0135,
+      "step": 5880
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 6.776270432261819,
+      "learning_rate": 6.52442383468057e-06,
+      "loss": 0.9313,
+      "step": 5881
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 6.968952396409455,
+      "learning_rate": 6.523324029622524e-06,
+      "loss": 0.88,
+      "step": 5882
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.703582349327837,
+      "learning_rate": 6.522224143314616e-06,
+      "loss": 0.9643,
+      "step": 5883
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 6.136139241690351,
+      "learning_rate": 6.52112417581551e-06,
+      "loss": 1.0735,
+      "step": 5884
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 6.893861775788539,
+      "learning_rate": 6.5200241271838724e-06,
+      "loss": 0.996,
+      "step": 5885
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 8.257709413437091,
+      "learning_rate": 6.518923997478379e-06,
+      "loss": 1.004,
+      "step": 5886
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.4942700724945785,
+      "learning_rate": 6.51782378675771e-06,
+      "loss": 1.0173,
+      "step": 5887
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 6.7850261292477505,
+      "learning_rate": 6.5167234950805435e-06,
+      "loss": 0.9613,
+      "step": 5888
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 9.505469276951926,
+      "learning_rate": 6.5156231225055674e-06,
+      "loss": 1.1145,
+      "step": 5889
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.880426133769793,
+      "learning_rate": 6.514522669091473e-06,
+      "loss": 0.9372,
+      "step": 5890
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 6.847335558272738,
+      "learning_rate": 6.513422134896955e-06,
+      "loss": 0.8965,
+      "step": 5891
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 5.856283485591732,
+      "learning_rate": 6.5123215199807135e-06,
+      "loss": 0.966,
+      "step": 5892
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 5.833866748826879,
+      "learning_rate": 6.511220824401451e-06,
+      "loss": 1.0323,
+      "step": 5893
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 5.73801328613292,
+      "learning_rate": 6.510120048217878e-06,
+      "loss": 0.8918,
+      "step": 5894
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.949695849109109,
+      "learning_rate": 6.509019191488701e-06,
+      "loss": 0.9601,
+      "step": 5895
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 9.675754642360953,
+      "learning_rate": 6.507918254272644e-06,
+      "loss": 0.9128,
+      "step": 5896
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.137440301284355,
+      "learning_rate": 6.506817236628424e-06,
+      "loss": 1.0079,
+      "step": 5897
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 5.562994198005793,
+      "learning_rate": 6.505716138614767e-06,
+      "loss": 0.9156,
+      "step": 5898
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 6.388675613837371,
+      "learning_rate": 6.504614960290401e-06,
+      "loss": 0.9597,
+      "step": 5899
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 8.275814300850762,
+      "learning_rate": 6.5035137017140624e-06,
+      "loss": 0.9146,
+      "step": 5900
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 8.612080365826614,
+      "learning_rate": 6.502412362944488e-06,
+      "loss": 1.0287,
+      "step": 5901
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.539632435372789,
+      "learning_rate": 6.50131094404042e-06,
+      "loss": 0.9307,
+      "step": 5902
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.318794222290295,
+      "learning_rate": 6.5002094450606055e-06,
+      "loss": 0.987,
+      "step": 5903
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 6.410831586083219,
+      "learning_rate": 6.499107866063794e-06,
+      "loss": 0.9939,
+      "step": 5904
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 6.967279930404857,
+      "learning_rate": 6.4980062071087426e-06,
+      "loss": 0.9955,
+      "step": 5905
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.396870419558846,
+      "learning_rate": 6.496904468254211e-06,
+      "loss": 0.9703,
+      "step": 5906
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 5.9374640516436745,
+      "learning_rate": 6.49580264955896e-06,
+      "loss": 0.971,
+      "step": 5907
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.917643827688961,
+      "learning_rate": 6.4947007510817596e-06,
+      "loss": 0.9285,
+      "step": 5908
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 8.077994033158625,
+      "learning_rate": 6.493598772881382e-06,
+      "loss": 0.9251,
+      "step": 5909
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.855232434472645,
+      "learning_rate": 6.492496715016603e-06,
+      "loss": 0.9599,
+      "step": 5910
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 6.988278455860967,
+      "learning_rate": 6.491394577546204e-06,
+      "loss": 0.8739,
+      "step": 5911
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 9.24482783788726,
+      "learning_rate": 6.4902923605289684e-06,
+      "loss": 0.9957,
+      "step": 5912
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 5.559501339541358,
+      "learning_rate": 6.489190064023688e-06,
+      "loss": 0.9103,
+      "step": 5913
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 6.493560140809792,
+      "learning_rate": 6.488087688089153e-06,
+      "loss": 0.9987,
+      "step": 5914
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 8.918137879415028,
+      "learning_rate": 6.486985232784164e-06,
+      "loss": 0.9714,
+      "step": 5915
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 6.198640820725265,
+      "learning_rate": 6.4858826981675225e-06,
+      "loss": 0.8676,
+      "step": 5916
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.571660279768595,
+      "learning_rate": 6.484780084298033e-06,
+      "loss": 0.9219,
+      "step": 5917
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 6.80800206900926,
+      "learning_rate": 6.483677391234507e-06,
+      "loss": 0.8716,
+      "step": 5918
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 6.904910970919684,
+      "learning_rate": 6.482574619035758e-06,
+      "loss": 1.0019,
+      "step": 5919
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 5.154659402838612,
+      "learning_rate": 6.481471767760606e-06,
+      "loss": 0.9125,
+      "step": 5920
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.585857752218509,
+      "learning_rate": 6.480368837467874e-06,
+      "loss": 0.9191,
+      "step": 5921
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 6.669938480362112,
+      "learning_rate": 6.47926582821639e-06,
+      "loss": 0.9407,
+      "step": 5922
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.123019084709178,
+      "learning_rate": 6.478162740064982e-06,
+      "loss": 1.033,
+      "step": 5923
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.40652243532434,
+      "learning_rate": 6.477059573072488e-06,
+      "loss": 1.0079,
+      "step": 5924
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.199585813251833,
+      "learning_rate": 6.47595632729775e-06,
+      "loss": 1.0791,
+      "step": 5925
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.9040597951664004,
+      "learning_rate": 6.4748530027996084e-06,
+      "loss": 0.8843,
+      "step": 5926
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.61211798430079,
+      "learning_rate": 6.473749599636914e-06,
+      "loss": 0.9147,
+      "step": 5927
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.804288744380264,
+      "learning_rate": 6.472646117868516e-06,
+      "loss": 1.0093,
+      "step": 5928
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.788657500921742,
+      "learning_rate": 6.471542557553274e-06,
+      "loss": 0.9925,
+      "step": 5929
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 8.187479832710899,
+      "learning_rate": 6.470438918750049e-06,
+      "loss": 0.9635,
+      "step": 5930
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.845931103978723,
+      "learning_rate": 6.469335201517705e-06,
+      "loss": 0.9601,
+      "step": 5931
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 8.584927267949215,
+      "learning_rate": 6.468231405915111e-06,
+      "loss": 0.9584,
+      "step": 5932
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.752988964524523,
+      "learning_rate": 6.46712753200114e-06,
+      "loss": 0.9269,
+      "step": 5933
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 5.58089080668602,
+      "learning_rate": 6.46602357983467e-06,
+      "loss": 0.9778,
+      "step": 5934
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.398747167075255,
+      "learning_rate": 6.464919549474584e-06,
+      "loss": 0.8622,
+      "step": 5935
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 5.283569240091941,
+      "learning_rate": 6.463815440979767e-06,
+      "loss": 0.8902,
+      "step": 5936
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.933633745224791,
+      "learning_rate": 6.462711254409109e-06,
+      "loss": 1.0074,
+      "step": 5937
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.53732911913593,
+      "learning_rate": 6.4616069898215025e-06,
+      "loss": 0.9575,
+      "step": 5938
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.324149267876385,
+      "learning_rate": 6.460502647275849e-06,
+      "loss": 0.9699,
+      "step": 5939
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 8.394849107086381,
+      "learning_rate": 6.45939822683105e-06,
+      "loss": 0.8758,
+      "step": 5940
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.344028907489541,
+      "learning_rate": 6.45829372854601e-06,
+      "loss": 0.9735,
+      "step": 5941
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.578677511374674,
+      "learning_rate": 6.457189152479644e-06,
+      "loss": 0.928,
+      "step": 5942
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.927884843098724,
+      "learning_rate": 6.456084498690862e-06,
+      "loss": 0.9586,
+      "step": 5943
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.330934755351843,
+      "learning_rate": 6.45497976723859e-06,
+      "loss": 0.9496,
+      "step": 5944
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 10.234560891799164,
+      "learning_rate": 6.453874958181744e-06,
+      "loss": 0.9662,
+      "step": 5945
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 5.881831843570916,
+      "learning_rate": 6.452770071579255e-06,
+      "loss": 0.9348,
+      "step": 5946
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.848715656548259,
+      "learning_rate": 6.451665107490055e-06,
+      "loss": 0.9916,
+      "step": 5947
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 10.634825332600315,
+      "learning_rate": 6.450560065973077e-06,
+      "loss": 1.0119,
+      "step": 5948
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.823894005822091,
+      "learning_rate": 6.449454947087265e-06,
+      "loss": 0.9142,
+      "step": 5949
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.196170712852626,
+      "learning_rate": 6.448349750891559e-06,
+      "loss": 0.8968,
+      "step": 5950
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.641359941724805,
+      "learning_rate": 6.44724447744491e-06,
+      "loss": 0.9514,
+      "step": 5951
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 8.788056133040678,
+      "learning_rate": 6.446139126806268e-06,
+      "loss": 0.8926,
+      "step": 5952
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.12979968093846,
+      "learning_rate": 6.445033699034591e-06,
+      "loss": 0.9848,
+      "step": 5953
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.788460308683707,
+      "learning_rate": 6.44392819418884e-06,
+      "loss": 0.9715,
+      "step": 5954
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 9.806390749452929,
+      "learning_rate": 6.442822612327977e-06,
+      "loss": 0.945,
+      "step": 5955
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.840243318140614,
+      "learning_rate": 6.441716953510972e-06,
+      "loss": 0.9229,
+      "step": 5956
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 5.558136718314965,
+      "learning_rate": 6.440611217796797e-06,
+      "loss": 1.0302,
+      "step": 5957
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.710701793039865,
+      "learning_rate": 6.439505405244431e-06,
+      "loss": 0.9993,
+      "step": 5958
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.697354852471928,
+      "learning_rate": 6.438399515912852e-06,
+      "loss": 0.9262,
+      "step": 5959
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.624235933258866,
+      "learning_rate": 6.4372935498610475e-06,
+      "loss": 0.971,
+      "step": 5960
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 8.840765990774633,
+      "learning_rate": 6.436187507148007e-06,
+      "loss": 1.0032,
+      "step": 5961
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.9371140751755425,
+      "learning_rate": 6.435081387832721e-06,
+      "loss": 0.9046,
+      "step": 5962
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.9862132897509275,
+      "learning_rate": 6.433975191974189e-06,
+      "loss": 0.9638,
+      "step": 5963
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 8.349713851884355,
+      "learning_rate": 6.432868919631412e-06,
+      "loss": 0.9179,
+      "step": 5964
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.212606623895222,
+      "learning_rate": 6.431762570863395e-06,
+      "loss": 0.8526,
+      "step": 5965
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 9.179975878739873,
+      "learning_rate": 6.430656145729147e-06,
+      "loss": 0.9625,
+      "step": 5966
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.830694781352321,
+      "learning_rate": 6.429549644287683e-06,
+      "loss": 0.9094,
+      "step": 5967
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.623174337046981,
+      "learning_rate": 6.42844306659802e-06,
+      "loss": 0.9703,
+      "step": 5968
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.735790076604665,
+      "learning_rate": 6.427336412719181e-06,
+      "loss": 0.8509,
+      "step": 5969
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.016473359029781,
+      "learning_rate": 6.42622968271019e-06,
+      "loss": 0.8953,
+      "step": 5970
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.267317308222935,
+      "learning_rate": 6.4251228766300766e-06,
+      "loss": 0.9127,
+      "step": 5971
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.925597409400072,
+      "learning_rate": 6.424015994537877e-06,
+      "loss": 1.0489,
+      "step": 5972
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.113956403907744,
+      "learning_rate": 6.4229090364926285e-06,
+      "loss": 1.0084,
+      "step": 5973
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.881675340664149,
+      "learning_rate": 6.421802002553371e-06,
+      "loss": 1.027,
+      "step": 5974
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.411152776874449,
+      "learning_rate": 6.420694892779153e-06,
+      "loss": 1.0082,
+      "step": 5975
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 8.089186668937419,
+      "learning_rate": 6.419587707229025e-06,
+      "loss": 0.9298,
+      "step": 5976
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 8.33308747154412,
+      "learning_rate": 6.4184804459620386e-06,
+      "loss": 0.9599,
+      "step": 5977
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.489487361606325,
+      "learning_rate": 6.4173731090372546e-06,
+      "loss": 1.0293,
+      "step": 5978
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 8.346915975921966,
+      "learning_rate": 6.416265696513734e-06,
+      "loss": 0.8816,
+      "step": 5979
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 8.914904119709998,
+      "learning_rate": 6.415158208450544e-06,
+      "loss": 0.8591,
+      "step": 5980
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 8.825026952552347,
+      "learning_rate": 6.414050644906753e-06,
+      "loss": 0.9913,
+      "step": 5981
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.209645040954345,
+      "learning_rate": 6.4129430059414365e-06,
+      "loss": 0.9719,
+      "step": 5982
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.45676558821073,
+      "learning_rate": 6.411835291613675e-06,
+      "loss": 0.8021,
+      "step": 5983
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.1752173734783,
+      "learning_rate": 6.410727501982548e-06,
+      "loss": 0.9775,
+      "step": 5984
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 5.589655916436899,
+      "learning_rate": 6.409619637107141e-06,
+      "loss": 0.948,
+      "step": 5985
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.62721684277844,
+      "learning_rate": 6.408511697046549e-06,
+      "loss": 0.9428,
+      "step": 5986
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.78686504036562,
+      "learning_rate": 6.407403681859862e-06,
+      "loss": 1.004,
+      "step": 5987
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.8306251806081395,
+      "learning_rate": 6.40629559160618e-06,
+      "loss": 0.9253,
+      "step": 5988
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 8.967192084482187,
+      "learning_rate": 6.405187426344608e-06,
+      "loss": 0.9606,
+      "step": 5989
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.000236512257383,
+      "learning_rate": 6.404079186134247e-06,
+      "loss": 0.8889,
+      "step": 5990
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.767780065182078,
+      "learning_rate": 6.402970871034211e-06,
+      "loss": 0.9289,
+      "step": 5991
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.025838878649492,
+      "learning_rate": 6.401862481103615e-06,
+      "loss": 0.917,
+      "step": 5992
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 8.805479391340471,
+      "learning_rate": 6.400754016401577e-06,
+      "loss": 0.9324,
+      "step": 5993
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.971247509254137,
+      "learning_rate": 6.399645476987218e-06,
+      "loss": 0.9424,
+      "step": 5994
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.689486404591706,
+      "learning_rate": 6.398536862919664e-06,
+      "loss": 0.9947,
+      "step": 5995
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 8.011299277913553,
+      "learning_rate": 6.397428174258048e-06,
+      "loss": 0.9674,
+      "step": 5996
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 8.239187662324314,
+      "learning_rate": 6.3963194110615025e-06,
+      "loss": 0.9075,
+      "step": 5997
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.651049761503494,
+      "learning_rate": 6.395210573389165e-06,
+      "loss": 1.0047,
+      "step": 5998
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.721117011672478,
+      "learning_rate": 6.394101661300183e-06,
+      "loss": 0.9527,
+      "step": 5999
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.2300648230682345,
+      "learning_rate": 6.392992674853696e-06,
+      "loss": 0.9905,
+      "step": 6000
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.068924229142879,
+      "learning_rate": 6.391883614108856e-06,
+      "loss": 1.0378,
+      "step": 6001
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.9832613567323865,
+      "learning_rate": 6.3907744791248215e-06,
+      "loss": 0.9224,
+      "step": 6002
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.2061557710426705,
+      "learning_rate": 6.3896652699607455e-06,
+      "loss": 0.8674,
+      "step": 6003
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 9.16902161118114,
+      "learning_rate": 6.388555986675793e-06,
+      "loss": 0.9825,
+      "step": 6004
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.175495696734838,
+      "learning_rate": 6.387446629329128e-06,
+      "loss": 0.9438,
+      "step": 6005
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 8.983998593862708,
+      "learning_rate": 6.3863371979799226e-06,
+      "loss": 0.9626,
+      "step": 6006
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.496916553694976,
+      "learning_rate": 6.38522769268735e-06,
+      "loss": 1.0062,
+      "step": 6007
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.480750492058359,
+      "learning_rate": 6.384118113510589e-06,
+      "loss": 0.9436,
+      "step": 6008
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.875355812284873,
+      "learning_rate": 6.383008460508818e-06,
+      "loss": 0.9331,
+      "step": 6009
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.224196046683403,
+      "learning_rate": 6.381898733741228e-06,
+      "loss": 0.9693,
+      "step": 6010
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.628245540861125,
+      "learning_rate": 6.380788933267005e-06,
+      "loss": 0.8568,
+      "step": 6011
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.600574512424357,
+      "learning_rate": 6.379679059145344e-06,
+      "loss": 1.0498,
+      "step": 6012
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.376037169837929,
+      "learning_rate": 6.378569111435442e-06,
+      "loss": 0.9412,
+      "step": 6013
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.018246808633308,
+      "learning_rate": 6.377459090196501e-06,
+      "loss": 0.9616,
+      "step": 6014
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.486885432996663,
+      "learning_rate": 6.376348995487725e-06,
+      "loss": 1.0226,
+      "step": 6015
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 8.716580193744015,
+      "learning_rate": 6.375238827368326e-06,
+      "loss": 0.9193,
+      "step": 6016
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.4139985819401035,
+      "learning_rate": 6.3741285858975165e-06,
+      "loss": 0.9741,
+      "step": 6017
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 8.771769203321112,
+      "learning_rate": 6.373018271134514e-06,
+      "loss": 1.0379,
+      "step": 6018
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.678990479939135,
+      "learning_rate": 6.371907883138535e-06,
+      "loss": 0.9346,
+      "step": 6019
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 8.078882503274613,
+      "learning_rate": 6.370797421968811e-06,
+      "loss": 0.9011,
+      "step": 6020
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.359589878828612,
+      "learning_rate": 6.369686887684568e-06,
+      "loss": 0.8867,
+      "step": 6021
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.475843459869638,
+      "learning_rate": 6.368576280345039e-06,
+      "loss": 0.9217,
+      "step": 6022
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 8.775565558756405,
+      "learning_rate": 6.367465600009461e-06,
+      "loss": 0.9384,
+      "step": 6023
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.629969363831824,
+      "learning_rate": 6.3663548467370724e-06,
+      "loss": 0.9606,
+      "step": 6024
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 9.864277377839487,
+      "learning_rate": 6.365244020587121e-06,
+      "loss": 0.9851,
+      "step": 6025
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.909196900808121,
+      "learning_rate": 6.364133121618854e-06,
+      "loss": 1.0528,
+      "step": 6026
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 15.004072586686306,
+      "learning_rate": 6.363022149891522e-06,
+      "loss": 1.0419,
+      "step": 6027
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 8.783863266525278,
+      "learning_rate": 6.361911105464383e-06,
+      "loss": 0.8707,
+      "step": 6028
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.908126185937211,
+      "learning_rate": 6.360799988396697e-06,
+      "loss": 0.9811,
+      "step": 6029
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 8.127934331168143,
+      "learning_rate": 6.359688798747728e-06,
+      "loss": 0.9095,
+      "step": 6030
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.674400832815191,
+      "learning_rate": 6.358577536576743e-06,
+      "loss": 0.8697,
+      "step": 6031
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.045911628908404,
+      "learning_rate": 6.357466201943013e-06,
+      "loss": 0.9462,
+      "step": 6032
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.126281657007101,
+      "learning_rate": 6.3563547949058144e-06,
+      "loss": 0.9514,
+      "step": 6033
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 8.26585771679198,
+      "learning_rate": 6.355243315524426e-06,
+      "loss": 1.0015,
+      "step": 6034
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.6860674738575625,
+      "learning_rate": 6.354131763858133e-06,
+      "loss": 0.8612,
+      "step": 6035
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.7130270252193105,
+      "learning_rate": 6.353020139966219e-06,
+      "loss": 0.9462,
+      "step": 6036
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.771680883874504,
+      "learning_rate": 6.351908443907981e-06,
+      "loss": 0.9671,
+      "step": 6037
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.474019555328759,
+      "learning_rate": 6.350796675742705e-06,
+      "loss": 0.9832,
+      "step": 6038
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 5.810140529456843,
+      "learning_rate": 6.349684835529696e-06,
+      "loss": 0.9428,
+      "step": 6039
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 5.883944098796391,
+      "learning_rate": 6.3485729233282556e-06,
+      "loss": 0.9624,
+      "step": 6040
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.627263846574915,
+      "learning_rate": 6.347460939197688e-06,
+      "loss": 0.9565,
+      "step": 6041
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 9.44766200598843,
+      "learning_rate": 6.346348883197305e-06,
+      "loss": 0.9484,
+      "step": 6042
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.357163310020866,
+      "learning_rate": 6.34523675538642e-06,
+      "loss": 0.9877,
+      "step": 6043
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 10.088784733760532,
+      "learning_rate": 6.344124555824351e-06,
+      "loss": 1.0536,
+      "step": 6044
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.550028755713312,
+      "learning_rate": 6.34301228457042e-06,
+      "loss": 0.8241,
+      "step": 6045
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.513582711515657,
+      "learning_rate": 6.341899941683951e-06,
+      "loss": 0.9322,
+      "step": 6046
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 8.433976697041759,
+      "learning_rate": 6.340787527224277e-06,
+      "loss": 0.8405,
+      "step": 6047
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.335639977953625,
+      "learning_rate": 6.339675041250724e-06,
+      "loss": 0.9002,
+      "step": 6048
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.626103549401741,
+      "learning_rate": 6.338562483822637e-06,
+      "loss": 0.9735,
+      "step": 6049
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.339790170853591,
+      "learning_rate": 6.337449854999353e-06,
+      "loss": 0.9987,
+      "step": 6050
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 11.465328838135244,
+      "learning_rate": 6.336337154840215e-06,
+      "loss": 1.0165,
+      "step": 6051
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.336000321574576,
+      "learning_rate": 6.3352243834045725e-06,
+      "loss": 0.9798,
+      "step": 6052
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.6749000625347215,
+      "learning_rate": 6.33411154075178e-06,
+      "loss": 0.9336,
+      "step": 6053
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.954616596303647,
+      "learning_rate": 6.332998626941189e-06,
+      "loss": 0.9435,
+      "step": 6054
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.372481335888182,
+      "learning_rate": 6.331885642032163e-06,
+      "loss": 0.9466,
+      "step": 6055
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.233355849699019,
+      "learning_rate": 6.330772586084065e-06,
+      "loss": 0.9007,
+      "step": 6056
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.495133630652206,
+      "learning_rate": 6.329659459156259e-06,
+      "loss": 0.9903,
+      "step": 6057
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 8.841258696268975,
+      "learning_rate": 6.3285462613081206e-06,
+      "loss": 0.9825,
+      "step": 6058
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.386438454501838,
+      "learning_rate": 6.327432992599023e-06,
+      "loss": 0.9728,
+      "step": 6059
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 5.833640403216647,
+      "learning_rate": 6.326319653088344e-06,
+      "loss": 0.976,
+      "step": 6060
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 5.098011618820677,
+      "learning_rate": 6.3252062428354654e-06,
+      "loss": 0.9229,
+      "step": 6061
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 8.406283416002214,
+      "learning_rate": 6.3240927618997765e-06,
+      "loss": 1.0143,
+      "step": 6062
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 9.37028697914453,
+      "learning_rate": 6.322979210340663e-06,
+      "loss": 0.8884,
+      "step": 6063
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.530603328753863,
+      "learning_rate": 6.321865588217522e-06,
+      "loss": 0.9178,
+      "step": 6064
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.52983223626174,
+      "learning_rate": 6.32075189558975e-06,
+      "loss": 0.9972,
+      "step": 6065
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 7.256189280869419,
+      "learning_rate": 6.31963813251675e-06,
+      "loss": 1.0115,
+      "step": 6066
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.565480153635846,
+      "learning_rate": 6.318524299057922e-06,
+      "loss": 0.9284,
+      "step": 6067
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.8132207811187255,
+      "learning_rate": 6.31741039527268e-06,
+      "loss": 0.9731,
+      "step": 6068
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.274419378304364,
+      "learning_rate": 6.316296421220435e-06,
+      "loss": 1.0131,
+      "step": 6069
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 7.3196152702878905,
+      "learning_rate": 6.315182376960601e-06,
+      "loss": 0.9827,
+      "step": 6070
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.451096091259678,
+      "learning_rate": 6.314068262552601e-06,
+      "loss": 0.9939,
+      "step": 6071
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.166649828955559,
+      "learning_rate": 6.312954078055858e-06,
+      "loss": 0.9139,
+      "step": 6072
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.337827174325725,
+      "learning_rate": 6.311839823529798e-06,
+      "loss": 0.9563,
+      "step": 6073
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 7.184536322573175,
+      "learning_rate": 6.310725499033854e-06,
+      "loss": 1.0068,
+      "step": 6074
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.6609107765254905,
+      "learning_rate": 6.309611104627461e-06,
+      "loss": 1.0246,
+      "step": 6075
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.301808681824734,
+      "learning_rate": 6.3084966403700555e-06,
+      "loss": 0.9687,
+      "step": 6076
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.5631112552959,
+      "learning_rate": 6.3073821063210814e-06,
+      "loss": 0.9923,
+      "step": 6077
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 7.322309512814564,
+      "learning_rate": 6.306267502539987e-06,
+      "loss": 0.9046,
+      "step": 6078
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 5.727472274328672,
+      "learning_rate": 6.305152829086219e-06,
+      "loss": 0.9152,
+      "step": 6079
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.417307125553286,
+      "learning_rate": 6.304038086019231e-06,
+      "loss": 0.895,
+      "step": 6080
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.464310645307437,
+      "learning_rate": 6.302923273398483e-06,
+      "loss": 0.9153,
+      "step": 6081
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 8.149281427372317,
+      "learning_rate": 6.3018083912834325e-06,
+      "loss": 0.9818,
+      "step": 6082
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 7.898132667260854,
+      "learning_rate": 6.300693439733546e-06,
+      "loss": 0.9001,
+      "step": 6083
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 8.134444981962634,
+      "learning_rate": 6.299578418808293e-06,
+      "loss": 1.0593,
+      "step": 6084
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 8.34873359863281,
+      "learning_rate": 6.298463328567146e-06,
+      "loss": 1.0194,
+      "step": 6085
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.986883338842157,
+      "learning_rate": 6.297348169069576e-06,
+      "loss": 1.0015,
+      "step": 6086
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 5.202632428036133,
+      "learning_rate": 6.296232940375067e-06,
+      "loss": 1.0258,
+      "step": 6087
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 7.383918392804246,
+      "learning_rate": 6.295117642543103e-06,
+      "loss": 0.9719,
+      "step": 6088
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 8.158836689064646,
+      "learning_rate": 6.294002275633167e-06,
+      "loss": 0.8889,
+      "step": 6089
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 9.148030493831826,
+      "learning_rate": 6.292886839704752e-06,
+      "loss": 0.9075,
+      "step": 6090
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 9.786520561190233,
+      "learning_rate": 6.291771334817351e-06,
+      "loss": 0.9593,
+      "step": 6091
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.535153726323533,
+      "learning_rate": 6.290655761030464e-06,
+      "loss": 0.9806,
+      "step": 6092
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 7.118400001568581,
+      "learning_rate": 6.289540118403591e-06,
+      "loss": 0.9319,
+      "step": 6093
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 9.66711172728103,
+      "learning_rate": 6.288424406996237e-06,
+      "loss": 0.9099,
+      "step": 6094
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 8.580300454303156,
+      "learning_rate": 6.287308626867912e-06,
+      "loss": 0.9886,
+      "step": 6095
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 10.4294833768206,
+      "learning_rate": 6.286192778078126e-06,
+      "loss": 0.9292,
+      "step": 6096
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 7.768811949774612,
+      "learning_rate": 6.285076860686401e-06,
+      "loss": 0.9876,
+      "step": 6097
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 7.804959046169453,
+      "learning_rate": 6.2839608747522504e-06,
+      "loss": 0.9321,
+      "step": 6098
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 5.516117858475391,
+      "learning_rate": 6.282844820335202e-06,
+      "loss": 0.9905,
+      "step": 6099
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 9.814169761320674,
+      "learning_rate": 6.281728697494782e-06,
+      "loss": 0.9659,
+      "step": 6100
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.546904713079157,
+      "learning_rate": 6.2806125062905195e-06,
+      "loss": 0.9524,
+      "step": 6101
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 8.566564836847833,
+      "learning_rate": 6.279496246781952e-06,
+      "loss": 1.0059,
+      "step": 6102
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 5.383630883417722,
+      "learning_rate": 6.278379919028616e-06,
+      "loss": 0.9778,
+      "step": 6103
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 5.840588259334643,
+      "learning_rate": 6.277263523090054e-06,
+      "loss": 0.9955,
+      "step": 6104
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 5.597486342135335,
+      "learning_rate": 6.276147059025809e-06,
+      "loss": 0.905,
+      "step": 6105
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 7.9612895394976775,
+      "learning_rate": 6.2750305268954325e-06,
+      "loss": 0.9837,
+      "step": 6106
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 7.521935649973944,
+      "learning_rate": 6.273913926758478e-06,
+      "loss": 0.9729,
+      "step": 6107
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 8.558556296226847,
+      "learning_rate": 6.2727972586745e-06,
+      "loss": 0.9456,
+      "step": 6108
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.847260120005081,
+      "learning_rate": 6.27168052270306e-06,
+      "loss": 1.0684,
+      "step": 6109
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 5.502220336755503,
+      "learning_rate": 6.270563718903719e-06,
+      "loss": 0.9322,
+      "step": 6110
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 5.441860871255827,
+      "learning_rate": 6.269446847336047e-06,
+      "loss": 0.9748,
+      "step": 6111
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 4.110024977999472,
+      "learning_rate": 6.268329908059613e-06,
+      "loss": 0.9613,
+      "step": 6112
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.601919244201488,
+      "learning_rate": 6.267212901133993e-06,
+      "loss": 0.9514,
+      "step": 6113
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 7.208432886830969,
+      "learning_rate": 6.266095826618763e-06,
+      "loss": 0.9594,
+      "step": 6114
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 8.80482987864409,
+      "learning_rate": 6.264978684573506e-06,
+      "loss": 0.9591,
+      "step": 6115
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 8.000493009983186,
+      "learning_rate": 6.263861475057809e-06,
+      "loss": 0.9148,
+      "step": 6116
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.815367084460051,
+      "learning_rate": 6.262744198131256e-06,
+      "loss": 0.932,
+      "step": 6117
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 8.449199520701287,
+      "learning_rate": 6.261626853853444e-06,
+      "loss": 0.9572,
+      "step": 6118
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.576062839445075,
+      "learning_rate": 6.2605094422839676e-06,
+      "loss": 0.9584,
+      "step": 6119
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 8.171303498823393,
+      "learning_rate": 6.259391963482425e-06,
+      "loss": 0.977,
+      "step": 6120
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.685582933971547,
+      "learning_rate": 6.258274417508422e-06,
+      "loss": 1.01,
+      "step": 6121
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 7.901669662928547,
+      "learning_rate": 6.257156804421563e-06,
+      "loss": 1.0562,
+      "step": 6122
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 8.36782552336154,
+      "learning_rate": 6.256039124281461e-06,
+      "loss": 0.9463,
+      "step": 6123
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 7.126416845370745,
+      "learning_rate": 6.254921377147726e-06,
+      "loss": 0.9228,
+      "step": 6124
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.4984832805293795,
+      "learning_rate": 6.253803563079979e-06,
+      "loss": 0.9036,
+      "step": 6125
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.758891222780014,
+      "learning_rate": 6.25268568213784e-06,
+      "loss": 0.9332,
+      "step": 6126
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.61428324761781,
+      "learning_rate": 6.251567734380933e-06,
+      "loss": 0.9667,
+      "step": 6127
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 7.106000261462618,
+      "learning_rate": 6.250449719868887e-06,
+      "loss": 0.952,
+      "step": 6128
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 7.022003338785676,
+      "learning_rate": 6.249331638661333e-06,
+      "loss": 0.9678,
+      "step": 6129
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 7.722171694478472,
+      "learning_rate": 6.248213490817908e-06,
+      "loss": 0.9254,
+      "step": 6130
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 5.999580258247427,
+      "learning_rate": 6.24709527639825e-06,
+      "loss": 0.9605,
+      "step": 6131
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.185966828587517,
+      "learning_rate": 6.245976995462001e-06,
+      "loss": 0.9414,
+      "step": 6132
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 5.063818168704504,
+      "learning_rate": 6.244858648068808e-06,
+      "loss": 0.9649,
+      "step": 6133
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 8.169322467606412,
+      "learning_rate": 6.243740234278317e-06,
+      "loss": 0.9434,
+      "step": 6134
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.803078317971111,
+      "learning_rate": 6.242621754150187e-06,
+      "loss": 0.9237,
+      "step": 6135
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 9.093136033113534,
+      "learning_rate": 6.241503207744071e-06,
+      "loss": 0.9664,
+      "step": 6136
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 9.100484462498594,
+      "learning_rate": 6.240384595119629e-06,
+      "loss": 0.9645,
+      "step": 6137
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.553432307416903,
+      "learning_rate": 6.239265916336525e-06,
+      "loss": 1.0106,
+      "step": 6138
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 8.642671581412268,
+      "learning_rate": 6.238147171454426e-06,
+      "loss": 0.904,
+      "step": 6139
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.99109126369758,
+      "learning_rate": 6.237028360533004e-06,
+      "loss": 0.9842,
+      "step": 6140
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.910937063425274,
+      "learning_rate": 6.235909483631932e-06,
+      "loss": 1.0171,
+      "step": 6141
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.992353343536915,
+      "learning_rate": 6.23479054081089e-06,
+      "loss": 0.9687,
+      "step": 6142
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.583386794891712,
+      "learning_rate": 6.233671532129556e-06,
+      "loss": 0.9943,
+      "step": 6143
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 5.926015053751738,
+      "learning_rate": 6.232552457647616e-06,
+      "loss": 0.9806,
+      "step": 6144
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.508423945398248,
+      "learning_rate": 6.2314333174247606e-06,
+      "loss": 0.9223,
+      "step": 6145
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.231318462828905,
+      "learning_rate": 6.230314111520679e-06,
+      "loss": 0.8918,
+      "step": 6146
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.815366793198278,
+      "learning_rate": 6.229194839995067e-06,
+      "loss": 0.9948,
+      "step": 6147
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.673292172501119,
+      "learning_rate": 6.228075502907625e-06,
+      "loss": 0.8709,
+      "step": 6148
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 5.891914170718226,
+      "learning_rate": 6.226956100318053e-06,
+      "loss": 0.8491,
+      "step": 6149
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 8.027441639641253,
+      "learning_rate": 6.225836632286056e-06,
+      "loss": 0.9877,
+      "step": 6150
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.275268609996738,
+      "learning_rate": 6.224717098871347e-06,
+      "loss": 0.9598,
+      "step": 6151
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 8.287877422730066,
+      "learning_rate": 6.223597500133638e-06,
+      "loss": 0.9206,
+      "step": 6152
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 9.719924223619383,
+      "learning_rate": 6.222477836132642e-06,
+      "loss": 0.925,
+      "step": 6153
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.360653796214566,
+      "learning_rate": 6.221358106928083e-06,
+      "loss": 0.9962,
+      "step": 6154
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 8.508185787764724,
+      "learning_rate": 6.220238312579682e-06,
+      "loss": 0.9813,
+      "step": 6155
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.370973652808962,
+      "learning_rate": 6.2191184531471646e-06,
+      "loss": 0.9711,
+      "step": 6156
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.679752645500928,
+      "learning_rate": 6.217998528690263e-06,
+      "loss": 0.9191,
+      "step": 6157
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.18783618794953,
+      "learning_rate": 6.216878539268712e-06,
+      "loss": 0.9372,
+      "step": 6158
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.6864815141931375,
+      "learning_rate": 6.215758484942243e-06,
+      "loss": 0.9643,
+      "step": 6159
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 5.418121223776942,
+      "learning_rate": 6.214638365770603e-06,
+      "loss": 0.958,
+      "step": 6160
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.571950839380838,
+      "learning_rate": 6.213518181813535e-06,
+      "loss": 0.9272,
+      "step": 6161
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.578770597248555,
+      "learning_rate": 6.212397933130783e-06,
+      "loss": 0.9418,
+      "step": 6162
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.284713877405985,
+      "learning_rate": 6.211277619782099e-06,
+      "loss": 0.971,
+      "step": 6163
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.515197129675501,
+      "learning_rate": 6.210157241827242e-06,
+      "loss": 0.9776,
+      "step": 6164
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 4.775885882139498,
+      "learning_rate": 6.209036799325962e-06,
+      "loss": 0.9273,
+      "step": 6165
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.61435610224022,
+      "learning_rate": 6.207916292338028e-06,
+      "loss": 0.9288,
+      "step": 6166
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.816676428580738,
+      "learning_rate": 6.206795720923199e-06,
+      "loss": 0.9217,
+      "step": 6167
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 5.170761230492704,
+      "learning_rate": 6.205675085141245e-06,
+      "loss": 0.9242,
+      "step": 6168
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.986142535932479,
+      "learning_rate": 6.20455438505194e-06,
+      "loss": 0.8676,
+      "step": 6169
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.053750140643099,
+      "learning_rate": 6.203433620715056e-06,
+      "loss": 0.8791,
+      "step": 6170
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.845494150153525,
+      "learning_rate": 6.202312792190373e-06,
+      "loss": 1.0321,
+      "step": 6171
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.313054590845473,
+      "learning_rate": 6.201191899537671e-06,
+      "loss": 1.0148,
+      "step": 6172
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.232686846111899,
+      "learning_rate": 6.200070942816737e-06,
+      "loss": 1.0045,
+      "step": 6173
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 5.871037287444165,
+      "learning_rate": 6.198949922087361e-06,
+      "loss": 0.9039,
+      "step": 6174
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.7105154936005516,
+      "learning_rate": 6.197828837409332e-06,
+      "loss": 0.936,
+      "step": 6175
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.3621303116790315,
+      "learning_rate": 6.196707688842447e-06,
+      "loss": 0.9892,
+      "step": 6176
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.680341035817459,
+      "learning_rate": 6.195586476446504e-06,
+      "loss": 0.946,
+      "step": 6177
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 8.945520158226477,
+      "learning_rate": 6.194465200281308e-06,
+      "loss": 0.8966,
+      "step": 6178
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.863378389134293,
+      "learning_rate": 6.193343860406662e-06,
+      "loss": 0.9695,
+      "step": 6179
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 5.9563798260868035,
+      "learning_rate": 6.192222456882378e-06,
+      "loss": 0.9892,
+      "step": 6180
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.655235214116637,
+      "learning_rate": 6.191100989768264e-06,
+      "loss": 0.9044,
+      "step": 6181
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.354731486959209,
+      "learning_rate": 6.1899794591241405e-06,
+      "loss": 0.9641,
+      "step": 6182
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 8.625465426784539,
+      "learning_rate": 6.188857865009825e-06,
+      "loss": 0.9269,
+      "step": 6183
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 9.05101605246163,
+      "learning_rate": 6.18773620748514e-06,
+      "loss": 0.9874,
+      "step": 6184
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.14257717829335,
+      "learning_rate": 6.186614486609911e-06,
+      "loss": 0.8748,
+      "step": 6185
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.912429447543029,
+      "learning_rate": 6.185492702443968e-06,
+      "loss": 0.9956,
+      "step": 6186
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 5.352591454829652,
+      "learning_rate": 6.1843708550471445e-06,
+      "loss": 0.9934,
+      "step": 6187
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 8.561742598981711,
+      "learning_rate": 6.183248944479277e-06,
+      "loss": 0.8751,
+      "step": 6188
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.218243812318832,
+      "learning_rate": 6.182126970800204e-06,
+      "loss": 0.9,
+      "step": 6189
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.678320564800349,
+      "learning_rate": 6.181004934069769e-06,
+      "loss": 0.9933,
+      "step": 6190
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.6868609213132855,
+      "learning_rate": 6.179882834347818e-06,
+      "loss": 0.8867,
+      "step": 6191
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 10.881242354240605,
+      "learning_rate": 6.1787606716942004e-06,
+      "loss": 0.9504,
+      "step": 6192
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 5.889429452467061,
+      "learning_rate": 6.177638446168771e-06,
+      "loss": 1.0172,
+      "step": 6193
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.337247943748243,
+      "learning_rate": 6.176516157831383e-06,
+      "loss": 0.8821,
+      "step": 6194
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.920015475273305,
+      "learning_rate": 6.175393806741899e-06,
+      "loss": 0.9934,
+      "step": 6195
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.823466267532507,
+      "learning_rate": 6.174271392960182e-06,
+      "loss": 0.9359,
+      "step": 6196
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 11.658729653307306,
+      "learning_rate": 6.173148916546097e-06,
+      "loss": 0.9932,
+      "step": 6197
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 8.617336073685216,
+      "learning_rate": 6.1720263775595125e-06,
+      "loss": 0.9378,
+      "step": 6198
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 9.792609999928652,
+      "learning_rate": 6.170903776060306e-06,
+      "loss": 0.9471,
+      "step": 6199
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.341446344314116,
+      "learning_rate": 6.169781112108351e-06,
+      "loss": 1.0044,
+      "step": 6200
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.820193970969101,
+      "learning_rate": 6.168658385763524e-06,
+      "loss": 0.9253,
+      "step": 6201
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 4.898233670629846,
+      "learning_rate": 6.1675355970857164e-06,
+      "loss": 0.9205,
+      "step": 6202
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 8.697445133678487,
+      "learning_rate": 6.166412746134806e-06,
+      "loss": 0.9977,
+      "step": 6203
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 8.421155960209896,
+      "learning_rate": 6.165289832970689e-06,
+      "loss": 1.0257,
+      "step": 6204
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.421509069798614,
+      "learning_rate": 6.164166857653255e-06,
+      "loss": 0.9861,
+      "step": 6205
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 8.261679804958996,
+      "learning_rate": 6.163043820242402e-06,
+      "loss": 0.8867,
+      "step": 6206
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 5.445936447961809,
+      "learning_rate": 6.161920720798028e-06,
+      "loss": 0.9583,
+      "step": 6207
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 6.888540100147536,
+      "learning_rate": 6.160797559380037e-06,
+      "loss": 0.9905,
+      "step": 6208
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 7.096669746452068,
+      "learning_rate": 6.1596743360483365e-06,
+      "loss": 0.9163,
+      "step": 6209
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 5.855744142818732,
+      "learning_rate": 6.158551050862833e-06,
+      "loss": 0.8951,
+      "step": 6210
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 6.999583501667255,
+      "learning_rate": 6.157427703883443e-06,
+      "loss": 0.9509,
+      "step": 6211
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 7.696582808066232,
+      "learning_rate": 6.1563042951700815e-06,
+      "loss": 0.9925,
+      "step": 6212
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 5.284602974306771,
+      "learning_rate": 6.155180824782666e-06,
+      "loss": 0.9622,
+      "step": 6213
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 8.208884016991034,
+      "learning_rate": 6.154057292781122e-06,
+      "loss": 0.9569,
+      "step": 6214
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 10.890880327639715,
+      "learning_rate": 6.152933699225373e-06,
+      "loss": 0.9014,
+      "step": 6215
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 6.446867206694125,
+      "learning_rate": 6.15181004417535e-06,
+      "loss": 0.8813,
+      "step": 6216
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 9.017028933729813,
+      "learning_rate": 6.150686327690985e-06,
+      "loss": 0.8914,
+      "step": 6217
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 6.7355152826296845,
+      "learning_rate": 6.149562549832216e-06,
+      "loss": 0.9101,
+      "step": 6218
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 6.479965600807705,
+      "learning_rate": 6.148438710658979e-06,
+      "loss": 0.9338,
+      "step": 6219
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 6.811002250372845,
+      "learning_rate": 6.147314810231218e-06,
+      "loss": 1.0227,
+      "step": 6220
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 6.20112147982838,
+      "learning_rate": 6.146190848608878e-06,
+      "loss": 0.9757,
+      "step": 6221
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 9.352652572673874,
+      "learning_rate": 6.145066825851909e-06,
+      "loss": 1.0118,
+      "step": 6222
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 7.157175871485686,
+      "learning_rate": 6.1439427420202636e-06,
+      "loss": 1.0021,
+      "step": 6223
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 8.71477744818321,
+      "learning_rate": 6.142818597173896e-06,
+      "loss": 0.9069,
+      "step": 6224
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 7.417065119463253,
+      "learning_rate": 6.141694391372765e-06,
+      "loss": 0.854,
+      "step": 6225
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 7.158525291170151,
+      "learning_rate": 6.140570124676834e-06,
+      "loss": 0.9203,
+      "step": 6226
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 4.898878222310897,
+      "learning_rate": 6.139445797146067e-06,
+      "loss": 0.9294,
+      "step": 6227
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 10.165264277750127,
+      "learning_rate": 6.1383214088404345e-06,
+      "loss": 0.9415,
+      "step": 6228
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 9.691618198297633,
+      "learning_rate": 6.137196959819903e-06,
+      "loss": 0.9293,
+      "step": 6229
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 7.2418402768178325,
+      "learning_rate": 6.136072450144452e-06,
+      "loss": 0.9843,
+      "step": 6230
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 8.217326132146985,
+      "learning_rate": 6.134947879874061e-06,
+      "loss": 0.9336,
+      "step": 6231
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 5.693598824560405,
+      "learning_rate": 6.133823249068709e-06,
+      "loss": 0.923,
+      "step": 6232
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 9.118560465856648,
+      "learning_rate": 6.132698557788379e-06,
+      "loss": 0.9512,
+      "step": 6233
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 8.221223474980263,
+      "learning_rate": 6.131573806093062e-06,
+      "loss": 0.954,
+      "step": 6234
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 7.933521794482829,
+      "learning_rate": 6.130448994042747e-06,
+      "loss": 0.9122,
+      "step": 6235
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 8.15479584485725,
+      "learning_rate": 6.12932412169743e-06,
+      "loss": 0.9475,
+      "step": 6236
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 7.415414535801395,
+      "learning_rate": 6.128199189117108e-06,
+      "loss": 0.9505,
+      "step": 6237
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 6.477617313059233,
+      "learning_rate": 6.1270741963617815e-06,
+      "loss": 0.9154,
+      "step": 6238
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 6.79464137033437,
+      "learning_rate": 6.125949143491455e-06,
+      "loss": 0.9381,
+      "step": 6239
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 8.024469717210128,
+      "learning_rate": 6.1248240305661335e-06,
+      "loss": 0.9826,
+      "step": 6240
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 5.583356581578762,
+      "learning_rate": 6.123698857645831e-06,
+      "loss": 0.9326,
+      "step": 6241
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 8.914287809429057,
+      "learning_rate": 6.122573624790558e-06,
+      "loss": 0.9451,
+      "step": 6242
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 9.879214013152989,
+      "learning_rate": 6.121448332060335e-06,
+      "loss": 0.9783,
+      "step": 6243
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 8.350912070502645,
+      "learning_rate": 6.1203229795151766e-06,
+      "loss": 0.9287,
+      "step": 6244
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 6.678554843678845,
+      "learning_rate": 6.119197567215111e-06,
+      "loss": 0.9417,
+      "step": 6245
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 10.031021159758284,
+      "learning_rate": 6.118072095220161e-06,
+      "loss": 0.9797,
+      "step": 6246
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 6.800018204848895,
+      "learning_rate": 6.116946563590359e-06,
+      "loss": 0.9756,
+      "step": 6247
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 6.538538025878059,
+      "learning_rate": 6.115820972385734e-06,
+      "loss": 0.9844,
+      "step": 6248
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 9.049064312687848,
+      "learning_rate": 6.1146953216663265e-06,
+      "loss": 1.0008,
+      "step": 6249
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 8.873033348553278,
+      "learning_rate": 6.113569611492174e-06,
+      "loss": 1.0128,
+      "step": 6250
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 8.959014467206151,
+      "learning_rate": 6.112443841923315e-06,
+      "loss": 0.9578,
+      "step": 6251
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 8.538886964439206,
+      "learning_rate": 6.1113180130198005e-06,
+      "loss": 0.9349,
+      "step": 6252
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 7.445294544767097,
+      "learning_rate": 6.1101921248416765e-06,
+      "loss": 0.9199,
+      "step": 6253
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 5.167161130087624,
+      "learning_rate": 6.109066177448994e-06,
+      "loss": 0.9077,
+      "step": 6254
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 5.801435061477104,
+      "learning_rate": 6.107940170901807e-06,
+      "loss": 0.99,
+      "step": 6255
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 8.265860935062834,
+      "learning_rate": 6.106814105260177e-06,
+      "loss": 0.8749,
+      "step": 6256
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 6.1383370352492745,
+      "learning_rate": 6.1056879805841654e-06,
+      "loss": 0.9322,
+      "step": 6257
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 6.613104502571149,
+      "learning_rate": 6.104561796933831e-06,
+      "loss": 0.9647,
+      "step": 6258
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 7.857002109240464,
+      "learning_rate": 6.103435554369247e-06,
+      "loss": 0.9268,
+      "step": 6259
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 5.389227988198649,
+      "learning_rate": 6.102309252950482e-06,
+      "loss": 0.9373,
+      "step": 6260
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 7.316355015205015,
+      "learning_rate": 6.101182892737609e-06,
+      "loss": 0.947,
+      "step": 6261
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 6.590440454002389,
+      "learning_rate": 6.100056473790706e-06,
+      "loss": 0.9753,
+      "step": 6262
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 7.618452514370738,
+      "learning_rate": 6.098929996169853e-06,
+      "loss": 0.897,
+      "step": 6263
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 8.435612157800671,
+      "learning_rate": 6.097803459935132e-06,
+      "loss": 0.97,
+      "step": 6264
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 9.60838239081246,
+      "learning_rate": 6.096676865146631e-06,
+      "loss": 1.0242,
+      "step": 6265
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 6.787617417452803,
+      "learning_rate": 6.095550211864439e-06,
+      "loss": 0.9193,
+      "step": 6266
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 10.27326064993867,
+      "learning_rate": 6.094423500148646e-06,
+      "loss": 0.9489,
+      "step": 6267
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 8.042936628561458,
+      "learning_rate": 6.093296730059353e-06,
+      "loss": 0.956,
+      "step": 6268
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 10.30919068619854,
+      "learning_rate": 6.092169901656654e-06,
+      "loss": 0.9733,
+      "step": 6269
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 7.133525078655588,
+      "learning_rate": 6.091043015000653e-06,
+      "loss": 0.9379,
+      "step": 6270
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 6.782791080704456,
+      "learning_rate": 6.089916070151454e-06,
+      "loss": 1.0116,
+      "step": 6271
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 8.271440310906922,
+      "learning_rate": 6.0887890671691655e-06,
+      "loss": 0.9533,
+      "step": 6272
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 8.201322219148642,
+      "learning_rate": 6.0876620061138995e-06,
+      "loss": 0.9577,
+      "step": 6273
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 6.260807881765928,
+      "learning_rate": 6.086534887045769e-06,
+      "loss": 0.9484,
+      "step": 6274
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 6.538156221031945,
+      "learning_rate": 6.085407710024891e-06,
+      "loss": 0.8726,
+      "step": 6275
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.527375588479012,
+      "learning_rate": 6.084280475111389e-06,
+      "loss": 0.9033,
+      "step": 6276
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.679251362931062,
+      "learning_rate": 6.083153182365383e-06,
+      "loss": 0.9395,
+      "step": 6277
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.503671917168894,
+      "learning_rate": 6.082025831847003e-06,
+      "loss": 0.9285,
+      "step": 6278
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.722849195836928,
+      "learning_rate": 6.080898423616376e-06,
+      "loss": 0.9055,
+      "step": 6279
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.163949513670514,
+      "learning_rate": 6.079770957733636e-06,
+      "loss": 0.9965,
+      "step": 6280
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.446420754815914,
+      "learning_rate": 6.078643434258919e-06,
+      "loss": 0.9426,
+      "step": 6281
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.815595087291004,
+      "learning_rate": 6.077515853252363e-06,
+      "loss": 0.8729,
+      "step": 6282
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.002601911335749,
+      "learning_rate": 6.076388214774109e-06,
+      "loss": 1.0232,
+      "step": 6283
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 10.464100669856213,
+      "learning_rate": 6.075260518884306e-06,
+      "loss": 0.9525,
+      "step": 6284
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 6.0013100737235705,
+      "learning_rate": 6.074132765643101e-06,
+      "loss": 0.9275,
+      "step": 6285
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.811444184514894,
+      "learning_rate": 6.0730049551106416e-06,
+      "loss": 0.9383,
+      "step": 6286
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.832654861733849,
+      "learning_rate": 6.071877087347084e-06,
+      "loss": 0.9159,
+      "step": 6287
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 6.878328952117172,
+      "learning_rate": 6.07074916241259e-06,
+      "loss": 0.9164,
+      "step": 6288
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 6.969703513211293,
+      "learning_rate": 6.069621180367313e-06,
+      "loss": 0.9963,
+      "step": 6289
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 5.861773858855336,
+      "learning_rate": 6.068493141271421e-06,
+      "loss": 0.9362,
+      "step": 6290
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 6.37442159214613,
+      "learning_rate": 6.067365045185078e-06,
+      "loss": 0.9305,
+      "step": 6291
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 8.659760410271433,
+      "learning_rate": 6.066236892168455e-06,
+      "loss": 0.964,
+      "step": 6292
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 8.087825631746396,
+      "learning_rate": 6.065108682281724e-06,
+      "loss": 0.9681,
+      "step": 6293
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.701619704017305,
+      "learning_rate": 6.06398041558506e-06,
+      "loss": 0.9373,
+      "step": 6294
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.278153411215065,
+      "learning_rate": 6.062852092138644e-06,
+      "loss": 1.0015,
+      "step": 6295
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.246153186037975,
+      "learning_rate": 6.061723712002654e-06,
+      "loss": 0.9048,
+      "step": 6296
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.733611263421648,
+      "learning_rate": 6.060595275237278e-06,
+      "loss": 0.8604,
+      "step": 6297
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.003288289950486,
+      "learning_rate": 6.059466781902702e-06,
+      "loss": 1.0094,
+      "step": 6298
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 8.422349725223661,
+      "learning_rate": 6.058338232059117e-06,
+      "loss": 1.0427,
+      "step": 6299
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 6.540254995589332,
+      "learning_rate": 6.0572096257667155e-06,
+      "loss": 0.9679,
+      "step": 6300
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.394385997753196,
+      "learning_rate": 6.056080963085695e-06,
+      "loss": 0.8753,
+      "step": 6301
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.7572953125031585,
+      "learning_rate": 6.054952244076256e-06,
+      "loss": 0.9159,
+      "step": 6302
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 6.275990721638346,
+      "learning_rate": 6.053823468798602e-06,
+      "loss": 1.0039,
+      "step": 6303
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.0930094270164,
+      "learning_rate": 6.052694637312937e-06,
+      "loss": 0.9827,
+      "step": 6304
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.536276688559568,
+      "learning_rate": 6.051565749679471e-06,
+      "loss": 0.9623,
+      "step": 6305
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.602939340356996,
+      "learning_rate": 6.050436805958413e-06,
+      "loss": 0.9093,
+      "step": 6306
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.181794827352582,
+      "learning_rate": 6.049307806209981e-06,
+      "loss": 0.9867,
+      "step": 6307
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.502712115762743,
+      "learning_rate": 6.048178750494391e-06,
+      "loss": 0.888,
+      "step": 6308
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 6.598791673726268,
+      "learning_rate": 6.0470496388718634e-06,
+      "loss": 0.9008,
+      "step": 6309
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 9.123047760532804,
+      "learning_rate": 6.045920471402623e-06,
+      "loss": 0.963,
+      "step": 6310
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 9.055225505728329,
+      "learning_rate": 6.044791248146896e-06,
+      "loss": 0.9141,
+      "step": 6311
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.345734483283756,
+      "learning_rate": 6.043661969164912e-06,
+      "loss": 0.9777,
+      "step": 6312
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 6.425502910680266,
+      "learning_rate": 6.042532634516904e-06,
+      "loss": 0.9463,
+      "step": 6313
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.3247171781380755,
+      "learning_rate": 6.0414032442631085e-06,
+      "loss": 0.9427,
+      "step": 6314
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.878575194470518,
+      "learning_rate": 6.04027379846376e-06,
+      "loss": 0.9977,
+      "step": 6315
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 9.238587823652908,
+      "learning_rate": 6.039144297179104e-06,
+      "loss": 0.9097,
+      "step": 6316
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.620884164943867,
+      "learning_rate": 6.038014740469384e-06,
+      "loss": 0.8365,
+      "step": 6317
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 9.381613803125394,
+      "learning_rate": 6.036885128394847e-06,
+      "loss": 0.9597,
+      "step": 6318
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.477090260089374,
+      "learning_rate": 6.035755461015743e-06,
+      "loss": 0.8757,
+      "step": 6319
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 6.407448316049157,
+      "learning_rate": 6.034625738392326e-06,
+      "loss": 0.9576,
+      "step": 6320
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 6.479421243740967,
+      "learning_rate": 6.033495960584852e-06,
+      "loss": 0.9507,
+      "step": 6321
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.038713842827127,
+      "learning_rate": 6.03236612765358e-06,
+      "loss": 0.8689,
+      "step": 6322
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 5.846210368431625,
+      "learning_rate": 6.031236239658772e-06,
+      "loss": 1.0195,
+      "step": 6323
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 8.744373499263178,
+      "learning_rate": 6.030106296660695e-06,
+      "loss": 0.9414,
+      "step": 6324
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 9.254803040594025,
+      "learning_rate": 6.028976298719613e-06,
+      "loss": 0.9662,
+      "step": 6325
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.571180570606017,
+      "learning_rate": 6.027846245895801e-06,
+      "loss": 0.9447,
+      "step": 6326
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.4524081284363115,
+      "learning_rate": 6.026716138249532e-06,
+      "loss": 1.0193,
+      "step": 6327
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.971564849449331,
+      "learning_rate": 6.025585975841081e-06,
+      "loss": 1.0073,
+      "step": 6328
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.365837811681589,
+      "learning_rate": 6.024455758730728e-06,
+      "loss": 0.9726,
+      "step": 6329
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 9.343922549741874,
+      "learning_rate": 6.023325486978758e-06,
+      "loss": 0.9589,
+      "step": 6330
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 6.768056841540378,
+      "learning_rate": 6.022195160645454e-06,
+      "loss": 0.9114,
+      "step": 6331
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 8.289415304055234,
+      "learning_rate": 6.021064779791106e-06,
+      "loss": 0.9623,
+      "step": 6332
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 8.487642299763406,
+      "learning_rate": 6.019934344476005e-06,
+      "loss": 0.8536,
+      "step": 6333
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 8.615331976417862,
+      "learning_rate": 6.018803854760445e-06,
+      "loss": 0.8717,
+      "step": 6334
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.6631132908483375,
+      "learning_rate": 6.017673310704721e-06,
+      "loss": 0.9598,
+      "step": 6335
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.098291456207917,
+      "learning_rate": 6.0165427123691385e-06,
+      "loss": 0.9158,
+      "step": 6336
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 5.582816810251171,
+      "learning_rate": 6.015412059813997e-06,
+      "loss": 0.9892,
+      "step": 6337
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 8.361070414923752,
+      "learning_rate": 6.014281353099601e-06,
+      "loss": 0.9695,
+      "step": 6338
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 6.788610355376981,
+      "learning_rate": 6.013150592286261e-06,
+      "loss": 0.9491,
+      "step": 6339
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 6.689346293141047,
+      "learning_rate": 6.012019777434289e-06,
+      "loss": 0.8628,
+      "step": 6340
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 10.039696121670067,
+      "learning_rate": 6.010888908603999e-06,
+      "loss": 0.9559,
+      "step": 6341
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 6.850731689834099,
+      "learning_rate": 6.009757985855709e-06,
+      "loss": 0.9414,
+      "step": 6342
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 6.64909760506916,
+      "learning_rate": 6.008627009249739e-06,
+      "loss": 0.9471,
+      "step": 6343
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 8.19671908438596,
+      "learning_rate": 6.00749597884641e-06,
+      "loss": 0.9722,
+      "step": 6344
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.471960788631342,
+      "learning_rate": 6.006364894706051e-06,
+      "loss": 0.8642,
+      "step": 6345
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 8.489162129357489,
+      "learning_rate": 6.0052337568889905e-06,
+      "loss": 1.008,
+      "step": 6346
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 6.83833358205547,
+      "learning_rate": 6.004102565455559e-06,
+      "loss": 0.8679,
+      "step": 6347
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.911978411049683,
+      "learning_rate": 6.002971320466091e-06,
+      "loss": 0.9896,
+      "step": 6348
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 8.681846991366958,
+      "learning_rate": 6.001840021980924e-06,
+      "loss": 0.9949,
+      "step": 6349
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 8.60933889749373,
+      "learning_rate": 6.0007086700604e-06,
+      "loss": 0.9231,
+      "step": 6350
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.9042446181229105,
+      "learning_rate": 5.999577264764861e-06,
+      "loss": 1.0274,
+      "step": 6351
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 8.133148605679377,
+      "learning_rate": 5.998445806154653e-06,
+      "loss": 0.9227,
+      "step": 6352
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 8.64054973266747,
+      "learning_rate": 5.997314294290124e-06,
+      "loss": 0.9625,
+      "step": 6353
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 8.207486888647656,
+      "learning_rate": 5.996182729231628e-06,
+      "loss": 0.9141,
+      "step": 6354
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 6.073694968207992,
+      "learning_rate": 5.995051111039518e-06,
+      "loss": 0.9086,
+      "step": 6355
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.10458922024136,
+      "learning_rate": 5.993919439774151e-06,
+      "loss": 0.9012,
+      "step": 6356
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 8.274795189520923,
+      "learning_rate": 5.992787715495887e-06,
+      "loss": 1.0139,
+      "step": 6357
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.972746938104561,
+      "learning_rate": 5.99165593826509e-06,
+      "loss": 0.8891,
+      "step": 6358
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.655871074499358,
+      "learning_rate": 5.9905241081421265e-06,
+      "loss": 0.8823,
+      "step": 6359
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.43166499899108,
+      "learning_rate": 5.989392225187363e-06,
+      "loss": 0.8926,
+      "step": 6360
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.534389623436886,
+      "learning_rate": 5.9882602894611715e-06,
+      "loss": 1.0426,
+      "step": 6361
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 5.546868165934997,
+      "learning_rate": 5.98712830102393e-06,
+      "loss": 1.0163,
+      "step": 6362
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 10.834007054000958,
+      "learning_rate": 5.9859962599360096e-06,
+      "loss": 0.9676,
+      "step": 6363
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 5.997579261895496,
+      "learning_rate": 5.984864166257794e-06,
+      "loss": 0.9337,
+      "step": 6364
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 6.82989147033396,
+      "learning_rate": 5.983732020049667e-06,
+      "loss": 0.9448,
+      "step": 6365
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 5.979125690835976,
+      "learning_rate": 5.98259982137201e-06,
+      "loss": 0.9071,
+      "step": 6366
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.292593839343308,
+      "learning_rate": 5.981467570285216e-06,
+      "loss": 0.9356,
+      "step": 6367
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 5.250431545813625,
+      "learning_rate": 5.980335266849671e-06,
+      "loss": 0.9284,
+      "step": 6368
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.415689896358445,
+      "learning_rate": 5.979202911125773e-06,
+      "loss": 0.9395,
+      "step": 6369
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.824962910948075,
+      "learning_rate": 5.978070503173917e-06,
+      "loss": 1.0022,
+      "step": 6370
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 6.575791553725139,
+      "learning_rate": 5.976938043054504e-06,
+      "loss": 0.8873,
+      "step": 6371
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.786914150926388,
+      "learning_rate": 5.975805530827935e-06,
+      "loss": 0.9785,
+      "step": 6372
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 8.760187446244224,
+      "learning_rate": 5.974672966554614e-06,
+      "loss": 0.9295,
+      "step": 6373
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 8.307007336731767,
+      "learning_rate": 5.973540350294951e-06,
+      "loss": 0.9675,
+      "step": 6374
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 5.48795726051451,
+      "learning_rate": 5.972407682109355e-06,
+      "loss": 0.8557,
+      "step": 6375
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.22503136240074,
+      "learning_rate": 5.971274962058239e-06,
+      "loss": 0.9596,
+      "step": 6376
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 5.985917690935696,
+      "learning_rate": 5.9701421902020205e-06,
+      "loss": 0.9562,
+      "step": 6377
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 5.793056301990185,
+      "learning_rate": 5.969009366601118e-06,
+      "loss": 1.0302,
+      "step": 6378
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.6383366690835475,
+      "learning_rate": 5.967876491315954e-06,
+      "loss": 0.847,
+      "step": 6379
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.745696385090265,
+      "learning_rate": 5.966743564406952e-06,
+      "loss": 0.9371,
+      "step": 6380
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 9.157127486096272,
+      "learning_rate": 5.965610585934541e-06,
+      "loss": 0.8954,
+      "step": 6381
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 5.140090572091237,
+      "learning_rate": 5.964477555959145e-06,
+      "loss": 0.968,
+      "step": 6382
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 8.2984287384302,
+      "learning_rate": 5.963344474541203e-06,
+      "loss": 0.9516,
+      "step": 6383
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 8.485917841963154,
+      "learning_rate": 5.9622113417411486e-06,
+      "loss": 0.9303,
+      "step": 6384
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 8.506519006111633,
+      "learning_rate": 5.961078157619419e-06,
+      "loss": 0.949,
+      "step": 6385
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 8.233225068251762,
+      "learning_rate": 5.959944922236456e-06,
+      "loss": 0.9207,
+      "step": 6386
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.173244964878238,
+      "learning_rate": 5.958811635652701e-06,
+      "loss": 0.9895,
+      "step": 6387
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.027311320475883,
+      "learning_rate": 5.957678297928604e-06,
+      "loss": 0.9081,
+      "step": 6388
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.274645637174,
+      "learning_rate": 5.956544909124612e-06,
+      "loss": 0.9569,
+      "step": 6389
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.244455020730554,
+      "learning_rate": 5.9554114693011776e-06,
+      "loss": 0.9414,
+      "step": 6390
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 5.928521785651705,
+      "learning_rate": 5.9542779785187554e-06,
+      "loss": 0.9309,
+      "step": 6391
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 8.743230683335636,
+      "learning_rate": 5.9531444368378e-06,
+      "loss": 1.0022,
+      "step": 6392
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.852297094231702,
+      "learning_rate": 5.952010844318775e-06,
+      "loss": 0.8907,
+      "step": 6393
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.889419174003238,
+      "learning_rate": 5.950877201022141e-06,
+      "loss": 0.9737,
+      "step": 6394
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 8.539940365941577,
+      "learning_rate": 5.949743507008364e-06,
+      "loss": 0.9413,
+      "step": 6395
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 6.843852611059233,
+      "learning_rate": 5.948609762337912e-06,
+      "loss": 0.9427,
+      "step": 6396
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 5.93516784704323,
+      "learning_rate": 5.947475967071256e-06,
+      "loss": 0.9864,
+      "step": 6397
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 6.307908293143605,
+      "learning_rate": 5.946342121268868e-06,
+      "loss": 0.9868,
+      "step": 6398
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 9.426778076248247,
+      "learning_rate": 5.9452082249912255e-06,
+      "loss": 0.9051,
+      "step": 6399
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.402288909474006,
+      "learning_rate": 5.94407427829881e-06,
+      "loss": 1.0383,
+      "step": 6400
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.126103045897088,
+      "learning_rate": 5.9429402812520975e-06,
+      "loss": 0.845,
+      "step": 6401
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.691808666687324,
+      "learning_rate": 5.941806233911576e-06,
+      "loss": 1.0441,
+      "step": 6402
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 5.639975897903551,
+      "learning_rate": 5.940672136337732e-06,
+      "loss": 1.0155,
+      "step": 6403
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.039265877908943,
+      "learning_rate": 5.939537988591054e-06,
+      "loss": 0.8877,
+      "step": 6404
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.295034958178049,
+      "learning_rate": 5.938403790732035e-06,
+      "loss": 0.8954,
+      "step": 6405
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 8.264218371920876,
+      "learning_rate": 5.93726954282117e-06,
+      "loss": 0.9182,
+      "step": 6406
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 6.894594016655635,
+      "learning_rate": 5.9361352449189555e-06,
+      "loss": 0.9364,
+      "step": 6407
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 6.997458485895497,
+      "learning_rate": 5.935000897085894e-06,
+      "loss": 0.9299,
+      "step": 6408
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 4.9786006439512,
+      "learning_rate": 5.933866499382487e-06,
+      "loss": 0.9011,
+      "step": 6409
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 6.506202825684202,
+      "learning_rate": 5.932732051869241e-06,
+      "loss": 0.937,
+      "step": 6410
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.497548931169013,
+      "learning_rate": 5.931597554606661e-06,
+      "loss": 0.9394,
+      "step": 6411
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.177581920926674,
+      "learning_rate": 5.930463007655264e-06,
+      "loss": 0.9011,
+      "step": 6412
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 5.765318000900268,
+      "learning_rate": 5.929328411075559e-06,
+      "loss": 0.9509,
+      "step": 6413
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.215999123484754,
+      "learning_rate": 5.928193764928063e-06,
+      "loss": 0.9686,
+      "step": 6414
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 6.10332922450582,
+      "learning_rate": 5.927059069273295e-06,
+      "loss": 0.9516,
+      "step": 6415
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 6.91791753993804,
+      "learning_rate": 5.925924324171777e-06,
+      "loss": 0.9904,
+      "step": 6416
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 10.591989510437237,
+      "learning_rate": 5.924789529684033e-06,
+      "loss": 0.9345,
+      "step": 6417
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 6.109167721464913,
+      "learning_rate": 5.923654685870589e-06,
+      "loss": 0.9543,
+      "step": 6418
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 8.643327963131615,
+      "learning_rate": 5.922519792791976e-06,
+      "loss": 0.9135,
+      "step": 6419
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 9.062450796379892,
+      "learning_rate": 5.921384850508723e-06,
+      "loss": 0.9486,
+      "step": 6420
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 5.2648521162526265,
+      "learning_rate": 5.92024985908137e-06,
+      "loss": 0.9407,
+      "step": 6421
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 7.021880192420104,
+      "learning_rate": 5.9191148185704496e-06,
+      "loss": 0.9292,
+      "step": 6422
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 7.82919838916714,
+      "learning_rate": 5.9179797290365024e-06,
+      "loss": 0.9565,
+      "step": 6423
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 8.082795022975171,
+      "learning_rate": 5.9168445905400725e-06,
+      "loss": 0.9425,
+      "step": 6424
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 4.982737556619429,
+      "learning_rate": 5.915709403141704e-06,
+      "loss": 0.9272,
+      "step": 6425
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 8.955828113223848,
+      "learning_rate": 5.914574166901945e-06,
+      "loss": 0.9327,
+      "step": 6426
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 9.347196049211657,
+      "learning_rate": 5.9134388818813445e-06,
+      "loss": 0.9797,
+      "step": 6427
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 8.454543755350889,
+      "learning_rate": 5.912303548140457e-06,
+      "loss": 0.9616,
+      "step": 6428
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 7.619055088470238,
+      "learning_rate": 5.91116816573984e-06,
+      "loss": 0.8974,
+      "step": 6429
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 7.99616785816862,
+      "learning_rate": 5.9100327347400464e-06,
+      "loss": 0.9785,
+      "step": 6430
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 8.640665944888642,
+      "learning_rate": 5.908897255201641e-06,
+      "loss": 0.8966,
+      "step": 6431
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 7.816833329315604,
+      "learning_rate": 5.907761727185187e-06,
+      "loss": 0.9692,
+      "step": 6432
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 8.102502563563492,
+      "learning_rate": 5.906626150751248e-06,
+      "loss": 0.93,
+      "step": 6433
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 5.930909641477919,
+      "learning_rate": 5.905490525960394e-06,
+      "loss": 0.9203,
+      "step": 6434
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 6.772776471612524,
+      "learning_rate": 5.904354852873195e-06,
+      "loss": 0.9039,
+      "step": 6435
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 7.284261217447669,
+      "learning_rate": 5.903219131550226e-06,
+      "loss": 0.9731,
+      "step": 6436
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 8.326487065072095,
+      "learning_rate": 5.902083362052063e-06,
+      "loss": 1.0165,
+      "step": 6437
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 5.901407710885028,
+      "learning_rate": 5.900947544439285e-06,
+      "loss": 0.9102,
+      "step": 6438
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 10.794751398336034,
+      "learning_rate": 5.899811678772471e-06,
+      "loss": 0.9263,
+      "step": 6439
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 5.770925803887722,
+      "learning_rate": 5.898675765112207e-06,
+      "loss": 1.0543,
+      "step": 6440
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 8.683558995207475,
+      "learning_rate": 5.897539803519081e-06,
+      "loss": 0.9682,
+      "step": 6441
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 6.726719305938526,
+      "learning_rate": 5.896403794053679e-06,
+      "loss": 0.9268,
+      "step": 6442
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 8.483106377192064,
+      "learning_rate": 5.8952677367765945e-06,
+      "loss": 0.9371,
+      "step": 6443
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 7.843392592915822,
+      "learning_rate": 5.89413163174842e-06,
+      "loss": 1.0177,
+      "step": 6444
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 10.569488260221796,
+      "learning_rate": 5.892995479029752e-06,
+      "loss": 0.9502,
+      "step": 6445
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 5.983253460232901,
+      "learning_rate": 5.891859278681193e-06,
+      "loss": 0.9807,
+      "step": 6446
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 6.817337865582941,
+      "learning_rate": 5.890723030763341e-06,
+      "loss": 0.9605,
+      "step": 6447
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 5.611608838200134,
+      "learning_rate": 5.889586735336804e-06,
+      "loss": 0.9708,
+      "step": 6448
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 6.792411404277176,
+      "learning_rate": 5.888450392462184e-06,
+      "loss": 0.9719,
+      "step": 6449
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 6.053692586404921,
+      "learning_rate": 5.8873140022000956e-06,
+      "loss": 0.9638,
+      "step": 6450
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 7.254748566265492,
+      "learning_rate": 5.886177564611148e-06,
+      "loss": 0.942,
+      "step": 6451
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 5.759118754846146,
+      "learning_rate": 5.885041079755953e-06,
+      "loss": 0.9236,
+      "step": 6452
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 6.206578678971167,
+      "learning_rate": 5.883904547695133e-06,
+      "loss": 0.8775,
+      "step": 6453
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 7.240710985346472,
+      "learning_rate": 5.8827679684893035e-06,
+      "loss": 0.9438,
+      "step": 6454
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 6.793026660686876,
+      "learning_rate": 5.881631342199088e-06,
+      "loss": 0.983,
+      "step": 6455
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 8.71357076501276,
+      "learning_rate": 5.880494668885111e-06,
+      "loss": 0.8863,
+      "step": 6456
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 7.911445375341453,
+      "learning_rate": 5.879357948608e-06,
+      "loss": 1.0065,
+      "step": 6457
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 8.387905451382887,
+      "learning_rate": 5.878221181428383e-06,
+      "loss": 0.9507,
+      "step": 6458
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 9.090740100564043,
+      "learning_rate": 5.8770843674068915e-06,
+      "loss": 0.9328,
+      "step": 6459
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 7.261941367587323,
+      "learning_rate": 5.8759475066041624e-06,
+      "loss": 1.0007,
+      "step": 6460
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 7.633272283912262,
+      "learning_rate": 5.8748105990808324e-06,
+      "loss": 0.9646,
+      "step": 6461
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 8.013945258385734,
+      "learning_rate": 5.873673644897539e-06,
+      "loss": 0.9501,
+      "step": 6462
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 6.507260519164113,
+      "learning_rate": 5.872536644114926e-06,
+      "loss": 0.9861,
+      "step": 6463
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 7.364764389899167,
+      "learning_rate": 5.871399596793635e-06,
+      "loss": 0.9039,
+      "step": 6464
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 8.547488900256884,
+      "learning_rate": 5.870262502994317e-06,
+      "loss": 0.9458,
+      "step": 6465
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 8.342302619007624,
+      "learning_rate": 5.869125362777619e-06,
+      "loss": 0.9636,
+      "step": 6466
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 7.138705935699462,
+      "learning_rate": 5.867988176204195e-06,
+      "loss": 0.8865,
+      "step": 6467
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 9.335044035863197,
+      "learning_rate": 5.866850943334694e-06,
+      "loss": 1.0059,
+      "step": 6468
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 7.054202152327699,
+      "learning_rate": 5.86571366422978e-06,
+      "loss": 0.8486,
+      "step": 6469
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 6.811626467181934,
+      "learning_rate": 5.864576338950108e-06,
+      "loss": 0.9694,
+      "step": 6470
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 7.289298194268427,
+      "learning_rate": 5.86343896755634e-06,
+      "loss": 0.9953,
+      "step": 6471
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 7.166232224908168,
+      "learning_rate": 5.8623015501091416e-06,
+      "loss": 0.9971,
+      "step": 6472
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 6.006245905457609,
+      "learning_rate": 5.8611640866691775e-06,
+      "loss": 0.9293,
+      "step": 6473
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 6.874861579612025,
+      "learning_rate": 5.860026577297119e-06,
+      "loss": 0.9685,
+      "step": 6474
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 6.368557877140853,
+      "learning_rate": 5.858889022053637e-06,
+      "loss": 0.9353,
+      "step": 6475
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 8.018460639784028,
+      "learning_rate": 5.857751420999406e-06,
+      "loss": 0.9116,
+      "step": 6476
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 8.076378332118875,
+      "learning_rate": 5.8566137741951e-06,
+      "loss": 0.8763,
+      "step": 6477
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 8.663681618814579,
+      "learning_rate": 5.8554760817014e-06,
+      "loss": 0.8895,
+      "step": 6478
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 10.103097861905756,
+      "learning_rate": 5.854338343578988e-06,
+      "loss": 0.8952,
+      "step": 6479
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 9.47855651344634,
+      "learning_rate": 5.853200559888547e-06,
+      "loss": 0.987,
+      "step": 6480
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 7.225867218967149,
+      "learning_rate": 5.852062730690762e-06,
+      "loss": 0.9769,
+      "step": 6481
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 5.946461331827482,
+      "learning_rate": 5.850924856046323e-06,
+      "loss": 0.9741,
+      "step": 6482
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 6.154890342583022,
+      "learning_rate": 5.8497869360159195e-06,
+      "loss": 0.9311,
+      "step": 6483
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 7.012603227873736,
+      "learning_rate": 5.8486489706602475e-06,
+      "loss": 0.905,
+      "step": 6484
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 5.246637033373475,
+      "learning_rate": 5.847510960040001e-06,
+      "loss": 0.9971,
+      "step": 6485
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 7.103507786681702,
+      "learning_rate": 5.846372904215881e-06,
+      "loss": 0.9273,
+      "step": 6486
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 8.093456959627765,
+      "learning_rate": 5.845234803248584e-06,
+      "loss": 0.9636,
+      "step": 6487
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 5.713189874334926,
+      "learning_rate": 5.844096657198817e-06,
+      "loss": 0.9182,
+      "step": 6488
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 8.025808468219934,
+      "learning_rate": 5.842958466127283e-06,
+      "loss": 0.9473,
+      "step": 6489
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 8.004247074130863,
+      "learning_rate": 5.84182023009469e-06,
+      "loss": 0.9884,
+      "step": 6490
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 9.787180336011202,
+      "learning_rate": 5.8406819491617506e-06,
+      "loss": 0.9975,
+      "step": 6491
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 6.80682277987286,
+      "learning_rate": 5.839543623389175e-06,
+      "loss": 0.9362,
+      "step": 6492
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 6.952826608919829,
+      "learning_rate": 5.83840525283768e-06,
+      "loss": 0.9898,
+      "step": 6493
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 6.491098597321528,
+      "learning_rate": 5.837266837567982e-06,
+      "loss": 0.8733,
+      "step": 6494
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 9.262198398335341,
+      "learning_rate": 5.8361283776408014e-06,
+      "loss": 1.01,
+      "step": 6495
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 8.550402956220259,
+      "learning_rate": 5.834989873116862e-06,
+      "loss": 0.8963,
+      "step": 6496
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 6.460503926464303,
+      "learning_rate": 5.833851324056885e-06,
+      "loss": 0.8862,
+      "step": 6497
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 5.637491279723866,
+      "learning_rate": 5.8327127305216015e-06,
+      "loss": 0.9322,
+      "step": 6498
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 7.157408795651948,
+      "learning_rate": 5.831574092571737e-06,
+      "loss": 1.0168,
+      "step": 6499
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 10.771869880730577,
+      "learning_rate": 5.830435410268026e-06,
+      "loss": 1.0304,
+      "step": 6500
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 7.00502089310555,
+      "learning_rate": 5.829296683671202e-06,
+      "loss": 0.9405,
+      "step": 6501
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 6.839005696908036,
+      "learning_rate": 5.828157912841999e-06,
+      "loss": 0.9398,
+      "step": 6502
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 6.776242349957737,
+      "learning_rate": 5.82701909784116e-06,
+      "loss": 0.8217,
+      "step": 6503
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 7.054221219930021,
+      "learning_rate": 5.825880238729424e-06,
+      "loss": 1.0048,
+      "step": 6504
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 6.762384706208638,
+      "learning_rate": 5.824741335567537e-06,
+      "loss": 0.9066,
+      "step": 6505
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 8.85543394496009,
+      "learning_rate": 5.82360238841624e-06,
+      "loss": 0.8468,
+      "step": 6506
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 8.434217275164741,
+      "learning_rate": 5.822463397336285e-06,
+      "loss": 0.8569,
+      "step": 6507
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 8.395453707275342,
+      "learning_rate": 5.821324362388422e-06,
+      "loss": 0.9311,
+      "step": 6508
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 6.122628008881121,
+      "learning_rate": 5.8201852836334035e-06,
+      "loss": 0.9766,
+      "step": 6509
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 7.451119391988016,
+      "learning_rate": 5.819046161131985e-06,
+      "loss": 0.9146,
+      "step": 6510
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 7.984792494834794,
+      "learning_rate": 5.817906994944924e-06,
+      "loss": 0.9868,
+      "step": 6511
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 7.536617878604879,
+      "learning_rate": 5.816767785132981e-06,
+      "loss": 0.9638,
+      "step": 6512
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 11.207954881450993,
+      "learning_rate": 5.815628531756916e-06,
+      "loss": 0.8904,
+      "step": 6513
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 6.558454317935459,
+      "learning_rate": 5.814489234877497e-06,
+      "loss": 0.9338,
+      "step": 6514
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 7.81686814854926,
+      "learning_rate": 5.81334989455549e-06,
+      "loss": 0.9386,
+      "step": 6515
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 6.855412150995705,
+      "learning_rate": 5.812210510851662e-06,
+      "loss": 0.942,
+      "step": 6516
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 8.513049968707868,
+      "learning_rate": 5.811071083826787e-06,
+      "loss": 0.8817,
+      "step": 6517
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 6.372617603303653,
+      "learning_rate": 5.80993161354164e-06,
+      "loss": 0.9658,
+      "step": 6518
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 10.131595096121321,
+      "learning_rate": 5.8087921000569925e-06,
+      "loss": 0.9322,
+      "step": 6519
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 6.9100775863897015,
+      "learning_rate": 5.8076525434336265e-06,
+      "loss": 0.9574,
+      "step": 6520
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 8.13873624416436,
+      "learning_rate": 5.806512943732321e-06,
+      "loss": 0.9141,
+      "step": 6521
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 7.410352579286267,
+      "learning_rate": 5.80537330101386e-06,
+      "loss": 0.8895,
+      "step": 6522
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 8.683282051943891,
+      "learning_rate": 5.80423361533903e-06,
+      "loss": 0.9507,
+      "step": 6523
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 8.06591445015921,
+      "learning_rate": 5.8030938867686185e-06,
+      "loss": 1.0077,
+      "step": 6524
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 8.995970448322343,
+      "learning_rate": 5.801954115363412e-06,
+      "loss": 1.0222,
+      "step": 6525
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 7.707125053273567,
+      "learning_rate": 5.8008143011842065e-06,
+      "loss": 1.0444,
+      "step": 6526
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 8.34026398145807,
+      "learning_rate": 5.799674444291797e-06,
+      "loss": 0.9349,
+      "step": 6527
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 8.671677712201122,
+      "learning_rate": 5.798534544746977e-06,
+      "loss": 0.9933,
+      "step": 6528
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 9.176820273058407,
+      "learning_rate": 5.797394602610545e-06,
+      "loss": 0.8678,
+      "step": 6529
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 6.902255005559794,
+      "learning_rate": 5.7962546179433075e-06,
+      "loss": 0.9423,
+      "step": 6530
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 11.451890634013848,
+      "learning_rate": 5.795114590806063e-06,
+      "loss": 0.8949,
+      "step": 6531
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 6.2842583092859785,
+      "learning_rate": 5.793974521259621e-06,
+      "loss": 0.911,
+      "step": 6532
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 8.41639877966738,
+      "learning_rate": 5.792834409364787e-06,
+      "loss": 0.9376,
+      "step": 6533
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 7.37284809718534,
+      "learning_rate": 5.791694255182374e-06,
+      "loss": 0.8937,
+      "step": 6534
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 5.820815470233924,
+      "learning_rate": 5.790554058773191e-06,
+      "loss": 0.977,
+      "step": 6535
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 8.526820838507351,
+      "learning_rate": 5.789413820198056e-06,
+      "loss": 0.973,
+      "step": 6536
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 8.523658588392893,
+      "learning_rate": 5.788273539517787e-06,
+      "loss": 0.9206,
+      "step": 6537
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 7.843559669559457,
+      "learning_rate": 5.787133216793199e-06,
+      "loss": 0.9003,
+      "step": 6538
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 7.748635026225475,
+      "learning_rate": 5.7859928520851176e-06,
+      "loss": 1.0019,
+      "step": 6539
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 7.431451412126577,
+      "learning_rate": 5.784852445454364e-06,
+      "loss": 0.907,
+      "step": 6540
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 4.181656960691426,
+      "learning_rate": 5.783711996961767e-06,
+      "loss": 0.9355,
+      "step": 6541
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 8.214291630258609,
+      "learning_rate": 5.782571506668152e-06,
+      "loss": 0.9595,
+      "step": 6542
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 7.614536260731338,
+      "learning_rate": 5.781430974634353e-06,
+      "loss": 0.9472,
+      "step": 6543
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 5.816403995721075,
+      "learning_rate": 5.7802904009212e-06,
+      "loss": 0.9686,
+      "step": 6544
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 5.517771908766712,
+      "learning_rate": 5.779149785589528e-06,
+      "loss": 0.9269,
+      "step": 6545
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 8.631183754484299,
+      "learning_rate": 5.778009128700177e-06,
+      "loss": 0.9526,
+      "step": 6546
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 7.925614723191517,
+      "learning_rate": 5.776868430313985e-06,
+      "loss": 0.9307,
+      "step": 6547
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 6.816503577960168,
+      "learning_rate": 5.7757276904917915e-06,
+      "loss": 0.9942,
+      "step": 6548
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 9.930546040416978,
+      "learning_rate": 5.774586909294443e-06,
+      "loss": 0.9059,
+      "step": 6549
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 5.84196139956753,
+      "learning_rate": 5.773446086782785e-06,
+      "loss": 0.9377,
+      "step": 6550
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 7.389284274946193,
+      "learning_rate": 5.772305223017665e-06,
+      "loss": 1.0134,
+      "step": 6551
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 6.970216621801214,
+      "learning_rate": 5.771164318059934e-06,
+      "loss": 0.9368,
+      "step": 6552
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 6.84189806658679,
+      "learning_rate": 5.7700233719704465e-06,
+      "loss": 0.8689,
+      "step": 6553
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 7.258025364786113,
+      "learning_rate": 5.768882384810054e-06,
+      "loss": 1.0276,
+      "step": 6554
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 7.014787856199779,
+      "learning_rate": 5.767741356639615e-06,
+      "loss": 0.9134,
+      "step": 6555
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 10.77814612460528,
+      "learning_rate": 5.7666002875199925e-06,
+      "loss": 0.9168,
+      "step": 6556
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 6.679767017351306,
+      "learning_rate": 5.7654591775120405e-06,
+      "loss": 0.8779,
+      "step": 6557
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 5.961211832618957,
+      "learning_rate": 5.7643180266766285e-06,
+      "loss": 0.9369,
+      "step": 6558
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.318214624961239,
+      "learning_rate": 5.7631768350746194e-06,
+      "loss": 0.8866,
+      "step": 6559
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 6.698004243617374,
+      "learning_rate": 5.7620356027668835e-06,
+      "loss": 0.9043,
+      "step": 6560
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 9.247885742591412,
+      "learning_rate": 5.7608943298142895e-06,
+      "loss": 1.0029,
+      "step": 6561
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 9.324918552120781,
+      "learning_rate": 5.75975301627771e-06,
+      "loss": 0.8567,
+      "step": 6562
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 8.102947336940064,
+      "learning_rate": 5.758611662218019e-06,
+      "loss": 0.8959,
+      "step": 6563
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.428827443231963,
+      "learning_rate": 5.757470267696093e-06,
+      "loss": 0.9701,
+      "step": 6564
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.823133431243138,
+      "learning_rate": 5.7563288327728115e-06,
+      "loss": 0.9478,
+      "step": 6565
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.304541963901101,
+      "learning_rate": 5.755187357509056e-06,
+      "loss": 1.0017,
+      "step": 6566
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 6.964322749672286,
+      "learning_rate": 5.754045841965708e-06,
+      "loss": 0.9658,
+      "step": 6567
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 5.713899503522005,
+      "learning_rate": 5.752904286203655e-06,
+      "loss": 0.9105,
+      "step": 6568
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 6.452651695956827,
+      "learning_rate": 5.751762690283782e-06,
+      "loss": 0.898,
+      "step": 6569
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 6.6797711646946585,
+      "learning_rate": 5.75062105426698e-06,
+      "loss": 1.0041,
+      "step": 6570
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 6.257303110577125,
+      "learning_rate": 5.74947937821414e-06,
+      "loss": 0.9395,
+      "step": 6571
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.315045677715637,
+      "learning_rate": 5.748337662186158e-06,
+      "loss": 0.9572,
+      "step": 6572
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 6.249909402406723,
+      "learning_rate": 5.747195906243925e-06,
+      "loss": 0.895,
+      "step": 6573
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 6.102069622269635,
+      "learning_rate": 5.7460541104483435e-06,
+      "loss": 0.9534,
+      "step": 6574
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 9.721715725233686,
+      "learning_rate": 5.744912274860314e-06,
+      "loss": 1.0197,
+      "step": 6575
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.490116411995148,
+      "learning_rate": 5.7437703995407355e-06,
+      "loss": 0.9124,
+      "step": 6576
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.342116023369519,
+      "learning_rate": 5.742628484550514e-06,
+      "loss": 0.9873,
+      "step": 6577
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 9.316606177734197,
+      "learning_rate": 5.741486529950557e-06,
+      "loss": 0.9385,
+      "step": 6578
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.951021431138527,
+      "learning_rate": 5.740344535801773e-06,
+      "loss": 0.9857,
+      "step": 6579
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 6.595185498385695,
+      "learning_rate": 5.739202502165071e-06,
+      "loss": 0.8651,
+      "step": 6580
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.302406694285999,
+      "learning_rate": 5.738060429101365e-06,
+      "loss": 0.8784,
+      "step": 6581
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 8.437192328087017,
+      "learning_rate": 5.736918316671573e-06,
+      "loss": 0.9751,
+      "step": 6582
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 4.949712671197386,
+      "learning_rate": 5.735776164936605e-06,
+      "loss": 0.9239,
+      "step": 6583
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 6.319369206761344,
+      "learning_rate": 5.734633973957387e-06,
+      "loss": 0.993,
+      "step": 6584
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 6.68777390304464,
+      "learning_rate": 5.7334917437948376e-06,
+      "loss": 1.0154,
+      "step": 6585
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 8.150186934211632,
+      "learning_rate": 5.73234947450988e-06,
+      "loss": 0.8937,
+      "step": 6586
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 8.619515400964788,
+      "learning_rate": 5.7312071661634395e-06,
+      "loss": 0.9007,
+      "step": 6587
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 8.49220514811092,
+      "learning_rate": 5.7300648188164445e-06,
+      "loss": 0.9341,
+      "step": 6588
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 6.629092622779984,
+      "learning_rate": 5.728922432529823e-06,
+      "loss": 0.985,
+      "step": 6589
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 8.466176265976825,
+      "learning_rate": 5.727780007364509e-06,
+      "loss": 0.8966,
+      "step": 6590
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 5.426822577322451,
+      "learning_rate": 5.726637543381437e-06,
+      "loss": 1.0215,
+      "step": 6591
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 8.28352745523084,
+      "learning_rate": 5.725495040641538e-06,
+      "loss": 0.8977,
+      "step": 6592
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.398710626123078,
+      "learning_rate": 5.724352499205753e-06,
+      "loss": 0.9168,
+      "step": 6593
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 5.425483843658618,
+      "learning_rate": 5.7232099191350244e-06,
+      "loss": 1.009,
+      "step": 6594
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.041175954270725,
+      "learning_rate": 5.722067300490291e-06,
+      "loss": 1.0584,
+      "step": 6595
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 9.290566373667806,
+      "learning_rate": 5.720924643332496e-06,
+      "loss": 0.8958,
+      "step": 6596
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 8.035400912026633,
+      "learning_rate": 5.719781947722589e-06,
+      "loss": 0.8649,
+      "step": 6597
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.919017168649423,
+      "learning_rate": 5.718639213721515e-06,
+      "loss": 0.9084,
+      "step": 6598
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.244123908457132,
+      "learning_rate": 5.717496441390226e-06,
+      "loss": 0.9851,
+      "step": 6599
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.5752070639880476,
+      "learning_rate": 5.716353630789675e-06,
+      "loss": 0.8728,
+      "step": 6600
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 5.462840868938448,
+      "learning_rate": 5.715210781980816e-06,
+      "loss": 0.8906,
+      "step": 6601
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 4.86310720850434,
+      "learning_rate": 5.714067895024602e-06,
+      "loss": 1.0679,
+      "step": 6602
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.097788838842827,
+      "learning_rate": 5.712924969981995e-06,
+      "loss": 0.9976,
+      "step": 6603
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 6.977458353214828,
+      "learning_rate": 5.711782006913957e-06,
+      "loss": 0.9716,
+      "step": 6604
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 6.2942321632719604,
+      "learning_rate": 5.710639005881446e-06,
+      "loss": 0.9662,
+      "step": 6605
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 6.630033818151364,
+      "learning_rate": 5.709495966945429e-06,
+      "loss": 0.8331,
+      "step": 6606
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.686797620208163,
+      "learning_rate": 5.708352890166871e-06,
+      "loss": 0.9775,
+      "step": 6607
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 6.555764801611139,
+      "learning_rate": 5.707209775606742e-06,
+      "loss": 0.9393,
+      "step": 6608
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 6.524172320210403,
+      "learning_rate": 5.706066623326012e-06,
+      "loss": 0.9309,
+      "step": 6609
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 8.119453226238106,
+      "learning_rate": 5.704923433385655e-06,
+      "loss": 1.0012,
+      "step": 6610
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.118127953973487,
+      "learning_rate": 5.703780205846642e-06,
+      "loss": 0.9777,
+      "step": 6611
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 8.392852373873255,
+      "learning_rate": 5.702636940769953e-06,
+      "loss": 0.9618,
+      "step": 6612
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 6.287555643810255,
+      "learning_rate": 5.701493638216566e-06,
+      "loss": 0.9338,
+      "step": 6613
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.083731319741362,
+      "learning_rate": 5.700350298247461e-06,
+      "loss": 0.9674,
+      "step": 6614
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 8.336983924618027,
+      "learning_rate": 5.69920692092362e-06,
+      "loss": 0.9428,
+      "step": 6615
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 9.481031691598398,
+      "learning_rate": 5.698063506306027e-06,
+      "loss": 0.8954,
+      "step": 6616
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.272481793469062,
+      "learning_rate": 5.696920054455671e-06,
+      "loss": 0.9533,
+      "step": 6617
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 8.641487762318532,
+      "learning_rate": 5.695776565433539e-06,
+      "loss": 0.8534,
+      "step": 6618
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 8.647234849118819,
+      "learning_rate": 5.694633039300621e-06,
+      "loss": 0.8995,
+      "step": 6619
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.478452861512679,
+      "learning_rate": 5.693489476117911e-06,
+      "loss": 0.9709,
+      "step": 6620
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.783167662369243,
+      "learning_rate": 5.692345875946402e-06,
+      "loss": 0.9901,
+      "step": 6621
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.573326353540654,
+      "learning_rate": 5.691202238847091e-06,
+      "loss": 0.9257,
+      "step": 6622
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 9.074765724480319,
+      "learning_rate": 5.690058564880978e-06,
+      "loss": 0.9864,
+      "step": 6623
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 5.656973628635774,
+      "learning_rate": 5.688914854109061e-06,
+      "loss": 1.0148,
+      "step": 6624
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.836617172855108,
+      "learning_rate": 5.687771106592344e-06,
+      "loss": 0.9301,
+      "step": 6625
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.512027919861395,
+      "learning_rate": 5.686627322391829e-06,
+      "loss": 1.0006,
+      "step": 6626
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 7.0698520078553555,
+      "learning_rate": 5.685483501568525e-06,
+      "loss": 0.9843,
+      "step": 6627
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.442019707357195,
+      "learning_rate": 5.684339644183439e-06,
+      "loss": 0.9252,
+      "step": 6628
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.710917074533429,
+      "learning_rate": 5.683195750297582e-06,
+      "loss": 0.9205,
+      "step": 6629
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.595744237635419,
+      "learning_rate": 5.682051819971964e-06,
+      "loss": 0.8696,
+      "step": 6630
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 6.967651917965765,
+      "learning_rate": 5.6809078532676e-06,
+      "loss": 0.8667,
+      "step": 6631
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.127080081923216,
+      "learning_rate": 5.6797638502455084e-06,
+      "loss": 0.9305,
+      "step": 6632
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 6.197690615655413,
+      "learning_rate": 5.678619810966705e-06,
+      "loss": 0.9564,
+      "step": 6633
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.86867008647589,
+      "learning_rate": 5.677475735492208e-06,
+      "loss": 0.9163,
+      "step": 6634
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 7.528512111347585,
+      "learning_rate": 5.676331623883042e-06,
+      "loss": 0.9811,
+      "step": 6635
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 7.059182309809349,
+      "learning_rate": 5.675187476200229e-06,
+      "loss": 0.8457,
+      "step": 6636
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 7.065863435565908,
+      "learning_rate": 5.674043292504796e-06,
+      "loss": 0.9099,
+      "step": 6637
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 5.996079961536828,
+      "learning_rate": 5.67289907285777e-06,
+      "loss": 1.026,
+      "step": 6638
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 6.976155219744575,
+      "learning_rate": 5.671754817320182e-06,
+      "loss": 0.8993,
+      "step": 6639
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 6.4196843298346655,
+      "learning_rate": 5.67061052595306e-06,
+      "loss": 0.8722,
+      "step": 6640
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 6.108589797039491,
+      "learning_rate": 5.669466198817439e-06,
+      "loss": 0.8622,
+      "step": 6641
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 5.5630343254042645,
+      "learning_rate": 5.6683218359743565e-06,
+      "loss": 0.8726,
+      "step": 6642
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 7.357555858652795,
+      "learning_rate": 5.667177437484845e-06,
+      "loss": 0.9704,
+      "step": 6643
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 9.062336216228365,
+      "learning_rate": 5.666033003409948e-06,
+      "loss": 0.8479,
+      "step": 6644
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 7.433479322335107,
+      "learning_rate": 5.664888533810704e-06,
+      "loss": 0.9229,
+      "step": 6645
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 7.123168678558906,
+      "learning_rate": 5.663744028748156e-06,
+      "loss": 0.9444,
+      "step": 6646
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 7.289180881481487,
+      "learning_rate": 5.662599488283349e-06,
+      "loss": 0.9796,
+      "step": 6647
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 7.227501434177451,
+      "learning_rate": 5.661454912477331e-06,
+      "loss": 0.8659,
+      "step": 6648
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 6.582672914608264,
+      "learning_rate": 5.660310301391148e-06,
+      "loss": 0.8948,
+      "step": 6649
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 6.667772225011971,
+      "learning_rate": 5.65916565508585e-06,
+      "loss": 0.8716,
+      "step": 6650
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 7.83618118838215,
+      "learning_rate": 5.6580209736224934e-06,
+      "loss": 0.9544,
+      "step": 6651
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 6.42092321924134,
+      "learning_rate": 5.656876257062128e-06,
+      "loss": 0.9348,
+      "step": 6652
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 6.364707167714541,
+      "learning_rate": 5.655731505465812e-06,
+      "loss": 0.9235,
+      "step": 6653
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 6.657176845657835,
+      "learning_rate": 5.654586718894603e-06,
+      "loss": 0.9542,
+      "step": 6654
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 5.894342486266327,
+      "learning_rate": 5.65344189740956e-06,
+      "loss": 1.0054,
+      "step": 6655
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 6.35697709799621,
+      "learning_rate": 5.652297041071745e-06,
+      "loss": 0.9551,
+      "step": 6656
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.968742929539006,
+      "learning_rate": 5.651152149942222e-06,
+      "loss": 0.9781,
+      "step": 6657
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 7.0229462682055015,
+      "learning_rate": 5.6500072240820555e-06,
+      "loss": 1.0128,
+      "step": 6658
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 4.824065143078924,
+      "learning_rate": 5.648862263552312e-06,
+      "loss": 0.8704,
+      "step": 6659
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.583248468715121,
+      "learning_rate": 5.647717268414063e-06,
+      "loss": 0.9484,
+      "step": 6660
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 7.5252873685766115,
+      "learning_rate": 5.646572238728379e-06,
+      "loss": 0.9743,
+      "step": 6661
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 5.336451691815387,
+      "learning_rate": 5.645427174556329e-06,
+      "loss": 0.8591,
+      "step": 6662
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 7.310549240745072,
+      "learning_rate": 5.644282075958991e-06,
+      "loss": 0.982,
+      "step": 6663
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 7.3867318955153305,
+      "learning_rate": 5.64313694299744e-06,
+      "loss": 0.8596,
+      "step": 6664
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 7.931396033671622,
+      "learning_rate": 5.641991775732756e-06,
+      "loss": 0.9292,
+      "step": 6665
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.176286281024353,
+      "learning_rate": 5.6408465742260155e-06,
+      "loss": 0.9887,
+      "step": 6666
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 9.111151487440075,
+      "learning_rate": 5.639701338538304e-06,
+      "loss": 0.8983,
+      "step": 6667
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.401189071302932,
+      "learning_rate": 5.638556068730705e-06,
+      "loss": 0.9556,
+      "step": 6668
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 7.5633406772883545,
+      "learning_rate": 5.6374107648643e-06,
+      "loss": 0.9349,
+      "step": 6669
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 6.65668459597422,
+      "learning_rate": 5.63626542700018e-06,
+      "loss": 0.922,
+      "step": 6670
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.550574319856157,
+      "learning_rate": 5.635120055199435e-06,
+      "loss": 0.8739,
+      "step": 6671
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.143534567079941,
+      "learning_rate": 5.633974649523152e-06,
+      "loss": 0.9006,
+      "step": 6672
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 6.792174717188811,
+      "learning_rate": 5.6328292100324265e-06,
+      "loss": 0.9373,
+      "step": 6673
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.314537943902096,
+      "learning_rate": 5.6316837367883514e-06,
+      "loss": 0.9194,
+      "step": 6674
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 6.160442013305696,
+      "learning_rate": 5.630538229852026e-06,
+      "loss": 0.9841,
+      "step": 6675
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 7.702546287714611,
+      "learning_rate": 5.629392689284545e-06,
+      "loss": 0.8991,
+      "step": 6676
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 6.478314209589246,
+      "learning_rate": 5.6282471151470115e-06,
+      "loss": 0.9352,
+      "step": 6677
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 5.466407190912094,
+      "learning_rate": 5.627101507500523e-06,
+      "loss": 0.9536,
+      "step": 6678
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 9.394980441394964,
+      "learning_rate": 5.625955866406188e-06,
+      "loss": 0.9732,
+      "step": 6679
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 7.487534887011426,
+      "learning_rate": 5.62481019192511e-06,
+      "loss": 0.9347,
+      "step": 6680
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.211742219412441,
+      "learning_rate": 5.623664484118394e-06,
+      "loss": 0.8971,
+      "step": 6681
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 7.633311137352695,
+      "learning_rate": 5.622518743047149e-06,
+      "loss": 0.9926,
+      "step": 6682
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.249371106888935,
+      "learning_rate": 5.621372968772489e-06,
+      "loss": 0.8746,
+      "step": 6683
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 10.166879916164337,
+      "learning_rate": 5.620227161355525e-06,
+      "loss": 0.9421,
+      "step": 6684
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.597756790410504,
+      "learning_rate": 5.619081320857369e-06,
+      "loss": 0.8947,
+      "step": 6685
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 7.289525357159646,
+      "learning_rate": 5.61793544733914e-06,
+      "loss": 0.915,
+      "step": 6686
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 9.147365159785805,
+      "learning_rate": 5.616789540861954e-06,
+      "loss": 0.9408,
+      "step": 6687
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 6.117831024694086,
+      "learning_rate": 5.615643601486929e-06,
+      "loss": 1.0257,
+      "step": 6688
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 9.223514100096384,
+      "learning_rate": 5.61449762927519e-06,
+      "loss": 0.9304,
+      "step": 6689
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.592193307260747,
+      "learning_rate": 5.6133516242878576e-06,
+      "loss": 0.9283,
+      "step": 6690
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 7.1355447534169425,
+      "learning_rate": 5.612205586586056e-06,
+      "loss": 1.0379,
+      "step": 6691
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 5.682276937755735,
+      "learning_rate": 5.6110595162309135e-06,
+      "loss": 0.9275,
+      "step": 6692
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.196185822145168,
+      "learning_rate": 5.609913413283557e-06,
+      "loss": 0.9932,
+      "step": 6693
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 6.532851756331697,
+      "learning_rate": 5.608767277805116e-06,
+      "loss": 0.9242,
+      "step": 6694
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 6.798008823456104,
+      "learning_rate": 5.6076211098567236e-06,
+      "loss": 0.8697,
+      "step": 6695
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 7.529444859447535,
+      "learning_rate": 5.606474909499513e-06,
+      "loss": 0.9297,
+      "step": 6696
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 5.962915051970084,
+      "learning_rate": 5.605328676794617e-06,
+      "loss": 0.9684,
+      "step": 6697
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 5.750339614366656,
+      "learning_rate": 5.604182411803175e-06,
+      "loss": 0.9645,
+      "step": 6698
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 8.413545566353088,
+      "learning_rate": 5.603036114586326e-06,
+      "loss": 0.9193,
+      "step": 6699
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 6.386599807103791,
+      "learning_rate": 5.601889785205209e-06,
+      "loss": 0.966,
+      "step": 6700
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 8.16384334309982,
+      "learning_rate": 5.600743423720966e-06,
+      "loss": 0.9011,
+      "step": 6701
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 7.504863549665674,
+      "learning_rate": 5.59959703019474e-06,
+      "loss": 0.8802,
+      "step": 6702
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 7.627522739671885,
+      "learning_rate": 5.598450604687679e-06,
+      "loss": 0.9812,
+      "step": 6703
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 7.4838028664060845,
+      "learning_rate": 5.5973041472609265e-06,
+      "loss": 1.034,
+      "step": 6704
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 8.81920330082298,
+      "learning_rate": 5.596157657975635e-06,
+      "loss": 0.9213,
+      "step": 6705
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 8.802109398979756,
+      "learning_rate": 5.5950111368929535e-06,
+      "loss": 0.9676,
+      "step": 6706
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 5.264615502176626,
+      "learning_rate": 5.593864584074033e-06,
+      "loss": 0.9447,
+      "step": 6707
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 9.879900458935099,
+      "learning_rate": 5.59271799958003e-06,
+      "loss": 0.8885,
+      "step": 6708
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 5.866221613811348,
+      "learning_rate": 5.591571383472099e-06,
+      "loss": 0.9116,
+      "step": 6709
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 9.969308071187164,
+      "learning_rate": 5.590424735811396e-06,
+      "loss": 0.9073,
+      "step": 6710
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 7.3029755492885515,
+      "learning_rate": 5.589278056659081e-06,
+      "loss": 0.9249,
+      "step": 6711
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 7.189154753898401,
+      "learning_rate": 5.588131346076315e-06,
+      "loss": 0.861,
+      "step": 6712
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 6.694711242950474,
+      "learning_rate": 5.586984604124259e-06,
+      "loss": 0.9499,
+      "step": 6713
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 6.3282102335253265,
+      "learning_rate": 5.5858378308640805e-06,
+      "loss": 0.9173,
+      "step": 6714
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 6.978882890715793,
+      "learning_rate": 5.584691026356942e-06,
+      "loss": 0.9601,
+      "step": 6715
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 5.365765054690419,
+      "learning_rate": 5.583544190664011e-06,
+      "loss": 1.0458,
+      "step": 6716
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 7.921473231764942,
+      "learning_rate": 5.582397323846456e-06,
+      "loss": 1.0266,
+      "step": 6717
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 7.9015425956945,
+      "learning_rate": 5.581250425965451e-06,
+      "loss": 0.9513,
+      "step": 6718
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 5.328510136918186,
+      "learning_rate": 5.580103497082165e-06,
+      "loss": 0.9726,
+      "step": 6719
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 5.676797363166277,
+      "learning_rate": 5.578956537257774e-06,
+      "loss": 0.9174,
+      "step": 6720
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 4.622421099007199,
+      "learning_rate": 5.577809546553452e-06,
+      "loss": 0.922,
+      "step": 6721
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 7.774663461875664,
+      "learning_rate": 5.576662525030378e-06,
+      "loss": 0.9083,
+      "step": 6722
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 7.441242958260775,
+      "learning_rate": 5.575515472749729e-06,
+      "loss": 0.9596,
+      "step": 6723
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 9.295011055444983,
+      "learning_rate": 5.574368389772686e-06,
+      "loss": 0.996,
+      "step": 6724
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 6.911716946853822,
+      "learning_rate": 5.573221276160434e-06,
+      "loss": 0.8835,
+      "step": 6725
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 7.469941160630341,
+      "learning_rate": 5.572074131974153e-06,
+      "loss": 1.0012,
+      "step": 6726
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 6.4726139950970145,
+      "learning_rate": 5.570926957275031e-06,
+      "loss": 0.9497,
+      "step": 6727
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 7.557392361088963,
+      "learning_rate": 5.569779752124256e-06,
+      "loss": 0.9221,
+      "step": 6728
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 8.601846996662626,
+      "learning_rate": 5.568632516583012e-06,
+      "loss": 0.9623,
+      "step": 6729
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 7.26489988121964,
+      "learning_rate": 5.567485250712495e-06,
+      "loss": 0.7959,
+      "step": 6730
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 6.111169057115388,
+      "learning_rate": 5.566337954573892e-06,
+      "loss": 1.0447,
+      "step": 6731
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 6.8057122551008415,
+      "learning_rate": 5.5651906282284e-06,
+      "loss": 0.9598,
+      "step": 6732
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 5.904348508554729,
+      "learning_rate": 5.564043271737213e-06,
+      "loss": 0.9523,
+      "step": 6733
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 6.974120449028274,
+      "learning_rate": 5.562895885161527e-06,
+      "loss": 0.8658,
+      "step": 6734
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 6.853993055164237,
+      "learning_rate": 5.561748468562543e-06,
+      "loss": 0.8578,
+      "step": 6735
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 9.880564971267765,
+      "learning_rate": 5.560601022001457e-06,
+      "loss": 0.9586,
+      "step": 6736
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 5.67544267677145,
+      "learning_rate": 5.559453545539476e-06,
+      "loss": 0.9864,
+      "step": 6737
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 6.890543698202022,
+      "learning_rate": 5.558306039237799e-06,
+      "loss": 0.933,
+      "step": 6738
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 5.790521491092636,
+      "learning_rate": 5.557158503157631e-06,
+      "loss": 0.8251,
+      "step": 6739
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 9.305222315493507,
+      "learning_rate": 5.5560109373601804e-06,
+      "loss": 0.8997,
+      "step": 6740
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 8.011157602879296,
+      "learning_rate": 5.554863341906652e-06,
+      "loss": 0.8798,
+      "step": 6741
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 6.4958668999813325,
+      "learning_rate": 5.553715716858259e-06,
+      "loss": 1.0705,
+      "step": 6742
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 8.236263178437468,
+      "learning_rate": 5.552568062276211e-06,
+      "loss": 0.9606,
+      "step": 6743
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 8.51283683315417,
+      "learning_rate": 5.551420378221721e-06,
+      "loss": 0.8806,
+      "step": 6744
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 6.828088638009503,
+      "learning_rate": 5.5502726647560014e-06,
+      "loss": 0.8829,
+      "step": 6745
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 5.938857080240418,
+      "learning_rate": 5.549124921940271e-06,
+      "loss": 0.9102,
+      "step": 6746
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 8.793988711173514,
+      "learning_rate": 5.547977149835746e-06,
+      "loss": 0.9592,
+      "step": 6747
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 6.791443487415316,
+      "learning_rate": 5.546829348503644e-06,
+      "loss": 0.9099,
+      "step": 6748
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 6.6133526589142555,
+      "learning_rate": 5.5456815180051874e-06,
+      "loss": 0.9744,
+      "step": 6749
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 5.397279140185512,
+      "learning_rate": 5.544533658401598e-06,
+      "loss": 0.8824,
+      "step": 6750
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 7.061729608374341,
+      "learning_rate": 5.543385769754097e-06,
+      "loss": 0.9589,
+      "step": 6751
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 7.873727488719164,
+      "learning_rate": 5.542237852123913e-06,
+      "loss": 1.0337,
+      "step": 6752
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 6.777327131689348,
+      "learning_rate": 5.541089905572272e-06,
+      "loss": 0.934,
+      "step": 6753
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 10.224093700014638,
+      "learning_rate": 5.5399419301604005e-06,
+      "loss": 0.9233,
+      "step": 6754
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 6.249835209983717,
+      "learning_rate": 5.538793925949528e-06,
+      "loss": 1.0386,
+      "step": 6755
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 9.210805205100034,
+      "learning_rate": 5.53764589300089e-06,
+      "loss": 0.947,
+      "step": 6756
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 7.834019561759945,
+      "learning_rate": 5.5364978313757156e-06,
+      "loss": 1.0263,
+      "step": 6757
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 8.711611382263404,
+      "learning_rate": 5.53534974113524e-06,
+      "loss": 0.9892,
+      "step": 6758
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 6.1061682490602545,
+      "learning_rate": 5.534201622340699e-06,
+      "loss": 0.9042,
+      "step": 6759
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 7.371600836959289,
+      "learning_rate": 5.53305347505333e-06,
+      "loss": 0.8907,
+      "step": 6760
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 6.360933747299024,
+      "learning_rate": 5.531905299334373e-06,
+      "loss": 0.9962,
+      "step": 6761
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 8.081271423965713,
+      "learning_rate": 5.530757095245068e-06,
+      "loss": 0.924,
+      "step": 6762
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 6.477187305730581,
+      "learning_rate": 5.529608862846658e-06,
+      "loss": 0.8838,
+      "step": 6763
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 7.566498865750431,
+      "learning_rate": 5.528460602200384e-06,
+      "loss": 0.971,
+      "step": 6764
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 9.890317344328578,
+      "learning_rate": 5.5273123133674925e-06,
+      "loss": 1.0446,
+      "step": 6765
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 6.139099414175345,
+      "learning_rate": 5.526163996409232e-06,
+      "loss": 0.8979,
+      "step": 6766
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 8.401644820572033,
+      "learning_rate": 5.525015651386846e-06,
+      "loss": 0.8872,
+      "step": 6767
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 8.138652239276004,
+      "learning_rate": 5.523867278361588e-06,
+      "loss": 0.9411,
+      "step": 6768
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 8.446651508852263,
+      "learning_rate": 5.522718877394706e-06,
+      "loss": 0.9295,
+      "step": 6769
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 5.702700002189971,
+      "learning_rate": 5.521570448547455e-06,
+      "loss": 0.928,
+      "step": 6770
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 5.824393623214954,
+      "learning_rate": 5.520421991881088e-06,
+      "loss": 0.9358,
+      "step": 6771
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 8.188408569909228,
+      "learning_rate": 5.519273507456861e-06,
+      "loss": 0.9471,
+      "step": 6772
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 9.353289072587467,
+      "learning_rate": 5.518124995336031e-06,
+      "loss": 0.9315,
+      "step": 6773
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 7.437711498392971,
+      "learning_rate": 5.516976455579855e-06,
+      "loss": 0.9165,
+      "step": 6774
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 6.020226248849883,
+      "learning_rate": 5.515827888249593e-06,
+      "loss": 0.9578,
+      "step": 6775
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 8.11671586961297,
+      "learning_rate": 5.51467929340651e-06,
+      "loss": 1.0793,
+      "step": 6776
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 8.029741495658326,
+      "learning_rate": 5.513530671111864e-06,
+      "loss": 0.9084,
+      "step": 6777
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 7.639396358951685,
+      "learning_rate": 5.512382021426922e-06,
+      "loss": 1.0226,
+      "step": 6778
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 5.940726329627674,
+      "learning_rate": 5.5112333444129486e-06,
+      "loss": 0.9683,
+      "step": 6779
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 5.193021066476851,
+      "learning_rate": 5.510084640131212e-06,
+      "loss": 0.9488,
+      "step": 6780
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 8.762165118997032,
+      "learning_rate": 5.50893590864298e-06,
+      "loss": 0.8986,
+      "step": 6781
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 8.799019461420517,
+      "learning_rate": 5.507787150009526e-06,
+      "loss": 0.9565,
+      "step": 6782
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 8.547117199912936,
+      "learning_rate": 5.506638364292115e-06,
+      "loss": 0.8399,
+      "step": 6783
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 8.265746113969199,
+      "learning_rate": 5.505489551552026e-06,
+      "loss": 0.9097,
+      "step": 6784
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 7.878351716100488,
+      "learning_rate": 5.504340711850532e-06,
+      "loss": 0.9931,
+      "step": 6785
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 8.929873262676802,
+      "learning_rate": 5.5031918452489075e-06,
+      "loss": 0.9883,
+      "step": 6786
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 7.392053577752544,
+      "learning_rate": 5.50204295180843e-06,
+      "loss": 0.8441,
+      "step": 6787
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 6.587482888134948,
+      "learning_rate": 5.500894031590379e-06,
+      "loss": 0.9186,
+      "step": 6788
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 11.262368016582965,
+      "learning_rate": 5.499745084656035e-06,
+      "loss": 0.909,
+      "step": 6789
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 9.376024131995182,
+      "learning_rate": 5.498596111066679e-06,
+      "loss": 0.983,
+      "step": 6790
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 7.192241357001804,
+      "learning_rate": 5.4974471108835935e-06,
+      "loss": 0.8889,
+      "step": 6791
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 6.838448795324413,
+      "learning_rate": 5.496298084168065e-06,
+      "loss": 1.0443,
+      "step": 6792
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 6.064253263416671,
+      "learning_rate": 5.495149030981379e-06,
+      "loss": 0.9269,
+      "step": 6793
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 8.268435292036909,
+      "learning_rate": 5.4939999513848195e-06,
+      "loss": 0.8477,
+      "step": 6794
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 6.7085336890997285,
+      "learning_rate": 5.49285084543968e-06,
+      "loss": 0.9673,
+      "step": 6795
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 7.229138415511051,
+      "learning_rate": 5.491701713207247e-06,
+      "loss": 0.9561,
+      "step": 6796
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 6.978997302087264,
+      "learning_rate": 5.490552554748814e-06,
+      "loss": 0.9582,
+      "step": 6797
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 7.165956698247774,
+      "learning_rate": 5.489403370125672e-06,
+      "loss": 0.9867,
+      "step": 6798
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 6.327341224728466,
+      "learning_rate": 5.488254159399117e-06,
+      "loss": 0.9936,
+      "step": 6799
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 6.772726391694853,
+      "learning_rate": 5.487104922630444e-06,
+      "loss": 0.9477,
+      "step": 6800
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 7.2285076666804,
+      "learning_rate": 5.485955659880951e-06,
+      "loss": 0.8875,
+      "step": 6801
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 7.365032050545174,
+      "learning_rate": 5.484806371211936e-06,
+      "loss": 0.9527,
+      "step": 6802
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 8.166885119565611,
+      "learning_rate": 5.483657056684697e-06,
+      "loss": 0.8868,
+      "step": 6803
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 5.385739036169626,
+      "learning_rate": 5.482507716360539e-06,
+      "loss": 0.9289,
+      "step": 6804
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 9.058785010341426,
+      "learning_rate": 5.481358350300761e-06,
+      "loss": 0.981,
+      "step": 6805
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 6.654438421737218,
+      "learning_rate": 5.480208958566669e-06,
+      "loss": 0.9687,
+      "step": 6806
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 9.14802152785414,
+      "learning_rate": 5.479059541219568e-06,
+      "loss": 0.947,
+      "step": 6807
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 5.5423628455936225,
+      "learning_rate": 5.4779100983207634e-06,
+      "loss": 0.9389,
+      "step": 6808
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 6.601267242071483,
+      "learning_rate": 5.476760629931566e-06,
+      "loss": 0.9523,
+      "step": 6809
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 7.89327542634126,
+      "learning_rate": 5.4756111361132825e-06,
+      "loss": 0.9399,
+      "step": 6810
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 5.984153460374775,
+      "learning_rate": 5.474461616927226e-06,
+      "loss": 0.9579,
+      "step": 6811
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 6.776298272738595,
+      "learning_rate": 5.473312072434705e-06,
+      "loss": 0.8944,
+      "step": 6812
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 8.588074162386478,
+      "learning_rate": 5.472162502697038e-06,
+      "loss": 0.9591,
+      "step": 6813
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 7.1231208888383835,
+      "learning_rate": 5.471012907775537e-06,
+      "loss": 0.9823,
+      "step": 6814
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 6.590470618448251,
+      "learning_rate": 5.469863287731518e-06,
+      "loss": 1.0562,
+      "step": 6815
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 6.046438930460561,
+      "learning_rate": 5.4687136426262995e-06,
+      "loss": 0.9583,
+      "step": 6816
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 7.151106764725794,
+      "learning_rate": 5.467563972521201e-06,
+      "loss": 0.9353,
+      "step": 6817
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 7.64814443333938,
+      "learning_rate": 5.46641427747754e-06,
+      "loss": 0.9172,
+      "step": 6818
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 7.01980785341239,
+      "learning_rate": 5.46526455755664e-06,
+      "loss": 0.9112,
+      "step": 6819
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 6.291725062712106,
+      "learning_rate": 5.464114812819825e-06,
+      "loss": 0.9859,
+      "step": 6820
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 5.687514817762935,
+      "learning_rate": 5.462965043328416e-06,
+      "loss": 0.9794,
+      "step": 6821
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 8.401419016928621,
+      "learning_rate": 5.46181524914374e-06,
+      "loss": 1.0094,
+      "step": 6822
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 6.6778976109589285,
+      "learning_rate": 5.460665430327125e-06,
+      "loss": 0.9029,
+      "step": 6823
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 7.093914555897699,
+      "learning_rate": 5.459515586939899e-06,
+      "loss": 0.9272,
+      "step": 6824
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 7.083552243748822,
+      "learning_rate": 5.458365719043389e-06,
+      "loss": 0.8301,
+      "step": 6825
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 5.7947806502284385,
+      "learning_rate": 5.457215826698928e-06,
+      "loss": 0.9144,
+      "step": 6826
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 8.51743784306273,
+      "learning_rate": 5.456065909967848e-06,
+      "loss": 1.0017,
+      "step": 6827
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 5.496277569705117,
+      "learning_rate": 5.4549159689114815e-06,
+      "loss": 0.9854,
+      "step": 6828
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 6.67855167425188,
+      "learning_rate": 5.4537660035911636e-06,
+      "loss": 0.9402,
+      "step": 6829
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 8.025751405444568,
+      "learning_rate": 5.452616014068232e-06,
+      "loss": 0.9848,
+      "step": 6830
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 6.481078944625567,
+      "learning_rate": 5.4514660004040175e-06,
+      "loss": 0.9013,
+      "step": 6831
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 6.850677139994847,
+      "learning_rate": 5.450315962659867e-06,
+      "loss": 0.8744,
+      "step": 6832
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 7.81754996319403,
+      "learning_rate": 5.449165900897115e-06,
+      "loss": 0.9744,
+      "step": 6833
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 7.456439401059276,
+      "learning_rate": 5.448015815177105e-06,
+      "loss": 0.9518,
+      "step": 6834
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 7.696352816970386,
+      "learning_rate": 5.446865705561177e-06,
+      "loss": 0.9413,
+      "step": 6835
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 7.961481042302837,
+      "learning_rate": 5.445715572110678e-06,
+      "loss": 0.8987,
+      "step": 6836
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.247048216459855,
+      "learning_rate": 5.44456541488695e-06,
+      "loss": 0.9256,
+      "step": 6837
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 5.855314633572047,
+      "learning_rate": 5.44341523395134e-06,
+      "loss": 0.925,
+      "step": 6838
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.778930283071659,
+      "learning_rate": 5.442265029365197e-06,
+      "loss": 0.9804,
+      "step": 6839
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 6.254748968405409,
+      "learning_rate": 5.441114801189868e-06,
+      "loss": 0.9429,
+      "step": 6840
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 8.846786541908788,
+      "learning_rate": 5.4399645494867005e-06,
+      "loss": 0.9684,
+      "step": 6841
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.37657441196854,
+      "learning_rate": 5.438814274317052e-06,
+      "loss": 0.8208,
+      "step": 6842
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.124065546860087,
+      "learning_rate": 5.43766397574227e-06,
+      "loss": 0.951,
+      "step": 6843
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 8.07180178682027,
+      "learning_rate": 5.4365136538237106e-06,
+      "loss": 0.9189,
+      "step": 6844
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 6.204324922288359,
+      "learning_rate": 5.435363308622728e-06,
+      "loss": 0.9348,
+      "step": 6845
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 6.517616996900712,
+      "learning_rate": 5.434212940200677e-06,
+      "loss": 1.0178,
+      "step": 6846
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.0425060160073745,
+      "learning_rate": 5.433062548618917e-06,
+      "loss": 0.8924,
+      "step": 6847
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 9.183411390656302,
+      "learning_rate": 5.431912133938807e-06,
+      "loss": 0.8331,
+      "step": 6848
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 4.674159929739105,
+      "learning_rate": 5.4307616962217055e-06,
+      "loss": 0.8574,
+      "step": 6849
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 8.210248684752425,
+      "learning_rate": 5.429611235528973e-06,
+      "loss": 1.0331,
+      "step": 6850
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 6.574443082928528,
+      "learning_rate": 5.428460751921974e-06,
+      "loss": 1.0067,
+      "step": 6851
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 9.173424133949943,
+      "learning_rate": 5.427310245462073e-06,
+      "loss": 0.9714,
+      "step": 6852
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 8.142012758798025,
+      "learning_rate": 5.426159716210631e-06,
+      "loss": 0.9363,
+      "step": 6853
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.199933321604792,
+      "learning_rate": 5.425009164229016e-06,
+      "loss": 0.8899,
+      "step": 6854
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 9.284917386780128,
+      "learning_rate": 5.423858589578594e-06,
+      "loss": 0.9774,
+      "step": 6855
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 11.212709023641183,
+      "learning_rate": 5.4227079923207375e-06,
+      "loss": 0.9339,
+      "step": 6856
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.168462125129097,
+      "learning_rate": 5.42155737251681e-06,
+      "loss": 0.9231,
+      "step": 6857
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 6.293702350455641,
+      "learning_rate": 5.4204067302281884e-06,
+      "loss": 0.9198,
+      "step": 6858
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 5.845777506891584,
+      "learning_rate": 5.419256065516242e-06,
+      "loss": 0.8705,
+      "step": 6859
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 8.458060205267813,
+      "learning_rate": 5.418105378442341e-06,
+      "loss": 0.9478,
+      "step": 6860
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 6.2086605144473195,
+      "learning_rate": 5.4169546690678666e-06,
+      "loss": 0.9586,
+      "step": 6861
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.8041624340164715,
+      "learning_rate": 5.415803937454189e-06,
+      "loss": 0.9048,
+      "step": 6862
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.49330396024779,
+      "learning_rate": 5.414653183662687e-06,
+      "loss": 0.9131,
+      "step": 6863
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 8.714346809977917,
+      "learning_rate": 5.413502407754738e-06,
+      "loss": 0.9522,
+      "step": 6864
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.945634770701528,
+      "learning_rate": 5.412351609791722e-06,
+      "loss": 1.0454,
+      "step": 6865
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 9.368993115526322,
+      "learning_rate": 5.411200789835018e-06,
+      "loss": 1.0014,
+      "step": 6866
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 8.013517862141779,
+      "learning_rate": 5.41004994794601e-06,
+      "loss": 0.8688,
+      "step": 6867
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 6.976188951336694,
+      "learning_rate": 5.408899084186079e-06,
+      "loss": 0.9348,
+      "step": 6868
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.196191343810921,
+      "learning_rate": 5.4077481986166066e-06,
+      "loss": 0.9129,
+      "step": 6869
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 9.047867666541292,
+      "learning_rate": 5.406597291298983e-06,
+      "loss": 0.9357,
+      "step": 6870
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 6.847925054064161,
+      "learning_rate": 5.4054463622945915e-06,
+      "loss": 0.9458,
+      "step": 6871
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 8.559082772892381,
+      "learning_rate": 5.404295411664818e-06,
+      "loss": 0.8225,
+      "step": 6872
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 5.918336304997929,
+      "learning_rate": 5.4031444394710544e-06,
+      "loss": 0.95,
+      "step": 6873
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 6.773408904026947,
+      "learning_rate": 5.401993445774687e-06,
+      "loss": 0.9082,
+      "step": 6874
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.0016701035996425,
+      "learning_rate": 5.4008424306371085e-06,
+      "loss": 0.99,
+      "step": 6875
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.55995697813859,
+      "learning_rate": 5.399691394119711e-06,
+      "loss": 0.9419,
+      "step": 6876
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 8.74826688147426,
+      "learning_rate": 5.398540336283887e-06,
+      "loss": 0.874,
+      "step": 6877
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 5.671584664450551,
+      "learning_rate": 5.397389257191031e-06,
+      "loss": 0.8967,
+      "step": 6878
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 9.156126514217135,
+      "learning_rate": 5.3962381569025365e-06,
+      "loss": 0.9312,
+      "step": 6879
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.373376107057237,
+      "learning_rate": 5.395087035479803e-06,
+      "loss": 0.994,
+      "step": 6880
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 8.647274435375692,
+      "learning_rate": 5.393935892984229e-06,
+      "loss": 0.9229,
+      "step": 6881
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 6.492698833929639,
+      "learning_rate": 5.3927847294772085e-06,
+      "loss": 0.8852,
+      "step": 6882
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 9.081852424851716,
+      "learning_rate": 5.391633545020144e-06,
+      "loss": 1.0193,
+      "step": 6883
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 6.9118132784924216,
+      "learning_rate": 5.3904823396744355e-06,
+      "loss": 0.9456,
+      "step": 6884
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 5.744634803798532,
+      "learning_rate": 5.389331113501487e-06,
+      "loss": 1.0206,
+      "step": 6885
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 8.698850639288345,
+      "learning_rate": 5.388179866562701e-06,
+      "loss": 0.9788,
+      "step": 6886
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.4070448011667755,
+      "learning_rate": 5.387028598919481e-06,
+      "loss": 0.9564,
+      "step": 6887
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 6.915822069539127,
+      "learning_rate": 5.385877310633233e-06,
+      "loss": 0.912,
+      "step": 6888
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 8.419895181879587,
+      "learning_rate": 5.384726001765362e-06,
+      "loss": 0.9532,
+      "step": 6889
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.343588229206991,
+      "learning_rate": 5.383574672377279e-06,
+      "loss": 0.935,
+      "step": 6890
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 6.392488607092849,
+      "learning_rate": 5.38242332253039e-06,
+      "loss": 0.9761,
+      "step": 6891
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.331794200266092,
+      "learning_rate": 5.381271952286104e-06,
+      "loss": 0.8792,
+      "step": 6892
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 8.044867539372826,
+      "learning_rate": 5.380120561705835e-06,
+      "loss": 0.9889,
+      "step": 6893
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.917062521183074,
+      "learning_rate": 5.378969150850993e-06,
+      "loss": 1.0231,
+      "step": 6894
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 6.190205969379041,
+      "learning_rate": 5.3778177197829905e-06,
+      "loss": 0.9671,
+      "step": 6895
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 9.578974154758738,
+      "learning_rate": 5.376666268563243e-06,
+      "loss": 0.9011,
+      "step": 6896
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 5.378278839363326,
+      "learning_rate": 5.375514797253166e-06,
+      "loss": 0.9171,
+      "step": 6897
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 6.814178480387419,
+      "learning_rate": 5.374363305914174e-06,
+      "loss": 0.972,
+      "step": 6898
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.204447378442116,
+      "learning_rate": 5.373211794607686e-06,
+      "loss": 0.9036,
+      "step": 6899
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 8.562780468529521,
+      "learning_rate": 5.37206026339512e-06,
+      "loss": 0.911,
+      "step": 6900
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 5.118328603429874,
+      "learning_rate": 5.370908712337894e-06,
+      "loss": 0.9446,
+      "step": 6901
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 6.628662282795075,
+      "learning_rate": 5.36975714149743e-06,
+      "loss": 0.8888,
+      "step": 6902
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 6.797392040351055,
+      "learning_rate": 5.3686055509351495e-06,
+      "loss": 0.8954,
+      "step": 6903
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 6.989316288234531,
+      "learning_rate": 5.367453940712475e-06,
+      "loss": 1.0062,
+      "step": 6904
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 5.166971366567664,
+      "learning_rate": 5.3663023108908295e-06,
+      "loss": 0.9213,
+      "step": 6905
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.788835213105813,
+      "learning_rate": 5.365150661531641e-06,
+      "loss": 0.9056,
+      "step": 6906
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 6.289960743285933,
+      "learning_rate": 5.36399899269633e-06,
+      "loss": 0.9026,
+      "step": 6907
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 5.0774004032455595,
+      "learning_rate": 5.362847304446326e-06,
+      "loss": 0.9598,
+      "step": 6908
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 5.423649669803673,
+      "learning_rate": 5.361695596843057e-06,
+      "loss": 0.9526,
+      "step": 6909
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 8.063512842979542,
+      "learning_rate": 5.360543869947953e-06,
+      "loss": 0.8801,
+      "step": 6910
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 7.914537312927556,
+      "learning_rate": 5.359392123822441e-06,
+      "loss": 0.9028,
+      "step": 6911
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 9.534114485937456,
+      "learning_rate": 5.358240358527954e-06,
+      "loss": 0.9627,
+      "step": 6912
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 8.07506205091955,
+      "learning_rate": 5.357088574125923e-06,
+      "loss": 0.9818,
+      "step": 6913
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 6.738120862461788,
+      "learning_rate": 5.355936770677782e-06,
+      "loss": 0.9782,
+      "step": 6914
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 7.208105508235665,
+      "learning_rate": 5.3547849482449645e-06,
+      "loss": 0.8341,
+      "step": 6915
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 6.1160177866693965,
+      "learning_rate": 5.3536331068889055e-06,
+      "loss": 0.9132,
+      "step": 6916
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 6.725423507489706,
+      "learning_rate": 5.35248124667104e-06,
+      "loss": 0.899,
+      "step": 6917
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 6.279023604598163,
+      "learning_rate": 5.351329367652805e-06,
+      "loss": 0.9025,
+      "step": 6918
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 8.498211374747749,
+      "learning_rate": 5.350177469895643e-06,
+      "loss": 1.0032,
+      "step": 6919
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 6.338368451317611,
+      "learning_rate": 5.349025553460987e-06,
+      "loss": 0.9402,
+      "step": 6920
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 8.19296391912993,
+      "learning_rate": 5.34787361841028e-06,
+      "loss": 0.9616,
+      "step": 6921
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 6.657934212025692,
+      "learning_rate": 5.346721664804962e-06,
+      "loss": 1.0073,
+      "step": 6922
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 9.559216217775777,
+      "learning_rate": 5.345569692706476e-06,
+      "loss": 0.9331,
+      "step": 6923
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 7.467658616254599,
+      "learning_rate": 5.344417702176264e-06,
+      "loss": 0.9528,
+      "step": 6924
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 7.779024708375057,
+      "learning_rate": 5.343265693275772e-06,
+      "loss": 0.8914,
+      "step": 6925
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 8.96920949830938,
+      "learning_rate": 5.342113666066443e-06,
+      "loss": 0.9564,
+      "step": 6926
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 6.6897206373482705,
+      "learning_rate": 5.340961620609722e-06,
+      "loss": 0.7908,
+      "step": 6927
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 7.389342925250135,
+      "learning_rate": 5.339809556967058e-06,
+      "loss": 0.9442,
+      "step": 6928
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 5.316663129718298,
+      "learning_rate": 5.3386574751998975e-06,
+      "loss": 0.925,
+      "step": 6929
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 6.847225750591146,
+      "learning_rate": 5.337505375369691e-06,
+      "loss": 1.0503,
+      "step": 6930
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 10.933632421633831,
+      "learning_rate": 5.336353257537886e-06,
+      "loss": 0.9338,
+      "step": 6931
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 8.391979570569303,
+      "learning_rate": 5.335201121765935e-06,
+      "loss": 1.0148,
+      "step": 6932
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 8.577638731437146,
+      "learning_rate": 5.3340489681152895e-06,
+      "loss": 1.0166,
+      "step": 6933
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 8.457976380209859,
+      "learning_rate": 5.332896796647401e-06,
+      "loss": 1.0159,
+      "step": 6934
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 6.6824815424665625,
+      "learning_rate": 5.331744607423726e-06,
+      "loss": 0.855,
+      "step": 6935
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 10.235474153546484,
+      "learning_rate": 5.330592400505715e-06,
+      "loss": 0.9553,
+      "step": 6936
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 6.694329036417631,
+      "learning_rate": 5.329440175954826e-06,
+      "loss": 0.8856,
+      "step": 6937
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 7.250857570318853,
+      "learning_rate": 5.328287933832517e-06,
+      "loss": 1.0309,
+      "step": 6938
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 6.615218751309817,
+      "learning_rate": 5.327135674200241e-06,
+      "loss": 0.9149,
+      "step": 6939
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 7.649837384683466,
+      "learning_rate": 5.32598339711946e-06,
+      "loss": 0.835,
+      "step": 6940
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 7.889362057710452,
+      "learning_rate": 5.324831102651631e-06,
+      "loss": 0.9102,
+      "step": 6941
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 6.961442230005438,
+      "learning_rate": 5.323678790858215e-06,
+      "loss": 0.8725,
+      "step": 6942
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 7.11823764790924,
+      "learning_rate": 5.3225264618006744e-06,
+      "loss": 0.8837,
+      "step": 6943
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 9.344441292315313,
+      "learning_rate": 5.321374115540469e-06,
+      "loss": 0.9423,
+      "step": 6944
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 7.14203400575217,
+      "learning_rate": 5.320221752139065e-06,
+      "loss": 0.786,
+      "step": 6945
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 9.65022873951392,
+      "learning_rate": 5.319069371657921e-06,
+      "loss": 1.026,
+      "step": 6946
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 7.454539638620268,
+      "learning_rate": 5.3179169741585064e-06,
+      "loss": 0.9159,
+      "step": 6947
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 9.353840130194468,
+      "learning_rate": 5.3167645597022855e-06,
+      "loss": 0.8995,
+      "step": 6948
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 6.977441150226569,
+      "learning_rate": 5.3156121283507245e-06,
+      "loss": 0.9853,
+      "step": 6949
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 5.75269287256272,
+      "learning_rate": 5.314459680165292e-06,
+      "loss": 0.9058,
+      "step": 6950
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 7.018941384268693,
+      "learning_rate": 5.313307215207454e-06,
+      "loss": 0.9019,
+      "step": 6951
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 7.629715304154484,
+      "learning_rate": 5.312154733538681e-06,
+      "loss": 0.9456,
+      "step": 6952
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 5.823246696320246,
+      "learning_rate": 5.311002235220444e-06,
+      "loss": 0.94,
+      "step": 6953
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 7.074113709700687,
+      "learning_rate": 5.309849720314215e-06,
+      "loss": 0.9306,
+      "step": 6954
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 8.484553148454658,
+      "learning_rate": 5.308697188881462e-06,
+      "loss": 0.9807,
+      "step": 6955
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 6.898134180954949,
+      "learning_rate": 5.307544640983661e-06,
+      "loss": 0.8889,
+      "step": 6956
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 7.949073813218991,
+      "learning_rate": 5.306392076682285e-06,
+      "loss": 0.9724,
+      "step": 6957
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 7.95319160354962,
+      "learning_rate": 5.305239496038809e-06,
+      "loss": 0.9063,
+      "step": 6958
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 8.330123241771672,
+      "learning_rate": 5.3040868991147075e-06,
+      "loss": 0.9072,
+      "step": 6959
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 7.801090569939124,
+      "learning_rate": 5.302934285971457e-06,
+      "loss": 0.9761,
+      "step": 6960
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 9.151634194260415,
+      "learning_rate": 5.301781656670535e-06,
+      "loss": 0.933,
+      "step": 6961
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 8.361677400004387,
+      "learning_rate": 5.30062901127342e-06,
+      "loss": 0.9725,
+      "step": 6962
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 6.6535951162223865,
+      "learning_rate": 5.299476349841589e-06,
+      "loss": 0.857,
+      "step": 6963
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 5.667201919678813,
+      "learning_rate": 5.298323672436526e-06,
+      "loss": 0.9153,
+      "step": 6964
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 6.434889679893565,
+      "learning_rate": 5.297170979119706e-06,
+      "loss": 0.9629,
+      "step": 6965
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 11.875795253447032,
+      "learning_rate": 5.296018269952613e-06,
+      "loss": 0.9254,
+      "step": 6966
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 6.505595224935018,
+      "learning_rate": 5.294865544996731e-06,
+      "loss": 0.9285,
+      "step": 6967
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 7.139770227509956,
+      "learning_rate": 5.293712804313541e-06,
+      "loss": 0.8982,
+      "step": 6968
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 6.042744552621317,
+      "learning_rate": 5.292560047964528e-06,
+      "loss": 0.9373,
+      "step": 6969
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 7.261206333931084,
+      "learning_rate": 5.291407276011175e-06,
+      "loss": 1.0188,
+      "step": 6970
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 8.241498555566713,
+      "learning_rate": 5.29025448851497e-06,
+      "loss": 0.8863,
+      "step": 6971
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 8.617824216036894,
+      "learning_rate": 5.289101685537399e-06,
+      "loss": 0.9771,
+      "step": 6972
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 9.082392905319143,
+      "learning_rate": 5.2879488671399495e-06,
+      "loss": 1.0014,
+      "step": 6973
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 6.952475841059934,
+      "learning_rate": 5.286796033384108e-06,
+      "loss": 0.9596,
+      "step": 6974
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 7.755479581227858,
+      "learning_rate": 5.285643184331362e-06,
+      "loss": 0.8879,
+      "step": 6975
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 5.5236177654103304,
+      "learning_rate": 5.284490320043206e-06,
+      "loss": 0.8618,
+      "step": 6976
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 7.059318690029693,
+      "learning_rate": 5.2833374405811275e-06,
+      "loss": 0.8753,
+      "step": 6977
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 9.500984400032753,
+      "learning_rate": 5.28218454600662e-06,
+      "loss": 0.9608,
+      "step": 6978
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 6.739459451366246,
+      "learning_rate": 5.281031636381173e-06,
+      "loss": 0.9694,
+      "step": 6979
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 7.104530477789307,
+      "learning_rate": 5.279878711766281e-06,
+      "loss": 0.9001,
+      "step": 6980
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 7.15149026748469,
+      "learning_rate": 5.278725772223438e-06,
+      "loss": 0.9615,
+      "step": 6981
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 9.702013338051419,
+      "learning_rate": 5.277572817814138e-06,
+      "loss": 0.9533,
+      "step": 6982
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 6.788010858796413,
+      "learning_rate": 5.276419848599876e-06,
+      "loss": 0.9578,
+      "step": 6983
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 7.554283242593308,
+      "learning_rate": 5.275266864642148e-06,
+      "loss": 1.0626,
+      "step": 6984
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 4.950893961496237,
+      "learning_rate": 5.274113866002452e-06,
+      "loss": 0.9603,
+      "step": 6985
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 6.973598132818833,
+      "learning_rate": 5.272960852742288e-06,
+      "loss": 0.9423,
+      "step": 6986
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 6.476906509523488,
+      "learning_rate": 5.2718078249231495e-06,
+      "loss": 0.9172,
+      "step": 6987
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 6.892891703366927,
+      "learning_rate": 5.270654782606539e-06,
+      "loss": 1.0108,
+      "step": 6988
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 7.169879496362002,
+      "learning_rate": 5.269501725853955e-06,
+      "loss": 0.9458,
+      "step": 6989
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 7.5637908000326135,
+      "learning_rate": 5.268348654726899e-06,
+      "loss": 0.9548,
+      "step": 6990
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 5.709420407679353,
+      "learning_rate": 5.267195569286874e-06,
+      "loss": 0.9434,
+      "step": 6991
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 6.978833517672747,
+      "learning_rate": 5.266042469595382e-06,
+      "loss": 0.8932,
+      "step": 6992
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 6.079837828266627,
+      "learning_rate": 5.264889355713922e-06,
+      "loss": 0.9623,
+      "step": 6993
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 7.144078463250194,
+      "learning_rate": 5.263736227704002e-06,
+      "loss": 0.8826,
+      "step": 6994
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 7.659879111188505,
+      "learning_rate": 5.262583085627128e-06,
+      "loss": 0.9216,
+      "step": 6995
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 7.339919224659549,
+      "learning_rate": 5.2614299295448e-06,
+      "loss": 0.9169,
+      "step": 6996
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 6.7797279989881405,
+      "learning_rate": 5.2602767595185285e-06,
+      "loss": 0.9116,
+      "step": 6997
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 6.790632277102803,
+      "learning_rate": 5.25912357560982e-06,
+      "loss": 0.8892,
+      "step": 6998
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 8.197174725651713,
+      "learning_rate": 5.257970377880181e-06,
+      "loss": 0.9339,
+      "step": 6999
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 6.2463398909365235,
+      "learning_rate": 5.256817166391119e-06,
+      "loss": 0.8222,
+      "step": 7000
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 5.4881910503664955,
+      "learning_rate": 5.255663941204146e-06,
+      "loss": 0.916,
+      "step": 7001
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 7.365018697105698,
+      "learning_rate": 5.254510702380769e-06,
+      "loss": 0.92,
+      "step": 7002
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 6.682012266864785,
+      "learning_rate": 5.253357449982498e-06,
+      "loss": 0.9782,
+      "step": 7003
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 5.661743765411276,
+      "learning_rate": 5.252204184070847e-06,
+      "loss": 0.8757,
+      "step": 7004
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 7.485832783867891,
+      "learning_rate": 5.251050904707329e-06,
+      "loss": 0.9531,
+      "step": 7005
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 7.556498462956931,
+      "learning_rate": 5.249897611953452e-06,
+      "loss": 0.9225,
+      "step": 7006
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 7.896687185271014,
+      "learning_rate": 5.248744305870733e-06,
+      "loss": 0.9778,
+      "step": 7007
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 6.884534845534008,
+      "learning_rate": 5.247590986520684e-06,
+      "loss": 0.9181,
+      "step": 7008
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 7.678086303196155,
+      "learning_rate": 5.246437653964822e-06,
+      "loss": 0.9439,
+      "step": 7009
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 6.483340535032447,
+      "learning_rate": 5.245284308264661e-06,
+      "loss": 0.8915,
+      "step": 7010
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 7.629754507712478,
+      "learning_rate": 5.244130949481719e-06,
+      "loss": 0.9957,
+      "step": 7011
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 11.70889034421395,
+      "learning_rate": 5.242977577677512e-06,
+      "loss": 0.9495,
+      "step": 7012
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 10.358714482160414,
+      "learning_rate": 5.241824192913554e-06,
+      "loss": 0.8036,
+      "step": 7013
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 6.518336164239729,
+      "learning_rate": 5.24067079525137e-06,
+      "loss": 0.722,
+      "step": 7014
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 6.953476018991004,
+      "learning_rate": 5.2395173847524735e-06,
+      "loss": 0.7603,
+      "step": 7015
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 7.900228083134104,
+      "learning_rate": 5.238363961478388e-06,
+      "loss": 0.836,
+      "step": 7016
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 7.227416133273312,
+      "learning_rate": 5.2372105254906315e-06,
+      "loss": 0.7714,
+      "step": 7017
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 8.144792777682069,
+      "learning_rate": 5.2360570768507255e-06,
+      "loss": 0.7711,
+      "step": 7018
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 7.311425602767089,
+      "learning_rate": 5.234903615620193e-06,
+      "loss": 0.7634,
+      "step": 7019
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 6.171617251571732,
+      "learning_rate": 5.233750141860554e-06,
+      "loss": 0.7281,
+      "step": 7020
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 8.214041192428096,
+      "learning_rate": 5.232596655633335e-06,
+      "loss": 0.859,
+      "step": 7021
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 7.3566704037620685,
+      "learning_rate": 5.231443157000057e-06,
+      "loss": 0.804,
+      "step": 7022
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 7.912399947832568,
+      "learning_rate": 5.230289646022244e-06,
+      "loss": 0.7799,
+      "step": 7023
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 8.868550059758572,
+      "learning_rate": 5.229136122761424e-06,
+      "loss": 0.7545,
+      "step": 7024
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 10.4136870411318,
+      "learning_rate": 5.227982587279121e-06,
+      "loss": 0.8562,
+      "step": 7025
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 8.706465609132026,
+      "learning_rate": 5.226829039636862e-06,
+      "loss": 0.7523,
+      "step": 7026
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 6.862113325498154,
+      "learning_rate": 5.225675479896171e-06,
+      "loss": 0.7751,
+      "step": 7027
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 9.244439814204819,
+      "learning_rate": 5.2245219081185795e-06,
+      "loss": 0.7571,
+      "step": 7028
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 7.938497661812305,
+      "learning_rate": 5.223368324365614e-06,
+      "loss": 0.749,
+      "step": 7029
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 8.700591192138033,
+      "learning_rate": 5.222214728698805e-06,
+      "loss": 0.8104,
+      "step": 7030
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 9.091444020839186,
+      "learning_rate": 5.22106112117968e-06,
+      "loss": 0.7501,
+      "step": 7031
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 7.3924401585656785,
+      "learning_rate": 5.219907501869769e-06,
+      "loss": 0.8088,
+      "step": 7032
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 7.339838411020649,
+      "learning_rate": 5.218753870830606e-06,
+      "loss": 0.772,
+      "step": 7033
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 9.138385550329572,
+      "learning_rate": 5.2176002281237195e-06,
+      "loss": 0.7416,
+      "step": 7034
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 8.544922718334158,
+      "learning_rate": 5.216446573810643e-06,
+      "loss": 0.7709,
+      "step": 7035
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 6.064563208140848,
+      "learning_rate": 5.215292907952908e-06,
+      "loss": 0.738,
+      "step": 7036
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 8.165864788315252,
+      "learning_rate": 5.214139230612049e-06,
+      "loss": 0.7094,
+      "step": 7037
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 10.833050920467436,
+      "learning_rate": 5.212985541849599e-06,
+      "loss": 0.732,
+      "step": 7038
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 6.626516651064956,
+      "learning_rate": 5.211831841727093e-06,
+      "loss": 0.7899,
+      "step": 7039
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 6.623716499780984,
+      "learning_rate": 5.210678130306069e-06,
+      "loss": 0.7375,
+      "step": 7040
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 11.901304944559682,
+      "learning_rate": 5.2095244076480556e-06,
+      "loss": 0.7285,
+      "step": 7041
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 8.780219008777475,
+      "learning_rate": 5.208370673814595e-06,
+      "loss": 0.7635,
+      "step": 7042
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 5.914068927682268,
+      "learning_rate": 5.207216928867224e-06,
+      "loss": 0.7282,
+      "step": 7043
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 8.068837938266796,
+      "learning_rate": 5.2060631728674785e-06,
+      "loss": 0.8369,
+      "step": 7044
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 8.49508316307726,
+      "learning_rate": 5.204909405876896e-06,
+      "loss": 0.7561,
+      "step": 7045
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 7.606174227084139,
+      "learning_rate": 5.203755627957015e-06,
+      "loss": 0.7958,
+      "step": 7046
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 8.742479280114669,
+      "learning_rate": 5.202601839169379e-06,
+      "loss": 0.7469,
+      "step": 7047
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 8.938555096656684,
+      "learning_rate": 5.201448039575523e-06,
+      "loss": 0.7127,
+      "step": 7048
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 9.839536204375264,
+      "learning_rate": 5.20029422923699e-06,
+      "loss": 0.7834,
+      "step": 7049
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 9.378869252906957,
+      "learning_rate": 5.19914040821532e-06,
+      "loss": 0.7367,
+      "step": 7050
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 7.855737691475216,
+      "learning_rate": 5.1979865765720535e-06,
+      "loss": 0.8081,
+      "step": 7051
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 7.94926528624145,
+      "learning_rate": 5.196832734368735e-06,
+      "loss": 0.6986,
+      "step": 7052
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 7.920812490508959,
+      "learning_rate": 5.195678881666909e-06,
+      "loss": 0.7566,
+      "step": 7053
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 8.447020000541537,
+      "learning_rate": 5.194525018528114e-06,
+      "loss": 0.7747,
+      "step": 7054
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 7.874777794696379,
+      "learning_rate": 5.1933711450138955e-06,
+      "loss": 0.7726,
+      "step": 7055
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 12.168537913525064,
+      "learning_rate": 5.192217261185799e-06,
+      "loss": 0.7203,
+      "step": 7056
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 9.492595065812067,
+      "learning_rate": 5.1910633671053674e-06,
+      "loss": 0.7493,
+      "step": 7057
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 7.666635092441438,
+      "learning_rate": 5.1899094628341494e-06,
+      "loss": 0.7518,
+      "step": 7058
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 8.13940957556461,
+      "learning_rate": 5.1887555484336905e-06,
+      "loss": 0.7226,
+      "step": 7059
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 8.430019114075924,
+      "learning_rate": 5.1876016239655334e-06,
+      "loss": 0.8211,
+      "step": 7060
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 7.330954534926477,
+      "learning_rate": 5.186447689491227e-06,
+      "loss": 0.7689,
+      "step": 7061
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 8.847775216669872,
+      "learning_rate": 5.185293745072324e-06,
+      "loss": 0.7822,
+      "step": 7062
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 7.137663791767022,
+      "learning_rate": 5.184139790770365e-06,
+      "loss": 0.7909,
+      "step": 7063
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 7.720334147049373,
+      "learning_rate": 5.182985826646904e-06,
+      "loss": 0.8044,
+      "step": 7064
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 7.441602205730784,
+      "learning_rate": 5.181831852763487e-06,
+      "loss": 0.8454,
+      "step": 7065
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 6.821546152157257,
+      "learning_rate": 5.180677869181665e-06,
+      "loss": 0.7744,
+      "step": 7066
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 8.773867257175032,
+      "learning_rate": 5.17952387596299e-06,
+      "loss": 0.708,
+      "step": 7067
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 10.124499134714066,
+      "learning_rate": 5.178369873169009e-06,
+      "loss": 0.6934,
+      "step": 7068
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 9.524571390505345,
+      "learning_rate": 5.177215860861278e-06,
+      "loss": 0.73,
+      "step": 7069
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 8.368118547764476,
+      "learning_rate": 5.176061839101343e-06,
+      "loss": 0.7953,
+      "step": 7070
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 9.107800823068349,
+      "learning_rate": 5.174907807950762e-06,
+      "loss": 0.78,
+      "step": 7071
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 9.169889985317013,
+      "learning_rate": 5.173753767471086e-06,
+      "loss": 0.7665,
+      "step": 7072
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 9.367809939598175,
+      "learning_rate": 5.172599717723866e-06,
+      "loss": 0.7019,
+      "step": 7073
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 10.44343630048015,
+      "learning_rate": 5.171445658770657e-06,
+      "loss": 0.7497,
+      "step": 7074
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 6.403940931955013,
+      "learning_rate": 5.170291590673016e-06,
+      "loss": 0.772,
+      "step": 7075
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 10.185027732361851,
+      "learning_rate": 5.169137513492494e-06,
+      "loss": 0.7906,
+      "step": 7076
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 9.686831300525014,
+      "learning_rate": 5.167983427290648e-06,
+      "loss": 0.8041,
+      "step": 7077
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 7.784193889963055,
+      "learning_rate": 5.166829332129034e-06,
+      "loss": 0.8113,
+      "step": 7078
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 7.171175503587817,
+      "learning_rate": 5.1656752280692075e-06,
+      "loss": 0.7254,
+      "step": 7079
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 9.141676613551354,
+      "learning_rate": 5.164521115172725e-06,
+      "loss": 0.79,
+      "step": 7080
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 6.768672129061837,
+      "learning_rate": 5.163366993501146e-06,
+      "loss": 0.8208,
+      "step": 7081
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 9.618545845412367,
+      "learning_rate": 5.162212863116024e-06,
+      "loss": 0.7581,
+      "step": 7082
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 8.2571749391662,
+      "learning_rate": 5.161058724078921e-06,
+      "loss": 0.8102,
+      "step": 7083
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 9.828893801087103,
+      "learning_rate": 5.159904576451393e-06,
+      "loss": 0.7501,
+      "step": 7084
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 7.222531557207571,
+      "learning_rate": 5.158750420295e-06,
+      "loss": 0.7718,
+      "step": 7085
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 7.38166437210362,
+      "learning_rate": 5.1575962556713025e-06,
+      "loss": 0.731,
+      "step": 7086
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 6.723814823956714,
+      "learning_rate": 5.156442082641858e-06,
+      "loss": 0.7962,
+      "step": 7087
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 7.292352641172711,
+      "learning_rate": 5.1552879012682296e-06,
+      "loss": 0.7819,
+      "step": 7088
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 7.89891485907909,
+      "learning_rate": 5.154133711611975e-06,
+      "loss": 0.7795,
+      "step": 7089
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 8.873354740638206,
+      "learning_rate": 5.152979513734658e-06,
+      "loss": 0.6975,
+      "step": 7090
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 9.867518170427772,
+      "learning_rate": 5.151825307697842e-06,
+      "loss": 0.8239,
+      "step": 7091
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 8.751695888908655,
+      "learning_rate": 5.150671093563084e-06,
+      "loss": 0.7591,
+      "step": 7092
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 9.729706887349312,
+      "learning_rate": 5.1495168713919496e-06,
+      "loss": 0.7761,
+      "step": 7093
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 83.11862338871211,
+      "learning_rate": 5.148362641246002e-06,
+      "loss": 0.8586,
+      "step": 7094
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 8.293554807213773,
+      "learning_rate": 5.147208403186804e-06,
+      "loss": 0.7131,
+      "step": 7095
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 9.173737617863546,
+      "learning_rate": 5.1460541572759195e-06,
+      "loss": 0.7962,
+      "step": 7096
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 10.008792378364753,
+      "learning_rate": 5.144899903574913e-06,
+      "loss": 0.7908,
+      "step": 7097
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 17.564294948605855,
+      "learning_rate": 5.143745642145349e-06,
+      "loss": 0.8261,
+      "step": 7098
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 12.230836071252499,
+      "learning_rate": 5.142591373048791e-06,
+      "loss": 0.7989,
+      "step": 7099
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 8.145524979920173,
+      "learning_rate": 5.141437096346809e-06,
+      "loss": 0.7388,
+      "step": 7100
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 9.93524663660106,
+      "learning_rate": 5.140282812100964e-06,
+      "loss": 0.7579,
+      "step": 7101
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 9.359309129713779,
+      "learning_rate": 5.139128520372825e-06,
+      "loss": 0.7196,
+      "step": 7102
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 8.343923229209535,
+      "learning_rate": 5.1379742212239565e-06,
+      "loss": 0.749,
+      "step": 7103
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 8.117500560755262,
+      "learning_rate": 5.136819914715928e-06,
+      "loss": 0.779,
+      "step": 7104
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 8.224143887089465,
+      "learning_rate": 5.135665600910307e-06,
+      "loss": 0.7754,
+      "step": 7105
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 12.201553169047674,
+      "learning_rate": 5.134511279868659e-06,
+      "loss": 0.7654,
+      "step": 7106
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 8.657116291678628,
+      "learning_rate": 5.133356951652556e-06,
+      "loss": 0.864,
+      "step": 7107
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 6.280209969716078,
+      "learning_rate": 5.1322026163235614e-06,
+      "loss": 0.7639,
+      "step": 7108
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 6.8742688209209994,
+      "learning_rate": 5.131048273943249e-06,
+      "loss": 0.818,
+      "step": 7109
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 8.673080642956913,
+      "learning_rate": 5.129893924573187e-06,
+      "loss": 0.7593,
+      "step": 7110
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 8.827376390110448,
+      "learning_rate": 5.1287395682749444e-06,
+      "loss": 0.7952,
+      "step": 7111
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 6.353470429402055,
+      "learning_rate": 5.127585205110091e-06,
+      "loss": 0.7405,
+      "step": 7112
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 9.088664795257598,
+      "learning_rate": 5.126430835140197e-06,
+      "loss": 0.7787,
+      "step": 7113
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 7.412405588922891,
+      "learning_rate": 5.125276458426834e-06,
+      "loss": 0.7357,
+      "step": 7114
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 7.822042642839984,
+      "learning_rate": 5.124122075031576e-06,
+      "loss": 0.8103,
+      "step": 7115
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 7.731222562013156,
+      "learning_rate": 5.122967685015991e-06,
+      "loss": 0.7338,
+      "step": 7116
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 10.038058389777266,
+      "learning_rate": 5.121813288441651e-06,
+      "loss": 0.7574,
+      "step": 7117
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 6.683501903305488,
+      "learning_rate": 5.120658885370129e-06,
+      "loss": 0.7987,
+      "step": 7118
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 9.85301439368646,
+      "learning_rate": 5.119504475863e-06,
+      "loss": 0.7555,
+      "step": 7119
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 8.574629095221157,
+      "learning_rate": 5.1183500599818325e-06,
+      "loss": 0.7474,
+      "step": 7120
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 5.7170020177487455,
+      "learning_rate": 5.117195637788204e-06,
+      "loss": 0.7441,
+      "step": 7121
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 9.835838648472484,
+      "learning_rate": 5.116041209343685e-06,
+      "loss": 0.7399,
+      "step": 7122
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 11.8074120438732,
+      "learning_rate": 5.1148867747098505e-06,
+      "loss": 0.7635,
+      "step": 7123
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 6.008616601933668,
+      "learning_rate": 5.113732333948277e-06,
+      "loss": 0.7615,
+      "step": 7124
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 9.135942648228177,
+      "learning_rate": 5.112577887120537e-06,
+      "loss": 0.8531,
+      "step": 7125
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 9.476052859894263,
+      "learning_rate": 5.111423434288206e-06,
+      "loss": 0.7855,
+      "step": 7126
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 8.878866190986225,
+      "learning_rate": 5.110268975512858e-06,
+      "loss": 0.7433,
+      "step": 7127
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 6.937736421386074,
+      "learning_rate": 5.10911451085607e-06,
+      "loss": 0.7403,
+      "step": 7128
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 10.803933998837563,
+      "learning_rate": 5.107960040379419e-06,
+      "loss": 0.8233,
+      "step": 7129
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 8.340841478654056,
+      "learning_rate": 5.1068055641444795e-06,
+      "loss": 0.768,
+      "step": 7130
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 6.95542559927236,
+      "learning_rate": 5.105651082212828e-06,
+      "loss": 0.7999,
+      "step": 7131
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 8.302508231926014,
+      "learning_rate": 5.1044965946460415e-06,
+      "loss": 0.7415,
+      "step": 7132
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 5.591785961062443,
+      "learning_rate": 5.103342101505698e-06,
+      "loss": 0.7812,
+      "step": 7133
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 7.253523953793232,
+      "learning_rate": 5.1021876028533744e-06,
+      "loss": 0.6839,
+      "step": 7134
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 11.268992514956846,
+      "learning_rate": 5.101033098750649e-06,
+      "loss": 0.8808,
+      "step": 7135
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 11.161767740702455,
+      "learning_rate": 5.0998785892591e-06,
+      "loss": 0.7982,
+      "step": 7136
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 7.842950555455648,
+      "learning_rate": 5.098724074440302e-06,
+      "loss": 0.7559,
+      "step": 7137
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 8.924031112456804,
+      "learning_rate": 5.09756955435584e-06,
+      "loss": 0.8478,
+      "step": 7138
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 7.765902933366324,
+      "learning_rate": 5.096415029067288e-06,
+      "loss": 0.7558,
+      "step": 7139
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 12.253472932431773,
+      "learning_rate": 5.0952604986362266e-06,
+      "loss": 0.7299,
+      "step": 7140
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 8.748933022042591,
+      "learning_rate": 5.094105963124235e-06,
+      "loss": 0.7418,
+      "step": 7141
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 8.128825397336545,
+      "learning_rate": 5.092951422592895e-06,
+      "loss": 0.8091,
+      "step": 7142
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 8.08252398173852,
+      "learning_rate": 5.091796877103783e-06,
+      "loss": 0.7691,
+      "step": 7143
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 6.2324697566038605,
+      "learning_rate": 5.090642326718483e-06,
+      "loss": 0.8161,
+      "step": 7144
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 7.046243034760394,
+      "learning_rate": 5.089487771498573e-06,
+      "loss": 0.7651,
+      "step": 7145
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 8.45756337695448,
+      "learning_rate": 5.0883332115056324e-06,
+      "loss": 0.742,
+      "step": 7146
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 5.818631797043624,
+      "learning_rate": 5.087178646801245e-06,
+      "loss": 0.7642,
+      "step": 7147
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 10.15415347303957,
+      "learning_rate": 5.086024077446993e-06,
+      "loss": 0.6558,
+      "step": 7148
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 11.445137995753178,
+      "learning_rate": 5.084869503504455e-06,
+      "loss": 0.8516,
+      "step": 7149
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 6.07893623443213,
+      "learning_rate": 5.083714925035214e-06,
+      "loss": 0.7164,
+      "step": 7150
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 6.925512377372417,
+      "learning_rate": 5.082560342100852e-06,
+      "loss": 0.7328,
+      "step": 7151
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 7.182296796514742,
+      "learning_rate": 5.08140575476295e-06,
+      "loss": 0.7767,
+      "step": 7152
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 10.702671885291238,
+      "learning_rate": 5.080251163083093e-06,
+      "loss": 0.8032,
+      "step": 7153
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 8.984968889749483,
+      "learning_rate": 5.079096567122862e-06,
+      "loss": 0.7213,
+      "step": 7154
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 10.151344672389564,
+      "learning_rate": 5.077941966943841e-06,
+      "loss": 0.7897,
+      "step": 7155
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 5.740043365868152,
+      "learning_rate": 5.076787362607611e-06,
+      "loss": 0.8771,
+      "step": 7156
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 7.667846489552335,
+      "learning_rate": 5.075632754175758e-06,
+      "loss": 0.7219,
+      "step": 7157
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 7.5213943223365405,
+      "learning_rate": 5.074478141709864e-06,
+      "loss": 0.7745,
+      "step": 7158
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 7.2586840632793015,
+      "learning_rate": 5.073323525271513e-06,
+      "loss": 0.8071,
+      "step": 7159
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 9.339025046970148,
+      "learning_rate": 5.07216890492229e-06,
+      "loss": 0.7753,
+      "step": 7160
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 5.927564588963645,
+      "learning_rate": 5.071014280723777e-06,
+      "loss": 0.7424,
+      "step": 7161
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 7.97806573593628,
+      "learning_rate": 5.069859652737561e-06,
+      "loss": 0.7006,
+      "step": 7162
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 8.5924963126191,
+      "learning_rate": 5.068705021025225e-06,
+      "loss": 0.7702,
+      "step": 7163
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 6.3838493439842425,
+      "learning_rate": 5.067550385648357e-06,
+      "loss": 0.805,
+      "step": 7164
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 8.208923753675782,
+      "learning_rate": 5.066395746668537e-06,
+      "loss": 0.7599,
+      "step": 7165
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 11.243948663559,
+      "learning_rate": 5.065241104147352e-06,
+      "loss": 0.7088,
+      "step": 7166
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 8.26832782096482,
+      "learning_rate": 5.06408645814639e-06,
+      "loss": 0.7345,
+      "step": 7167
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 10.379096226726631,
+      "learning_rate": 5.062931808727234e-06,
+      "loss": 0.7603,
+      "step": 7168
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 7.016088113965974,
+      "learning_rate": 5.061777155951469e-06,
+      "loss": 0.8442,
+      "step": 7169
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 8.38872069929839,
+      "learning_rate": 5.060622499880684e-06,
+      "loss": 0.7541,
+      "step": 7170
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 6.720084825017916,
+      "learning_rate": 5.059467840576464e-06,
+      "loss": 0.7243,
+      "step": 7171
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 10.404539562965006,
+      "learning_rate": 5.058313178100393e-06,
+      "loss": 0.8243,
+      "step": 7172
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 6.880428931847487,
+      "learning_rate": 5.057158512514061e-06,
+      "loss": 0.7339,
+      "step": 7173
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 8.236759369046801,
+      "learning_rate": 5.056003843879053e-06,
+      "loss": 0.7649,
+      "step": 7174
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 6.737892760138318,
+      "learning_rate": 5.054849172256956e-06,
+      "loss": 0.7619,
+      "step": 7175
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 7.980792439120986,
+      "learning_rate": 5.0536944977093565e-06,
+      "loss": 0.7847,
+      "step": 7176
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 8.440849975357711,
+      "learning_rate": 5.0525398202978426e-06,
+      "loss": 0.8071,
+      "step": 7177
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 8.2606552193488,
+      "learning_rate": 5.051385140084e-06,
+      "loss": 0.7694,
+      "step": 7178
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 9.971727552296494,
+      "learning_rate": 5.050230457129417e-06,
+      "loss": 0.6839,
+      "step": 7179
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 8.991207040014423,
+      "learning_rate": 5.049075771495682e-06,
+      "loss": 0.7886,
+      "step": 7180
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 6.873070659964527,
+      "learning_rate": 5.04792108324438e-06,
+      "loss": 0.7107,
+      "step": 7181
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 10.901323847671511,
+      "learning_rate": 5.046766392437103e-06,
+      "loss": 0.7663,
+      "step": 7182
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 10.848849710523458,
+      "learning_rate": 5.045611699135438e-06,
+      "loss": 0.8322,
+      "step": 7183
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 8.395260782168108,
+      "learning_rate": 5.044457003400969e-06,
+      "loss": 0.7541,
+      "step": 7184
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 6.818213358293996,
+      "learning_rate": 5.0433023052952865e-06,
+      "loss": 0.8042,
+      "step": 7185
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 7.354263590850806,
+      "learning_rate": 5.042147604879982e-06,
+      "loss": 0.8131,
+      "step": 7186
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 7.813141854838572,
+      "learning_rate": 5.040992902216642e-06,
+      "loss": 0.7396,
+      "step": 7187
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 8.508083454459705,
+      "learning_rate": 5.039838197366853e-06,
+      "loss": 0.743,
+      "step": 7188
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 8.832886220573267,
+      "learning_rate": 5.038683490392206e-06,
+      "loss": 0.7304,
+      "step": 7189
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 10.744936281649455,
+      "learning_rate": 5.037528781354288e-06,
+      "loss": 0.8337,
+      "step": 7190
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 9.148138206942338,
+      "learning_rate": 5.036374070314691e-06,
+      "loss": 0.7732,
+      "step": 7191
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 8.25724354804594,
+      "learning_rate": 5.035219357335002e-06,
+      "loss": 0.8226,
+      "step": 7192
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 6.300755610250504,
+      "learning_rate": 5.034064642476811e-06,
+      "loss": 0.728,
+      "step": 7193
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 8.546986685448447,
+      "learning_rate": 5.032909925801706e-06,
+      "loss": 0.8225,
+      "step": 7194
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 7.393523627541755,
+      "learning_rate": 5.031755207371276e-06,
+      "loss": 0.7925,
+      "step": 7195
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 8.861589102780691,
+      "learning_rate": 5.030600487247115e-06,
+      "loss": 0.7461,
+      "step": 7196
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 7.228287224364795,
+      "learning_rate": 5.029445765490807e-06,
+      "loss": 0.8097,
+      "step": 7197
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 7.1574543207319214,
+      "learning_rate": 5.028291042163943e-06,
+      "loss": 0.7562,
+      "step": 7198
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 10.314800657181095,
+      "learning_rate": 5.027136317328115e-06,
+      "loss": 0.8347,
+      "step": 7199
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 9.438079801007714,
+      "learning_rate": 5.02598159104491e-06,
+      "loss": 0.7186,
+      "step": 7200
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 7.272339385837436,
+      "learning_rate": 5.024826863375921e-06,
+      "loss": 0.7882,
+      "step": 7201
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 6.674150935643723,
+      "learning_rate": 5.023672134382737e-06,
+      "loss": 0.8139,
+      "step": 7202
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 7.701168709524854,
+      "learning_rate": 5.022517404126945e-06,
+      "loss": 0.787,
+      "step": 7203
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 6.8066930752177575,
+      "learning_rate": 5.021362672670137e-06,
+      "loss": 0.7661,
+      "step": 7204
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 9.281039116113853,
+      "learning_rate": 5.020207940073905e-06,
+      "loss": 0.7715,
+      "step": 7205
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 10.09926383889431,
+      "learning_rate": 5.019053206399837e-06,
+      "loss": 0.7254,
+      "step": 7206
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 6.974978448954368,
+      "learning_rate": 5.017898471709524e-06,
+      "loss": 0.7427,
+      "step": 7207
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 8.391247100265252,
+      "learning_rate": 5.016743736064557e-06,
+      "loss": 0.8271,
+      "step": 7208
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 7.7641543598395115,
+      "learning_rate": 5.015588999526525e-06,
+      "loss": 0.7859,
+      "step": 7209
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 8.944481364030407,
+      "learning_rate": 5.0144342621570185e-06,
+      "loss": 0.7571,
+      "step": 7210
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 12.3172742555324,
+      "learning_rate": 5.01327952401763e-06,
+      "loss": 0.794,
+      "step": 7211
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 6.660134670575342,
+      "learning_rate": 5.01212478516995e-06,
+      "loss": 0.7765,
+      "step": 7212
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 8.31319696952524,
+      "learning_rate": 5.010970045675564e-06,
+      "loss": 0.7391,
+      "step": 7213
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 8.30174623644396,
+      "learning_rate": 5.009815305596069e-06,
+      "loss": 0.736,
+      "step": 7214
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 7.145750076225014,
+      "learning_rate": 5.008660564993052e-06,
+      "loss": 0.7075,
+      "step": 7215
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 7.412102071448449,
+      "learning_rate": 5.0075058239281046e-06,
+      "loss": 0.7903,
+      "step": 7216
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 8.551989827820888,
+      "learning_rate": 5.006351082462818e-06,
+      "loss": 0.7396,
+      "step": 7217
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 7.717825520357623,
+      "learning_rate": 5.005196340658781e-06,
+      "loss": 0.7956,
+      "step": 7218
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 9.178884147800494,
+      "learning_rate": 5.004041598577587e-06,
+      "loss": 0.8101,
+      "step": 7219
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 7.012340474632072,
+      "learning_rate": 5.002886856280824e-06,
+      "loss": 0.7506,
+      "step": 7220
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 6.159729367720125,
+      "learning_rate": 5.001732113830085e-06,
+      "loss": 0.7891,
+      "step": 7221
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 7.984671953391424,
+      "learning_rate": 5.000577371286962e-06,
+      "loss": 0.7294,
+      "step": 7222
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 9.125863301401967,
+      "learning_rate": 4.99942262871304e-06,
+      "loss": 0.7444,
+      "step": 7223
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 10.462383627090345,
+      "learning_rate": 4.9982678861699155e-06,
+      "loss": 0.7884,
+      "step": 7224
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 5.678359761628124,
+      "learning_rate": 4.997113143719177e-06,
+      "loss": 0.7521,
+      "step": 7225
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 7.390471947674997,
+      "learning_rate": 4.995958401422413e-06,
+      "loss": 0.7096,
+      "step": 7226
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 7.7918756614904146,
+      "learning_rate": 4.99480365934122e-06,
+      "loss": 0.8259,
+      "step": 7227
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 8.278952130305482,
+      "learning_rate": 4.9936489175371835e-06,
+      "loss": 0.7268,
+      "step": 7228
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 9.027952510411625,
+      "learning_rate": 4.992494176071898e-06,
+      "loss": 0.7884,
+      "step": 7229
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 8.271523838360043,
+      "learning_rate": 4.991339435006949e-06,
+      "loss": 0.682,
+      "step": 7230
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 7.665997112450753,
+      "learning_rate": 4.990184694403932e-06,
+      "loss": 0.8034,
+      "step": 7231
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 6.548472738839258,
+      "learning_rate": 4.989029954324437e-06,
+      "loss": 0.6968,
+      "step": 7232
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 5.261424704902649,
+      "learning_rate": 4.987875214830053e-06,
+      "loss": 0.7191,
+      "step": 7233
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 6.855782618666774,
+      "learning_rate": 4.986720475982371e-06,
+      "loss": 0.7196,
+      "step": 7234
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 10.00459520484865,
+      "learning_rate": 4.985565737842982e-06,
+      "loss": 0.8153,
+      "step": 7235
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 9.361388514097326,
+      "learning_rate": 4.9844110004734755e-06,
+      "loss": 0.738,
+      "step": 7236
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 8.173592508110714,
+      "learning_rate": 4.983256263935445e-06,
+      "loss": 0.6791,
+      "step": 7237
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 7.829998899849345,
+      "learning_rate": 4.982101528290477e-06,
+      "loss": 0.7779,
+      "step": 7238
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 8.749753207332962,
+      "learning_rate": 4.980946793600165e-06,
+      "loss": 0.7684,
+      "step": 7239
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 8.76424203370774,
+      "learning_rate": 4.979792059926096e-06,
+      "loss": 0.7478,
+      "step": 7240
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 6.9707428171630905,
+      "learning_rate": 4.978637327329864e-06,
+      "loss": 0.7041,
+      "step": 7241
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 9.60449331245503,
+      "learning_rate": 4.977482595873058e-06,
+      "loss": 0.767,
+      "step": 7242
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 7.835387917604971,
+      "learning_rate": 4.976327865617265e-06,
+      "loss": 0.7599,
+      "step": 7243
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 8.887365544716456,
+      "learning_rate": 4.975173136624082e-06,
+      "loss": 0.772,
+      "step": 7244
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 6.512104384691002,
+      "learning_rate": 4.974018408955091e-06,
+      "loss": 0.8346,
+      "step": 7245
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 8.622758752316265,
+      "learning_rate": 4.9728636826718865e-06,
+      "loss": 0.7481,
+      "step": 7246
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 7.445818419036586,
+      "learning_rate": 4.971708957836059e-06,
+      "loss": 0.7628,
+      "step": 7247
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 8.57342712536997,
+      "learning_rate": 4.970554234509195e-06,
+      "loss": 0.7801,
+      "step": 7248
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 7.9915304219958925,
+      "learning_rate": 4.969399512752887e-06,
+      "loss": 0.7413,
+      "step": 7249
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 9.800248308807223,
+      "learning_rate": 4.968244792628725e-06,
+      "loss": 0.8113,
+      "step": 7250
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 7.095141499221772,
+      "learning_rate": 4.967090074198295e-06,
+      "loss": 0.7553,
+      "step": 7251
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 12.236508515819569,
+      "learning_rate": 4.9659353575231915e-06,
+      "loss": 0.7681,
+      "step": 7252
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 6.68932623232966,
+      "learning_rate": 4.964780642664999e-06,
+      "loss": 0.7997,
+      "step": 7253
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 9.046703676034422,
+      "learning_rate": 4.963625929685309e-06,
+      "loss": 0.7865,
+      "step": 7254
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 6.713155459306096,
+      "learning_rate": 4.962471218645713e-06,
+      "loss": 0.8015,
+      "step": 7255
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 9.048801328105336,
+      "learning_rate": 4.961316509607796e-06,
+      "loss": 0.8132,
+      "step": 7256
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 10.317415909114917,
+      "learning_rate": 4.96016180263315e-06,
+      "loss": 0.7421,
+      "step": 7257
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 10.93675325424012,
+      "learning_rate": 4.95900709778336e-06,
+      "loss": 0.6848,
+      "step": 7258
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 8.17356045627509,
+      "learning_rate": 4.9578523951200184e-06,
+      "loss": 0.768,
+      "step": 7259
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 9.681787163629675,
+      "learning_rate": 4.956697694704714e-06,
+      "loss": 0.7734,
+      "step": 7260
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 7.169881716432663,
+      "learning_rate": 4.955542996599033e-06,
+      "loss": 0.7712,
+      "step": 7261
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 10.140692175331473,
+      "learning_rate": 4.9543883008645655e-06,
+      "loss": 0.7987,
+      "step": 7262
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 8.107251515869669,
+      "learning_rate": 4.953233607562898e-06,
+      "loss": 0.7603,
+      "step": 7263
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 8.888297333746907,
+      "learning_rate": 4.952078916755619e-06,
+      "loss": 0.8252,
+      "step": 7264
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 6.848162164027927,
+      "learning_rate": 4.9509242285043206e-06,
+      "loss": 0.8069,
+      "step": 7265
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 8.623741728577683,
+      "learning_rate": 4.949769542870584e-06,
+      "loss": 0.8143,
+      "step": 7266
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 6.548776993658931,
+      "learning_rate": 4.948614859916003e-06,
+      "loss": 0.7955,
+      "step": 7267
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 9.402542918593976,
+      "learning_rate": 4.947460179702159e-06,
+      "loss": 0.7937,
+      "step": 7268
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 10.681294917283143,
+      "learning_rate": 4.946305502290644e-06,
+      "loss": 0.7665,
+      "step": 7269
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 7.676257584105571,
+      "learning_rate": 4.945150827743046e-06,
+      "loss": 0.8284,
+      "step": 7270
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 10.060193084426636,
+      "learning_rate": 4.9439961561209475e-06,
+      "loss": 0.7854,
+      "step": 7271
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 8.279667866104374,
+      "learning_rate": 4.94284148748594e-06,
+      "loss": 0.7979,
+      "step": 7272
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 8.179602176715736,
+      "learning_rate": 4.9416868218996074e-06,
+      "loss": 0.7431,
+      "step": 7273
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 9.096238199861629,
+      "learning_rate": 4.940532159423537e-06,
+      "loss": 0.667,
+      "step": 7274
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 7.522774055840552,
+      "learning_rate": 4.939377500119317e-06,
+      "loss": 0.7982,
+      "step": 7275
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 7.387245247384804,
+      "learning_rate": 4.938222844048532e-06,
+      "loss": 0.8844,
+      "step": 7276
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 9.02902532994442,
+      "learning_rate": 4.937068191272769e-06,
+      "loss": 0.7664,
+      "step": 7277
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 8.08791637364738,
+      "learning_rate": 4.935913541853612e-06,
+      "loss": 0.7792,
+      "step": 7278
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 9.656211351940653,
+      "learning_rate": 4.934758895852649e-06,
+      "loss": 0.7099,
+      "step": 7279
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 6.892987104856052,
+      "learning_rate": 4.933604253331466e-06,
+      "loss": 0.7734,
+      "step": 7280
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 6.931016629313172,
+      "learning_rate": 4.932449614351645e-06,
+      "loss": 0.722,
+      "step": 7281
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 7.649521869208438,
+      "learning_rate": 4.931294978974776e-06,
+      "loss": 0.7276,
+      "step": 7282
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 6.956703762835998,
+      "learning_rate": 4.93014034726244e-06,
+      "loss": 0.7642,
+      "step": 7283
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 8.48857729460036,
+      "learning_rate": 4.928985719276223e-06,
+      "loss": 0.7202,
+      "step": 7284
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 9.910787919888246,
+      "learning_rate": 4.927831095077713e-06,
+      "loss": 0.7651,
+      "step": 7285
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 7.756397782097134,
+      "learning_rate": 4.926676474728488e-06,
+      "loss": 0.7843,
+      "step": 7286
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 7.762692884403299,
+      "learning_rate": 4.925521858290138e-06,
+      "loss": 0.7999,
+      "step": 7287
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 6.973598965685363,
+      "learning_rate": 4.924367245824244e-06,
+      "loss": 0.8029,
+      "step": 7288
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 7.341510215309941,
+      "learning_rate": 4.923212637392389e-06,
+      "loss": 0.8527,
+      "step": 7289
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 8.919848282660848,
+      "learning_rate": 4.922058033056161e-06,
+      "loss": 0.8271,
+      "step": 7290
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 8.229962364336425,
+      "learning_rate": 4.920903432877139e-06,
+      "loss": 0.7443,
+      "step": 7291
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 8.248594443510056,
+      "learning_rate": 4.9197488369169075e-06,
+      "loss": 0.7659,
+      "step": 7292
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 8.684803403566598,
+      "learning_rate": 4.9185942452370515e-06,
+      "loss": 0.7895,
+      "step": 7293
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 7.807364229244529,
+      "learning_rate": 4.91743965789915e-06,
+      "loss": 0.709,
+      "step": 7294
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 8.739685266370616,
+      "learning_rate": 4.916285074964789e-06,
+      "loss": 0.8238,
+      "step": 7295
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 8.525141010939771,
+      "learning_rate": 4.915130496495547e-06,
+      "loss": 0.7107,
+      "step": 7296
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 10.115506587988383,
+      "learning_rate": 4.913975922553008e-06,
+      "loss": 0.7679,
+      "step": 7297
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 5.178401636963003,
+      "learning_rate": 4.912821353198756e-06,
+      "loss": 0.7465,
+      "step": 7298
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 6.758208071013043,
+      "learning_rate": 4.911666788494368e-06,
+      "loss": 0.7984,
+      "step": 7299
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 8.40311030378148,
+      "learning_rate": 4.91051222850143e-06,
+      "loss": 0.7229,
+      "step": 7300
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 9.503473280088539,
+      "learning_rate": 4.90935767328152e-06,
+      "loss": 0.7315,
+      "step": 7301
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 6.681502213836631,
+      "learning_rate": 4.9082031228962176e-06,
+      "loss": 0.8066,
+      "step": 7302
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 8.71375819524653,
+      "learning_rate": 4.907048577407107e-06,
+      "loss": 0.7674,
+      "step": 7303
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 7.357326440991901,
+      "learning_rate": 4.905894036875766e-06,
+      "loss": 0.6968,
+      "step": 7304
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 6.408842784728621,
+      "learning_rate": 4.904739501363776e-06,
+      "loss": 0.794,
+      "step": 7305
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 6.638857228934147,
+      "learning_rate": 4.903584970932714e-06,
+      "loss": 0.8083,
+      "step": 7306
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 10.053411037864143,
+      "learning_rate": 4.902430445644162e-06,
+      "loss": 0.8111,
+      "step": 7307
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 9.29548646266281,
+      "learning_rate": 4.901275925559699e-06,
+      "loss": 0.7639,
+      "step": 7308
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 8.364365263465567,
+      "learning_rate": 4.9001214107409025e-06,
+      "loss": 0.8084,
+      "step": 7309
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 6.593688275676285,
+      "learning_rate": 4.898966901249353e-06,
+      "loss": 0.8087,
+      "step": 7310
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 8.300838933183293,
+      "learning_rate": 4.897812397146627e-06,
+      "loss": 0.8112,
+      "step": 7311
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 7.8927299828694935,
+      "learning_rate": 4.896657898494303e-06,
+      "loss": 0.7242,
+      "step": 7312
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 9.149835605058339,
+      "learning_rate": 4.89550340535396e-06,
+      "loss": 0.794,
+      "step": 7313
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 6.709219434353125,
+      "learning_rate": 4.894348917787174e-06,
+      "loss": 0.8477,
+      "step": 7314
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 6.229744093845979,
+      "learning_rate": 4.893194435855523e-06,
+      "loss": 0.7835,
+      "step": 7315
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 9.336476486818844,
+      "learning_rate": 4.892039959620583e-06,
+      "loss": 0.7982,
+      "step": 7316
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 8.587202076091772,
+      "learning_rate": 4.890885489143931e-06,
+      "loss": 0.8075,
+      "step": 7317
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 9.645192239407862,
+      "learning_rate": 4.889731024487144e-06,
+      "loss": 0.752,
+      "step": 7318
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 9.913395838794754,
+      "learning_rate": 4.888576565711795e-06,
+      "loss": 0.804,
+      "step": 7319
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 8.203803958141918,
+      "learning_rate": 4.887422112879465e-06,
+      "loss": 0.7366,
+      "step": 7320
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 8.190027438224861,
+      "learning_rate": 4.8862676660517244e-06,
+      "loss": 0.7233,
+      "step": 7321
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 9.117223063205387,
+      "learning_rate": 4.885113225290149e-06,
+      "loss": 0.8047,
+      "step": 7322
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 7.266709718844858,
+      "learning_rate": 4.883958790656317e-06,
+      "loss": 0.7609,
+      "step": 7323
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 7.517273819367243,
+      "learning_rate": 4.8828043622117985e-06,
+      "loss": 0.779,
+      "step": 7324
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 9.683208656809587,
+      "learning_rate": 4.88164994001817e-06,
+      "loss": 0.8215,
+      "step": 7325
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 7.705555460795916,
+      "learning_rate": 4.880495524137002e-06,
+      "loss": 0.7567,
+      "step": 7326
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 9.424204431670983,
+      "learning_rate": 4.879341114629872e-06,
+      "loss": 0.7758,
+      "step": 7327
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 7.631937353591548,
+      "learning_rate": 4.878186711558351e-06,
+      "loss": 0.7591,
+      "step": 7328
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 7.606532693493977,
+      "learning_rate": 4.87703231498401e-06,
+      "loss": 0.7716,
+      "step": 7329
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 7.923736866384223,
+      "learning_rate": 4.875877924968427e-06,
+      "loss": 0.7757,
+      "step": 7330
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 7.601232333674376,
+      "learning_rate": 4.874723541573167e-06,
+      "loss": 0.8689,
+      "step": 7331
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 7.896805036767863,
+      "learning_rate": 4.873569164859804e-06,
+      "loss": 0.7585,
+      "step": 7332
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 9.657463486971777,
+      "learning_rate": 4.8724147948899125e-06,
+      "loss": 0.7877,
+      "step": 7333
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 5.855685090386087,
+      "learning_rate": 4.871260431725058e-06,
+      "loss": 0.7694,
+      "step": 7334
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 8.95192867330951,
+      "learning_rate": 4.870106075426814e-06,
+      "loss": 0.7333,
+      "step": 7335
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 8.706387824770747,
+      "learning_rate": 4.868951726056753e-06,
+      "loss": 0.7337,
+      "step": 7336
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 7.522179823046623,
+      "learning_rate": 4.8677973836764385e-06,
+      "loss": 0.7828,
+      "step": 7337
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 9.612702663997975,
+      "learning_rate": 4.8666430483474466e-06,
+      "loss": 0.8037,
+      "step": 7338
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 7.9467137938183665,
+      "learning_rate": 4.865488720131343e-06,
+      "loss": 0.7308,
+      "step": 7339
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 9.638889124845038,
+      "learning_rate": 4.864334399089693e-06,
+      "loss": 0.7051,
+      "step": 7340
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 7.779831964082278,
+      "learning_rate": 4.863180085284073e-06,
+      "loss": 0.7803,
+      "step": 7341
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 9.95520894298698,
+      "learning_rate": 4.862025778776044e-06,
+      "loss": 0.8196,
+      "step": 7342
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 6.957586225829754,
+      "learning_rate": 4.860871479627179e-06,
+      "loss": 0.7513,
+      "step": 7343
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 10.19909240881773,
+      "learning_rate": 4.859717187899037e-06,
+      "loss": 0.7749,
+      "step": 7344
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 9.918840142084788,
+      "learning_rate": 4.858562903653193e-06,
+      "loss": 0.7267,
+      "step": 7345
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 8.520299767417717,
+      "learning_rate": 4.8574086269512096e-06,
+      "loss": 0.7102,
+      "step": 7346
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 7.943173863731135,
+      "learning_rate": 4.856254357854652e-06,
+      "loss": 0.7566,
+      "step": 7347
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 9.335862829384102,
+      "learning_rate": 4.855100096425089e-06,
+      "loss": 0.6627,
+      "step": 7348
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 7.289589904126261,
+      "learning_rate": 4.853945842724082e-06,
+      "loss": 0.8572,
+      "step": 7349
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 6.764164228614686,
+      "learning_rate": 4.8527915968131965e-06,
+      "loss": 0.7481,
+      "step": 7350
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 8.606410435136318,
+      "learning_rate": 4.851637358753999e-06,
+      "loss": 0.8308,
+      "step": 7351
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 8.352756948370992,
+      "learning_rate": 4.850483128608051e-06,
+      "loss": 0.8069,
+      "step": 7352
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 8.462116136988653,
+      "learning_rate": 4.849328906436918e-06,
+      "loss": 0.8186,
+      "step": 7353
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 7.421047756708276,
+      "learning_rate": 4.848174692302159e-06,
+      "loss": 0.8012,
+      "step": 7354
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 7.312660093362268,
+      "learning_rate": 4.847020486265341e-06,
+      "loss": 0.7726,
+      "step": 7355
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 8.202542237942831,
+      "learning_rate": 4.845866288388026e-06,
+      "loss": 0.7575,
+      "step": 7356
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 8.632114570434776,
+      "learning_rate": 4.844712098731771e-06,
+      "loss": 0.7725,
+      "step": 7357
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 11.065391485175473,
+      "learning_rate": 4.8435579173581435e-06,
+      "loss": 0.7323,
+      "step": 7358
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 10.288077984262749,
+      "learning_rate": 4.8424037443287e-06,
+      "loss": 0.7619,
+      "step": 7359
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 10.18797342972138,
+      "learning_rate": 4.841249579705e-06,
+      "loss": 0.7965,
+      "step": 7360
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 9.107163980796159,
+      "learning_rate": 4.8400954235486085e-06,
+      "loss": 0.7652,
+      "step": 7361
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 8.473746336949636,
+      "learning_rate": 4.838941275921081e-06,
+      "loss": 0.6808,
+      "step": 7362
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 6.8059576606161105,
+      "learning_rate": 4.8377871368839776e-06,
+      "loss": 0.794,
+      "step": 7363
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 10.434527743729769,
+      "learning_rate": 4.836633006498856e-06,
+      "loss": 0.8043,
+      "step": 7364
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 6.6679117988979595,
+      "learning_rate": 4.835478884827276e-06,
+      "loss": 0.7138,
+      "step": 7365
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 8.068707293401966,
+      "learning_rate": 4.834324771930794e-06,
+      "loss": 0.7899,
+      "step": 7366
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 9.407392238997758,
+      "learning_rate": 4.833170667870967e-06,
+      "loss": 0.7537,
+      "step": 7367
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 9.710901429886121,
+      "learning_rate": 4.832016572709354e-06,
+      "loss": 0.8035,
+      "step": 7368
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 8.94615497768793,
+      "learning_rate": 4.830862486507508e-06,
+      "loss": 0.7928,
+      "step": 7369
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 8.522764055481616,
+      "learning_rate": 4.829708409326984e-06,
+      "loss": 0.7956,
+      "step": 7370
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 6.980379926739998,
+      "learning_rate": 4.8285543412293444e-06,
+      "loss": 0.6801,
+      "step": 7371
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 9.13757485796811,
+      "learning_rate": 4.827400282276136e-06,
+      "loss": 0.7779,
+      "step": 7372
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 8.39396456280935,
+      "learning_rate": 4.8262462325289175e-06,
+      "loss": 0.7959,
+      "step": 7373
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 7.564725984910043,
+      "learning_rate": 4.82509219204924e-06,
+      "loss": 0.7592,
+      "step": 7374
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 8.377738201176546,
+      "learning_rate": 4.823938160898657e-06,
+      "loss": 0.8375,
+      "step": 7375
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 8.722410548646415,
+      "learning_rate": 4.8227841391387245e-06,
+      "loss": 0.7647,
+      "step": 7376
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 9.16247782371567,
+      "learning_rate": 4.821630126830993e-06,
+      "loss": 0.7651,
+      "step": 7377
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 4.6637877046831395,
+      "learning_rate": 4.820476124037011e-06,
+      "loss": 0.786,
+      "step": 7378
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 7.841687768421192,
+      "learning_rate": 4.8193221308183365e-06,
+      "loss": 0.7212,
+      "step": 7379
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 9.245657847866541,
+      "learning_rate": 4.818168147236515e-06,
+      "loss": 0.7452,
+      "step": 7380
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 8.352318145893278,
+      "learning_rate": 4.817014173353099e-06,
+      "loss": 0.7672,
+      "step": 7381
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 7.9290940460880694,
+      "learning_rate": 4.8158602092296365e-06,
+      "loss": 0.8097,
+      "step": 7382
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 6.167156875143176,
+      "learning_rate": 4.814706254927678e-06,
+      "loss": 0.8616,
+      "step": 7383
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 7.324321803059384,
+      "learning_rate": 4.813552310508774e-06,
+      "loss": 0.8019,
+      "step": 7384
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 9.00191785284454,
+      "learning_rate": 4.812398376034468e-06,
+      "loss": 0.7556,
+      "step": 7385
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 9.211567737895669,
+      "learning_rate": 4.811244451566313e-06,
+      "loss": 0.7432,
+      "step": 7386
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 7.072932734325653,
+      "learning_rate": 4.810090537165852e-06,
+      "loss": 0.8303,
+      "step": 7387
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 7.387170825490112,
+      "learning_rate": 4.8089366328946325e-06,
+      "loss": 0.7699,
+      "step": 7388
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 6.6009855062230045,
+      "learning_rate": 4.807782738814203e-06,
+      "loss": 0.7926,
+      "step": 7389
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 7.6438750490734835,
+      "learning_rate": 4.806628854986106e-06,
+      "loss": 0.7143,
+      "step": 7390
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 7.066279596111792,
+      "learning_rate": 4.8054749814718894e-06,
+      "loss": 0.7723,
+      "step": 7391
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 8.767745131919524,
+      "learning_rate": 4.804321118333093e-06,
+      "loss": 0.7491,
+      "step": 7392
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 10.815576137763198,
+      "learning_rate": 4.803167265631264e-06,
+      "loss": 0.8007,
+      "step": 7393
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 6.1268742416494435,
+      "learning_rate": 4.802013423427947e-06,
+      "loss": 0.7477,
+      "step": 7394
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 8.363295098227136,
+      "learning_rate": 4.800859591784681e-06,
+      "loss": 0.8181,
+      "step": 7395
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 9.340254112900343,
+      "learning_rate": 4.799705770763013e-06,
+      "loss": 0.7871,
+      "step": 7396
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 5.758388670455839,
+      "learning_rate": 4.798551960424479e-06,
+      "loss": 0.692,
+      "step": 7397
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 9.150965993860538,
+      "learning_rate": 4.797398160830622e-06,
+      "loss": 0.7393,
+      "step": 7398
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 8.177148962096027,
+      "learning_rate": 4.796244372042986e-06,
+      "loss": 0.6991,
+      "step": 7399
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 6.416742536376394,
+      "learning_rate": 4.795090594123106e-06,
+      "loss": 0.7188,
+      "step": 7400
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 10.459524111412819,
+      "learning_rate": 4.793936827132525e-06,
+      "loss": 0.7667,
+      "step": 7401
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 7.955475809212945,
+      "learning_rate": 4.792783071132777e-06,
+      "loss": 0.7743,
+      "step": 7402
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 10.074088561174388,
+      "learning_rate": 4.791629326185405e-06,
+      "loss": 0.8285,
+      "step": 7403
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 8.199307681195338,
+      "learning_rate": 4.790475592351946e-06,
+      "loss": 0.7334,
+      "step": 7404
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 7.339812608259651,
+      "learning_rate": 4.789321869693934e-06,
+      "loss": 0.7729,
+      "step": 7405
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 9.407336287792633,
+      "learning_rate": 4.788168158272908e-06,
+      "loss": 0.7984,
+      "step": 7406
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 8.21256127914734,
+      "learning_rate": 4.787014458150402e-06,
+      "loss": 0.8062,
+      "step": 7407
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 7.531651821637176,
+      "learning_rate": 4.785860769387952e-06,
+      "loss": 0.739,
+      "step": 7408
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 8.643242038526903,
+      "learning_rate": 4.784707092047093e-06,
+      "loss": 0.8056,
+      "step": 7409
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 11.334340460182863,
+      "learning_rate": 4.783553426189359e-06,
+      "loss": 0.7838,
+      "step": 7410
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 5.65414049623275,
+      "learning_rate": 4.782399771876283e-06,
+      "loss": 0.7409,
+      "step": 7411
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 8.247750686375648,
+      "learning_rate": 4.781246129169395e-06,
+      "loss": 0.7977,
+      "step": 7412
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 9.12305168567162,
+      "learning_rate": 4.780092498130231e-06,
+      "loss": 0.7962,
+      "step": 7413
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 7.369526062446959,
+      "learning_rate": 4.7789388788203225e-06,
+      "loss": 0.7503,
+      "step": 7414
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 8.118981360142536,
+      "learning_rate": 4.777785271301196e-06,
+      "loss": 0.7148,
+      "step": 7415
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 6.228889818859425,
+      "learning_rate": 4.776631675634389e-06,
+      "loss": 0.8348,
+      "step": 7416
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 5.383268657932324,
+      "learning_rate": 4.775478091881422e-06,
+      "loss": 0.7617,
+      "step": 7417
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 6.876275416606365,
+      "learning_rate": 4.77432452010383e-06,
+      "loss": 0.8078,
+      "step": 7418
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 6.750888415277019,
+      "learning_rate": 4.773170960363142e-06,
+      "loss": 0.8092,
+      "step": 7419
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 8.90827871473178,
+      "learning_rate": 4.77201741272088e-06,
+      "loss": 0.709,
+      "step": 7420
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 7.766346673072725,
+      "learning_rate": 4.770863877238577e-06,
+      "loss": 0.8103,
+      "step": 7421
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 9.407376500702652,
+      "learning_rate": 4.769710353977756e-06,
+      "loss": 0.7751,
+      "step": 7422
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 8.371680725982415,
+      "learning_rate": 4.768556842999944e-06,
+      "loss": 0.7988,
+      "step": 7423
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 8.392567884064086,
+      "learning_rate": 4.767403344366667e-06,
+      "loss": 0.7618,
+      "step": 7424
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 6.576488655745396,
+      "learning_rate": 4.766249858139447e-06,
+      "loss": 0.7627,
+      "step": 7425
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 8.78993744680041,
+      "learning_rate": 4.765096384379808e-06,
+      "loss": 0.8209,
+      "step": 7426
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 8.226094454486091,
+      "learning_rate": 4.763942923149275e-06,
+      "loss": 0.7757,
+      "step": 7427
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 7.303693237640598,
+      "learning_rate": 4.76278947450937e-06,
+      "loss": 0.7799,
+      "step": 7428
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 10.319040859000888,
+      "learning_rate": 4.761636038521615e-06,
+      "loss": 0.6999,
+      "step": 7429
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 9.234031970804836,
+      "learning_rate": 4.760482615247527e-06,
+      "loss": 0.7656,
+      "step": 7430
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 10.369751836002095,
+      "learning_rate": 4.759329204748631e-06,
+      "loss": 0.7821,
+      "step": 7431
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 8.939370673242468,
+      "learning_rate": 4.758175807086448e-06,
+      "loss": 0.7819,
+      "step": 7432
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 7.529446854408764,
+      "learning_rate": 4.757022422322491e-06,
+      "loss": 0.7882,
+      "step": 7433
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 7.038738789157782,
+      "learning_rate": 4.755869050518283e-06,
+      "loss": 0.7361,
+      "step": 7434
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 6.944913952374191,
+      "learning_rate": 4.75471569173534e-06,
+      "loss": 0.8408,
+      "step": 7435
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 9.664003034059041,
+      "learning_rate": 4.753562346035178e-06,
+      "loss": 0.8187,
+      "step": 7436
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 9.643096827339468,
+      "learning_rate": 4.7524090134793165e-06,
+      "loss": 0.8028,
+      "step": 7437
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 6.967433933945254,
+      "learning_rate": 4.751255694129268e-06,
+      "loss": 0.6666,
+      "step": 7438
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 8.752526454020199,
+      "learning_rate": 4.750102388046551e-06,
+      "loss": 0.7785,
+      "step": 7439
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 10.17734538701991,
+      "learning_rate": 4.748949095292672e-06,
+      "loss": 0.7463,
+      "step": 7440
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 9.63552874049823,
+      "learning_rate": 4.747795815929152e-06,
+      "loss": 0.8368,
+      "step": 7441
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 7.127815503415288,
+      "learning_rate": 4.746642550017503e-06,
+      "loss": 0.7416,
+      "step": 7442
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 8.442123977151414,
+      "learning_rate": 4.745489297619232e-06,
+      "loss": 0.7798,
+      "step": 7443
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 7.9606274718900005,
+      "learning_rate": 4.744336058795857e-06,
+      "loss": 0.8413,
+      "step": 7444
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 8.832505450894402,
+      "learning_rate": 4.7431828336088825e-06,
+      "loss": 0.7395,
+      "step": 7445
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 5.901138348492691,
+      "learning_rate": 4.74202962211982e-06,
+      "loss": 0.8605,
+      "step": 7446
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 8.078268313103637,
+      "learning_rate": 4.740876424390181e-06,
+      "loss": 0.7491,
+      "step": 7447
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 7.618637835339919,
+      "learning_rate": 4.739723240481472e-06,
+      "loss": 0.6773,
+      "step": 7448
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 9.131381972231464,
+      "learning_rate": 4.738570070455202e-06,
+      "loss": 0.8615,
+      "step": 7449
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 9.167206128957915,
+      "learning_rate": 4.737416914372874e-06,
+      "loss": 0.781,
+      "step": 7450
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 9.438379581815825,
+      "learning_rate": 4.736263772295998e-06,
+      "loss": 0.7556,
+      "step": 7451
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 6.9636470556254855,
+      "learning_rate": 4.73511064428608e-06,
+      "loss": 0.762,
+      "step": 7452
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 7.792725565364799,
+      "learning_rate": 4.733957530404621e-06,
+      "loss": 0.749,
+      "step": 7453
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 7.835230521082119,
+      "learning_rate": 4.732804430713128e-06,
+      "loss": 0.8123,
+      "step": 7454
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 8.985979004849204,
+      "learning_rate": 4.731651345273102e-06,
+      "loss": 0.7472,
+      "step": 7455
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 8.587286403470586,
+      "learning_rate": 4.730498274146046e-06,
+      "loss": 0.7552,
+      "step": 7456
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 8.115694227671465,
+      "learning_rate": 4.729345217393464e-06,
+      "loss": 0.7548,
+      "step": 7457
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 8.717430883938004,
+      "learning_rate": 4.728192175076851e-06,
+      "loss": 0.7765,
+      "step": 7458
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 8.117011471792818,
+      "learning_rate": 4.7270391472577145e-06,
+      "loss": 0.7881,
+      "step": 7459
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 9.153824467858128,
+      "learning_rate": 4.7258861339975485e-06,
+      "loss": 0.8173,
+      "step": 7460
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 8.240063719519119,
+      "learning_rate": 4.724733135357852e-06,
+      "loss": 0.749,
+      "step": 7461
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 10.016801730173578,
+      "learning_rate": 4.723580151400126e-06,
+      "loss": 0.816,
+      "step": 7462
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 7.749507234914095,
+      "learning_rate": 4.722427182185864e-06,
+      "loss": 0.8218,
+      "step": 7463
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 10.05994038071962,
+      "learning_rate": 4.721274227776563e-06,
+      "loss": 0.7104,
+      "step": 7464
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 8.207251213568163,
+      "learning_rate": 4.72012128823372e-06,
+      "loss": 0.7482,
+      "step": 7465
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 8.327389444145775,
+      "learning_rate": 4.7189683636188285e-06,
+      "loss": 0.7292,
+      "step": 7466
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 8.273909881177772,
+      "learning_rate": 4.717815453993383e-06,
+      "loss": 0.7449,
+      "step": 7467
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 5.731894501814709,
+      "learning_rate": 4.716662559418873e-06,
+      "loss": 0.8036,
+      "step": 7468
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 11.344694337105803,
+      "learning_rate": 4.715509679956795e-06,
+      "loss": 0.8395,
+      "step": 7469
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 8.01093500635786,
+      "learning_rate": 4.71435681566864e-06,
+      "loss": 0.763,
+      "step": 7470
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 7.604647810531275,
+      "learning_rate": 4.713203966615895e-06,
+      "loss": 0.7572,
+      "step": 7471
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 6.860312662871299,
+      "learning_rate": 4.712051132860054e-06,
+      "loss": 0.7158,
+      "step": 7472
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 9.145462042325617,
+      "learning_rate": 4.710898314462603e-06,
+      "loss": 0.7699,
+      "step": 7473
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 12.034136671249666,
+      "learning_rate": 4.70974551148503e-06,
+      "loss": 0.771,
+      "step": 7474
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 5.9887368332238085,
+      "learning_rate": 4.708592723988826e-06,
+      "loss": 0.8142,
+      "step": 7475
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 6.762944361648879,
+      "learning_rate": 4.707439952035474e-06,
+      "loss": 0.7814,
+      "step": 7476
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 9.329615545660431,
+      "learning_rate": 4.706287195686461e-06,
+      "loss": 0.7579,
+      "step": 7477
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 9.849506202666717,
+      "learning_rate": 4.70513445500327e-06,
+      "loss": 0.6781,
+      "step": 7478
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 7.560867997488974,
+      "learning_rate": 4.703981730047387e-06,
+      "loss": 0.8764,
+      "step": 7479
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 10.727636537501873,
+      "learning_rate": 4.702829020880296e-06,
+      "loss": 0.7596,
+      "step": 7480
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 12.247555974999702,
+      "learning_rate": 4.7016763275634756e-06,
+      "loss": 0.6987,
+      "step": 7481
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 6.792064975993648,
+      "learning_rate": 4.7005236501584114e-06,
+      "loss": 0.7657,
+      "step": 7482
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 10.665859634618345,
+      "learning_rate": 4.699370988726581e-06,
+      "loss": 0.7879,
+      "step": 7483
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 9.318898534810783,
+      "learning_rate": 4.698218343329466e-06,
+      "loss": 0.7993,
+      "step": 7484
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 7.003911750936435,
+      "learning_rate": 4.697065714028544e-06,
+      "loss": 0.6896,
+      "step": 7485
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 9.479796369529927,
+      "learning_rate": 4.695913100885293e-06,
+      "loss": 0.8054,
+      "step": 7486
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 10.433020365551107,
+      "learning_rate": 4.694760503961193e-06,
+      "loss": 0.7095,
+      "step": 7487
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 7.14218706004475,
+      "learning_rate": 4.693607923317716e-06,
+      "loss": 0.8163,
+      "step": 7488
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 7.000700748075955,
+      "learning_rate": 4.69245535901634e-06,
+      "loss": 0.7885,
+      "step": 7489
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 7.909292454893863,
+      "learning_rate": 4.69130281111854e-06,
+      "loss": 0.8039,
+      "step": 7490
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 6.7707956733443355,
+      "learning_rate": 4.690150279685787e-06,
+      "loss": 0.7754,
+      "step": 7491
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 12.423669462382009,
+      "learning_rate": 4.688997764779558e-06,
+      "loss": 0.7429,
+      "step": 7492
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 8.639116252555729,
+      "learning_rate": 4.68784526646132e-06,
+      "loss": 0.7403,
+      "step": 7493
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 8.465127090817388,
+      "learning_rate": 4.686692784792547e-06,
+      "loss": 0.7693,
+      "step": 7494
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 6.447967053053244,
+      "learning_rate": 4.685540319834711e-06,
+      "loss": 0.7161,
+      "step": 7495
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 9.954155490716156,
+      "learning_rate": 4.684387871649277e-06,
+      "loss": 0.7546,
+      "step": 7496
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 6.45909440042962,
+      "learning_rate": 4.683235440297717e-06,
+      "loss": 0.7848,
+      "step": 7497
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 7.662284448113951,
+      "learning_rate": 4.682083025841494e-06,
+      "loss": 0.7969,
+      "step": 7498
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 9.804962363453448,
+      "learning_rate": 4.680930628342079e-06,
+      "loss": 0.8052,
+      "step": 7499
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 7.529725166921905,
+      "learning_rate": 4.679778247860938e-06,
+      "loss": 0.7605,
+      "step": 7500
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 7.756336095784511,
+      "learning_rate": 4.678625884459532e-06,
+      "loss": 0.7698,
+      "step": 7501
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 9.327407986005776,
+      "learning_rate": 4.677473538199329e-06,
+      "loss": 0.8123,
+      "step": 7502
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 7.41829904156605,
+      "learning_rate": 4.676321209141786e-06,
+      "loss": 0.7564,
+      "step": 7503
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 7.795738263681656,
+      "learning_rate": 4.67516889734837e-06,
+      "loss": 0.7908,
+      "step": 7504
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 12.669742188063232,
+      "learning_rate": 4.674016602880544e-06,
+      "loss": 0.7327,
+      "step": 7505
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 8.003641359353642,
+      "learning_rate": 4.672864325799761e-06,
+      "loss": 0.7568,
+      "step": 7506
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 9.76207915382109,
+      "learning_rate": 4.6717120661674856e-06,
+      "loss": 0.7459,
+      "step": 7507
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 8.092355920281049,
+      "learning_rate": 4.670559824045175e-06,
+      "loss": 0.7329,
+      "step": 7508
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 6.952724553691049,
+      "learning_rate": 4.669407599494286e-06,
+      "loss": 0.7579,
+      "step": 7509
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 7.17385306403984,
+      "learning_rate": 4.668255392576276e-06,
+      "loss": 0.8557,
+      "step": 7510
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 11.598059331270385,
+      "learning_rate": 4.6671032033526e-06,
+      "loss": 0.801,
+      "step": 7511
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 8.457711634153616,
+      "learning_rate": 4.665951031884711e-06,
+      "loss": 0.8107,
+      "step": 7512
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 11.17952018670345,
+      "learning_rate": 4.664798878234067e-06,
+      "loss": 0.7559,
+      "step": 7513
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 7.680108314414941,
+      "learning_rate": 4.663646742462115e-06,
+      "loss": 0.7591,
+      "step": 7514
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 5.5290934496376085,
+      "learning_rate": 4.662494624630312e-06,
+      "loss": 0.7554,
+      "step": 7515
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 9.471417972444161,
+      "learning_rate": 4.661342524800103e-06,
+      "loss": 0.7792,
+      "step": 7516
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 8.90046996941556,
+      "learning_rate": 4.660190443032944e-06,
+      "loss": 0.7642,
+      "step": 7517
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 9.340164171813461,
+      "learning_rate": 4.659038379390281e-06,
+      "loss": 0.7041,
+      "step": 7518
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 5.084757199877626,
+      "learning_rate": 4.657886333933559e-06,
+      "loss": 0.7937,
+      "step": 7519
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 5.9139853782125975,
+      "learning_rate": 4.6567343067242305e-06,
+      "loss": 0.8135,
+      "step": 7520
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 7.915960324005659,
+      "learning_rate": 4.6555822978237366e-06,
+      "loss": 0.7617,
+      "step": 7521
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 9.06250780839779,
+      "learning_rate": 4.654430307293525e-06,
+      "loss": 0.7673,
+      "step": 7522
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 7.959381505859823,
+      "learning_rate": 4.653278335195039e-06,
+      "loss": 0.803,
+      "step": 7523
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 7.608122363277654,
+      "learning_rate": 4.652126381589722e-06,
+      "loss": 0.7985,
+      "step": 7524
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 12.254769417256902,
+      "learning_rate": 4.650974446539015e-06,
+      "loss": 0.6874,
+      "step": 7525
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 9.308838997671511,
+      "learning_rate": 4.649822530104359e-06,
+      "loss": 0.7266,
+      "step": 7526
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 9.049259257671302,
+      "learning_rate": 4.648670632347195e-06,
+      "loss": 0.8003,
+      "step": 7527
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 9.277845642223054,
+      "learning_rate": 4.647518753328962e-06,
+      "loss": 0.7409,
+      "step": 7528
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 8.160768220346496,
+      "learning_rate": 4.646366893111096e-06,
+      "loss": 0.6662,
+      "step": 7529
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 7.856073342705332,
+      "learning_rate": 4.645215051755037e-06,
+      "loss": 0.8104,
+      "step": 7530
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 9.909149156893047,
+      "learning_rate": 4.64406322932222e-06,
+      "loss": 0.7274,
+      "step": 7531
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 6.013321240522778,
+      "learning_rate": 4.642911425874077e-06,
+      "loss": 0.7602,
+      "step": 7532
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 9.652888529974375,
+      "learning_rate": 4.641759641472048e-06,
+      "loss": 0.7536,
+      "step": 7533
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 7.4580218580087365,
+      "learning_rate": 4.64060787617756e-06,
+      "loss": 0.7901,
+      "step": 7534
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 7.3186127565521355,
+      "learning_rate": 4.63945613005205e-06,
+      "loss": 0.8226,
+      "step": 7535
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 10.513170459002493,
+      "learning_rate": 4.638304403156944e-06,
+      "loss": 0.6635,
+      "step": 7536
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 9.213332151235903,
+      "learning_rate": 4.637152695553675e-06,
+      "loss": 0.7404,
+      "step": 7537
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 10.584764854434018,
+      "learning_rate": 4.6360010073036724e-06,
+      "loss": 0.7794,
+      "step": 7538
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 8.308691800505281,
+      "learning_rate": 4.634849338468361e-06,
+      "loss": 0.7299,
+      "step": 7539
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 6.808119449823074,
+      "learning_rate": 4.633697689109172e-06,
+      "loss": 0.7064,
+      "step": 7540
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 7.553926039247217,
+      "learning_rate": 4.632546059287527e-06,
+      "loss": 0.7277,
+      "step": 7541
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 7.069816564157566,
+      "learning_rate": 4.6313944490648504e-06,
+      "loss": 0.7534,
+      "step": 7542
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 7.517142242124813,
+      "learning_rate": 4.630242858502572e-06,
+      "loss": 0.7704,
+      "step": 7543
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 10.947106103815274,
+      "learning_rate": 4.6290912876621076e-06,
+      "loss": 0.7311,
+      "step": 7544
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 8.367667286973397,
+      "learning_rate": 4.627939736604883e-06,
+      "loss": 0.7435,
+      "step": 7545
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 7.867062870369238,
+      "learning_rate": 4.626788205392317e-06,
+      "loss": 0.8043,
+      "step": 7546
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 9.13908360968104,
+      "learning_rate": 4.625636694085827e-06,
+      "loss": 0.7974,
+      "step": 7547
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 7.9332110454058755,
+      "learning_rate": 4.6244852027468356e-06,
+      "loss": 0.7667,
+      "step": 7548
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 8.113277572999534,
+      "learning_rate": 4.623333731436758e-06,
+      "loss": 0.7241,
+      "step": 7549
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 10.532693409526557,
+      "learning_rate": 4.6221822802170095e-06,
+      "loss": 0.7615,
+      "step": 7550
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 9.049691784529179,
+      "learning_rate": 4.621030849149008e-06,
+      "loss": 0.8682,
+      "step": 7551
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 5.468001780276567,
+      "learning_rate": 4.619879438294167e-06,
+      "loss": 0.8354,
+      "step": 7552
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 9.98849887875617,
+      "learning_rate": 4.618728047713898e-06,
+      "loss": 0.7498,
+      "step": 7553
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 7.775704326548758,
+      "learning_rate": 4.617576677469612e-06,
+      "loss": 0.8244,
+      "step": 7554
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 10.116660735307144,
+      "learning_rate": 4.616425327622722e-06,
+      "loss": 0.7886,
+      "step": 7555
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 9.247641748033534,
+      "learning_rate": 4.6152739982346396e-06,
+      "loss": 0.7544,
+      "step": 7556
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 7.801537242256017,
+      "learning_rate": 4.614122689366769e-06,
+      "loss": 0.6928,
+      "step": 7557
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 9.006964024463363,
+      "learning_rate": 4.612971401080521e-06,
+      "loss": 0.7263,
+      "step": 7558
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 8.838757838767249,
+      "learning_rate": 4.611820133437301e-06,
+      "loss": 0.8017,
+      "step": 7559
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 9.418953551996532,
+      "learning_rate": 4.610668886498513e-06,
+      "loss": 0.6836,
+      "step": 7560
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 6.635897361189798,
+      "learning_rate": 4.609517660325565e-06,
+      "loss": 0.7969,
+      "step": 7561
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 10.336448908113496,
+      "learning_rate": 4.608366454979858e-06,
+      "loss": 0.7969,
+      "step": 7562
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 11.21769521913177,
+      "learning_rate": 4.607215270522795e-06,
+      "loss": 0.8455,
+      "step": 7563
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 10.290840905727599,
+      "learning_rate": 4.606064107015773e-06,
+      "loss": 0.6933,
+      "step": 7564
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 8.023493951762385,
+      "learning_rate": 4.6049129645201966e-06,
+      "loss": 0.7093,
+      "step": 7565
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 9.901376062938615,
+      "learning_rate": 4.603761843097464e-06,
+      "loss": 0.8485,
+      "step": 7566
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 8.933192814178142,
+      "learning_rate": 4.60261074280897e-06,
+      "loss": 0.7893,
+      "step": 7567
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 10.79521353443479,
+      "learning_rate": 4.6014596637161145e-06,
+      "loss": 0.7914,
+      "step": 7568
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 6.8559292831410215,
+      "learning_rate": 4.6003086058802904e-06,
+      "loss": 0.8005,
+      "step": 7569
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 6.38313423889435,
+      "learning_rate": 4.5991575693628914e-06,
+      "loss": 0.7509,
+      "step": 7570
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 6.913256059398141,
+      "learning_rate": 4.598006554225314e-06,
+      "loss": 0.7426,
+      "step": 7571
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 9.327382563746758,
+      "learning_rate": 4.596855560528947e-06,
+      "loss": 0.8216,
+      "step": 7572
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 9.79163645000244,
+      "learning_rate": 4.595704588335184e-06,
+      "loss": 0.7692,
+      "step": 7573
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 10.793559275087647,
+      "learning_rate": 4.59455363770541e-06,
+      "loss": 0.6972,
+      "step": 7574
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 9.362559890171053,
+      "learning_rate": 4.593402708701017e-06,
+      "loss": 0.8014,
+      "step": 7575
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 8.792752357592974,
+      "learning_rate": 4.592251801383394e-06,
+      "loss": 0.8065,
+      "step": 7576
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 9.53378879637801,
+      "learning_rate": 4.5911009158139224e-06,
+      "loss": 0.7811,
+      "step": 7577
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 9.686220902102859,
+      "learning_rate": 4.589950052053992e-06,
+      "loss": 0.766,
+      "step": 7578
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 8.587383562426885,
+      "learning_rate": 4.588799210164983e-06,
+      "loss": 0.8034,
+      "step": 7579
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 6.721339514318494,
+      "learning_rate": 4.587648390208279e-06,
+      "loss": 0.7462,
+      "step": 7580
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 6.0604842243166495,
+      "learning_rate": 4.586497592245264e-06,
+      "loss": 0.7295,
+      "step": 7581
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 9.004622113097012,
+      "learning_rate": 4.585346816337314e-06,
+      "loss": 0.8254,
+      "step": 7582
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 10.614071497016573,
+      "learning_rate": 4.584196062545814e-06,
+      "loss": 0.7911,
+      "step": 7583
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 13.705792067004033,
+      "learning_rate": 4.583045330932135e-06,
+      "loss": 0.7361,
+      "step": 7584
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 7.436387240445698,
+      "learning_rate": 4.5818946215576585e-06,
+      "loss": 0.7536,
+      "step": 7585
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 8.243854139468747,
+      "learning_rate": 4.580743934483761e-06,
+      "loss": 0.7431,
+      "step": 7586
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 9.918984813511534,
+      "learning_rate": 4.579593269771814e-06,
+      "loss": 0.8165,
+      "step": 7587
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 9.047734879218934,
+      "learning_rate": 4.5784426274831914e-06,
+      "loss": 0.815,
+      "step": 7588
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 10.778552469497107,
+      "learning_rate": 4.577292007679265e-06,
+      "loss": 0.7868,
+      "step": 7589
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 10.188208410191757,
+      "learning_rate": 4.576141410421407e-06,
+      "loss": 0.6944,
+      "step": 7590
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 9.179813619632462,
+      "learning_rate": 4.574990835770987e-06,
+      "loss": 0.7664,
+      "step": 7591
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 8.33344708788023,
+      "learning_rate": 4.573840283789371e-06,
+      "loss": 0.6986,
+      "step": 7592
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 9.460278068834004,
+      "learning_rate": 4.572689754537929e-06,
+      "loss": 0.7861,
+      "step": 7593
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 8.600673262564944,
+      "learning_rate": 4.571539248078027e-06,
+      "loss": 0.8242,
+      "step": 7594
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 8.43108025190181,
+      "learning_rate": 4.570388764471027e-06,
+      "loss": 0.7928,
+      "step": 7595
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 7.529250240057811,
+      "learning_rate": 4.569238303778297e-06,
+      "loss": 0.7242,
+      "step": 7596
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 8.254663604274626,
+      "learning_rate": 4.568087866061195e-06,
+      "loss": 0.8503,
+      "step": 7597
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 7.227152587400561,
+      "learning_rate": 4.566937451381083e-06,
+      "loss": 0.7873,
+      "step": 7598
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 8.070979489964152,
+      "learning_rate": 4.565787059799324e-06,
+      "loss": 0.8443,
+      "step": 7599
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 8.564602613435794,
+      "learning_rate": 4.564636691377274e-06,
+      "loss": 0.7619,
+      "step": 7600
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 8.09682358970575,
+      "learning_rate": 4.563486346176292e-06,
+      "loss": 0.6729,
+      "step": 7601
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 8.382380896829323,
+      "learning_rate": 4.562336024257731e-06,
+      "loss": 0.7421,
+      "step": 7602
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 7.201809162772191,
+      "learning_rate": 4.561185725682949e-06,
+      "loss": 0.8021,
+      "step": 7603
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 6.845997602955798,
+      "learning_rate": 4.5600354505133e-06,
+      "loss": 0.722,
+      "step": 7604
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 9.439602006442437,
+      "learning_rate": 4.558885198810134e-06,
+      "loss": 0.7992,
+      "step": 7605
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 9.81763735153241,
+      "learning_rate": 4.5577349706348055e-06,
+      "loss": 0.8074,
+      "step": 7606
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 8.291768442003388,
+      "learning_rate": 4.556584766048661e-06,
+      "loss": 0.7777,
+      "step": 7607
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 9.528434489103832,
+      "learning_rate": 4.55543458511305e-06,
+      "loss": 0.754,
+      "step": 7608
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 8.607545810849007,
+      "learning_rate": 4.554284427889324e-06,
+      "loss": 0.7433,
+      "step": 7609
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 9.69460270533606,
+      "learning_rate": 4.553134294438824e-06,
+      "loss": 0.7018,
+      "step": 7610
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 8.32079159832062,
+      "learning_rate": 4.551984184822898e-06,
+      "loss": 0.8093,
+      "step": 7611
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 6.804919763557846,
+      "learning_rate": 4.550834099102886e-06,
+      "loss": 0.7905,
+      "step": 7612
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 6.787103905401153,
+      "learning_rate": 4.549684037340134e-06,
+      "loss": 0.7342,
+      "step": 7613
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 7.2093693225720274,
+      "learning_rate": 4.548533999595983e-06,
+      "loss": 0.7124,
+      "step": 7614
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 6.948091484740607,
+      "learning_rate": 4.547383985931771e-06,
+      "loss": 0.8164,
+      "step": 7615
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 7.44760475602227,
+      "learning_rate": 4.546233996408837e-06,
+      "loss": 0.7378,
+      "step": 7616
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 9.90941900261026,
+      "learning_rate": 4.545084031088519e-06,
+      "loss": 0.7451,
+      "step": 7617
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 6.096336597014878,
+      "learning_rate": 4.543934090032152e-06,
+      "loss": 0.688,
+      "step": 7618
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 8.447420016639105,
+      "learning_rate": 4.542784173301073e-06,
+      "loss": 0.7783,
+      "step": 7619
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 7.863713309090894,
+      "learning_rate": 4.541634280956612e-06,
+      "loss": 0.738,
+      "step": 7620
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 10.159206570711936,
+      "learning_rate": 4.540484413060104e-06,
+      "loss": 0.7928,
+      "step": 7621
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 8.624994388320749,
+      "learning_rate": 4.539334569672876e-06,
+      "loss": 0.7076,
+      "step": 7622
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 9.202415816693144,
+      "learning_rate": 4.5381847508562605e-06,
+      "loss": 0.7552,
+      "step": 7623
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 13.782661041443859,
+      "learning_rate": 4.537034956671586e-06,
+      "loss": 0.8248,
+      "step": 7624
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 7.685364732695519,
+      "learning_rate": 4.535885187180176e-06,
+      "loss": 0.7452,
+      "step": 7625
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 9.743509622499708,
+      "learning_rate": 4.534735442443363e-06,
+      "loss": 0.7174,
+      "step": 7626
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 8.329475733319322,
+      "learning_rate": 4.5335857225224615e-06,
+      "loss": 0.7374,
+      "step": 7627
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 8.909122684056594,
+      "learning_rate": 4.5324360274788e-06,
+      "loss": 0.7806,
+      "step": 7628
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 7.860583470288195,
+      "learning_rate": 4.531286357373703e-06,
+      "loss": 0.7693,
+      "step": 7629
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 7.665315343616475,
+      "learning_rate": 4.5301367122684826e-06,
+      "loss": 0.7506,
+      "step": 7630
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 8.221691548178574,
+      "learning_rate": 4.528987092224465e-06,
+      "loss": 0.7904,
+      "step": 7631
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 7.143943446444984,
+      "learning_rate": 4.527837497302963e-06,
+      "loss": 0.7172,
+      "step": 7632
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 11.358533454967338,
+      "learning_rate": 4.526687927565295e-06,
+      "loss": 0.7468,
+      "step": 7633
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 8.309435622409435,
+      "learning_rate": 4.525538383072776e-06,
+      "loss": 0.753,
+      "step": 7634
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 7.277649346747383,
+      "learning_rate": 4.524388863886719e-06,
+      "loss": 0.7591,
+      "step": 7635
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 9.205885799215311,
+      "learning_rate": 4.523239370068435e-06,
+      "loss": 0.8228,
+      "step": 7636
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 7.512179924635817,
+      "learning_rate": 4.522089901679238e-06,
+      "loss": 0.735,
+      "step": 7637
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 10.858709583879378,
+      "learning_rate": 4.520940458780434e-06,
+      "loss": 0.6987,
+      "step": 7638
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 10.054108092801611,
+      "learning_rate": 4.519791041433333e-06,
+      "loss": 0.7646,
+      "step": 7639
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 8.02220283954799,
+      "learning_rate": 4.51864164969924e-06,
+      "loss": 0.7181,
+      "step": 7640
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 10.45484479415744,
+      "learning_rate": 4.517492283639463e-06,
+      "loss": 0.744,
+      "step": 7641
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 8.69396512815345,
+      "learning_rate": 4.5163429433153045e-06,
+      "loss": 0.7225,
+      "step": 7642
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 8.953577927528825,
+      "learning_rate": 4.515193628788066e-06,
+      "loss": 0.7725,
+      "step": 7643
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 9.904723030132443,
+      "learning_rate": 4.514044340119051e-06,
+      "loss": 0.8409,
+      "step": 7644
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 10.226973341970561,
+      "learning_rate": 4.512895077369557e-06,
+      "loss": 0.8386,
+      "step": 7645
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 6.430745240599196,
+      "learning_rate": 4.511745840600883e-06,
+      "loss": 0.7106,
+      "step": 7646
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 9.299872593209674,
+      "learning_rate": 4.5105966298743294e-06,
+      "loss": 0.6806,
+      "step": 7647
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 8.372821751618968,
+      "learning_rate": 4.509447445251188e-06,
+      "loss": 0.7827,
+      "step": 7648
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 7.726069283794234,
+      "learning_rate": 4.5082982867927555e-06,
+      "loss": 0.7025,
+      "step": 7649
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 10.03375454166149,
+      "learning_rate": 4.507149154560321e-06,
+      "loss": 0.8098,
+      "step": 7650
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 10.918037728734053,
+      "learning_rate": 4.506000048615181e-06,
+      "loss": 0.739,
+      "step": 7651
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 7.381669816066459,
+      "learning_rate": 4.504850969018624e-06,
+      "loss": 0.7775,
+      "step": 7652
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 9.402680686091577,
+      "learning_rate": 4.5037019158319355e-06,
+      "loss": 0.7769,
+      "step": 7653
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 8.24320904589887,
+      "learning_rate": 4.502552889116407e-06,
+      "loss": 0.7233,
+      "step": 7654
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 7.374477077142105,
+      "learning_rate": 4.501403888933323e-06,
+      "loss": 0.687,
+      "step": 7655
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 7.3765168146112625,
+      "learning_rate": 4.500254915343966e-06,
+      "loss": 0.6907,
+      "step": 7656
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 8.090717719816979,
+      "learning_rate": 4.499105968409623e-06,
+      "loss": 0.7525,
+      "step": 7657
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 8.640308609196067,
+      "learning_rate": 4.497957048191572e-06,
+      "loss": 0.731,
+      "step": 7658
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 15.082443577847313,
+      "learning_rate": 4.496808154751096e-06,
+      "loss": 0.7422,
+      "step": 7659
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 9.63852021779594,
+      "learning_rate": 4.49565928814947e-06,
+      "loss": 0.7562,
+      "step": 7660
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 8.033686770004996,
+      "learning_rate": 4.494510448447974e-06,
+      "loss": 0.8039,
+      "step": 7661
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 9.548620457739407,
+      "learning_rate": 4.4933616357078865e-06,
+      "loss": 0.7247,
+      "step": 7662
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 9.184834739709249,
+      "learning_rate": 4.492212849990476e-06,
+      "loss": 0.8143,
+      "step": 7663
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 6.902359638819193,
+      "learning_rate": 4.4910640913570205e-06,
+      "loss": 0.717,
+      "step": 7664
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 7.967869752486344,
+      "learning_rate": 4.489915359868789e-06,
+      "loss": 0.766,
+      "step": 7665
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 10.164283247321922,
+      "learning_rate": 4.488766655587051e-06,
+      "loss": 0.8799,
+      "step": 7666
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 10.370648270431829,
+      "learning_rate": 4.4876179785730794e-06,
+      "loss": 0.7526,
+      "step": 7667
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 10.10909190822096,
+      "learning_rate": 4.486469328888138e-06,
+      "loss": 0.7861,
+      "step": 7668
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 9.403720229249332,
+      "learning_rate": 4.485320706593493e-06,
+      "loss": 0.8076,
+      "step": 7669
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 7.358056114145409,
+      "learning_rate": 4.484172111750408e-06,
+      "loss": 0.7907,
+      "step": 7670
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 7.757784247496993,
+      "learning_rate": 4.483023544420146e-06,
+      "loss": 0.7936,
+      "step": 7671
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 6.723671028465615,
+      "learning_rate": 4.481875004663971e-06,
+      "loss": 0.7515,
+      "step": 7672
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 7.97137152952443,
+      "learning_rate": 4.4807264925431405e-06,
+      "loss": 0.7716,
+      "step": 7673
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 8.298532539004915,
+      "learning_rate": 4.479578008118914e-06,
+      "loss": 0.76,
+      "step": 7674
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 6.527613043321986,
+      "learning_rate": 4.478429551452546e-06,
+      "loss": 0.7194,
+      "step": 7675
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 6.660325240780266,
+      "learning_rate": 4.477281122605295e-06,
+      "loss": 0.7272,
+      "step": 7676
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 6.895460146472968,
+      "learning_rate": 4.4761327216384145e-06,
+      "loss": 0.8077,
+      "step": 7677
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 8.466393381271237,
+      "learning_rate": 4.474984348613155e-06,
+      "loss": 0.7089,
+      "step": 7678
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 7.3354849597631135,
+      "learning_rate": 4.4738360035907704e-06,
+      "loss": 0.8304,
+      "step": 7679
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 8.527994802121786,
+      "learning_rate": 4.472687686632508e-06,
+      "loss": 0.7024,
+      "step": 7680
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 8.18290756074071,
+      "learning_rate": 4.471539397799617e-06,
+      "loss": 0.8675,
+      "step": 7681
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 7.030361175875533,
+      "learning_rate": 4.470391137153344e-06,
+      "loss": 0.7685,
+      "step": 7682
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 10.081946222813183,
+      "learning_rate": 4.469242904754933e-06,
+      "loss": 0.8457,
+      "step": 7683
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 7.646457479274031,
+      "learning_rate": 4.468094700665627e-06,
+      "loss": 0.834,
+      "step": 7684
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 9.289350631540225,
+      "learning_rate": 4.466946524946671e-06,
+      "loss": 0.7563,
+      "step": 7685
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 7.210155181576814,
+      "learning_rate": 4.4657983776593025e-06,
+      "loss": 0.827,
+      "step": 7686
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 8.005037547410954,
+      "learning_rate": 4.4646502588647634e-06,
+      "loss": 0.7558,
+      "step": 7687
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 7.861886208691222,
+      "learning_rate": 4.463502168624286e-06,
+      "loss": 0.6581,
+      "step": 7688
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 9.553645398419286,
+      "learning_rate": 4.462354106999112e-06,
+      "loss": 0.8166,
+      "step": 7689
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 8.648375569630272,
+      "learning_rate": 4.461206074050473e-06,
+      "loss": 0.738,
+      "step": 7690
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 6.830955767562904,
+      "learning_rate": 4.460058069839601e-06,
+      "loss": 0.7421,
+      "step": 7691
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 8.226068276666812,
+      "learning_rate": 4.458910094427731e-06,
+      "loss": 0.7171,
+      "step": 7692
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 9.667462158164108,
+      "learning_rate": 4.457762147876089e-06,
+      "loss": 0.7076,
+      "step": 7693
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 9.796544083118432,
+      "learning_rate": 4.456614230245903e-06,
+      "loss": 0.7296,
+      "step": 7694
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 10.118841765572618,
+      "learning_rate": 4.455466341598405e-06,
+      "loss": 0.752,
+      "step": 7695
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 7.690175150178459,
+      "learning_rate": 4.454318481994814e-06,
+      "loss": 0.76,
+      "step": 7696
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 10.830609063792227,
+      "learning_rate": 4.453170651496358e-06,
+      "loss": 0.6378,
+      "step": 7697
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 6.5927072362990335,
+      "learning_rate": 4.452022850164255e-06,
+      "loss": 0.6981,
+      "step": 7698
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 9.05604203514957,
+      "learning_rate": 4.45087507805973e-06,
+      "loss": 0.8045,
+      "step": 7699
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 8.079857938508797,
+      "learning_rate": 4.449727335243999e-06,
+      "loss": 0.7678,
+      "step": 7700
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 7.277264904225629,
+      "learning_rate": 4.44857962177828e-06,
+      "loss": 0.7986,
+      "step": 7701
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 8.244919651126315,
+      "learning_rate": 4.447431937723791e-06,
+      "loss": 0.6966,
+      "step": 7702
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 8.043893516519244,
+      "learning_rate": 4.446284283141742e-06,
+      "loss": 0.7549,
+      "step": 7703
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 8.844357623120038,
+      "learning_rate": 4.445136658093348e-06,
+      "loss": 0.773,
+      "step": 7704
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 9.905650163082907,
+      "learning_rate": 4.443989062639822e-06,
+      "loss": 0.8151,
+      "step": 7705
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 10.705058979891593,
+      "learning_rate": 4.442841496842371e-06,
+      "loss": 0.7624,
+      "step": 7706
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 7.474320595921196,
+      "learning_rate": 4.441693960762204e-06,
+      "loss": 0.8062,
+      "step": 7707
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 8.28279573268035,
+      "learning_rate": 4.440546454460525e-06,
+      "loss": 0.772,
+      "step": 7708
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 7.93742570637122,
+      "learning_rate": 4.439398977998543e-06,
+      "loss": 0.8152,
+      "step": 7709
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 8.186806798751176,
+      "learning_rate": 4.4382515314374595e-06,
+      "loss": 0.7594,
+      "step": 7710
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 8.226984093206065,
+      "learning_rate": 4.437104114838473e-06,
+      "loss": 0.7178,
+      "step": 7711
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 7.100081749067884,
+      "learning_rate": 4.43595672826279e-06,
+      "loss": 0.7354,
+      "step": 7712
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 10.221561006526212,
+      "learning_rate": 4.434809371771602e-06,
+      "loss": 0.7116,
+      "step": 7713
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 10.134026415252327,
+      "learning_rate": 4.433662045426108e-06,
+      "loss": 0.7877,
+      "step": 7714
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 7.115679301630648,
+      "learning_rate": 4.432514749287509e-06,
+      "loss": 0.7039,
+      "step": 7715
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 6.635239073488832,
+      "learning_rate": 4.431367483416989e-06,
+      "loss": 0.8008,
+      "step": 7716
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 7.819563141834577,
+      "learning_rate": 4.430220247875747e-06,
+      "loss": 0.6735,
+      "step": 7717
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 8.914356967263913,
+      "learning_rate": 4.42907304272497e-06,
+      "loss": 0.7243,
+      "step": 7718
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 7.233959127244643,
+      "learning_rate": 4.427925868025847e-06,
+      "loss": 0.7443,
+      "step": 7719
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 10.555742874892617,
+      "learning_rate": 4.426778723839568e-06,
+      "loss": 0.719,
+      "step": 7720
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 8.335316222872677,
+      "learning_rate": 4.4256316102273146e-06,
+      "loss": 0.779,
+      "step": 7721
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 8.72604654352678,
+      "learning_rate": 4.424484527250272e-06,
+      "loss": 0.848,
+      "step": 7722
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 9.144197331159548,
+      "learning_rate": 4.423337474969625e-06,
+      "loss": 0.6852,
+      "step": 7723
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 9.281310572614945,
+      "learning_rate": 4.42219045344655e-06,
+      "loss": 0.7724,
+      "step": 7724
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 7.156744946507731,
+      "learning_rate": 4.421043462742229e-06,
+      "loss": 0.769,
+      "step": 7725
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 10.254474986482176,
+      "learning_rate": 4.4198965029178365e-06,
+      "loss": 0.7394,
+      "step": 7726
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 6.954503620332393,
+      "learning_rate": 4.418749574034551e-06,
+      "loss": 0.7646,
+      "step": 7727
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 8.92293810408438,
+      "learning_rate": 4.417602676153546e-06,
+      "loss": 0.7514,
+      "step": 7728
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 8.420225831031733,
+      "learning_rate": 4.41645580933599e-06,
+      "loss": 0.7369,
+      "step": 7729
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 6.157772463622785,
+      "learning_rate": 4.415308973643061e-06,
+      "loss": 0.8306,
+      "step": 7730
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 8.026235305044592,
+      "learning_rate": 4.414162169135921e-06,
+      "loss": 0.7587,
+      "step": 7731
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 8.706562392027172,
+      "learning_rate": 4.4130153958757406e-06,
+      "loss": 0.7507,
+      "step": 7732
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 9.255132940102952,
+      "learning_rate": 4.411868653923687e-06,
+      "loss": 0.7244,
+      "step": 7733
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 10.188367702875762,
+      "learning_rate": 4.41072194334092e-06,
+      "loss": 0.7378,
+      "step": 7734
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 9.1493908919841,
+      "learning_rate": 4.409575264188607e-06,
+      "loss": 0.7501,
+      "step": 7735
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 8.62916711632811,
+      "learning_rate": 4.408428616527903e-06,
+      "loss": 0.7578,
+      "step": 7736
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 10.427721742318736,
+      "learning_rate": 4.407282000419971e-06,
+      "loss": 0.7205,
+      "step": 7737
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 8.383366317045438,
+      "learning_rate": 4.406135415925968e-06,
+      "loss": 0.7368,
+      "step": 7738
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 7.116115874057547,
+      "learning_rate": 4.404988863107047e-06,
+      "loss": 0.7306,
+      "step": 7739
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 8.623987384605933,
+      "learning_rate": 4.4038423420243655e-06,
+      "loss": 0.7396,
+      "step": 7740
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 12.800363328530436,
+      "learning_rate": 4.402695852739074e-06,
+      "loss": 0.7319,
+      "step": 7741
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 8.210839215832811,
+      "learning_rate": 4.401549395312322e-06,
+      "loss": 0.7265,
+      "step": 7742
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 8.627497270919667,
+      "learning_rate": 4.400402969805261e-06,
+      "loss": 0.8439,
+      "step": 7743
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 9.722265348373758,
+      "learning_rate": 4.399256576279036e-06,
+      "loss": 0.7045,
+      "step": 7744
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 10.588797745788746,
+      "learning_rate": 4.398110214794793e-06,
+      "loss": 0.7865,
+      "step": 7745
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 7.824075741453073,
+      "learning_rate": 4.3969638854136746e-06,
+      "loss": 0.6786,
+      "step": 7746
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 8.807760743750235,
+      "learning_rate": 4.395817588196825e-06,
+      "loss": 0.7472,
+      "step": 7747
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 10.899230410315925,
+      "learning_rate": 4.394671323205385e-06,
+      "loss": 0.7334,
+      "step": 7748
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 8.026095014864048,
+      "learning_rate": 4.393525090500489e-06,
+      "loss": 0.7794,
+      "step": 7749
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 8.461583937190172,
+      "learning_rate": 4.392378890143278e-06,
+      "loss": 0.7912,
+      "step": 7750
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 9.9293661979747,
+      "learning_rate": 4.391232722194886e-06,
+      "loss": 0.7604,
+      "step": 7751
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 8.767101280823196,
+      "learning_rate": 4.390086586716444e-06,
+      "loss": 0.7461,
+      "step": 7752
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 7.963301665791785,
+      "learning_rate": 4.388940483769088e-06,
+      "loss": 0.7369,
+      "step": 7753
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 7.398740292779094,
+      "learning_rate": 4.387794413413945e-06,
+      "loss": 0.6967,
+      "step": 7754
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 6.3762240664720125,
+      "learning_rate": 4.386648375712145e-06,
+      "loss": 0.9064,
+      "step": 7755
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 8.47913748173651,
+      "learning_rate": 4.385502370724812e-06,
+      "loss": 0.8271,
+      "step": 7756
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 11.657682796676385,
+      "learning_rate": 4.384356398513071e-06,
+      "loss": 0.7026,
+      "step": 7757
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 10.18106301139484,
+      "learning_rate": 4.383210459138048e-06,
+      "loss": 0.7343,
+      "step": 7758
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 5.81294946644556,
+      "learning_rate": 4.382064552660862e-06,
+      "loss": 0.8292,
+      "step": 7759
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 10.182622061131749,
+      "learning_rate": 4.380918679142633e-06,
+      "loss": 0.7582,
+      "step": 7760
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 8.620507236744528,
+      "learning_rate": 4.379772838644477e-06,
+      "loss": 0.7706,
+      "step": 7761
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 11.632889288892782,
+      "learning_rate": 4.3786270312275116e-06,
+      "loss": 0.7474,
+      "step": 7762
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 8.501774202482032,
+      "learning_rate": 4.3774812569528526e-06,
+      "loss": 0.7873,
+      "step": 7763
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 8.797206604032386,
+      "learning_rate": 4.376335515881608e-06,
+      "loss": 0.8023,
+      "step": 7764
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 9.067181663337566,
+      "learning_rate": 4.3751898080748925e-06,
+      "loss": 0.7206,
+      "step": 7765
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 8.668814216198943,
+      "learning_rate": 4.374044133593814e-06,
+      "loss": 0.7786,
+      "step": 7766
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 10.329074084062878,
+      "learning_rate": 4.372898492499477e-06,
+      "loss": 0.7486,
+      "step": 7767
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 7.195780029266628,
+      "learning_rate": 4.371752884852991e-06,
+      "loss": 0.7749,
+      "step": 7768
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 10.944151216152864,
+      "learning_rate": 4.370607310715456e-06,
+      "loss": 0.7589,
+      "step": 7769
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 8.936234042591337,
+      "learning_rate": 4.369461770147974e-06,
+      "loss": 0.7824,
+      "step": 7770
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 8.33625872002121,
+      "learning_rate": 4.368316263211649e-06,
+      "loss": 0.7517,
+      "step": 7771
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 9.496541717888254,
+      "learning_rate": 4.367170789967575e-06,
+      "loss": 0.7173,
+      "step": 7772
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 10.507472970199563,
+      "learning_rate": 4.366025350476851e-06,
+      "loss": 0.8589,
+      "step": 7773
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 6.732358587080357,
+      "learning_rate": 4.364879944800567e-06,
+      "loss": 0.7904,
+      "step": 7774
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 8.58091174273791,
+      "learning_rate": 4.363734572999821e-06,
+      "loss": 0.7678,
+      "step": 7775
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 6.6317512444201485,
+      "learning_rate": 4.362589235135702e-06,
+      "loss": 0.7987,
+      "step": 7776
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 7.6587169923589045,
+      "learning_rate": 4.361443931269297e-06,
+      "loss": 0.7793,
+      "step": 7777
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 7.553557238220289,
+      "learning_rate": 4.360298661461697e-06,
+      "loss": 0.7493,
+      "step": 7778
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 12.298884139428276,
+      "learning_rate": 4.359153425773985e-06,
+      "loss": 0.7258,
+      "step": 7779
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 9.497565028040265,
+      "learning_rate": 4.358008224267245e-06,
+      "loss": 0.7988,
+      "step": 7780
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 8.259472903199573,
+      "learning_rate": 4.356863057002561e-06,
+      "loss": 0.8572,
+      "step": 7781
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 10.705808790985172,
+      "learning_rate": 4.35571792404101e-06,
+      "loss": 0.7576,
+      "step": 7782
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 7.143611392086527,
+      "learning_rate": 4.354572825443674e-06,
+      "loss": 0.7884,
+      "step": 7783
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 9.117895371891011,
+      "learning_rate": 4.353427761271623e-06,
+      "loss": 0.7411,
+      "step": 7784
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 8.081980687169816,
+      "learning_rate": 4.3522827315859375e-06,
+      "loss": 0.7493,
+      "step": 7785
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 7.543111912554629,
+      "learning_rate": 4.351137736447689e-06,
+      "loss": 0.7335,
+      "step": 7786
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 9.248128884937767,
+      "learning_rate": 4.349992775917945e-06,
+      "loss": 0.7424,
+      "step": 7787
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 9.162241107496918,
+      "learning_rate": 4.34884785005778e-06,
+      "loss": 0.7564,
+      "step": 7788
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 8.840612550534052,
+      "learning_rate": 4.3477029589282565e-06,
+      "loss": 0.7677,
+      "step": 7789
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 7.629836291165609,
+      "learning_rate": 4.346558102590441e-06,
+      "loss": 0.7121,
+      "step": 7790
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 10.13519332487513,
+      "learning_rate": 4.345413281105399e-06,
+      "loss": 0.7589,
+      "step": 7791
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 7.805451316335746,
+      "learning_rate": 4.344268494534189e-06,
+      "loss": 0.7401,
+      "step": 7792
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 11.557510425735694,
+      "learning_rate": 4.343123742937875e-06,
+      "loss": 0.8091,
+      "step": 7793
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 8.28398381427393,
+      "learning_rate": 4.341979026377508e-06,
+      "loss": 0.7551,
+      "step": 7794
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 8.75974051585288,
+      "learning_rate": 4.34083434491415e-06,
+      "loss": 0.774,
+      "step": 7795
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 7.12593356137242,
+      "learning_rate": 4.339689698608855e-06,
+      "loss": 0.7721,
+      "step": 7796
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 6.041748920883709,
+      "learning_rate": 4.338545087522671e-06,
+      "loss": 0.7112,
+      "step": 7797
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 12.048435641775587,
+      "learning_rate": 4.337400511716654e-06,
+      "loss": 0.7837,
+      "step": 7798
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 9.108682046983063,
+      "learning_rate": 4.336255971251846e-06,
+      "loss": 0.8096,
+      "step": 7799
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 8.338498394469587,
+      "learning_rate": 4.335111466189297e-06,
+      "loss": 0.7502,
+      "step": 7800
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 7.784777823870131,
+      "learning_rate": 4.333966996590055e-06,
+      "loss": 0.7494,
+      "step": 7801
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 8.220546005074267,
+      "learning_rate": 4.332822562515156e-06,
+      "loss": 0.7761,
+      "step": 7802
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 7.771665862073582,
+      "learning_rate": 4.331678164025647e-06,
+      "loss": 0.7238,
+      "step": 7803
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 10.122648745018699,
+      "learning_rate": 4.330533801182562e-06,
+      "loss": 0.7257,
+      "step": 7804
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 8.556339724408206,
+      "learning_rate": 4.329389474046941e-06,
+      "loss": 0.7651,
+      "step": 7805
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 8.695963204595706,
+      "learning_rate": 4.32824518267982e-06,
+      "loss": 0.7723,
+      "step": 7806
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 10.819905539977881,
+      "learning_rate": 4.327100927142231e-06,
+      "loss": 0.7914,
+      "step": 7807
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 9.945500012489923,
+      "learning_rate": 4.325956707495204e-06,
+      "loss": 0.7713,
+      "step": 7808
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 9.715852272686819,
+      "learning_rate": 4.324812523799772e-06,
+      "loss": 0.712,
+      "step": 7809
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 9.36821873368035,
+      "learning_rate": 4.323668376116959e-06,
+      "loss": 0.7408,
+      "step": 7810
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 6.883506163191858,
+      "learning_rate": 4.322524264507795e-06,
+      "loss": 0.7235,
+      "step": 7811
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 9.148381116065556,
+      "learning_rate": 4.3213801890332976e-06,
+      "loss": 0.8224,
+      "step": 7812
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 6.156213098222671,
+      "learning_rate": 4.320236149754493e-06,
+      "loss": 0.758,
+      "step": 7813
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 10.348652039094526,
+      "learning_rate": 4.3190921467324006e-06,
+      "loss": 0.7463,
+      "step": 7814
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 10.412976711996947,
+      "learning_rate": 4.317948180028037e-06,
+      "loss": 0.7982,
+      "step": 7815
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 8.069603737158005,
+      "learning_rate": 4.31680424970242e-06,
+      "loss": 0.8375,
+      "step": 7816
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 7.766216831594718,
+      "learning_rate": 4.315660355816562e-06,
+      "loss": 0.7614,
+      "step": 7817
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 7.915676871981661,
+      "learning_rate": 4.3145164984314755e-06,
+      "loss": 0.7349,
+      "step": 7818
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 6.488825765488138,
+      "learning_rate": 4.313372677608172e-06,
+      "loss": 0.7087,
+      "step": 7819
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 10.125443708708278,
+      "learning_rate": 4.312228893407658e-06,
+      "loss": 0.7589,
+      "step": 7820
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 11.612773127446268,
+      "learning_rate": 4.3110851458909405e-06,
+      "loss": 0.8346,
+      "step": 7821
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 9.756706821954275,
+      "learning_rate": 4.309941435119023e-06,
+      "loss": 0.7723,
+      "step": 7822
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 10.14415070889681,
+      "learning_rate": 4.308797761152909e-06,
+      "loss": 0.709,
+      "step": 7823
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 8.66330703043872,
+      "learning_rate": 4.3076541240536e-06,
+      "loss": 0.761,
+      "step": 7824
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 8.6472703357724,
+      "learning_rate": 4.3065105238820894e-06,
+      "loss": 0.7034,
+      "step": 7825
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 7.386165993454099,
+      "learning_rate": 4.305366960699381e-06,
+      "loss": 0.7787,
+      "step": 7826
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 9.170906484605885,
+      "learning_rate": 4.304223434566463e-06,
+      "loss": 0.7332,
+      "step": 7827
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 8.094459411540639,
+      "learning_rate": 4.3030799455443296e-06,
+      "loss": 0.8034,
+      "step": 7828
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 8.52105601161485,
+      "learning_rate": 4.301936493693974e-06,
+      "loss": 0.7108,
+      "step": 7829
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 7.143297613794136,
+      "learning_rate": 4.300793079076382e-06,
+      "loss": 0.7605,
+      "step": 7830
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 12.719634653869559,
+      "learning_rate": 4.299649701752541e-06,
+      "loss": 0.7001,
+      "step": 7831
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 9.46338982013372,
+      "learning_rate": 4.298506361783435e-06,
+      "loss": 0.7951,
+      "step": 7832
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 5.199036148143277,
+      "learning_rate": 4.297363059230048e-06,
+      "loss": 0.7094,
+      "step": 7833
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 9.263077650131182,
+      "learning_rate": 4.296219794153359e-06,
+      "loss": 0.7587,
+      "step": 7834
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 8.74887146561903,
+      "learning_rate": 4.295076566614346e-06,
+      "loss": 0.8484,
+      "step": 7835
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 9.832211655560865,
+      "learning_rate": 4.293933376673989e-06,
+      "loss": 0.7364,
+      "step": 7836
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 7.552065497756182,
+      "learning_rate": 4.2927902243932595e-06,
+      "loss": 0.6768,
+      "step": 7837
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 9.848513509360394,
+      "learning_rate": 4.291647109833129e-06,
+      "loss": 0.7939,
+      "step": 7838
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 9.515162811588786,
+      "learning_rate": 4.290504033054573e-06,
+      "loss": 0.7715,
+      "step": 7839
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 8.955637286548248,
+      "learning_rate": 4.2893609941185555e-06,
+      "loss": 0.8224,
+      "step": 7840
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 9.194935548784306,
+      "learning_rate": 4.288217993086046e-06,
+      "loss": 0.735,
+      "step": 7841
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 6.696491616707653,
+      "learning_rate": 4.287075030018006e-06,
+      "loss": 0.8009,
+      "step": 7842
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 8.169652334848267,
+      "learning_rate": 4.2859321049753986e-06,
+      "loss": 0.7572,
+      "step": 7843
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 7.307106964920857,
+      "learning_rate": 4.284789218019187e-06,
+      "loss": 0.7756,
+      "step": 7844
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 7.9970667844778065,
+      "learning_rate": 4.283646369210327e-06,
+      "loss": 0.7706,
+      "step": 7845
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 8.290069269220925,
+      "learning_rate": 4.282503558609776e-06,
+      "loss": 0.7263,
+      "step": 7846
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 7.180050845438995,
+      "learning_rate": 4.281360786278486e-06,
+      "loss": 0.6795,
+      "step": 7847
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 11.951931567633858,
+      "learning_rate": 4.2802180522774126e-06,
+      "loss": 0.7375,
+      "step": 7848
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 8.966931713999848,
+      "learning_rate": 4.279075356667506e-06,
+      "loss": 0.7698,
+      "step": 7849
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 8.157852051632162,
+      "learning_rate": 4.277932699509711e-06,
+      "loss": 0.7285,
+      "step": 7850
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 8.02205664277105,
+      "learning_rate": 4.276790080864977e-06,
+      "loss": 0.8232,
+      "step": 7851
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 9.369853618713206,
+      "learning_rate": 4.275647500794248e-06,
+      "loss": 0.8013,
+      "step": 7852
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 8.624262613685188,
+      "learning_rate": 4.274504959358463e-06,
+      "loss": 0.7303,
+      "step": 7853
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 7.142440894899141,
+      "learning_rate": 4.273362456618566e-06,
+      "loss": 0.7221,
+      "step": 7854
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 5.545300557691877,
+      "learning_rate": 4.2722199926354924e-06,
+      "loss": 0.7657,
+      "step": 7855
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 12.126725580189852,
+      "learning_rate": 4.271077567470176e-06,
+      "loss": 0.7742,
+      "step": 7856
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 7.984916390696273,
+      "learning_rate": 4.269935181183558e-06,
+      "loss": 0.8408,
+      "step": 7857
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 7.834652986996509,
+      "learning_rate": 4.268792833836562e-06,
+      "loss": 0.7903,
+      "step": 7858
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 7.567952438673904,
+      "learning_rate": 4.267650525490123e-06,
+      "loss": 0.7759,
+      "step": 7859
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 9.62548314663186,
+      "learning_rate": 4.266508256205164e-06,
+      "loss": 0.7828,
+      "step": 7860
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 6.559165063300317,
+      "learning_rate": 4.2653660260426134e-06,
+      "loss": 0.8218,
+      "step": 7861
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 8.361631795599484,
+      "learning_rate": 4.264223835063396e-06,
+      "loss": 0.7017,
+      "step": 7862
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 8.290197876168069,
+      "learning_rate": 4.263081683328429e-06,
+      "loss": 0.6957,
+      "step": 7863
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 10.639182908403772,
+      "learning_rate": 4.261939570898636e-06,
+      "loss": 0.6859,
+      "step": 7864
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 7.756554381209552,
+      "learning_rate": 4.26079749783493e-06,
+      "loss": 0.7988,
+      "step": 7865
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 7.994082207698434,
+      "learning_rate": 4.259655464198227e-06,
+      "loss": 0.7681,
+      "step": 7866
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 8.293962721775214,
+      "learning_rate": 4.258513470049444e-06,
+      "loss": 0.7542,
+      "step": 7867
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 8.237730825739643,
+      "learning_rate": 4.257371515449487e-06,
+      "loss": 0.8476,
+      "step": 7868
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 5.769958941642787,
+      "learning_rate": 4.256229600459267e-06,
+      "loss": 0.7771,
+      "step": 7869
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 7.507825982755467,
+      "learning_rate": 4.255087725139688e-06,
+      "loss": 0.7684,
+      "step": 7870
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 5.516883763818306,
+      "learning_rate": 4.2539458895516564e-06,
+      "loss": 0.7562,
+      "step": 7871
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 7.700770051504516,
+      "learning_rate": 4.2528040937560765e-06,
+      "loss": 0.6509,
+      "step": 7872
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 7.795269422794475,
+      "learning_rate": 4.251662337813844e-06,
+      "loss": 0.7907,
+      "step": 7873
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 8.894746810334327,
+      "learning_rate": 4.2505206217858614e-06,
+      "loss": 0.7493,
+      "step": 7874
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 8.858111894039403,
+      "learning_rate": 4.249378945733021e-06,
+      "loss": 0.765,
+      "step": 7875
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 5.9814928420669675,
+      "learning_rate": 4.248237309716218e-06,
+      "loss": 0.7845,
+      "step": 7876
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 7.7931824465871316,
+      "learning_rate": 4.247095713796346e-06,
+      "loss": 0.7753,
+      "step": 7877
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 10.14840888924265,
+      "learning_rate": 4.245954158034293e-06,
+      "loss": 0.7467,
+      "step": 7878
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 7.9467955072769625,
+      "learning_rate": 4.2448126424909466e-06,
+      "loss": 0.7438,
+      "step": 7879
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 9.171594807345821,
+      "learning_rate": 4.243671167227189e-06,
+      "loss": 0.7806,
+      "step": 7880
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 10.69670962113383,
+      "learning_rate": 4.242529732303908e-06,
+      "loss": 0.7275,
+      "step": 7881
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 10.162854329601887,
+      "learning_rate": 4.2413883377819835e-06,
+      "loss": 0.8374,
+      "step": 7882
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 9.608898560321972,
+      "learning_rate": 4.240246983722292e-06,
+      "loss": 0.6967,
+      "step": 7883
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 7.924372962419822,
+      "learning_rate": 4.239105670185714e-06,
+      "loss": 0.8154,
+      "step": 7884
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 6.323987507684559,
+      "learning_rate": 4.237964397233118e-06,
+      "loss": 0.7212,
+      "step": 7885
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 8.055123247010966,
+      "learning_rate": 4.236823164925381e-06,
+      "loss": 0.8357,
+      "step": 7886
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 10.409009452357033,
+      "learning_rate": 4.235681973323374e-06,
+      "loss": 0.7847,
+      "step": 7887
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 8.309103484376003,
+      "learning_rate": 4.23454082248796e-06,
+      "loss": 0.6881,
+      "step": 7888
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 11.726185207389703,
+      "learning_rate": 4.233399712480011e-06,
+      "loss": 0.7444,
+      "step": 7889
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 5.903766434092012,
+      "learning_rate": 4.2322586433603855e-06,
+      "loss": 0.6776,
+      "step": 7890
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 9.496459073301365,
+      "learning_rate": 4.231117615189946e-06,
+      "loss": 0.7839,
+      "step": 7891
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 6.544475161638433,
+      "learning_rate": 4.229976628029556e-06,
+      "loss": 0.8289,
+      "step": 7892
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 10.123397328777493,
+      "learning_rate": 4.228835681940067e-06,
+      "loss": 0.7358,
+      "step": 7893
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 7.576331340854543,
+      "learning_rate": 4.227694776982335e-06,
+      "loss": 0.78,
+      "step": 7894
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 7.368636027068057,
+      "learning_rate": 4.226553913217217e-06,
+      "loss": 0.6697,
+      "step": 7895
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 7.486397120697179,
+      "learning_rate": 4.225413090705558e-06,
+      "loss": 0.7408,
+      "step": 7896
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 5.826043868163263,
+      "learning_rate": 4.224272309508211e-06,
+      "loss": 0.7708,
+      "step": 7897
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 6.15158691617901,
+      "learning_rate": 4.223131569686017e-06,
+      "loss": 0.7326,
+      "step": 7898
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 7.24228672816997,
+      "learning_rate": 4.221990871299823e-06,
+      "loss": 0.733,
+      "step": 7899
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 8.801597156207027,
+      "learning_rate": 4.220850214410473e-06,
+      "loss": 0.7517,
+      "step": 7900
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 9.469468457686006,
+      "learning_rate": 4.2197095990788005e-06,
+      "loss": 0.8048,
+      "step": 7901
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 10.253990766602708,
+      "learning_rate": 4.218569025365648e-06,
+      "loss": 0.745,
+      "step": 7902
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 7.388218188849292,
+      "learning_rate": 4.217428493331848e-06,
+      "loss": 0.7822,
+      "step": 7903
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 7.44771709443514,
+      "learning_rate": 4.216288003038234e-06,
+      "loss": 0.7133,
+      "step": 7904
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 8.36628151473727,
+      "learning_rate": 4.2151475545456375e-06,
+      "loss": 0.7902,
+      "step": 7905
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 13.703892788337884,
+      "learning_rate": 4.214007147914884e-06,
+      "loss": 0.7022,
+      "step": 7906
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 7.332759303919909,
+      "learning_rate": 4.212866783206803e-06,
+      "loss": 0.7419,
+      "step": 7907
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 9.024915220595021,
+      "learning_rate": 4.211726460482215e-06,
+      "loss": 0.7924,
+      "step": 7908
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 8.457192710538004,
+      "learning_rate": 4.2105861798019445e-06,
+      "loss": 0.7664,
+      "step": 7909
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 9.024692516082366,
+      "learning_rate": 4.20944594122681e-06,
+      "loss": 0.7611,
+      "step": 7910
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 9.150968217183639,
+      "learning_rate": 4.208305744817627e-06,
+      "loss": 0.7659,
+      "step": 7911
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 7.0094521486690216,
+      "learning_rate": 4.207165590635214e-06,
+      "loss": 0.7351,
+      "step": 7912
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 8.442606503369012,
+      "learning_rate": 4.20602547874038e-06,
+      "loss": 0.7456,
+      "step": 7913
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 7.272530864065261,
+      "learning_rate": 4.204885409193937e-06,
+      "loss": 0.7017,
+      "step": 7914
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 8.548985313517809,
+      "learning_rate": 4.203745382056694e-06,
+      "loss": 0.8088,
+      "step": 7915
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 7.634267229443137,
+      "learning_rate": 4.202605397389456e-06,
+      "loss": 0.7537,
+      "step": 7916
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 12.96730675354199,
+      "learning_rate": 4.201465455253027e-06,
+      "loss": 0.736,
+      "step": 7917
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 9.27077616291066,
+      "learning_rate": 4.200325555708205e-06,
+      "loss": 0.8041,
+      "step": 7918
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 7.02951210779518,
+      "learning_rate": 4.1991856988157935e-06,
+      "loss": 0.7409,
+      "step": 7919
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 7.17270280853045,
+      "learning_rate": 4.198045884636589e-06,
+      "loss": 0.6831,
+      "step": 7920
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 7.001934695294399,
+      "learning_rate": 4.196906113231382e-06,
+      "loss": 0.7696,
+      "step": 7921
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 9.065264248358003,
+      "learning_rate": 4.195766384660971e-06,
+      "loss": 0.7051,
+      "step": 7922
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 9.034964764293651,
+      "learning_rate": 4.194626698986141e-06,
+      "loss": 0.7697,
+      "step": 7923
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 7.747145806918408,
+      "learning_rate": 4.19348705626768e-06,
+      "loss": 0.7384,
+      "step": 7924
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 10.843247758982672,
+      "learning_rate": 4.192347456566377e-06,
+      "loss": 0.6776,
+      "step": 7925
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 10.58700337323699,
+      "learning_rate": 4.191207899943009e-06,
+      "loss": 0.6856,
+      "step": 7926
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 12.715001448958597,
+      "learning_rate": 4.190068386458364e-06,
+      "loss": 0.7669,
+      "step": 7927
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 9.205586287281445,
+      "learning_rate": 4.188928916173214e-06,
+      "loss": 0.7085,
+      "step": 7928
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 9.389357266434395,
+      "learning_rate": 4.187789489148339e-06,
+      "loss": 0.7288,
+      "step": 7929
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 7.429309418729395,
+      "learning_rate": 4.186650105444512e-06,
+      "loss": 0.7707,
+      "step": 7930
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 7.2850584631100475,
+      "learning_rate": 4.185510765122504e-06,
+      "loss": 0.7591,
+      "step": 7931
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 8.487262077862946,
+      "learning_rate": 4.184371468243086e-06,
+      "loss": 0.775,
+      "step": 7932
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 9.808249979124412,
+      "learning_rate": 4.1832322148670214e-06,
+      "loss": 0.763,
+      "step": 7933
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 7.372371074386948,
+      "learning_rate": 4.1820930050550776e-06,
+      "loss": 0.7749,
+      "step": 7934
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 9.220352195754959,
+      "learning_rate": 4.180953838868017e-06,
+      "loss": 0.6608,
+      "step": 7935
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 10.393066814031453,
+      "learning_rate": 4.179814716366598e-06,
+      "loss": 0.7284,
+      "step": 7936
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 7.358071941966468,
+      "learning_rate": 4.17867563761158e-06,
+      "loss": 0.7845,
+      "step": 7937
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 9.36866453258283,
+      "learning_rate": 4.177536602663717e-06,
+      "loss": 0.7197,
+      "step": 7938
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 8.658474272051635,
+      "learning_rate": 4.176397611583761e-06,
+      "loss": 0.7931,
+      "step": 7939
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 8.268205021168402,
+      "learning_rate": 4.175258664432466e-06,
+      "loss": 0.6877,
+      "step": 7940
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 7.056011423838092,
+      "learning_rate": 4.174119761270577e-06,
+      "loss": 0.7358,
+      "step": 7941
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 7.472876987790148,
+      "learning_rate": 4.17298090215884e-06,
+      "loss": 0.8022,
+      "step": 7942
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 6.801182400135178,
+      "learning_rate": 4.171842087158002e-06,
+      "loss": 0.7613,
+      "step": 7943
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 9.130828803477897,
+      "learning_rate": 4.1707033163288e-06,
+      "loss": 0.724,
+      "step": 7944
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 10.41536165287554,
+      "learning_rate": 4.169564589731977e-06,
+      "loss": 0.7355,
+      "step": 7945
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 6.459448264743364,
+      "learning_rate": 4.168425907428264e-06,
+      "loss": 0.7321,
+      "step": 7946
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 9.624575764200964,
+      "learning_rate": 4.1672872694784e-06,
+      "loss": 0.7506,
+      "step": 7947
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 8.235687577893678,
+      "learning_rate": 4.1661486759431164e-06,
+      "loss": 0.7886,
+      "step": 7948
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 8.052152042275976,
+      "learning_rate": 4.165010126883139e-06,
+      "loss": 0.7217,
+      "step": 7949
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 9.551959017617497,
+      "learning_rate": 4.163871622359199e-06,
+      "loss": 0.8429,
+      "step": 7950
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 8.181211052495826,
+      "learning_rate": 4.16273316243202e-06,
+      "loss": 0.725,
+      "step": 7951
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 7.984472154272746,
+      "learning_rate": 4.16159474716232e-06,
+      "loss": 0.7371,
+      "step": 7952
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 12.001263287858727,
+      "learning_rate": 4.160456376610826e-06,
+      "loss": 0.7296,
+      "step": 7953
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 7.110622377746712,
+      "learning_rate": 4.159318050838251e-06,
+      "loss": 0.7628,
+      "step": 7954
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 7.851144731406501,
+      "learning_rate": 4.158179769905311e-06,
+      "loss": 0.7458,
+      "step": 7955
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 11.1068083057623,
+      "learning_rate": 4.157041533872718e-06,
+      "loss": 0.7771,
+      "step": 7956
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 7.901418843341832,
+      "learning_rate": 4.155903342801184e-06,
+      "loss": 0.7062,
+      "step": 7957
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 10.773869221982633,
+      "learning_rate": 4.154765196751417e-06,
+      "loss": 0.7627,
+      "step": 7958
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 9.227402493105572,
+      "learning_rate": 4.15362709578412e-06,
+      "loss": 0.6898,
+      "step": 7959
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 9.662792640825476,
+      "learning_rate": 4.152489039959999e-06,
+      "loss": 0.7691,
+      "step": 7960
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 7.478842021224669,
+      "learning_rate": 4.151351029339753e-06,
+      "loss": 0.7295,
+      "step": 7961
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 6.9969277828245335,
+      "learning_rate": 4.1502130639840805e-06,
+      "loss": 0.7975,
+      "step": 7962
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 10.418017580024205,
+      "learning_rate": 4.149075143953679e-06,
+      "loss": 0.7087,
+      "step": 7963
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 9.522646612734341,
+      "learning_rate": 4.14793726930924e-06,
+      "loss": 0.7492,
+      "step": 7964
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 7.453733563288234,
+      "learning_rate": 4.146799440111456e-06,
+      "loss": 0.7003,
+      "step": 7965
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 8.7479704903694,
+      "learning_rate": 4.145661656421013e-06,
+      "loss": 0.7459,
+      "step": 7966
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 9.047813822188413,
+      "learning_rate": 4.144523918298601e-06,
+      "loss": 0.7857,
+      "step": 7967
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 8.128556378042557,
+      "learning_rate": 4.143386225804903e-06,
+      "loss": 0.7905,
+      "step": 7968
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 8.480157389387434,
+      "learning_rate": 4.1422485790005955e-06,
+      "loss": 0.8114,
+      "step": 7969
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 11.497540953141042,
+      "learning_rate": 4.141110977946366e-06,
+      "loss": 0.7846,
+      "step": 7970
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 8.064251114078763,
+      "learning_rate": 4.139973422702882e-06,
+      "loss": 0.7904,
+      "step": 7971
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 9.890217885412008,
+      "learning_rate": 4.138835913330823e-06,
+      "loss": 0.7669,
+      "step": 7972
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 8.052944112657675,
+      "learning_rate": 4.137698449890861e-06,
+      "loss": 0.7082,
+      "step": 7973
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 10.583051389851736,
+      "learning_rate": 4.136561032443661e-06,
+      "loss": 0.7317,
+      "step": 7974
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 7.625341812884481,
+      "learning_rate": 4.135423661049894e-06,
+      "loss": 0.7733,
+      "step": 7975
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 8.503208877041967,
+      "learning_rate": 4.134286335770222e-06,
+      "loss": 0.8138,
+      "step": 7976
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 8.354074003038788,
+      "learning_rate": 4.133149056665306e-06,
+      "loss": 0.8238,
+      "step": 7977
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 5.605140727786811,
+      "learning_rate": 4.1320118237958076e-06,
+      "loss": 0.7384,
+      "step": 7978
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 5.968235457469932,
+      "learning_rate": 4.1308746372223825e-06,
+      "loss": 0.8578,
+      "step": 7979
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 9.359853917834164,
+      "learning_rate": 4.129737497005684e-06,
+      "loss": 0.7617,
+      "step": 7980
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 10.4642741573858,
+      "learning_rate": 4.128600403206366e-06,
+      "loss": 0.8255,
+      "step": 7981
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 9.895860191050481,
+      "learning_rate": 4.127463355885077e-06,
+      "loss": 0.815,
+      "step": 7982
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 8.795182526802591,
+      "learning_rate": 4.126326355102464e-06,
+      "loss": 0.7805,
+      "step": 7983
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 8.974147259777954,
+      "learning_rate": 4.125189400919169e-06,
+      "loss": 0.7271,
+      "step": 7984
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 6.585836640117158,
+      "learning_rate": 4.124052493395838e-06,
+      "loss": 0.7641,
+      "step": 7985
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 9.843245029902477,
+      "learning_rate": 4.12291563259311e-06,
+      "loss": 0.7984,
+      "step": 7986
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 9.405639612203297,
+      "learning_rate": 4.121778818571619e-06,
+      "loss": 0.7402,
+      "step": 7987
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 7.4574992537558416,
+      "learning_rate": 4.120642051392003e-06,
+      "loss": 0.7449,
+      "step": 7988
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 11.87970218503001,
+      "learning_rate": 4.11950533111489e-06,
+      "loss": 0.73,
+      "step": 7989
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 8.169207172613136,
+      "learning_rate": 4.1183686578009115e-06,
+      "loss": 0.7502,
+      "step": 7990
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 12.334139703485533,
+      "learning_rate": 4.117232031510698e-06,
+      "loss": 0.765,
+      "step": 7991
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 7.444080867856584,
+      "learning_rate": 4.116095452304869e-06,
+      "loss": 0.7527,
+      "step": 7992
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 9.341892583013777,
+      "learning_rate": 4.1149589202440485e-06,
+      "loss": 0.7098,
+      "step": 7993
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 12.538869956825318,
+      "learning_rate": 4.113822435388855e-06,
+      "loss": 0.7357,
+      "step": 7994
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 7.831522879725489,
+      "learning_rate": 4.112685997799905e-06,
+      "loss": 0.7554,
+      "step": 7995
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 8.890668433524885,
+      "learning_rate": 4.111549607537817e-06,
+      "loss": 0.7058,
+      "step": 7996
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 7.6130119757428085,
+      "learning_rate": 4.1104132646631975e-06,
+      "loss": 0.7638,
+      "step": 7997
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 8.62572076673396,
+      "learning_rate": 4.10927696923666e-06,
+      "loss": 0.7655,
+      "step": 7998
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 7.657551961446681,
+      "learning_rate": 4.108140721318808e-06,
+      "loss": 0.8172,
+      "step": 7999
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 8.697237452724654,
+      "learning_rate": 4.1070045209702476e-06,
+      "loss": 0.8017,
+      "step": 8000
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 7.560871206155974,
+      "learning_rate": 4.105868368251582e-06,
+      "loss": 0.8256,
+      "step": 8001
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 7.434387153915806,
+      "learning_rate": 4.104732263223408e-06,
+      "loss": 0.7536,
+      "step": 8002
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 8.708853033276693,
+      "learning_rate": 4.103596205946323e-06,
+      "loss": 0.7382,
+      "step": 8003
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 8.753375480220454,
+      "learning_rate": 4.10246019648092e-06,
+      "loss": 0.8034,
+      "step": 8004
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 9.98756707748311,
+      "learning_rate": 4.101324234887793e-06,
+      "loss": 0.74,
+      "step": 8005
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 8.214984329469539,
+      "learning_rate": 4.1001883212275304e-06,
+      "loss": 0.7156,
+      "step": 8006
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 9.525186817361723,
+      "learning_rate": 4.099052455560716e-06,
+      "loss": 0.6872,
+      "step": 8007
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 8.913136384875195,
+      "learning_rate": 4.097916637947939e-06,
+      "loss": 0.75,
+      "step": 8008
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 8.977822789741424,
+      "learning_rate": 4.096780868449775e-06,
+      "loss": 0.7553,
+      "step": 8009
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 9.919114511507876,
+      "learning_rate": 4.095645147126805e-06,
+      "loss": 0.7891,
+      "step": 8010
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 8.168905754016373,
+      "learning_rate": 4.094509474039609e-06,
+      "loss": 0.7722,
+      "step": 8011
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 10.591371428378391,
+      "learning_rate": 4.093373849248753e-06,
+      "loss": 0.7925,
+      "step": 8012
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 6.968508413250735,
+      "learning_rate": 4.0922382728148155e-06,
+      "loss": 0.7761,
+      "step": 8013
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 6.39103243254125,
+      "learning_rate": 4.09110274479836e-06,
+      "loss": 0.7228,
+      "step": 8014
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 8.99707864154168,
+      "learning_rate": 4.0899672652599535e-06,
+      "loss": 0.6984,
+      "step": 8015
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 10.605294483500147,
+      "learning_rate": 4.088831834260162e-06,
+      "loss": 0.7375,
+      "step": 8016
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 7.665792110724499,
+      "learning_rate": 4.087696451859543e-06,
+      "loss": 0.7999,
+      "step": 8017
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 7.97614931659269,
+      "learning_rate": 4.086561118118657e-06,
+      "loss": 0.7609,
+      "step": 8018
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 6.127948103996735,
+      "learning_rate": 4.085425833098057e-06,
+      "loss": 0.6924,
+      "step": 8019
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 7.997525546616246,
+      "learning_rate": 4.084290596858298e-06,
+      "loss": 0.7237,
+      "step": 8020
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 8.098982416210069,
+      "learning_rate": 4.083155409459929e-06,
+      "loss": 0.7943,
+      "step": 8021
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 9.00629293718439,
+      "learning_rate": 4.082020270963498e-06,
+      "loss": 0.7703,
+      "step": 8022
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 11.458301155633182,
+      "learning_rate": 4.080885181429552e-06,
+      "loss": 0.7417,
+      "step": 8023
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 11.184719808942988,
+      "learning_rate": 4.079750140918632e-06,
+      "loss": 0.7731,
+      "step": 8024
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 9.702088884087832,
+      "learning_rate": 4.078615149491277e-06,
+      "loss": 0.7141,
+      "step": 8025
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 8.739523433275858,
+      "learning_rate": 4.0774802072080264e-06,
+      "loss": 0.7483,
+      "step": 8026
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 8.271660042371511,
+      "learning_rate": 4.076345314129412e-06,
+      "loss": 0.7613,
+      "step": 8027
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 8.250717433463471,
+      "learning_rate": 4.0752104703159675e-06,
+      "loss": 0.786,
+      "step": 8028
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 8.984033117112743,
+      "learning_rate": 4.074075675828224e-06,
+      "loss": 0.8852,
+      "step": 8029
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 6.09707826397324,
+      "learning_rate": 4.0729409307267064e-06,
+      "loss": 0.7821,
+      "step": 8030
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 9.425515147297284,
+      "learning_rate": 4.07180623507194e-06,
+      "loss": 0.8523,
+      "step": 8031
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 7.1183999916334395,
+      "learning_rate": 4.070671588924442e-06,
+      "loss": 0.7752,
+      "step": 8032
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 10.289301638727018,
+      "learning_rate": 4.069536992344737e-06,
+      "loss": 0.7293,
+      "step": 8033
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 9.3478268603436,
+      "learning_rate": 4.068402445393339e-06,
+      "loss": 0.7286,
+      "step": 8034
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 8.834218303087578,
+      "learning_rate": 4.06726794813076e-06,
+      "loss": 0.7751,
+      "step": 8035
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 9.684181586934178,
+      "learning_rate": 4.066133500617515e-06,
+      "loss": 0.7431,
+      "step": 8036
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 8.758161841166027,
+      "learning_rate": 4.064999102914107e-06,
+      "loss": 0.7342,
+      "step": 8037
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 8.74616987106939,
+      "learning_rate": 4.063864755081044e-06,
+      "loss": 0.7256,
+      "step": 8038
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 8.989456184968244,
+      "learning_rate": 4.0627304571788314e-06,
+      "loss": 0.7655,
+      "step": 8039
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 6.828470648875749,
+      "learning_rate": 4.061596209267966e-06,
+      "loss": 0.7405,
+      "step": 8040
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 7.286789586164389,
+      "learning_rate": 4.060462011408949e-06,
+      "loss": 0.773,
+      "step": 8041
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 13.642771808862866,
+      "learning_rate": 4.05932786366227e-06,
+      "loss": 0.7424,
+      "step": 8042
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 6.901218421626648,
+      "learning_rate": 4.058193766088425e-06,
+      "loss": 0.7239,
+      "step": 8043
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 8.809460868389255,
+      "learning_rate": 4.057059718747904e-06,
+      "loss": 0.7504,
+      "step": 8044
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 8.608046280943704,
+      "learning_rate": 4.055925721701192e-06,
+      "loss": 0.7807,
+      "step": 8045
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 8.67953953829587,
+      "learning_rate": 4.054791775008775e-06,
+      "loss": 0.7567,
+      "step": 8046
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 10.212681981631452,
+      "learning_rate": 4.053657878731133e-06,
+      "loss": 0.7751,
+      "step": 8047
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 11.449096399917881,
+      "learning_rate": 4.052524032928745e-06,
+      "loss": 0.784,
+      "step": 8048
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 13.147684940552006,
+      "learning_rate": 4.051390237662089e-06,
+      "loss": 0.7501,
+      "step": 8049
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 10.395549971499273,
+      "learning_rate": 4.050256492991638e-06,
+      "loss": 0.7527,
+      "step": 8050
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 8.73640989206266,
+      "learning_rate": 4.049122798977861e-06,
+      "loss": 0.7385,
+      "step": 8051
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 7.003806992578856,
+      "learning_rate": 4.047989155681226e-06,
+      "loss": 0.7503,
+      "step": 8052
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 7.653945582318363,
+      "learning_rate": 4.046855563162201e-06,
+      "loss": 0.7723,
+      "step": 8053
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 8.6360627335241,
+      "learning_rate": 4.045722021481248e-06,
+      "loss": 0.7248,
+      "step": 8054
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 9.53401499323021,
+      "learning_rate": 4.044588530698824e-06,
+      "loss": 0.7036,
+      "step": 8055
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 9.938430447977238,
+      "learning_rate": 4.04345509087539e-06,
+      "loss": 0.742,
+      "step": 8056
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 7.983706958945469,
+      "learning_rate": 4.042321702071397e-06,
+      "loss": 0.7964,
+      "step": 8057
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 6.9401274589522854,
+      "learning_rate": 4.0411883643473e-06,
+      "loss": 0.7805,
+      "step": 8058
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 8.31034125057991,
+      "learning_rate": 4.040055077763547e-06,
+      "loss": 0.7621,
+      "step": 8059
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 7.235923812962243,
+      "learning_rate": 4.038921842380583e-06,
+      "loss": 0.6999,
+      "step": 8060
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 7.68293560377897,
+      "learning_rate": 4.037788658258854e-06,
+      "loss": 0.7444,
+      "step": 8061
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 8.091056477030055,
+      "learning_rate": 4.036655525458799e-06,
+      "loss": 0.7843,
+      "step": 8062
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 5.866620368175618,
+      "learning_rate": 4.035522444040856e-06,
+      "loss": 0.7569,
+      "step": 8063
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 7.767731749431985,
+      "learning_rate": 4.034389414065463e-06,
+      "loss": 0.7079,
+      "step": 8064
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 7.64115490984475,
+      "learning_rate": 4.033256435593049e-06,
+      "loss": 0.7869,
+      "step": 8065
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 7.373650521513021,
+      "learning_rate": 4.032123508684046e-06,
+      "loss": 0.7907,
+      "step": 8066
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 8.750202640966037,
+      "learning_rate": 4.0309906333988825e-06,
+      "loss": 0.8245,
+      "step": 8067
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 9.56814072606939,
+      "learning_rate": 4.02985780979798e-06,
+      "loss": 0.6981,
+      "step": 8068
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 7.369938534071219,
+      "learning_rate": 4.0287250379417636e-06,
+      "loss": 0.6425,
+      "step": 8069
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 8.633069617312184,
+      "learning_rate": 4.027592317890648e-06,
+      "loss": 0.851,
+      "step": 8070
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 7.384913662859872,
+      "learning_rate": 4.0264596497050516e-06,
+      "loss": 0.7612,
+      "step": 8071
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 8.080673463817176,
+      "learning_rate": 4.025327033445389e-06,
+      "loss": 0.7333,
+      "step": 8072
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 7.864450591771992,
+      "learning_rate": 4.024194469172067e-06,
+      "loss": 0.8051,
+      "step": 8073
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 9.346382860281041,
+      "learning_rate": 4.023061956945497e-06,
+      "loss": 0.7233,
+      "step": 8074
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 7.4159195779761795,
+      "learning_rate": 4.021929496826084e-06,
+      "loss": 0.7618,
+      "step": 8075
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 7.547390958515986,
+      "learning_rate": 4.020797088874227e-06,
+      "loss": 0.7497,
+      "step": 8076
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 7.395816072514645,
+      "learning_rate": 4.019664733150329e-06,
+      "loss": 0.7295,
+      "step": 8077
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 10.822974790005283,
+      "learning_rate": 4.018532429714786e-06,
+      "loss": 0.8036,
+      "step": 8078
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 8.777767937392477,
+      "learning_rate": 4.017400178627991e-06,
+      "loss": 0.6951,
+      "step": 8079
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 9.403165723515864,
+      "learning_rate": 4.0162679799503345e-06,
+      "loss": 0.7977,
+      "step": 8080
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 8.600545267835653,
+      "learning_rate": 4.015135833742206e-06,
+      "loss": 0.7045,
+      "step": 8081
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 8.232060864998981,
+      "learning_rate": 4.014003740063992e-06,
+      "loss": 0.7246,
+      "step": 8082
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 7.829527844845795,
+      "learning_rate": 4.012871698976071e-06,
+      "loss": 0.7053,
+      "step": 8083
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 11.178396675760272,
+      "learning_rate": 4.011739710538829e-06,
+      "loss": 0.7358,
+      "step": 8084
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 8.503721914601028,
+      "learning_rate": 4.010607774812639e-06,
+      "loss": 0.731,
+      "step": 8085
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 9.856149978871608,
+      "learning_rate": 4.009475891857874e-06,
+      "loss": 0.7919,
+      "step": 8086
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 7.374714826173947,
+      "learning_rate": 4.008344061734911e-06,
+      "loss": 0.8048,
+      "step": 8087
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 10.247721536694096,
+      "learning_rate": 4.007212284504115e-06,
+      "loss": 0.7499,
+      "step": 8088
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 7.261233283647043,
+      "learning_rate": 4.006080560225852e-06,
+      "loss": 0.7994,
+      "step": 8089
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 9.901105793003282,
+      "learning_rate": 4.0049488889604836e-06,
+      "loss": 0.7976,
+      "step": 8090
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 11.288188409557788,
+      "learning_rate": 4.003817270768373e-06,
+      "loss": 0.7107,
+      "step": 8091
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 7.560264405575211,
+      "learning_rate": 4.002685705709877e-06,
+      "loss": 0.7842,
+      "step": 8092
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 9.108593117711719,
+      "learning_rate": 4.0015541938453475e-06,
+      "loss": 0.7504,
+      "step": 8093
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 9.694444070530285,
+      "learning_rate": 4.000422735235142e-06,
+      "loss": 0.7678,
+      "step": 8094
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 10.2772465132898,
+      "learning_rate": 3.999291329939601e-06,
+      "loss": 0.7251,
+      "step": 8095
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 6.6290246131780535,
+      "learning_rate": 3.9981599780190764e-06,
+      "loss": 0.7761,
+      "step": 8096
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 7.1911115094815,
+      "learning_rate": 3.9970286795339115e-06,
+      "loss": 0.7762,
+      "step": 8097
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 11.817369868295884,
+      "learning_rate": 3.995897434544443e-06,
+      "loss": 0.7057,
+      "step": 8098
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 8.281136142621655,
+      "learning_rate": 3.994766243111012e-06,
+      "loss": 0.7362,
+      "step": 8099
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 8.924803547733115,
+      "learning_rate": 3.993635105293951e-06,
+      "loss": 0.7641,
+      "step": 8100
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 7.269271984444937,
+      "learning_rate": 3.992504021153591e-06,
+      "loss": 0.8332,
+      "step": 8101
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 7.372154241283868,
+      "learning_rate": 3.991372990750264e-06,
+      "loss": 0.7556,
+      "step": 8102
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 9.135207255188428,
+      "learning_rate": 3.990242014144293e-06,
+      "loss": 0.7544,
+      "step": 8103
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 8.078592679257435,
+      "learning_rate": 3.989111091396003e-06,
+      "loss": 0.743,
+      "step": 8104
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 8.551679807143351,
+      "learning_rate": 3.987980222565712e-06,
+      "loss": 0.7345,
+      "step": 8105
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 10.063946166446174,
+      "learning_rate": 3.98684940771374e-06,
+      "loss": 0.7566,
+      "step": 8106
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 6.1347435173583165,
+      "learning_rate": 3.985718646900402e-06,
+      "loss": 0.8267,
+      "step": 8107
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 7.1054398192752055,
+      "learning_rate": 3.984587940186006e-06,
+      "loss": 0.8173,
+      "step": 8108
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 7.726648858165014,
+      "learning_rate": 3.983457287630862e-06,
+      "loss": 0.7475,
+      "step": 8109
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 8.157915673546936,
+      "learning_rate": 3.98232668929528e-06,
+      "loss": 0.8051,
+      "step": 8110
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 9.466097329952614,
+      "learning_rate": 3.981196145239556e-06,
+      "loss": 0.7424,
+      "step": 8111
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 14.04931001533715,
+      "learning_rate": 3.980065655523997e-06,
+      "loss": 0.7951,
+      "step": 8112
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 8.308968377504534,
+      "learning_rate": 3.978935220208896e-06,
+      "loss": 0.7449,
+      "step": 8113
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 7.624624482491161,
+      "learning_rate": 3.977804839354546e-06,
+      "loss": 0.7101,
+      "step": 8114
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 9.190516199165435,
+      "learning_rate": 3.976674513021243e-06,
+      "loss": 0.7206,
+      "step": 8115
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 5.69737233996654,
+      "learning_rate": 3.975544241269273e-06,
+      "loss": 0.7227,
+      "step": 8116
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 8.46489903445464,
+      "learning_rate": 3.974414024158922e-06,
+      "loss": 0.7523,
+      "step": 8117
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 6.518898117273208,
+      "learning_rate": 3.973283861750469e-06,
+      "loss": 0.7885,
+      "step": 8118
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 6.85923676860482,
+      "learning_rate": 3.972153754104199e-06,
+      "loss": 0.7137,
+      "step": 8119
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 8.692623676387285,
+      "learning_rate": 3.971023701280388e-06,
+      "loss": 0.7645,
+      "step": 8120
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 6.125970986674405,
+      "learning_rate": 3.969893703339307e-06,
+      "loss": 0.7639,
+      "step": 8121
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 9.058311217258069,
+      "learning_rate": 3.96876376034123e-06,
+      "loss": 0.7576,
+      "step": 8122
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 9.03797673708015,
+      "learning_rate": 3.967633872346421e-06,
+      "loss": 0.7481,
+      "step": 8123
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 6.7422599677561035,
+      "learning_rate": 3.966504039415149e-06,
+      "loss": 0.6887,
+      "step": 8124
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 9.752728779664709,
+      "learning_rate": 3.965374261607676e-06,
+      "loss": 0.8249,
+      "step": 8125
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 12.619642872746363,
+      "learning_rate": 3.964244538984258e-06,
+      "loss": 0.7567,
+      "step": 8126
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 6.7336340504175265,
+      "learning_rate": 3.9631148716051556e-06,
+      "loss": 0.7196,
+      "step": 8127
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 8.79854077074099,
+      "learning_rate": 3.9619852595306165e-06,
+      "loss": 0.7811,
+      "step": 8128
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 8.769558153105889,
+      "learning_rate": 3.960855702820897e-06,
+      "loss": 0.7767,
+      "step": 8129
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 6.655502432882598,
+      "learning_rate": 3.959726201536241e-06,
+      "loss": 0.7523,
+      "step": 8130
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 5.829058702119254,
+      "learning_rate": 3.958596755736893e-06,
+      "loss": 0.7538,
+      "step": 8131
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 10.314746726997878,
+      "learning_rate": 3.957467365483097e-06,
+      "loss": 0.7518,
+      "step": 8132
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 5.829523387622083,
+      "learning_rate": 3.956338030835089e-06,
+      "loss": 0.7846,
+      "step": 8133
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 9.118414698074009,
+      "learning_rate": 3.9552087518531036e-06,
+      "loss": 0.7122,
+      "step": 8134
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 9.227743251213813,
+      "learning_rate": 3.954079528597378e-06,
+      "loss": 0.7142,
+      "step": 8135
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 8.92623037999293,
+      "learning_rate": 3.952950361128137e-06,
+      "loss": 0.7363,
+      "step": 8136
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 9.231977518236635,
+      "learning_rate": 3.951821249505612e-06,
+      "loss": 0.7201,
+      "step": 8137
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 8.902447150459722,
+      "learning_rate": 3.950692193790021e-06,
+      "loss": 0.7241,
+      "step": 8138
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 8.275041904603174,
+      "learning_rate": 3.949563194041588e-06,
+      "loss": 0.7615,
+      "step": 8139
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 9.303290586295379,
+      "learning_rate": 3.948434250320532e-06,
+      "loss": 0.739,
+      "step": 8140
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 6.29941657163439,
+      "learning_rate": 3.947305362687065e-06,
+      "loss": 0.7263,
+      "step": 8141
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 10.766087903422743,
+      "learning_rate": 3.946176531201401e-06,
+      "loss": 0.8226,
+      "step": 8142
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 7.609659563852658,
+      "learning_rate": 3.9450477559237445e-06,
+      "loss": 0.7589,
+      "step": 8143
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 7.780823034859427,
+      "learning_rate": 3.943919036914306e-06,
+      "loss": 0.8324,
+      "step": 8144
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 7.982363073824229,
+      "learning_rate": 3.942790374233287e-06,
+      "loss": 0.7729,
+      "step": 8145
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 8.47211435698665,
+      "learning_rate": 3.941661767940886e-06,
+      "loss": 0.8027,
+      "step": 8146
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 10.303402286197246,
+      "learning_rate": 3.9405332180973e-06,
+      "loss": 0.7503,
+      "step": 8147
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 9.299652803572846,
+      "learning_rate": 3.939404724762724e-06,
+      "loss": 0.7849,
+      "step": 8148
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 7.064216618338495,
+      "learning_rate": 3.938276287997347e-06,
+      "loss": 0.7649,
+      "step": 8149
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 8.989344680886754,
+      "learning_rate": 3.937147907861358e-06,
+      "loss": 0.8291,
+      "step": 8150
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 7.610042682484435,
+      "learning_rate": 3.936019584414941e-06,
+      "loss": 0.7576,
+      "step": 8151
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 10.950461148838063,
+      "learning_rate": 3.934891317718276e-06,
+      "loss": 0.7763,
+      "step": 8152
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 6.620741115416676,
+      "learning_rate": 3.933763107831546e-06,
+      "loss": 0.8045,
+      "step": 8153
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 9.706294322122552,
+      "learning_rate": 3.932634954814923e-06,
+      "loss": 0.724,
+      "step": 8154
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 9.118888875684366,
+      "learning_rate": 3.931506858728582e-06,
+      "loss": 0.7848,
+      "step": 8155
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 8.008025984229906,
+      "learning_rate": 3.930378819632688e-06,
+      "loss": 0.7835,
+      "step": 8156
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 7.23378014607513,
+      "learning_rate": 3.9292508375874115e-06,
+      "loss": 0.7437,
+      "step": 8157
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 7.231657220142668,
+      "learning_rate": 3.9281229126529165e-06,
+      "loss": 0.744,
+      "step": 8158
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 9.142821121949167,
+      "learning_rate": 3.926995044889359e-06,
+      "loss": 0.7658,
+      "step": 8159
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 8.37795703623269,
+      "learning_rate": 3.925867234356902e-06,
+      "loss": 0.7952,
+      "step": 8160
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 8.953308095060255,
+      "learning_rate": 3.924739481115696e-06,
+      "loss": 0.7723,
+      "step": 8161
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 7.295078662982093,
+      "learning_rate": 3.92361178522589e-06,
+      "loss": 0.7327,
+      "step": 8162
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 11.12753398847354,
+      "learning_rate": 3.9224841467476396e-06,
+      "loss": 0.7736,
+      "step": 8163
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 9.404127491210897,
+      "learning_rate": 3.921356565741084e-06,
+      "loss": 0.7658,
+      "step": 8164
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 7.651842917464342,
+      "learning_rate": 3.920229042266367e-06,
+      "loss": 0.7836,
+      "step": 8165
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 9.130729133370663,
+      "learning_rate": 3.9191015763836256e-06,
+      "loss": 0.7661,
+      "step": 8166
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 10.65332801303595,
+      "learning_rate": 3.917974168152998e-06,
+      "loss": 0.7377,
+      "step": 8167
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 6.282309339741953,
+      "learning_rate": 3.916846817634618e-06,
+      "loss": 0.7459,
+      "step": 8168
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 8.0743242883668,
+      "learning_rate": 3.915719524888612e-06,
+      "loss": 0.7806,
+      "step": 8169
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 6.137004651550019,
+      "learning_rate": 3.9145922899751095e-06,
+      "loss": 0.7737,
+      "step": 8170
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 6.827118522740564,
+      "learning_rate": 3.913465112954233e-06,
+      "loss": 0.7221,
+      "step": 8171
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 10.387614722717942,
+      "learning_rate": 3.912337993886101e-06,
+      "loss": 0.7115,
+      "step": 8172
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 8.471242921945674,
+      "learning_rate": 3.911210932830836e-06,
+      "loss": 0.769,
+      "step": 8173
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 7.162532596861036,
+      "learning_rate": 3.910083929848548e-06,
+      "loss": 0.7893,
+      "step": 8174
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 8.668321475838413,
+      "learning_rate": 3.90895698499935e-06,
+      "loss": 0.8063,
+      "step": 8175
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 12.252164753492295,
+      "learning_rate": 3.907830098343347e-06,
+      "loss": 0.7407,
+      "step": 8176
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 8.625322271535648,
+      "learning_rate": 3.906703269940648e-06,
+      "loss": 0.7425,
+      "step": 8177
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 9.93178543396798,
+      "learning_rate": 3.905576499851354e-06,
+      "loss": 0.7392,
+      "step": 8178
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 11.814735288141817,
+      "learning_rate": 3.904449788135562e-06,
+      "loss": 0.7695,
+      "step": 8179
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 8.446793697609989,
+      "learning_rate": 3.903323134853372e-06,
+      "loss": 0.7599,
+      "step": 8180
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 8.261283739242213,
+      "learning_rate": 3.902196540064869e-06,
+      "loss": 0.7554,
+      "step": 8181
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 7.901378426941926,
+      "learning_rate": 3.901070003830148e-06,
+      "loss": 0.713,
+      "step": 8182
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 7.365324491344955,
+      "learning_rate": 3.899943526209297e-06,
+      "loss": 0.7523,
+      "step": 8183
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 9.940031014714215,
+      "learning_rate": 3.898817107262392e-06,
+      "loss": 0.7592,
+      "step": 8184
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 8.196987297784977,
+      "learning_rate": 3.8976907470495205e-06,
+      "loss": 0.7647,
+      "step": 8185
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 10.016636161015963,
+      "learning_rate": 3.896564445630755e-06,
+      "loss": 0.7339,
+      "step": 8186
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 12.743160761725074,
+      "learning_rate": 3.895438203066169e-06,
+      "loss": 0.7297,
+      "step": 8187
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 9.386242693408798,
+      "learning_rate": 3.894312019415837e-06,
+      "loss": 0.7611,
+      "step": 8188
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 6.961166138424856,
+      "learning_rate": 3.893185894739824e-06,
+      "loss": 0.774,
+      "step": 8189
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 8.302068534771145,
+      "learning_rate": 3.892059829098195e-06,
+      "loss": 0.7288,
+      "step": 8190
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 7.7964721776048345,
+      "learning_rate": 3.890933822551009e-06,
+      "loss": 0.7332,
+      "step": 8191
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 6.324483234282235,
+      "learning_rate": 3.889807875158326e-06,
+      "loss": 0.6748,
+      "step": 8192
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 9.325823740034936,
+      "learning_rate": 3.888681986980202e-06,
+      "loss": 0.7859,
+      "step": 8193
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 7.7851041619308,
+      "learning_rate": 3.8875561580766855e-06,
+      "loss": 0.7544,
+      "step": 8194
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 13.507465394941265,
+      "learning_rate": 3.886430388507828e-06,
+      "loss": 0.8001,
+      "step": 8195
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 10.680506261358905,
+      "learning_rate": 3.885304678333675e-06,
+      "loss": 0.6945,
+      "step": 8196
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 6.287051597317435,
+      "learning_rate": 3.884179027614265e-06,
+      "loss": 0.6976,
+      "step": 8197
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 8.304418242999855,
+      "learning_rate": 3.883053436409643e-06,
+      "loss": 0.7063,
+      "step": 8198
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 7.860895535105733,
+      "learning_rate": 3.88192790477984e-06,
+      "loss": 0.6861,
+      "step": 8199
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 7.697615213049095,
+      "learning_rate": 3.88080243278489e-06,
+      "loss": 0.8654,
+      "step": 8200
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 8.239817894198922,
+      "learning_rate": 3.879677020484824e-06,
+      "loss": 0.7939,
+      "step": 8201
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 9.162790684314658,
+      "learning_rate": 3.8785516679396676e-06,
+      "loss": 0.8181,
+      "step": 8202
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 9.230616052087518,
+      "learning_rate": 3.877426375209443e-06,
+      "loss": 0.7672,
+      "step": 8203
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 9.787536735183313,
+      "learning_rate": 3.87630114235417e-06,
+      "loss": 0.6895,
+      "step": 8204
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 7.588295168606698,
+      "learning_rate": 3.8751759694338665e-06,
+      "loss": 0.7414,
+      "step": 8205
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 10.354763248106991,
+      "learning_rate": 3.874050856508548e-06,
+      "loss": 0.7495,
+      "step": 8206
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 7.731948713794327,
+      "learning_rate": 3.872925803638219e-06,
+      "loss": 0.7732,
+      "step": 8207
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 10.146984857466327,
+      "learning_rate": 3.871800810882894e-06,
+      "loss": 0.7673,
+      "step": 8208
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 8.394193507159907,
+      "learning_rate": 3.870675878302571e-06,
+      "loss": 0.7867,
+      "step": 8209
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 10.48515083289146,
+      "learning_rate": 3.869551005957253e-06,
+      "loss": 0.7749,
+      "step": 8210
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 8.117618477970451,
+      "learning_rate": 3.86842619390694e-06,
+      "loss": 0.802,
+      "step": 8211
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 5.984651680397915,
+      "learning_rate": 3.867301442211622e-06,
+      "loss": 0.7225,
+      "step": 8212
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 5.871632159161867,
+      "learning_rate": 3.866176750931294e-06,
+      "loss": 0.7092,
+      "step": 8213
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 10.62907744472718,
+      "learning_rate": 3.86505212012594e-06,
+      "loss": 0.7673,
+      "step": 8214
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 8.360295783701117,
+      "learning_rate": 3.863927549855548e-06,
+      "loss": 0.8114,
+      "step": 8215
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 6.021917937361955,
+      "learning_rate": 3.862803040180098e-06,
+      "loss": 0.7001,
+      "step": 8216
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 8.739656416941333,
+      "learning_rate": 3.861678591159568e-06,
+      "loss": 0.8193,
+      "step": 8217
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 9.918127270176393,
+      "learning_rate": 3.860554202853934e-06,
+      "loss": 0.7775,
+      "step": 8218
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 9.684839931472457,
+      "learning_rate": 3.8594298753231675e-06,
+      "loss": 0.7061,
+      "step": 8219
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 9.584766448886667,
+      "learning_rate": 3.858305608627235e-06,
+      "loss": 0.7503,
+      "step": 8220
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 7.211105613933779,
+      "learning_rate": 3.8571814028261055e-06,
+      "loss": 0.7428,
+      "step": 8221
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 10.088330100483674,
+      "learning_rate": 3.856057257979737e-06,
+      "loss": 0.7104,
+      "step": 8222
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 7.0469376691070975,
+      "learning_rate": 3.854933174148093e-06,
+      "loss": 0.7606,
+      "step": 8223
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 10.882494870952375,
+      "learning_rate": 3.853809151391123e-06,
+      "loss": 0.7917,
+      "step": 8224
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 11.766151910388698,
+      "learning_rate": 3.852685189768783e-06,
+      "loss": 0.7643,
+      "step": 8225
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 6.525509774676447,
+      "learning_rate": 3.851561289341023e-06,
+      "loss": 0.747,
+      "step": 8226
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 7.173105167519195,
+      "learning_rate": 3.850437450167787e-06,
+      "loss": 0.7717,
+      "step": 8227
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 10.14680948146781,
+      "learning_rate": 3.849313672309017e-06,
+      "loss": 0.7812,
+      "step": 8228
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 7.8633245273981265,
+      "learning_rate": 3.848189955824652e-06,
+      "loss": 0.7535,
+      "step": 8229
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 6.29425784983349,
+      "learning_rate": 3.847066300774629e-06,
+      "loss": 0.7136,
+      "step": 8230
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 10.206213618514678,
+      "learning_rate": 3.845942707218882e-06,
+      "loss": 0.7856,
+      "step": 8231
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 7.497889724597089,
+      "learning_rate": 3.844819175217335e-06,
+      "loss": 0.7451,
+      "step": 8232
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 7.554484139760068,
+      "learning_rate": 3.843695704829921e-06,
+      "loss": 0.7619,
+      "step": 8233
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 9.703450139707389,
+      "learning_rate": 3.842572296116558e-06,
+      "loss": 0.7861,
+      "step": 8234
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 9.432250539634039,
+      "learning_rate": 3.841448949137167e-06,
+      "loss": 0.769,
+      "step": 8235
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 9.972425430980131,
+      "learning_rate": 3.840325663951665e-06,
+      "loss": 0.7379,
+      "step": 8236
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 10.645288986339295,
+      "learning_rate": 3.839202440619964e-06,
+      "loss": 0.7379,
+      "step": 8237
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 7.852519084162917,
+      "learning_rate": 3.838079279201972e-06,
+      "loss": 0.7592,
+      "step": 8238
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 10.47256407334003,
+      "learning_rate": 3.836956179757599e-06,
+      "loss": 0.7541,
+      "step": 8239
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 7.854860932742812,
+      "learning_rate": 3.8358331423467456e-06,
+      "loss": 0.7736,
+      "step": 8240
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 10.767382270340445,
+      "learning_rate": 3.834710167029313e-06,
+      "loss": 0.7668,
+      "step": 8241
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 11.754877212048818,
+      "learning_rate": 3.8335872538651944e-06,
+      "loss": 0.8495,
+      "step": 8242
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 10.564194260641385,
+      "learning_rate": 3.832464402914285e-06,
+      "loss": 0.6942,
+      "step": 8243
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 9.72502274969685,
+      "learning_rate": 3.831341614236477e-06,
+      "loss": 0.7682,
+      "step": 8244
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 8.693037268418108,
+      "learning_rate": 3.830218887891651e-06,
+      "loss": 0.8299,
+      "step": 8245
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 9.580113477154395,
+      "learning_rate": 3.8290962239396965e-06,
+      "loss": 0.6755,
+      "step": 8246
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 6.296623822718163,
+      "learning_rate": 3.827973622440488e-06,
+      "loss": 0.751,
+      "step": 8247
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 10.677793421815027,
+      "learning_rate": 3.826851083453904e-06,
+      "loss": 0.798,
+      "step": 8248
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 9.734639362972729,
+      "learning_rate": 3.82572860703982e-06,
+      "loss": 0.7991,
+      "step": 8249
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 9.54335936492447,
+      "learning_rate": 3.824606193258102e-06,
+      "loss": 0.7134,
+      "step": 8250
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 8.393991061282524,
+      "learning_rate": 3.823483842168618e-06,
+      "loss": 0.7201,
+      "step": 8251
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 10.117080545717327,
+      "learning_rate": 3.8223615538312305e-06,
+      "loss": 0.7936,
+      "step": 8252
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 11.836547561358906,
+      "learning_rate": 3.8212393283058e-06,
+      "loss": 0.6992,
+      "step": 8253
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 10.279531324297318,
+      "learning_rate": 3.820117165652184e-06,
+      "loss": 0.8251,
+      "step": 8254
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 7.5489536752182484,
+      "learning_rate": 3.8189950659302315e-06,
+      "loss": 0.7592,
+      "step": 8255
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 10.466062246019094,
+      "learning_rate": 3.817873029199797e-06,
+      "loss": 0.6897,
+      "step": 8256
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 9.639582821602206,
+      "learning_rate": 3.816751055520724e-06,
+      "loss": 0.7999,
+      "step": 8257
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 11.417836226250893,
+      "learning_rate": 3.8156291449528555e-06,
+      "loss": 0.7377,
+      "step": 8258
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 7.372763132523433,
+      "learning_rate": 3.8145072975560333e-06,
+      "loss": 0.8198,
+      "step": 8259
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 7.4131686514429225,
+      "learning_rate": 3.8133855133900905e-06,
+      "loss": 0.7411,
+      "step": 8260
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 6.734874365163843,
+      "learning_rate": 3.8122637925148633e-06,
+      "loss": 0.7068,
+      "step": 8261
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 8.207911924937312,
+      "learning_rate": 3.811142134990177e-06,
+      "loss": 0.7246,
+      "step": 8262
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 7.156125904884193,
+      "learning_rate": 3.8100205408758608e-06,
+      "loss": 0.7754,
+      "step": 8263
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 5.93773271197172,
+      "learning_rate": 3.8088990102317373e-06,
+      "loss": 0.8653,
+      "step": 8264
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 7.231592036678661,
+      "learning_rate": 3.8077775431176234e-06,
+      "loss": 0.7523,
+      "step": 8265
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 9.84433089687121,
+      "learning_rate": 3.80665613959334e-06,
+      "loss": 0.7858,
+      "step": 8266
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 7.929401256347232,
+      "learning_rate": 3.805534799718694e-06,
+      "loss": 0.7617,
+      "step": 8267
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 11.17435584913018,
+      "learning_rate": 3.8044135235534958e-06,
+      "loss": 0.7262,
+      "step": 8268
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 9.231273612129137,
+      "learning_rate": 3.803292311157556e-06,
+      "loss": 0.7636,
+      "step": 8269
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 9.991546761383976,
+      "learning_rate": 3.80217116259067e-06,
+      "loss": 0.7319,
+      "step": 8270
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 11.858673315105767,
+      "learning_rate": 3.801050077912642e-06,
+      "loss": 0.752,
+      "step": 8271
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 8.083042973431759,
+      "learning_rate": 3.7999290571832646e-06,
+      "loss": 0.7715,
+      "step": 8272
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 8.764783407901437,
+      "learning_rate": 3.79880810046233e-06,
+      "loss": 0.7678,
+      "step": 8273
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 7.839116960345164,
+      "learning_rate": 3.7976872078096296e-06,
+      "loss": 0.6824,
+      "step": 8274
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 12.19531333532413,
+      "learning_rate": 3.796566379284946e-06,
+      "loss": 0.7893,
+      "step": 8275
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 10.052090707765391,
+      "learning_rate": 3.795445614948063e-06,
+      "loss": 0.7932,
+      "step": 8276
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 7.069869375132423,
+      "learning_rate": 3.7943249148587557e-06,
+      "loss": 0.7596,
+      "step": 8277
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 8.532317247500435,
+      "learning_rate": 3.7932042790768025e-06,
+      "loss": 0.789,
+      "step": 8278
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 6.481150348517012,
+      "learning_rate": 3.7920837076619753e-06,
+      "loss": 0.7244,
+      "step": 8279
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 6.80497531982404,
+      "learning_rate": 3.790963200674039e-06,
+      "loss": 0.7751,
+      "step": 8280
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 10.338321663443725,
+      "learning_rate": 3.7898427581727604e-06,
+      "loss": 0.7711,
+      "step": 8281
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 6.0196734681354185,
+      "learning_rate": 3.788722380217902e-06,
+      "loss": 0.7484,
+      "step": 8282
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 8.799450829060598,
+      "learning_rate": 3.787602066869218e-06,
+      "loss": 0.6964,
+      "step": 8283
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 10.855331562496113,
+      "learning_rate": 3.7864818181864675e-06,
+      "loss": 0.7033,
+      "step": 8284
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 11.098207878971689,
+      "learning_rate": 3.7853616342293974e-06,
+      "loss": 0.7062,
+      "step": 8285
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 8.055125402532756,
+      "learning_rate": 3.7842415150577563e-06,
+      "loss": 0.7979,
+      "step": 8286
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 9.068881707020227,
+      "learning_rate": 3.7831214607312903e-06,
+      "loss": 0.8227,
+      "step": 8287
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 7.8498461958901755,
+      "learning_rate": 3.7820014713097382e-06,
+      "loss": 0.7018,
+      "step": 8288
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 7.442155809597508,
+      "learning_rate": 3.780881546852837e-06,
+      "loss": 0.6835,
+      "step": 8289
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 6.729080935468035,
+      "learning_rate": 3.7797616874203196e-06,
+      "loss": 0.7692,
+      "step": 8290
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 9.009073626106138,
+      "learning_rate": 3.778641893071918e-06,
+      "loss": 0.8326,
+      "step": 8291
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 8.229210974679424,
+      "learning_rate": 3.777522163867359e-06,
+      "loss": 0.6592,
+      "step": 8292
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 8.492782303358876,
+      "learning_rate": 3.776402499866363e-06,
+      "loss": 0.7716,
+      "step": 8293
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 10.129198476646028,
+      "learning_rate": 3.775282901128654e-06,
+      "loss": 0.8365,
+      "step": 8294
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 10.09934831006522,
+      "learning_rate": 3.774163367713945e-06,
+      "loss": 0.8216,
+      "step": 8295
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 9.134176455390376,
+      "learning_rate": 3.7730438996819486e-06,
+      "loss": 0.7255,
+      "step": 8296
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 9.477052681424635,
+      "learning_rate": 3.7719244970923775e-06,
+      "loss": 0.7878,
+      "step": 8297
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 8.371846253806602,
+      "learning_rate": 3.7708051600049344e-06,
+      "loss": 0.7781,
+      "step": 8298
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 6.905682725345172,
+      "learning_rate": 3.769685888479323e-06,
+      "loss": 0.7296,
+      "step": 8299
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 12.709350786006214,
+      "learning_rate": 3.7685666825752403e-06,
+      "loss": 0.8094,
+      "step": 8300
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 10.158159735178424,
+      "learning_rate": 3.767447542352384e-06,
+      "loss": 0.7393,
+      "step": 8301
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 8.073637645627336,
+      "learning_rate": 3.766328467870446e-06,
+      "loss": 0.7352,
+      "step": 8302
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 14.383106901790422,
+      "learning_rate": 3.765209459189111e-06,
+      "loss": 0.762,
+      "step": 8303
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 9.287138868641327,
+      "learning_rate": 3.7640905163680693e-06,
+      "loss": 0.7778,
+      "step": 8304
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 6.654986222547686,
+      "learning_rate": 3.7629716394669976e-06,
+      "loss": 0.7876,
+      "step": 8305
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 7.84183509373912,
+      "learning_rate": 3.7618528285455747e-06,
+      "loss": 0.686,
+      "step": 8306
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 8.412288318135003,
+      "learning_rate": 3.760734083663477e-06,
+      "loss": 0.7662,
+      "step": 8307
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 10.60388345059768,
+      "learning_rate": 3.7596154048803734e-06,
+      "loss": 0.8018,
+      "step": 8308
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 9.511703679577828,
+      "learning_rate": 3.758496792255932e-06,
+      "loss": 0.7183,
+      "step": 8309
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 8.081909314512993,
+      "learning_rate": 3.7573782458498154e-06,
+      "loss": 0.6845,
+      "step": 8310
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 9.4963312447837,
+      "learning_rate": 3.7562597657216826e-06,
+      "loss": 0.8179,
+      "step": 8311
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 8.649415346486379,
+      "learning_rate": 3.7551413519311946e-06,
+      "loss": 0.7573,
+      "step": 8312
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 6.1493808607017195,
+      "learning_rate": 3.7540230045380006e-06,
+      "loss": 0.7875,
+      "step": 8313
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 7.03893478084814,
+      "learning_rate": 3.7529047236017525e-06,
+      "loss": 0.7967,
+      "step": 8314
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 8.19612647026777,
+      "learning_rate": 3.7517865091820925e-06,
+      "loss": 0.7682,
+      "step": 8315
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 10.625585163207555,
+      "learning_rate": 3.7506683613386673e-06,
+      "loss": 0.7241,
+      "step": 8316
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 10.801167721709572,
+      "learning_rate": 3.749550280131115e-06,
+      "loss": 0.7896,
+      "step": 8317
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 12.570339317273536,
+      "learning_rate": 3.7484322656190684e-06,
+      "loss": 0.6985,
+      "step": 8318
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 8.067826508190938,
+      "learning_rate": 3.747314317862163e-06,
+      "loss": 0.6989,
+      "step": 8319
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 10.051389620760562,
+      "learning_rate": 3.746196436920023e-06,
+      "loss": 0.7104,
+      "step": 8320
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 8.37529874477203,
+      "learning_rate": 3.745078622852275e-06,
+      "loss": 0.7643,
+      "step": 8321
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 8.47891072923103,
+      "learning_rate": 3.743960875718542e-06,
+      "loss": 0.7843,
+      "step": 8322
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 6.656748873388745,
+      "learning_rate": 3.7428431955784384e-06,
+      "loss": 0.7393,
+      "step": 8323
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 7.363439026918951,
+      "learning_rate": 3.741725582491579e-06,
+      "loss": 0.7658,
+      "step": 8324
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 11.837913096691898,
+      "learning_rate": 3.740608036517576e-06,
+      "loss": 0.7533,
+      "step": 8325
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 6.65663887385476,
+      "learning_rate": 3.739490557716034e-06,
+      "loss": 0.7403,
+      "step": 8326
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 9.199249092599208,
+      "learning_rate": 3.7383731461465577e-06,
+      "loss": 0.7562,
+      "step": 8327
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 6.131466811939889,
+      "learning_rate": 3.7372558018687444e-06,
+      "loss": 0.6969,
+      "step": 8328
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 7.9672525881207825,
+      "learning_rate": 3.7361385249421934e-06,
+      "loss": 0.8358,
+      "step": 8329
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 6.55134045605751,
+      "learning_rate": 3.7350213154264956e-06,
+      "loss": 0.751,
+      "step": 8330
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 8.206625470363077,
+      "learning_rate": 3.7339041733812374e-06,
+      "loss": 0.7557,
+      "step": 8331
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 10.004628419265767,
+      "learning_rate": 3.732787098866009e-06,
+      "loss": 0.7884,
+      "step": 8332
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 10.706309270851499,
+      "learning_rate": 3.7316700919403885e-06,
+      "loss": 0.7435,
+      "step": 8333
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 9.019084011277192,
+      "learning_rate": 3.7305531526639538e-06,
+      "loss": 0.7876,
+      "step": 8334
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 6.145882042559039,
+      "learning_rate": 3.7294362810962827e-06,
+      "loss": 0.7855,
+      "step": 8335
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 8.511686581785654,
+      "learning_rate": 3.7283194772969424e-06,
+      "loss": 0.6963,
+      "step": 8336
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 6.6509968731099045,
+      "learning_rate": 3.727202741325502e-06,
+      "loss": 0.7393,
+      "step": 8337
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 6.462933273900989,
+      "learning_rate": 3.7260860732415225e-06,
+      "loss": 0.7649,
+      "step": 8338
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 8.106158613605304,
+      "learning_rate": 3.7249694731045675e-06,
+      "loss": 0.6921,
+      "step": 8339
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 9.11410009332401,
+      "learning_rate": 3.723852940974193e-06,
+      "loss": 0.7699,
+      "step": 8340
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 7.033751360736371,
+      "learning_rate": 3.7227364769099477e-06,
+      "loss": 0.727,
+      "step": 8341
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 8.743477434039432,
+      "learning_rate": 3.721620080971386e-06,
+      "loss": 0.7246,
+      "step": 8342
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 8.008043121253094,
+      "learning_rate": 3.7205037532180493e-06,
+      "loss": 0.7615,
+      "step": 8343
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 6.453789178549875,
+      "learning_rate": 3.7193874937094796e-06,
+      "loss": 0.7186,
+      "step": 8344
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 8.21130142642593,
+      "learning_rate": 3.71827130250522e-06,
+      "loss": 0.7907,
+      "step": 8345
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 11.559590527656674,
+      "learning_rate": 3.7171551796647988e-06,
+      "loss": 0.707,
+      "step": 8346
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 7.86727621963732,
+      "learning_rate": 3.7160391252477512e-06,
+      "loss": 0.6818,
+      "step": 8347
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 13.827100231983755,
+      "learning_rate": 3.7149231393136004e-06,
+      "loss": 0.7774,
+      "step": 8348
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 10.316386387252093,
+      "learning_rate": 3.7138072219218736e-06,
+      "loss": 0.7308,
+      "step": 8349
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 8.245842024942972,
+      "learning_rate": 3.71269137313209e-06,
+      "loss": 0.7275,
+      "step": 8350
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 9.606966807175919,
+      "learning_rate": 3.7115755930037634e-06,
+      "loss": 0.7639,
+      "step": 8351
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 8.855144886440614,
+      "learning_rate": 3.710459881596412e-06,
+      "loss": 0.8239,
+      "step": 8352
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 6.749731829875019,
+      "learning_rate": 3.7093442389695376e-06,
+      "loss": 0.7575,
+      "step": 8353
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 9.839562528173493,
+      "learning_rate": 3.7082286651826487e-06,
+      "loss": 0.7015,
+      "step": 8354
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 8.379728286634522,
+      "learning_rate": 3.70711316029525e-06,
+      "loss": 0.7748,
+      "step": 8355
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 10.825673022148257,
+      "learning_rate": 3.7059977243668343e-06,
+      "loss": 0.8199,
+      "step": 8356
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 7.17248805670414,
+      "learning_rate": 3.7048823574568995e-06,
+      "loss": 0.6921,
+      "step": 8357
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 8.423976837828828,
+      "learning_rate": 3.703767059624934e-06,
+      "loss": 0.7162,
+      "step": 8358
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 10.341352781773871,
+      "learning_rate": 3.7026518309304243e-06,
+      "loss": 0.7775,
+      "step": 8359
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 10.78490141199895,
+      "learning_rate": 3.701536671432857e-06,
+      "loss": 0.7629,
+      "step": 8360
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 8.948589504219187,
+      "learning_rate": 3.700421581191708e-06,
+      "loss": 0.8061,
+      "step": 8361
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 8.890732967541036,
+      "learning_rate": 3.6993065602664558e-06,
+      "loss": 0.781,
+      "step": 8362
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 13.270226070797134,
+      "learning_rate": 3.698191608716569e-06,
+      "loss": 0.8028,
+      "step": 8363
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 8.458136481184924,
+      "learning_rate": 3.6970767266015194e-06,
+      "loss": 0.7949,
+      "step": 8364
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 8.187420439925097,
+      "learning_rate": 3.695961913980771e-06,
+      "loss": 0.7048,
+      "step": 8365
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 10.725873266190929,
+      "learning_rate": 3.694847170913783e-06,
+      "loss": 0.7364,
+      "step": 8366
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 11.403912364155396,
+      "learning_rate": 3.693732497460014e-06,
+      "loss": 0.7448,
+      "step": 8367
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 8.9783282112985,
+      "learning_rate": 3.6926178936789194e-06,
+      "loss": 0.7733,
+      "step": 8368
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 6.916466151584519,
+      "learning_rate": 3.6915033596299453e-06,
+      "loss": 0.7127,
+      "step": 8369
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 8.556913416287824,
+      "learning_rate": 3.6903888953725407e-06,
+      "loss": 0.7801,
+      "step": 8370
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 12.452553474638425,
+      "learning_rate": 3.689274500966147e-06,
+      "loss": 0.7086,
+      "step": 8371
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 9.390449495880206,
+      "learning_rate": 3.6881601764702025e-06,
+      "loss": 0.7427,
+      "step": 8372
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 10.0945858379858,
+      "learning_rate": 3.6870459219441436e-06,
+      "loss": 0.6739,
+      "step": 8373
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 11.52223758469315,
+      "learning_rate": 3.6859317374474002e-06,
+      "loss": 0.8095,
+      "step": 8374
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 7.258413798306889,
+      "learning_rate": 3.6848176230394005e-06,
+      "loss": 0.7471,
+      "step": 8375
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 9.28030215444405,
+      "learning_rate": 3.683703578779567e-06,
+      "loss": 0.7635,
+      "step": 8376
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 9.877936973035212,
+      "learning_rate": 3.6825896047273206e-06,
+      "loss": 0.7329,
+      "step": 8377
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 7.169606405541422,
+      "learning_rate": 3.681475700942079e-06,
+      "loss": 0.7668,
+      "step": 8378
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 9.890611996327175,
+      "learning_rate": 3.680361867483252e-06,
+      "loss": 0.7657,
+      "step": 8379
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 9.395790412766381,
+      "learning_rate": 3.6792481044102514e-06,
+      "loss": 0.7124,
+      "step": 8380
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 9.657828637558843,
+      "learning_rate": 3.6781344117824793e-06,
+      "loss": 0.7735,
+      "step": 8381
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 7.080010572201637,
+      "learning_rate": 3.677020789659337e-06,
+      "loss": 0.7221,
+      "step": 8382
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 10.499007968201063,
+      "learning_rate": 3.675907238100226e-06,
+      "loss": 0.7314,
+      "step": 8383
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 10.974189568354761,
+      "learning_rate": 3.6747937571645354e-06,
+      "loss": 0.7593,
+      "step": 8384
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 8.847407629698044,
+      "learning_rate": 3.6736803469116587e-06,
+      "loss": 0.758,
+      "step": 8385
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 6.524152382691209,
+      "learning_rate": 3.6725670074009783e-06,
+      "loss": 0.7763,
+      "step": 8386
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 9.283162230160636,
+      "learning_rate": 3.6714537386918803e-06,
+      "loss": 0.7409,
+      "step": 8387
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 10.285678782237405,
+      "learning_rate": 3.670340540843742e-06,
+      "loss": 0.7267,
+      "step": 8388
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 9.023046538804792,
+      "learning_rate": 3.6692274139159367e-06,
+      "loss": 0.765,
+      "step": 8389
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 9.553165937282628,
+      "learning_rate": 3.6681143579678384e-06,
+      "loss": 0.8232,
+      "step": 8390
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 6.678471178720299,
+      "learning_rate": 3.667001373058812e-06,
+      "loss": 0.6737,
+      "step": 8391
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 8.215434184154907,
+      "learning_rate": 3.6658884592482214e-06,
+      "loss": 0.7815,
+      "step": 8392
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 6.374815426363875,
+      "learning_rate": 3.6647756165954283e-06,
+      "loss": 0.801,
+      "step": 8393
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 9.131321245543825,
+      "learning_rate": 3.663662845159787e-06,
+      "loss": 0.7829,
+      "step": 8394
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 7.242262213112552,
+      "learning_rate": 3.6625501450006503e-06,
+      "loss": 0.7911,
+      "step": 8395
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 8.290372211677818,
+      "learning_rate": 3.661437516177365e-06,
+      "loss": 0.7298,
+      "step": 8396
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 6.75922505516537,
+      "learning_rate": 3.6603249587492752e-06,
+      "loss": 0.7542,
+      "step": 8397
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 8.114930957029939,
+      "learning_rate": 3.6592124727757263e-06,
+      "loss": 0.7064,
+      "step": 8398
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 6.49037986865688,
+      "learning_rate": 3.6581000583160498e-06,
+      "loss": 0.7153,
+      "step": 8399
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 12.157805474937737,
+      "learning_rate": 3.656987715429583e-06,
+      "loss": 0.7571,
+      "step": 8400
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 7.546549293802553,
+      "learning_rate": 3.65587544417565e-06,
+      "loss": 0.7073,
+      "step": 8401
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 11.140927817256166,
+      "learning_rate": 3.654763244613581e-06,
+      "loss": 0.7054,
+      "step": 8402
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 7.129741962859503,
+      "learning_rate": 3.653651116802697e-06,
+      "loss": 0.767,
+      "step": 8403
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 12.188497128592303,
+      "learning_rate": 3.652539060802313e-06,
+      "loss": 0.7539,
+      "step": 8404
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 9.892669760574062,
+      "learning_rate": 3.651427076671747e-06,
+      "loss": 0.7355,
+      "step": 8405
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 10.455119509771386,
+      "learning_rate": 3.6503151644703057e-06,
+      "loss": 0.7146,
+      "step": 8406
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 9.3166538809159,
+      "learning_rate": 3.6492033242572955e-06,
+      "loss": 0.7821,
+      "step": 8407
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 12.45133236387548,
+      "learning_rate": 3.6480915560920222e-06,
+      "loss": 0.6862,
+      "step": 8408
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 12.880724885007435,
+      "learning_rate": 3.6469798600337816e-06,
+      "loss": 0.7671,
+      "step": 8409
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 10.891888571554087,
+      "learning_rate": 3.6458682361418675e-06,
+      "loss": 0.7287,
+      "step": 8410
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 8.532055731442407,
+      "learning_rate": 3.6447566844755745e-06,
+      "loss": 0.655,
+      "step": 8411
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 10.952287932431064,
+      "learning_rate": 3.643645205094187e-06,
+      "loss": 0.7681,
+      "step": 8412
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 9.397884545970449,
+      "learning_rate": 3.6425337980569895e-06,
+      "loss": 0.6762,
+      "step": 8413
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 8.411415999240834,
+      "learning_rate": 3.641422463423259e-06,
+      "loss": 0.7666,
+      "step": 8414
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 13.83118449225171,
+      "learning_rate": 3.640311201252273e-06,
+      "loss": 0.7955,
+      "step": 8415
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 13.07647178965351,
+      "learning_rate": 3.6392000116033043e-06,
+      "loss": 0.7326,
+      "step": 8416
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 9.000961643187024,
+      "learning_rate": 3.6380888945356167e-06,
+      "loss": 0.6931,
+      "step": 8417
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 8.46346574179052,
+      "learning_rate": 3.6369778501084795e-06,
+      "loss": 0.6852,
+      "step": 8418
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 7.515536493373632,
+      "learning_rate": 3.635866878381148e-06,
+      "loss": 0.7933,
+      "step": 8419
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 10.060834586301793,
+      "learning_rate": 3.6347559794128794e-06,
+      "loss": 0.7531,
+      "step": 8420
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 9.620818513587466,
+      "learning_rate": 3.6336451532629284e-06,
+      "loss": 0.7588,
+      "step": 8421
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 7.363510685817783,
+      "learning_rate": 3.632534399990541e-06,
+      "loss": 0.728,
+      "step": 8422
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 9.727495347855053,
+      "learning_rate": 3.6314237196549634e-06,
+      "loss": 0.7433,
+      "step": 8423
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 6.656284433984691,
+      "learning_rate": 3.630313112315433e-06,
+      "loss": 0.7955,
+      "step": 8424
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 9.401787013858653,
+      "learning_rate": 3.629202578031189e-06,
+      "loss": 0.773,
+      "step": 8425
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 7.9916670615224055,
+      "learning_rate": 3.6280921168614658e-06,
+      "loss": 0.7728,
+      "step": 8426
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 7.023735556895596,
+      "learning_rate": 3.626981728865488e-06,
+      "loss": 0.7733,
+      "step": 8427
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 7.686706930847839,
+      "learning_rate": 3.6258714141024852e-06,
+      "loss": 0.6573,
+      "step": 8428
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 6.157516015864547,
+      "learning_rate": 3.6247611726316746e-06,
+      "loss": 0.731,
+      "step": 8429
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 9.31643842708706,
+      "learning_rate": 3.623651004512275e-06,
+      "loss": 0.7159,
+      "step": 8430
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 7.761031415940602,
+      "learning_rate": 3.6225409098035006e-06,
+      "loss": 0.7249,
+      "step": 8431
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 9.090563119199183,
+      "learning_rate": 3.6214308885645597e-06,
+      "loss": 0.8187,
+      "step": 8432
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 8.733206806244683,
+      "learning_rate": 3.6203209408546592e-06,
+      "loss": 0.7247,
+      "step": 8433
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 6.8807052133543305,
+      "learning_rate": 3.619211066732996e-06,
+      "loss": 0.7971,
+      "step": 8434
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 6.798587466054207,
+      "learning_rate": 3.6181012662587735e-06,
+      "loss": 0.7235,
+      "step": 8435
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 9.13214678480559,
+      "learning_rate": 3.616991539491183e-06,
+      "loss": 0.7962,
+      "step": 8436
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 10.61093583268632,
+      "learning_rate": 3.6158818864894124e-06,
+      "loss": 0.7814,
+      "step": 8437
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 8.813856791902548,
+      "learning_rate": 3.6147723073126527e-06,
+      "loss": 0.784,
+      "step": 8438
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 7.353331379534261,
+      "learning_rate": 3.6136628020200787e-06,
+      "loss": 0.7097,
+      "step": 8439
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 8.16744041705293,
+      "learning_rate": 3.6125533706708734e-06,
+      "loss": 0.7723,
+      "step": 8440
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 7.608607896775953,
+      "learning_rate": 3.61144401332421e-06,
+      "loss": 0.8231,
+      "step": 8441
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 8.966647841740382,
+      "learning_rate": 3.6103347300392557e-06,
+      "loss": 0.7068,
+      "step": 8442
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 11.2632947725382,
+      "learning_rate": 3.6092255208751815e-06,
+      "loss": 0.7014,
+      "step": 8443
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 9.6091986522979,
+      "learning_rate": 3.608116385891145e-06,
+      "loss": 0.7881,
+      "step": 8444
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 8.299382389553333,
+      "learning_rate": 3.6070073251463056e-06,
+      "loss": 0.8121,
+      "step": 8445
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 10.75015695096307,
+      "learning_rate": 3.6058983386998196e-06,
+      "loss": 0.6875,
+      "step": 8446
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 7.987545485257819,
+      "learning_rate": 3.604789426610835e-06,
+      "loss": 0.7191,
+      "step": 8447
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 11.187431225862055,
+      "learning_rate": 3.6036805889384996e-06,
+      "loss": 0.7287,
+      "step": 8448
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 8.847221238542629,
+      "learning_rate": 3.6025718257419532e-06,
+      "loss": 0.7726,
+      "step": 8449
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 7.5714143248032,
+      "learning_rate": 3.6014631370803366e-06,
+      "loss": 0.7711,
+      "step": 8450
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 8.347707231946464,
+      "learning_rate": 3.6003545230127844e-06,
+      "loss": 0.8025,
+      "step": 8451
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 9.838920190765414,
+      "learning_rate": 3.5992459835984244e-06,
+      "loss": 0.7783,
+      "step": 8452
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 9.539160890401554,
+      "learning_rate": 3.598137518896385e-06,
+      "loss": 0.7683,
+      "step": 8453
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 9.049046465610388,
+      "learning_rate": 3.5970291289657897e-06,
+      "loss": 0.7733,
+      "step": 8454
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 10.78908645308829,
+      "learning_rate": 3.595920813865754e-06,
+      "loss": 0.7821,
+      "step": 8455
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 10.212247651446047,
+      "learning_rate": 3.594812573655395e-06,
+      "loss": 0.7151,
+      "step": 8456
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 8.975070812814963,
+      "learning_rate": 3.593704408393821e-06,
+      "loss": 0.726,
+      "step": 8457
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 8.626412421300575,
+      "learning_rate": 3.5925963181401387e-06,
+      "loss": 0.723,
+      "step": 8458
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 10.66373369073308,
+      "learning_rate": 3.5914883029534533e-06,
+      "loss": 0.7484,
+      "step": 8459
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 7.5529245181899665,
+      "learning_rate": 3.5903803628928596e-06,
+      "loss": 0.7015,
+      "step": 8460
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 6.425594295473126,
+      "learning_rate": 3.5892724980174553e-06,
+      "loss": 0.7875,
+      "step": 8461
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 11.938544100592804,
+      "learning_rate": 3.5881647083863267e-06,
+      "loss": 0.8227,
+      "step": 8462
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 7.794637092574945,
+      "learning_rate": 3.587056994058564e-06,
+      "loss": 0.76,
+      "step": 8463
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 9.844197623253486,
+      "learning_rate": 3.585949355093249e-06,
+      "loss": 0.7966,
+      "step": 8464
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 8.636186925585314,
+      "learning_rate": 3.584841791549457e-06,
+      "loss": 0.7485,
+      "step": 8465
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 9.767114088516303,
+      "learning_rate": 3.5837343034862678e-06,
+      "loss": 0.6821,
+      "step": 8466
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 9.151891146882978,
+      "learning_rate": 3.582626890962747e-06,
+      "loss": 0.7498,
+      "step": 8467
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 7.911382966332842,
+      "learning_rate": 3.5815195540379614e-06,
+      "loss": 0.7427,
+      "step": 8468
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 7.775804474225866,
+      "learning_rate": 3.5804122927709768e-06,
+      "loss": 0.7124,
+      "step": 8469
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 6.750084607417917,
+      "learning_rate": 3.579305107220848e-06,
+      "loss": 0.7879,
+      "step": 8470
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 9.196265944853456,
+      "learning_rate": 3.578197997446631e-06,
+      "loss": 0.8287,
+      "step": 8471
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 8.070654986452753,
+      "learning_rate": 3.5770909635073735e-06,
+      "loss": 0.742,
+      "step": 8472
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 6.610740650143758,
+      "learning_rate": 3.5759840054621243e-06,
+      "loss": 0.7588,
+      "step": 8473
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 8.400838917199444,
+      "learning_rate": 3.574877123369925e-06,
+      "loss": 0.7523,
+      "step": 8474
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 10.785577038024636,
+      "learning_rate": 3.573770317289812e-06,
+      "loss": 0.7028,
+      "step": 8475
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 7.546790487652793,
+      "learning_rate": 3.5726635872808213e-06,
+      "loss": 0.8071,
+      "step": 8476
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 9.089492588349922,
+      "learning_rate": 3.5715569334019817e-06,
+      "loss": 0.7435,
+      "step": 8477
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 9.12710652042245,
+      "learning_rate": 3.570450355712317e-06,
+      "loss": 0.8085,
+      "step": 8478
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 8.498482021969362,
+      "learning_rate": 3.5693438542708548e-06,
+      "loss": 0.7086,
+      "step": 8479
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 10.290308651885537,
+      "learning_rate": 3.5682374291366066e-06,
+      "loss": 0.7786,
+      "step": 8480
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 7.322828968529745,
+      "learning_rate": 3.5671310803685904e-06,
+      "loss": 0.7287,
+      "step": 8481
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 7.069762660571734,
+      "learning_rate": 3.566024808025813e-06,
+      "loss": 0.7627,
+      "step": 8482
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 9.055419243612011,
+      "learning_rate": 3.56491861216728e-06,
+      "loss": 0.7216,
+      "step": 8483
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 8.996397597325904,
+      "learning_rate": 3.5638124928519945e-06,
+      "loss": 0.7152,
+      "step": 8484
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 7.204487590466511,
+      "learning_rate": 3.562706450138953e-06,
+      "loss": 0.7235,
+      "step": 8485
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 8.77217993845042,
+      "learning_rate": 3.56160048408715e-06,
+      "loss": 0.783,
+      "step": 8486
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 9.474509619122752,
+      "learning_rate": 3.56049459475557e-06,
+      "loss": 0.7648,
+      "step": 8487
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 8.065155409894757,
+      "learning_rate": 3.5593887822032036e-06,
+      "loss": 0.726,
+      "step": 8488
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 7.301229143679938,
+      "learning_rate": 3.5582830464890307e-06,
+      "loss": 0.6941,
+      "step": 8489
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 8.475219956749799,
+      "learning_rate": 3.5571773876720243e-06,
+      "loss": 0.6769,
+      "step": 8490
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 9.520461210926376,
+      "learning_rate": 3.5560718058111626e-06,
+      "loss": 0.7652,
+      "step": 8491
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 8.239766616760408,
+      "learning_rate": 3.55496630096541e-06,
+      "loss": 0.7878,
+      "step": 8492
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 11.543446596607728,
+      "learning_rate": 3.553860873193732e-06,
+      "loss": 0.7518,
+      "step": 8493
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 9.416971869884643,
+      "learning_rate": 3.5527555225550915e-06,
+      "loss": 0.7494,
+      "step": 8494
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 10.441218435340613,
+      "learning_rate": 3.551650249108442e-06,
+      "loss": 0.7411,
+      "step": 8495
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 6.193810552766717,
+      "learning_rate": 3.550545052912736e-06,
+      "loss": 0.7959,
+      "step": 8496
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 9.676880952175695,
+      "learning_rate": 3.549439934026924e-06,
+      "loss": 0.7607,
+      "step": 8497
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 7.099672450970294,
+      "learning_rate": 3.548334892509947e-06,
+      "loss": 0.7369,
+      "step": 8498
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 10.063973109350087,
+      "learning_rate": 3.5472299284207478e-06,
+      "loss": 0.7592,
+      "step": 8499
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 7.927774849291947,
+      "learning_rate": 3.546125041818258e-06,
+      "loss": 0.7003,
+      "step": 8500
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 7.826449226399131,
+      "learning_rate": 3.5450202327614124e-06,
+      "loss": 0.7734,
+      "step": 8501
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 11.027242221333852,
+      "learning_rate": 3.5439155013091385e-06,
+      "loss": 0.7118,
+      "step": 8502
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 6.627577586871528,
+      "learning_rate": 3.542810847520357e-06,
+      "loss": 0.7959,
+      "step": 8503
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 8.376981695379488,
+      "learning_rate": 3.541706271453991e-06,
+      "loss": 0.704,
+      "step": 8504
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 11.416255876960768,
+      "learning_rate": 3.5406017731689517e-06,
+      "loss": 0.6284,
+      "step": 8505
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 11.0101629159699,
+      "learning_rate": 3.539497352724151e-06,
+      "loss": 0.7225,
+      "step": 8506
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 8.30587993501216,
+      "learning_rate": 3.538393010178498e-06,
+      "loss": 0.7517,
+      "step": 8507
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 11.46738668575602,
+      "learning_rate": 3.537288745590893e-06,
+      "loss": 0.7872,
+      "step": 8508
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 8.297038571548963,
+      "learning_rate": 3.536184559020235e-06,
+      "loss": 0.8015,
+      "step": 8509
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 10.295966933083509,
+      "learning_rate": 3.5350804505254165e-06,
+      "loss": 0.7354,
+      "step": 8510
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 9.366396590499795,
+      "learning_rate": 3.5339764201653302e-06,
+      "loss": 0.8052,
+      "step": 8511
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 11.803918733681684,
+      "learning_rate": 3.5328724679988618e-06,
+      "loss": 0.7491,
+      "step": 8512
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 9.137436646211363,
+      "learning_rate": 3.53176859408489e-06,
+      "loss": 0.7443,
+      "step": 8513
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 9.603023230080023,
+      "learning_rate": 3.5306647984822967e-06,
+      "loss": 0.7573,
+      "step": 8514
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 7.555814638063555,
+      "learning_rate": 3.5295610812499524e-06,
+      "loss": 0.6678,
+      "step": 8515
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 7.915849269534117,
+      "learning_rate": 3.5284574424467254e-06,
+      "loss": 0.6925,
+      "step": 8516
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 11.259258380197966,
+      "learning_rate": 3.5273538821314846e-06,
+      "loss": 0.7451,
+      "step": 8517
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 8.98419459826559,
+      "learning_rate": 3.5262504003630884e-06,
+      "loss": 0.7983,
+      "step": 8518
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 7.868112882642927,
+      "learning_rate": 3.5251469972003936e-06,
+      "loss": 0.7059,
+      "step": 8519
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 9.161355425353811,
+      "learning_rate": 3.524043672702251e-06,
+      "loss": 0.726,
+      "step": 8520
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 8.81487851107767,
+      "learning_rate": 3.5229404269275113e-06,
+      "loss": 0.7591,
+      "step": 8521
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 5.689496201439047,
+      "learning_rate": 3.5218372599350194e-06,
+      "loss": 0.7599,
+      "step": 8522
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 11.742782010844659,
+      "learning_rate": 3.5207341717836113e-06,
+      "loss": 0.8209,
+      "step": 8523
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 7.8545256821190215,
+      "learning_rate": 3.5196311625321276e-06,
+      "loss": 0.7168,
+      "step": 8524
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 9.528542401589231,
+      "learning_rate": 3.518528232239394e-06,
+      "loss": 0.7469,
+      "step": 8525
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 11.333479688550238,
+      "learning_rate": 3.517425380964243e-06,
+      "loss": 0.7743,
+      "step": 8526
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 8.76883244269053,
+      "learning_rate": 3.5163226087654954e-06,
+      "loss": 0.7134,
+      "step": 8527
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 8.90147595150148,
+      "learning_rate": 3.5152199157019677e-06,
+      "loss": 0.7698,
+      "step": 8528
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 7.023451032387824,
+      "learning_rate": 3.514117301832479e-06,
+      "loss": 0.7116,
+      "step": 8529
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 8.863813030857965,
+      "learning_rate": 3.513014767215837e-06,
+      "loss": 0.6848,
+      "step": 8530
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 7.436522168566707,
+      "learning_rate": 3.511912311910847e-06,
+      "loss": 0.737,
+      "step": 8531
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 6.353148870965614,
+      "learning_rate": 3.5108099359763135e-06,
+      "loss": 0.7777,
+      "step": 8532
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 7.602121568681379,
+      "learning_rate": 3.509707639471033e-06,
+      "loss": 0.7155,
+      "step": 8533
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 7.576400923444247,
+      "learning_rate": 3.508605422453799e-06,
+      "loss": 0.7439,
+      "step": 8534
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 7.808791555444899,
+      "learning_rate": 3.507503284983399e-06,
+      "loss": 0.7092,
+      "step": 8535
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 10.194639115246318,
+      "learning_rate": 3.5064012271186197e-06,
+      "loss": 0.6948,
+      "step": 8536
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 9.38349505380605,
+      "learning_rate": 3.5052992489182425e-06,
+      "loss": 0.7543,
+      "step": 8537
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 7.344243515091533,
+      "learning_rate": 3.504197350441041e-06,
+      "loss": 0.7041,
+      "step": 8538
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 8.850247818034662,
+      "learning_rate": 3.5030955317457904e-06,
+      "loss": 0.7024,
+      "step": 8539
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 8.995647094100748,
+      "learning_rate": 3.5019937928912583e-06,
+      "loss": 0.7308,
+      "step": 8540
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 10.794343162644994,
+      "learning_rate": 3.5008921339362056e-06,
+      "loss": 0.7759,
+      "step": 8541
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 10.881194671547615,
+      "learning_rate": 3.499790554939396e-06,
+      "loss": 0.7456,
+      "step": 8542
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 5.7163475454843065,
+      "learning_rate": 3.498689055959581e-06,
+      "loss": 0.6831,
+      "step": 8543
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 10.63992498098832,
+      "learning_rate": 3.4975876370555117e-06,
+      "loss": 0.7283,
+      "step": 8544
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 8.186339600306157,
+      "learning_rate": 3.4964862982859384e-06,
+      "loss": 0.7374,
+      "step": 8545
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 10.23841375680027,
+      "learning_rate": 3.4953850397095994e-06,
+      "loss": 0.7422,
+      "step": 8546
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 10.044311700419462,
+      "learning_rate": 3.494283861385236e-06,
+      "loss": 0.8016,
+      "step": 8547
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 9.907210160433857,
+      "learning_rate": 3.4931827633715774e-06,
+      "loss": 0.7482,
+      "step": 8548
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 7.449631729079498,
+      "learning_rate": 3.492081745727357e-06,
+      "loss": 0.7159,
+      "step": 8549
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 10.638831698494998,
+      "learning_rate": 3.4909808085113e-06,
+      "loss": 0.7522,
+      "step": 8550
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 12.048833423739675,
+      "learning_rate": 3.489879951782125e-06,
+      "loss": 0.7291,
+      "step": 8551
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 8.110084670298553,
+      "learning_rate": 3.488779175598551e-06,
+      "loss": 0.7012,
+      "step": 8552
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 8.68355317061793,
+      "learning_rate": 3.4876784800192886e-06,
+      "loss": 0.7839,
+      "step": 8553
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 8.045412631189876,
+      "learning_rate": 3.4865778651030456e-06,
+      "loss": 0.7532,
+      "step": 8554
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 10.04056257688272,
+      "learning_rate": 3.485477330908529e-06,
+      "loss": 0.7325,
+      "step": 8555
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 6.653319079655716,
+      "learning_rate": 3.4843768774944342e-06,
+      "loss": 0.7447,
+      "step": 8556
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 8.938484344761136,
+      "learning_rate": 3.483276504919459e-06,
+      "loss": 0.7511,
+      "step": 8557
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 9.127183096439618,
+      "learning_rate": 3.482176213242291e-06,
+      "loss": 0.6525,
+      "step": 8558
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 7.4143750416648055,
+      "learning_rate": 3.4810760025216207e-06,
+      "loss": 0.7788,
+      "step": 8559
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 9.695396429592632,
+      "learning_rate": 3.4799758728161292e-06,
+      "loss": 0.6927,
+      "step": 8560
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 9.412003490556334,
+      "learning_rate": 3.4788758241844912e-06,
+      "loss": 0.7801,
+      "step": 8561
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 8.083589220929111,
+      "learning_rate": 3.477775856685385e-06,
+      "loss": 0.7157,
+      "step": 8562
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 9.10847035440901,
+      "learning_rate": 3.476675970377477e-06,
+      "loss": 0.7219,
+      "step": 8563
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 9.322068051984738,
+      "learning_rate": 3.475576165319431e-06,
+      "loss": 0.7659,
+      "step": 8564
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 8.295603724582083,
+      "learning_rate": 3.4744764415699116e-06,
+      "loss": 0.7628,
+      "step": 8565
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 9.484687374157943,
+      "learning_rate": 3.47337679918757e-06,
+      "loss": 0.6765,
+      "step": 8566
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 10.178953533519255,
+      "learning_rate": 3.472277238231062e-06,
+      "loss": 0.8766,
+      "step": 8567
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 10.599840346272787,
+      "learning_rate": 3.4711777587590333e-06,
+      "loss": 0.8036,
+      "step": 8568
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 9.313108545914952,
+      "learning_rate": 3.4700783608301265e-06,
+      "loss": 0.798,
+      "step": 8569
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 8.109607408731378,
+      "learning_rate": 3.4689790445029838e-06,
+      "loss": 0.7409,
+      "step": 8570
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 9.09755602455764,
+      "learning_rate": 3.4678798098362356e-06,
+      "loss": 0.8066,
+      "step": 8571
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 8.420834674431221,
+      "learning_rate": 3.4667806568885144e-06,
+      "loss": 0.7929,
+      "step": 8572
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 8.28083910575682,
+      "learning_rate": 3.4656815857184433e-06,
+      "loss": 0.7456,
+      "step": 8573
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 11.61801986397133,
+      "learning_rate": 3.4645825963846468e-06,
+      "loss": 0.7654,
+      "step": 8574
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 8.823527555047981,
+      "learning_rate": 3.463483688945741e-06,
+      "loss": 0.7624,
+      "step": 8575
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 10.563008760689561,
+      "learning_rate": 3.4623848634603373e-06,
+      "loss": 0.7632,
+      "step": 8576
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 6.6494484554991145,
+      "learning_rate": 3.4612861199870474e-06,
+      "loss": 0.7491,
+      "step": 8577
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 7.82226546669423,
+      "learning_rate": 3.460187458584471e-06,
+      "loss": 0.7209,
+      "step": 8578
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 10.113805920751089,
+      "learning_rate": 3.459088879311209e-06,
+      "loss": 0.7946,
+      "step": 8579
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 8.94217944251775,
+      "learning_rate": 3.457990382225858e-06,
+      "loss": 0.8222,
+      "step": 8580
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 8.674336113969096,
+      "learning_rate": 3.4568919673870073e-06,
+      "loss": 0.7033,
+      "step": 8581
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 10.868454393050298,
+      "learning_rate": 3.455793634853243e-06,
+      "loss": 0.7733,
+      "step": 8582
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 12.384951642259363,
+      "learning_rate": 3.4546953846831498e-06,
+      "loss": 0.7575,
+      "step": 8583
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 7.4087718133004925,
+      "learning_rate": 3.4535972169353026e-06,
+      "loss": 0.7201,
+      "step": 8584
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 8.810944778933338,
+      "learning_rate": 3.4524991316682767e-06,
+      "loss": 0.7371,
+      "step": 8585
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 7.740254931415976,
+      "learning_rate": 3.451401128940637e-06,
+      "loss": 0.7264,
+      "step": 8586
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 8.78169414115717,
+      "learning_rate": 3.450303208810951e-06,
+      "loss": 0.722,
+      "step": 8587
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 7.411381689577122,
+      "learning_rate": 3.4492053713377803e-06,
+      "loss": 0.6638,
+      "step": 8588
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 12.47119307399443,
+      "learning_rate": 3.4481076165796756e-06,
+      "loss": 0.7775,
+      "step": 8589
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 7.036552983572059,
+      "learning_rate": 3.447009944595194e-06,
+      "loss": 0.8118,
+      "step": 8590
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 9.244104672158924,
+      "learning_rate": 3.4459123554428775e-06,
+      "loss": 0.7982,
+      "step": 8591
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 12.999815989675907,
+      "learning_rate": 3.444814849181269e-06,
+      "loss": 0.748,
+      "step": 8592
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 7.474834934776185,
+      "learning_rate": 3.44371742586891e-06,
+      "loss": 0.7915,
+      "step": 8593
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 8.070789977869351,
+      "learning_rate": 3.44262008556433e-06,
+      "loss": 0.7451,
+      "step": 8594
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 8.665894158560757,
+      "learning_rate": 3.441522828326061e-06,
+      "loss": 0.6981,
+      "step": 8595
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 9.080237171184736,
+      "learning_rate": 3.440425654212624e-06,
+      "loss": 0.792,
+      "step": 8596
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 9.891981552800775,
+      "learning_rate": 3.4393285632825425e-06,
+      "loss": 0.7377,
+      "step": 8597
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 9.402811739197134,
+      "learning_rate": 3.4382315555943323e-06,
+      "loss": 0.6999,
+      "step": 8598
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 10.2901107158184,
+      "learning_rate": 3.4371346312065024e-06,
+      "loss": 0.7936,
+      "step": 8599
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 9.27893427867226,
+      "learning_rate": 3.436037790177562e-06,
+      "loss": 0.7659,
+      "step": 8600
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 9.12059423054721,
+      "learning_rate": 3.4349410325660125e-06,
+      "loss": 0.7925,
+      "step": 8601
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 9.978372390918947,
+      "learning_rate": 3.4338443584303506e-06,
+      "loss": 0.7405,
+      "step": 8602
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 9.327470103905082,
+      "learning_rate": 3.4327477678290732e-06,
+      "loss": 0.761,
+      "step": 8603
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 6.675587464020834,
+      "learning_rate": 3.431651260820666e-06,
+      "loss": 0.7738,
+      "step": 8604
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 7.087785909180209,
+      "learning_rate": 3.4305548374636165e-06,
+      "loss": 0.7768,
+      "step": 8605
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 8.123290206472156,
+      "learning_rate": 3.429458497816401e-06,
+      "loss": 0.88,
+      "step": 8606
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 7.727463076267734,
+      "learning_rate": 3.4283622419374984e-06,
+      "loss": 0.8059,
+      "step": 8607
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 7.134543132892729,
+      "learning_rate": 3.427266069885381e-06,
+      "loss": 0.7636,
+      "step": 8608
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 7.881051820079861,
+      "learning_rate": 3.426169981718511e-06,
+      "loss": 0.7872,
+      "step": 8609
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 8.05535479159558,
+      "learning_rate": 3.4250739774953567e-06,
+      "loss": 0.7446,
+      "step": 8610
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 11.447689562355572,
+      "learning_rate": 3.4239780572743697e-06,
+      "loss": 0.7469,
+      "step": 8611
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 7.848226240326429,
+      "learning_rate": 3.422882221114007e-06,
+      "loss": 0.792,
+      "step": 8612
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 7.5909681762965,
+      "learning_rate": 3.4217864690727177e-06,
+      "loss": 0.7964,
+      "step": 8613
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 8.839448921102028,
+      "learning_rate": 3.420690801208943e-06,
+      "loss": 0.7093,
+      "step": 8614
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 9.247519314359886,
+      "learning_rate": 3.419595217581127e-06,
+      "loss": 0.7729,
+      "step": 8615
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 8.249442685529331,
+      "learning_rate": 3.418499718247702e-06,
+      "loss": 0.748,
+      "step": 8616
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 8.98534120877661,
+      "learning_rate": 3.417404303267099e-06,
+      "loss": 0.7173,
+      "step": 8617
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 7.413853013430948,
+      "learning_rate": 3.4163089726977474e-06,
+      "loss": 0.7782,
+      "step": 8618
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 7.84925723123179,
+      "learning_rate": 3.4152137265980658e-06,
+      "loss": 0.7372,
+      "step": 8619
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 7.216155586661951,
+      "learning_rate": 3.414118565026474e-06,
+      "loss": 0.6598,
+      "step": 8620
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 8.969092599844837,
+      "learning_rate": 3.4130234880413816e-06,
+      "loss": 0.7352,
+      "step": 8621
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 8.795144170367335,
+      "learning_rate": 3.4119284957011993e-06,
+      "loss": 0.7422,
+      "step": 8622
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 5.728198521961956,
+      "learning_rate": 3.4108335880643325e-06,
+      "loss": 0.7667,
+      "step": 8623
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 8.51480865446728,
+      "learning_rate": 3.409738765189176e-06,
+      "loss": 0.787,
+      "step": 8624
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 10.68475180907938,
+      "learning_rate": 3.408644027134128e-06,
+      "loss": 0.8132,
+      "step": 8625
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 7.494690567667348,
+      "learning_rate": 3.40754937395758e-06,
+      "loss": 0.767,
+      "step": 8626
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 8.150458018984757,
+      "learning_rate": 3.406454805717913e-06,
+      "loss": 0.7398,
+      "step": 8627
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 9.232813377009073,
+      "learning_rate": 3.4053603224735133e-06,
+      "loss": 0.7438,
+      "step": 8628
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 9.913400567165484,
+      "learning_rate": 3.4042659242827546e-06,
+      "loss": 0.7356,
+      "step": 8629
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 9.04413428548478,
+      "learning_rate": 3.4031716112040082e-06,
+      "loss": 0.7699,
+      "step": 8630
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 9.077601933911533,
+      "learning_rate": 3.4020773832956456e-06,
+      "loss": 0.7306,
+      "step": 8631
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 8.20587087622214,
+      "learning_rate": 3.4009832406160258e-06,
+      "loss": 0.6962,
+      "step": 8632
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 7.000685479947115,
+      "learning_rate": 3.3998891832235113e-06,
+      "loss": 0.7709,
+      "step": 8633
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 9.211479914329479,
+      "learning_rate": 3.398795211176451e-06,
+      "loss": 0.703,
+      "step": 8634
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 11.254528246490136,
+      "learning_rate": 3.397701324533198e-06,
+      "loss": 0.7346,
+      "step": 8635
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 10.098514556215424,
+      "learning_rate": 3.3966075233520977e-06,
+      "loss": 0.731,
+      "step": 8636
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 8.59270204412961,
+      "learning_rate": 3.3955138076914874e-06,
+      "loss": 0.7093,
+      "step": 8637
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 12.102797244419156,
+      "learning_rate": 3.3944201776097056e-06,
+      "loss": 0.7778,
+      "step": 8638
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 9.79611389389473,
+      "learning_rate": 3.3933266331650818e-06,
+      "loss": 0.7835,
+      "step": 8639
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 9.314682789994697,
+      "learning_rate": 3.392233174415942e-06,
+      "loss": 0.7849,
+      "step": 8640
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 9.074816687502272,
+      "learning_rate": 3.391139801420612e-06,
+      "loss": 0.7289,
+      "step": 8641
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 7.074469575614851,
+      "learning_rate": 3.3900465142374052e-06,
+      "loss": 0.7288,
+      "step": 8642
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 8.734302283243563,
+      "learning_rate": 3.388953312924637e-06,
+      "loss": 0.7397,
+      "step": 8643
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 10.817225579647648,
+      "learning_rate": 3.3878601975406134e-06,
+      "loss": 0.7465,
+      "step": 8644
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 14.371983602040224,
+      "learning_rate": 3.3867671681436392e-06,
+      "loss": 0.8169,
+      "step": 8645
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 9.276519135732034,
+      "learning_rate": 3.385674224792016e-06,
+      "loss": 0.7098,
+      "step": 8646
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 8.3898022425631,
+      "learning_rate": 3.384581367544033e-06,
+      "loss": 0.7936,
+      "step": 8647
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 7.495050798892511,
+      "learning_rate": 3.383488596457986e-06,
+      "loss": 0.7764,
+      "step": 8648
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 9.548276845879755,
+      "learning_rate": 3.3823959115921566e-06,
+      "loss": 0.7779,
+      "step": 8649
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 8.933853155297474,
+      "learning_rate": 3.3813033130048257e-06,
+      "loss": 0.7595,
+      "step": 8650
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 7.911641658675772,
+      "learning_rate": 3.380210800754273e-06,
+      "loss": 0.7588,
+      "step": 8651
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 10.976186515871802,
+      "learning_rate": 3.3791183748987654e-06,
+      "loss": 0.8389,
+      "step": 8652
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 9.335949778224542,
+      "learning_rate": 3.3780260354965733e-06,
+      "loss": 0.7926,
+      "step": 8653
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 6.333661754576154,
+      "learning_rate": 3.3769337826059565e-06,
+      "loss": 0.7119,
+      "step": 8654
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 8.706352400957874,
+      "learning_rate": 3.3758416162851732e-06,
+      "loss": 0.7696,
+      "step": 8655
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 8.379892538317819,
+      "learning_rate": 3.3747495365924786e-06,
+      "loss": 0.7256,
+      "step": 8656
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 7.138249218027695,
+      "learning_rate": 3.3736575435861195e-06,
+      "loss": 0.7971,
+      "step": 8657
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 9.922603265789476,
+      "learning_rate": 3.372565637324341e-06,
+      "loss": 0.807,
+      "step": 8658
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 6.318889072291844,
+      "learning_rate": 3.3714738178653793e-06,
+      "loss": 0.693,
+      "step": 8659
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 7.5275981569531005,
+      "learning_rate": 3.370382085267472e-06,
+      "loss": 0.7671,
+      "step": 8660
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 8.549291824379502,
+      "learning_rate": 3.369290439588849e-06,
+      "loss": 0.7531,
+      "step": 8661
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 6.9670130200877445,
+      "learning_rate": 3.3681988808877337e-06,
+      "loss": 0.7844,
+      "step": 8662
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 10.965800276769704,
+      "learning_rate": 3.3671074092223495e-06,
+      "loss": 0.764,
+      "step": 8663
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 11.869754754519406,
+      "learning_rate": 3.36601602465091e-06,
+      "loss": 0.7787,
+      "step": 8664
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 9.71156772317947,
+      "learning_rate": 3.364924727231626e-06,
+      "loss": 0.7525,
+      "step": 8665
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 8.859731437685493,
+      "learning_rate": 3.3638335170227075e-06,
+      "loss": 0.769,
+      "step": 8666
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 9.546385402734472,
+      "learning_rate": 3.362742394082355e-06,
+      "loss": 0.7126,
+      "step": 8667
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 7.778311425880162,
+      "learning_rate": 3.3616513584687644e-06,
+      "loss": 0.8281,
+      "step": 8668
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 9.700624065244243,
+      "learning_rate": 3.360560410240132e-06,
+      "loss": 0.7402,
+      "step": 8669
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 8.546023062806267,
+      "learning_rate": 3.3594695494546426e-06,
+      "loss": 0.7606,
+      "step": 8670
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 8.200725735129932,
+      "learning_rate": 3.3583787761704823e-06,
+      "loss": 0.7099,
+      "step": 8671
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 8.727888381845466,
+      "learning_rate": 3.3572880904458267e-06,
+      "loss": 0.7533,
+      "step": 8672
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 6.533925685801008,
+      "learning_rate": 3.356197492338853e-06,
+      "loss": 0.765,
+      "step": 8673
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 9.002427376339563,
+      "learning_rate": 3.3551069819077297e-06,
+      "loss": 0.7369,
+      "step": 8674
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 10.140587055734983,
+      "learning_rate": 3.35401655921062e-06,
+      "loss": 0.745,
+      "step": 8675
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 10.54236128568077,
+      "learning_rate": 3.352926224305687e-06,
+      "loss": 0.7694,
+      "step": 8676
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 9.639263128927054,
+      "learning_rate": 3.351835977251084e-06,
+      "loss": 0.715,
+      "step": 8677
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 9.616913853119458,
+      "learning_rate": 3.3507458181049613e-06,
+      "loss": 0.6883,
+      "step": 8678
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 5.329690057716211,
+      "learning_rate": 3.3496557469254674e-06,
+      "loss": 0.7739,
+      "step": 8679
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 10.410258177357575,
+      "learning_rate": 3.3485657637707415e-06,
+      "loss": 0.799,
+      "step": 8680
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 7.501682214615927,
+      "learning_rate": 3.347475868698922e-06,
+      "loss": 0.7537,
+      "step": 8681
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 10.286231176814404,
+      "learning_rate": 3.3463860617681378e-06,
+      "loss": 0.6979,
+      "step": 8682
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 9.21377228356717,
+      "learning_rate": 3.3452963430365193e-06,
+      "loss": 0.7169,
+      "step": 8683
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 10.387865603042417,
+      "learning_rate": 3.344206712562189e-06,
+      "loss": 0.7792,
+      "step": 8684
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 8.377330048393173,
+      "learning_rate": 3.343117170403261e-06,
+      "loss": 0.6792,
+      "step": 8685
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 8.400866049111219,
+      "learning_rate": 3.342027716617854e-06,
+      "loss": 0.7896,
+      "step": 8686
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 9.802128794587032,
+      "learning_rate": 3.3409383512640726e-06,
+      "loss": 0.8048,
+      "step": 8687
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 7.132044080786985,
+      "learning_rate": 3.33984907440002e-06,
+      "loss": 0.7153,
+      "step": 8688
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 9.429267470083802,
+      "learning_rate": 3.3387598860837987e-06,
+      "loss": 0.7226,
+      "step": 8689
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 10.621159108834796,
+      "learning_rate": 3.3376707863735004e-06,
+      "loss": 0.7808,
+      "step": 8690
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 9.978858513541008,
+      "learning_rate": 3.336581775327216e-06,
+      "loss": 0.7431,
+      "step": 8691
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 9.894933353546934,
+      "learning_rate": 3.335492853003028e-06,
+      "loss": 0.7192,
+      "step": 8692
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 7.529549144131811,
+      "learning_rate": 3.3344040194590187e-06,
+      "loss": 0.771,
+      "step": 8693
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 11.972577995880577,
+      "learning_rate": 3.333315274753264e-06,
+      "loss": 0.7839,
+      "step": 8694
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 8.568859337944055,
+      "learning_rate": 3.3322266189438313e-06,
+      "loss": 0.7366,
+      "step": 8695
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 6.681742172823675,
+      "learning_rate": 3.3311380520887904e-06,
+      "loss": 0.729,
+      "step": 8696
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 9.65882154983858,
+      "learning_rate": 3.330049574246198e-06,
+      "loss": 0.6713,
+      "step": 8697
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 7.556616890767536,
+      "learning_rate": 3.3289611854741143e-06,
+      "loss": 0.7282,
+      "step": 8698
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 10.372112402817308,
+      "learning_rate": 3.327872885830591e-06,
+      "loss": 0.7751,
+      "step": 8699
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 7.944745975331767,
+      "learning_rate": 3.326784675373671e-06,
+      "loss": 0.8691,
+      "step": 8700
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 10.715668273139602,
+      "learning_rate": 3.325696554161401e-06,
+      "loss": 0.7352,
+      "step": 8701
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 10.181458015917768,
+      "learning_rate": 3.3246085222518154e-06,
+      "loss": 0.7467,
+      "step": 8702
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 9.05327419324768,
+      "learning_rate": 3.323520579702947e-06,
+      "loss": 0.7577,
+      "step": 8703
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 5.992780403088068,
+      "learning_rate": 3.3224327265728264e-06,
+      "loss": 0.6843,
+      "step": 8704
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 9.346889545064684,
+      "learning_rate": 3.3213449629194734e-06,
+      "loss": 0.7589,
+      "step": 8705
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 10.62256432476998,
+      "learning_rate": 3.320257288800908e-06,
+      "loss": 0.7722,
+      "step": 8706
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 7.922372140548819,
+      "learning_rate": 3.319169704275142e-06,
+      "loss": 0.6778,
+      "step": 8707
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 8.30904638468876,
+      "learning_rate": 3.3180822094001864e-06,
+      "loss": 0.8472,
+      "step": 8708
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 9.28218016528198,
+      "learning_rate": 3.3169948042340445e-06,
+      "loss": 0.7205,
+      "step": 8709
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 7.765438721040624,
+      "learning_rate": 3.3159074888347132e-06,
+      "loss": 0.765,
+      "step": 8710
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 6.672377071892402,
+      "learning_rate": 3.31482026326019e-06,
+      "loss": 0.7848,
+      "step": 8711
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 8.493365186764212,
+      "learning_rate": 3.3137331275684643e-06,
+      "loss": 0.7668,
+      "step": 8712
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 9.373454379247697,
+      "learning_rate": 3.3126460818175177e-06,
+      "loss": 0.7205,
+      "step": 8713
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 10.298725954801583,
+      "learning_rate": 3.311559126065334e-06,
+      "loss": 0.8089,
+      "step": 8714
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 10.6308756579953,
+      "learning_rate": 3.310472260369886e-06,
+      "loss": 0.733,
+      "step": 8715
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 7.584828846455866,
+      "learning_rate": 3.3093854847891436e-06,
+      "loss": 0.7205,
+      "step": 8716
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 9.755754696254106,
+      "learning_rate": 3.3082987993810756e-06,
+      "loss": 0.8003,
+      "step": 8717
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 10.46320649185379,
+      "learning_rate": 3.3072122042036404e-06,
+      "loss": 0.7126,
+      "step": 8718
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 8.234192222201493,
+      "learning_rate": 3.3061256993147944e-06,
+      "loss": 0.7945,
+      "step": 8719
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 11.769923908728563,
+      "learning_rate": 3.305039284772487e-06,
+      "loss": 0.6839,
+      "step": 8720
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 7.8501800561005854,
+      "learning_rate": 3.3039529606346675e-06,
+      "loss": 0.7596,
+      "step": 8721
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 7.859587307344062,
+      "learning_rate": 3.3028667269592766e-06,
+      "loss": 0.8085,
+      "step": 8722
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 9.811489644188889,
+      "learning_rate": 3.301780583804249e-06,
+      "loss": 0.6753,
+      "step": 8723
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 11.463829678696209,
+      "learning_rate": 3.3006945312275196e-06,
+      "loss": 0.838,
+      "step": 8724
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 10.044482088824948,
+      "learning_rate": 3.2996085692870135e-06,
+      "loss": 0.7147,
+      "step": 8725
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 7.196147206738648,
+      "learning_rate": 3.2985226980406515e-06,
+      "loss": 0.7442,
+      "step": 8726
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 8.93597563400865,
+      "learning_rate": 3.297436917546355e-06,
+      "loss": 0.7715,
+      "step": 8727
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 9.330240296917523,
+      "learning_rate": 3.2963512278620336e-06,
+      "loss": 0.6935,
+      "step": 8728
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 10.565536695376856,
+      "learning_rate": 3.295265629045596e-06,
+      "loss": 0.7415,
+      "step": 8729
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 11.233823377127088,
+      "learning_rate": 3.294180121154943e-06,
+      "loss": 0.7283,
+      "step": 8730
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 9.00586489856888,
+      "learning_rate": 3.2930947042479745e-06,
+      "loss": 0.7792,
+      "step": 8731
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 7.354177386694398,
+      "learning_rate": 3.292009378382585e-06,
+      "loss": 0.7043,
+      "step": 8732
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 9.912348121395105,
+      "learning_rate": 3.2909241436166583e-06,
+      "loss": 0.7535,
+      "step": 8733
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 7.72533278201334,
+      "learning_rate": 3.289839000008084e-06,
+      "loss": 0.7932,
+      "step": 8734
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 8.263160163896275,
+      "learning_rate": 3.288753947614734e-06,
+      "loss": 0.7584,
+      "step": 8735
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 8.082687791988457,
+      "learning_rate": 3.2876689864944854e-06,
+      "loss": 0.6814,
+      "step": 8736
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 8.388836332573979,
+      "learning_rate": 3.286584116705209e-06,
+      "loss": 0.7855,
+      "step": 8737
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 9.488659904274439,
+      "learning_rate": 3.2854993383047646e-06,
+      "loss": 0.7159,
+      "step": 8738
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 8.845498179286535,
+      "learning_rate": 3.2844146513510144e-06,
+      "loss": 0.704,
+      "step": 8739
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 7.500306328269236,
+      "learning_rate": 3.2833300559018113e-06,
+      "loss": 0.767,
+      "step": 8740
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 9.482183473226753,
+      "learning_rate": 3.282245552015002e-06,
+      "loss": 0.797,
+      "step": 8741
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 10.75829825168409,
+      "learning_rate": 3.281161139748437e-06,
+      "loss": 0.7172,
+      "step": 8742
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 7.83933634340602,
+      "learning_rate": 3.2800768191599508e-06,
+      "loss": 0.7662,
+      "step": 8743
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 6.414903356359749,
+      "learning_rate": 3.2789925903073805e-06,
+      "loss": 0.7006,
+      "step": 8744
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 6.911198879128029,
+      "learning_rate": 3.2779084532485534e-06,
+      "loss": 0.6595,
+      "step": 8745
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 8.647615969851088,
+      "learning_rate": 3.276824408041297e-06,
+      "loss": 0.7589,
+      "step": 8746
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 8.994680059424825,
+      "learning_rate": 3.27574045474343e-06,
+      "loss": 0.7423,
+      "step": 8747
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 8.98771364664103,
+      "learning_rate": 3.2746565934127674e-06,
+      "loss": 0.7604,
+      "step": 8748
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 7.38463594913288,
+      "learning_rate": 3.27357282410712e-06,
+      "loss": 0.7433,
+      "step": 8749
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 6.1780274183843655,
+      "learning_rate": 3.2724891468842925e-06,
+      "loss": 0.7001,
+      "step": 8750
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 7.282442911637532,
+      "learning_rate": 3.2714055618020846e-06,
+      "loss": 0.7314,
+      "step": 8751
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 10.250986600435123,
+      "learning_rate": 3.2703220689182937e-06,
+      "loss": 0.7405,
+      "step": 8752
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 10.32321604865653,
+      "learning_rate": 3.2692386682907085e-06,
+      "loss": 0.7377,
+      "step": 8753
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 11.33205506924742,
+      "learning_rate": 3.2681553599771138e-06,
+      "loss": 0.8126,
+      "step": 8754
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 8.690275648308795,
+      "learning_rate": 3.2670721440352935e-06,
+      "loss": 0.7331,
+      "step": 8755
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 10.740067504289769,
+      "learning_rate": 3.265989020523021e-06,
+      "loss": 0.7251,
+      "step": 8756
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 7.544267981092203,
+      "learning_rate": 3.2649059894980684e-06,
+      "loss": 0.739,
+      "step": 8757
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 9.285430468810521,
+      "learning_rate": 3.263823051018198e-06,
+      "loss": 0.742,
+      "step": 8758
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 9.12721655228059,
+      "learning_rate": 3.262740205141175e-06,
+      "loss": 0.6436,
+      "step": 8759
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 10.647617801156766,
+      "learning_rate": 3.2616574519247545e-06,
+      "loss": 0.7769,
+      "step": 8760
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 10.70623572333691,
+      "learning_rate": 3.260574791426685e-06,
+      "loss": 0.7645,
+      "step": 8761
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 7.141721367214174,
+      "learning_rate": 3.259492223704716e-06,
+      "loss": 0.6746,
+      "step": 8762
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 8.13451409920607,
+      "learning_rate": 3.2584097488165866e-06,
+      "loss": 0.69,
+      "step": 8763
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 8.831541607159277,
+      "learning_rate": 3.257327366820032e-06,
+      "loss": 0.735,
+      "step": 8764
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 11.114132633734826,
+      "learning_rate": 3.2562450777727873e-06,
+      "loss": 0.6854,
+      "step": 8765
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 7.5954674207831125,
+      "learning_rate": 3.2551628817325748e-06,
+      "loss": 0.7061,
+      "step": 8766
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 8.65501521373747,
+      "learning_rate": 3.254080778757119e-06,
+      "loss": 0.7148,
+      "step": 8767
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 7.260774897461294,
+      "learning_rate": 3.252998768904132e-06,
+      "loss": 0.7194,
+      "step": 8768
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 10.231243757377126,
+      "learning_rate": 3.2519168522313292e-06,
+      "loss": 0.7724,
+      "step": 8769
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 11.833368945835645,
+      "learning_rate": 3.250835028796417e-06,
+      "loss": 0.7353,
+      "step": 8770
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 10.812957616146933,
+      "learning_rate": 3.2497532986570934e-06,
+      "loss": 0.6924,
+      "step": 8771
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 8.264858346025374,
+      "learning_rate": 3.248671661871059e-06,
+      "loss": 0.7559,
+      "step": 8772
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 9.225683825360077,
+      "learning_rate": 3.2475901184960017e-06,
+      "loss": 0.7763,
+      "step": 8773
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 10.396722277174401,
+      "learning_rate": 3.2465086685896093e-06,
+      "loss": 0.7718,
+      "step": 8774
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 8.217016356574264,
+      "learning_rate": 3.245427312209565e-06,
+      "loss": 0.7566,
+      "step": 8775
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 10.232054949355703,
+      "learning_rate": 3.2443460494135436e-06,
+      "loss": 0.8071,
+      "step": 8776
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 9.952857266463928,
+      "learning_rate": 3.2432648802592184e-06,
+      "loss": 0.7529,
+      "step": 8777
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 8.6201001893014,
+      "learning_rate": 3.2421838048042516e-06,
+      "loss": 0.7435,
+      "step": 8778
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 7.147981986327341,
+      "learning_rate": 3.241102823106309e-06,
+      "loss": 0.7078,
+      "step": 8779
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 6.539605331666905,
+      "learning_rate": 3.2400219352230466e-06,
+      "loss": 0.7936,
+      "step": 8780
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 13.012835684215114,
+      "learning_rate": 3.2389411412121153e-06,
+      "loss": 0.7156,
+      "step": 8781
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 6.923998773315342,
+      "learning_rate": 3.237860441131162e-06,
+      "loss": 0.6761,
+      "step": 8782
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 9.762569351123966,
+      "learning_rate": 3.2367798350378254e-06,
+      "loss": 0.7044,
+      "step": 8783
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 8.38835267263443,
+      "learning_rate": 3.235699322989746e-06,
+      "loss": 0.748,
+      "step": 8784
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 7.261054372657833,
+      "learning_rate": 3.234618905044554e-06,
+      "loss": 0.77,
+      "step": 8785
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 8.53274558250064,
+      "learning_rate": 3.2335385812598745e-06,
+      "loss": 0.7874,
+      "step": 8786
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 9.847464495894641,
+      "learning_rate": 3.232458351693332e-06,
+      "loss": 0.6782,
+      "step": 8787
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 9.577263700495767,
+      "learning_rate": 3.23137821640254e-06,
+      "loss": 0.7786,
+      "step": 8788
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 7.182083836211842,
+      "learning_rate": 3.2302981754451095e-06,
+      "loss": 0.7758,
+      "step": 8789
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 11.033749977305508,
+      "learning_rate": 3.229218228878651e-06,
+      "loss": 0.8247,
+      "step": 8790
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 7.820586473994385,
+      "learning_rate": 3.2281383767607617e-06,
+      "loss": 0.6807,
+      "step": 8791
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 13.378482947370482,
+      "learning_rate": 3.2270586191490404e-06,
+      "loss": 0.8215,
+      "step": 8792
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 12.500443009292535,
+      "learning_rate": 3.225978956101075e-06,
+      "loss": 0.8359,
+      "step": 8793
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 10.451435665198224,
+      "learning_rate": 3.2248993876744557e-06,
+      "loss": 0.7463,
+      "step": 8794
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 9.55412250436727,
+      "learning_rate": 3.223819913926763e-06,
+      "loss": 0.7153,
+      "step": 8795
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 11.005583077911073,
+      "learning_rate": 3.2227405349155693e-06,
+      "loss": 0.7138,
+      "step": 8796
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 5.873782341594122,
+      "learning_rate": 3.22166125069845e-06,
+      "loss": 0.8012,
+      "step": 8797
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 11.582868646301156,
+      "learning_rate": 3.220582061332971e-06,
+      "loss": 0.7993,
+      "step": 8798
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 8.26029466612179,
+      "learning_rate": 3.219502966876689e-06,
+      "loss": 0.7994,
+      "step": 8799
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 10.790128759856463,
+      "learning_rate": 3.2184239673871654e-06,
+      "loss": 0.8287,
+      "step": 8800
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 8.354099901443675,
+      "learning_rate": 3.2173450629219465e-06,
+      "loss": 0.7398,
+      "step": 8801
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 10.144650417879669,
+      "learning_rate": 3.216266253538579e-06,
+      "loss": 0.7247,
+      "step": 8802
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 8.826814931415283,
+      "learning_rate": 3.2151875392946073e-06,
+      "loss": 0.6695,
+      "step": 8803
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 9.681540053693983,
+      "learning_rate": 3.2141089202475628e-06,
+      "loss": 0.7129,
+      "step": 8804
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 12.638418731497048,
+      "learning_rate": 3.213030396454978e-06,
+      "loss": 0.7899,
+      "step": 8805
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 10.24716072553643,
+      "learning_rate": 3.211951967974376e-06,
+      "loss": 0.7442,
+      "step": 8806
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 8.586216124577325,
+      "learning_rate": 3.2108736348632806e-06,
+      "loss": 0.7083,
+      "step": 8807
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 10.50408729829631,
+      "learning_rate": 3.2097953971792062e-06,
+      "loss": 0.7052,
+      "step": 8808
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 9.10569670032897,
+      "learning_rate": 3.2087172549796597e-06,
+      "loss": 0.7178,
+      "step": 8809
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 7.076892234300965,
+      "learning_rate": 3.2076392083221507e-06,
+      "loss": 0.7869,
+      "step": 8810
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 7.34517238796353,
+      "learning_rate": 3.2065612572641768e-06,
+      "loss": 0.6883,
+      "step": 8811
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 9.633837508175494,
+      "learning_rate": 3.205483401863232e-06,
+      "loss": 0.7779,
+      "step": 8812
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 8.834160534422006,
+      "learning_rate": 3.2044056421768098e-06,
+      "loss": 0.7303,
+      "step": 8813
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 8.28497854252242,
+      "learning_rate": 3.2033279782623917e-06,
+      "loss": 0.7259,
+      "step": 8814
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 6.938367741274695,
+      "learning_rate": 3.2022504101774586e-06,
+      "loss": 0.6915,
+      "step": 8815
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 7.12423048282314,
+      "learning_rate": 3.201172937979483e-06,
+      "loss": 0.6885,
+      "step": 8816
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 8.77002162032933,
+      "learning_rate": 3.2000955617259364e-06,
+      "loss": 0.7674,
+      "step": 8817
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 9.085382999340396,
+      "learning_rate": 3.199018281474284e-06,
+      "loss": 0.787,
+      "step": 8818
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 8.972602013449773,
+      "learning_rate": 3.1979410972819813e-06,
+      "loss": 0.7418,
+      "step": 8819
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 9.273509433118306,
+      "learning_rate": 3.196864009206487e-06,
+      "loss": 0.7046,
+      "step": 8820
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 7.8977806295961965,
+      "learning_rate": 3.1957870173052446e-06,
+      "loss": 0.8457,
+      "step": 8821
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 10.088799523539167,
+      "learning_rate": 3.1947101216357e-06,
+      "loss": 0.693,
+      "step": 8822
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 7.800363134014852,
+      "learning_rate": 3.1936333222552968e-06,
+      "loss": 0.7183,
+      "step": 8823
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 8.749776021917189,
+      "learning_rate": 3.19255661922146e-06,
+      "loss": 0.7191,
+      "step": 8824
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 8.700366028937967,
+      "learning_rate": 3.1914800125916248e-06,
+      "loss": 0.6984,
+      "step": 8825
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 7.533178730419025,
+      "learning_rate": 3.1904035024232105e-06,
+      "loss": 0.7282,
+      "step": 8826
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 7.14483542778819,
+      "learning_rate": 3.189327088773635e-06,
+      "loss": 0.767,
+      "step": 8827
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 8.028383520167475,
+      "learning_rate": 3.1882507717003153e-06,
+      "loss": 0.8724,
+      "step": 8828
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 8.090808508872433,
+      "learning_rate": 3.1871745512606543e-06,
+      "loss": 0.7446,
+      "step": 8829
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 6.2250673404095,
+      "learning_rate": 3.1860984275120587e-06,
+      "loss": 0.7564,
+      "step": 8830
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 13.102759655212141,
+      "learning_rate": 3.1850224005119212e-06,
+      "loss": 0.763,
+      "step": 8831
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 7.201048456689397,
+      "learning_rate": 3.183946470317638e-06,
+      "loss": 0.7621,
+      "step": 8832
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 9.4558011700291,
+      "learning_rate": 3.1828706369865963e-06,
+      "loss": 0.7628,
+      "step": 8833
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 8.97027560054332,
+      "learning_rate": 3.181794900576175e-06,
+      "loss": 0.787,
+      "step": 8834
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 10.18051546234774,
+      "learning_rate": 3.1807192611437544e-06,
+      "loss": 0.6897,
+      "step": 8835
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 8.17663452657051,
+      "learning_rate": 3.179643718746703e-06,
+      "loss": 0.7493,
+      "step": 8836
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 7.583376692153134,
+      "learning_rate": 3.1785682734423883e-06,
+      "loss": 0.6892,
+      "step": 8837
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 8.576678620625868,
+      "learning_rate": 3.1774929252881734e-06,
+      "loss": 0.7759,
+      "step": 8838
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 7.988838250537783,
+      "learning_rate": 3.176417674341412e-06,
+      "loss": 0.71,
+      "step": 8839
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 11.031043161031281,
+      "learning_rate": 3.175342520659456e-06,
+      "loss": 0.745,
+      "step": 8840
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 11.418084386105992,
+      "learning_rate": 3.174267464299652e-06,
+      "loss": 0.779,
+      "step": 8841
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 8.397584636446249,
+      "learning_rate": 3.173192505319338e-06,
+      "loss": 0.8231,
+      "step": 8842
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 11.131629892103758,
+      "learning_rate": 3.1721176437758524e-06,
+      "loss": 0.7739,
+      "step": 8843
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 10.285657043922523,
+      "learning_rate": 3.1710428797265213e-06,
+      "loss": 0.7888,
+      "step": 8844
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 10.373643948069136,
+      "learning_rate": 3.1699682132286734e-06,
+      "loss": 0.7441,
+      "step": 8845
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 9.035820774394173,
+      "learning_rate": 3.1688936443396285e-06,
+      "loss": 0.6746,
+      "step": 8846
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 8.5334250505751,
+      "learning_rate": 3.1678191731166974e-06,
+      "loss": 0.7547,
+      "step": 8847
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 6.8312522143261685,
+      "learning_rate": 3.1667447996171928e-06,
+      "loss": 0.7745,
+      "step": 8848
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 9.084244804720905,
+      "learning_rate": 3.1656705238984177e-06,
+      "loss": 0.7184,
+      "step": 8849
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 8.219481746864364,
+      "learning_rate": 3.1645963460176698e-06,
+      "loss": 0.7067,
+      "step": 8850
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 9.385414968277999,
+      "learning_rate": 3.163522266032246e-06,
+      "loss": 0.8128,
+      "step": 8851
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 6.874923021491098,
+      "learning_rate": 3.1624482839994314e-06,
+      "loss": 0.7428,
+      "step": 8852
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 7.596807246686393,
+      "learning_rate": 3.1613743999765116e-06,
+      "loss": 0.7185,
+      "step": 8853
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 12.312296028838121,
+      "learning_rate": 3.1603006140207616e-06,
+      "loss": 0.7823,
+      "step": 8854
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 9.07359314202201,
+      "learning_rate": 3.159226926189457e-06,
+      "loss": 0.675,
+      "step": 8855
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 10.610397007073995,
+      "learning_rate": 3.1581533365398647e-06,
+      "loss": 0.7767,
+      "step": 8856
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 9.672988049024658,
+      "learning_rate": 3.1570798451292452e-06,
+      "loss": 0.716,
+      "step": 8857
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 7.721676323594637,
+      "learning_rate": 3.156006452014859e-06,
+      "loss": 0.6785,
+      "step": 8858
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 11.135565596036043,
+      "learning_rate": 3.1549331572539544e-06,
+      "loss": 0.8172,
+      "step": 8859
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 8.085906934054107,
+      "learning_rate": 3.153859960903779e-06,
+      "loss": 0.7874,
+      "step": 8860
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 9.102784850499077,
+      "learning_rate": 3.1527868630215765e-06,
+      "loss": 0.7599,
+      "step": 8861
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 13.13605253357834,
+      "learning_rate": 3.1517138636645793e-06,
+      "loss": 0.7186,
+      "step": 8862
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 9.81762716019159,
+      "learning_rate": 3.1506409628900212e-06,
+      "loss": 0.7317,
+      "step": 8863
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 8.018394659678062,
+      "learning_rate": 3.1495681607551244e-06,
+      "loss": 0.7504,
+      "step": 8864
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 10.736457715802734,
+      "learning_rate": 3.148495457317112e-06,
+      "loss": 0.7317,
+      "step": 8865
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 9.31633824653768,
+      "learning_rate": 3.1474228526332e-06,
+      "loss": 0.6775,
+      "step": 8866
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 6.869142494817958,
+      "learning_rate": 3.146350346760595e-06,
+      "loss": 0.6533,
+      "step": 8867
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 10.842969823330835,
+      "learning_rate": 3.1452779397565037e-06,
+      "loss": 0.7278,
+      "step": 8868
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 6.831398772388119,
+      "learning_rate": 3.1442056316781224e-06,
+      "loss": 0.7036,
+      "step": 8869
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 11.174388616774257,
+      "learning_rate": 3.143133422582648e-06,
+      "loss": 0.6937,
+      "step": 8870
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 9.145294442357114,
+      "learning_rate": 3.1420613125272693e-06,
+      "loss": 0.6569,
+      "step": 8871
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 10.439740812622695,
+      "learning_rate": 3.140989301569166e-06,
+      "loss": 0.7569,
+      "step": 8872
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 10.654169109929596,
+      "learning_rate": 3.139917389765522e-06,
+      "loss": 0.7109,
+      "step": 8873
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 9.287369722176605,
+      "learning_rate": 3.138845577173505e-06,
+      "loss": 0.8093,
+      "step": 8874
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 7.469386173509699,
+      "learning_rate": 3.1377738638502834e-06,
+      "loss": 0.7221,
+      "step": 8875
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 11.045618834116214,
+      "learning_rate": 3.136702249853023e-06,
+      "loss": 0.8239,
+      "step": 8876
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 10.341965340814978,
+      "learning_rate": 3.135630735238876e-06,
+      "loss": 0.7009,
+      "step": 8877
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 6.464319123744325,
+      "learning_rate": 3.134559320064998e-06,
+      "loss": 0.7918,
+      "step": 8878
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 8.717859770090541,
+      "learning_rate": 3.133488004388531e-06,
+      "loss": 0.7308,
+      "step": 8879
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 7.307643422640819,
+      "learning_rate": 3.1324167882666195e-06,
+      "loss": 0.7605,
+      "step": 8880
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 10.862578565112583,
+      "learning_rate": 3.131345671756399e-06,
+      "loss": 0.7187,
+      "step": 8881
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 6.813212966696381,
+      "learning_rate": 3.1302746549149964e-06,
+      "loss": 0.7346,
+      "step": 8882
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 10.420812223663598,
+      "learning_rate": 3.129203737799541e-06,
+      "loss": 0.741,
+      "step": 8883
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 10.3330180430427,
+      "learning_rate": 3.128132920467152e-06,
+      "loss": 0.8067,
+      "step": 8884
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 7.958777130703696,
+      "learning_rate": 3.12706220297494e-06,
+      "loss": 0.6468,
+      "step": 8885
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 9.102358884610709,
+      "learning_rate": 3.1259915853800193e-06,
+      "loss": 0.7035,
+      "step": 8886
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 7.476810349062565,
+      "learning_rate": 3.1249210677394896e-06,
+      "loss": 0.6846,
+      "step": 8887
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 9.55118915364762,
+      "learning_rate": 3.1238506501104503e-06,
+      "loss": 0.7738,
+      "step": 8888
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 8.248217597442448,
+      "learning_rate": 3.1227803325499965e-06,
+      "loss": 0.8027,
+      "step": 8889
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 8.203697061743794,
+      "learning_rate": 3.121710115115214e-06,
+      "loss": 0.7911,
+      "step": 8890
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 7.854092533813913,
+      "learning_rate": 3.1206399978631862e-06,
+      "loss": 0.6896,
+      "step": 8891
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 8.780184124049873,
+      "learning_rate": 3.1195699808509883e-06,
+      "loss": 0.8206,
+      "step": 8892
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 8.952327876723382,
+      "learning_rate": 3.118500064135694e-06,
+      "loss": 0.756,
+      "step": 8893
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 7.72827001560193,
+      "learning_rate": 3.11743024777437e-06,
+      "loss": 0.7219,
+      "step": 8894
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 8.76926060771498,
+      "learning_rate": 3.116360531824074e-06,
+      "loss": 0.6975,
+      "step": 8895
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 11.40405786234604,
+      "learning_rate": 3.1152909163418673e-06,
+      "loss": 0.743,
+      "step": 8896
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 9.573230809098492,
+      "learning_rate": 3.114221401384795e-06,
+      "loss": 0.7078,
+      "step": 8897
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 10.427509391302236,
+      "learning_rate": 3.1131519870099036e-06,
+      "loss": 0.7274,
+      "step": 8898
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 8.931569290435428,
+      "learning_rate": 3.112082673274235e-06,
+      "loss": 0.7481,
+      "step": 8899
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 11.230081649483937,
+      "learning_rate": 3.11101346023482e-06,
+      "loss": 0.7241,
+      "step": 8900
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 10.237982697585055,
+      "learning_rate": 3.1099443479486906e-06,
+      "loss": 0.7526,
+      "step": 8901
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 5.721658722995274,
+      "learning_rate": 3.108875336472866e-06,
+      "loss": 0.7489,
+      "step": 8902
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 6.1930448972051755,
+      "learning_rate": 3.107806425864369e-06,
+      "loss": 0.8102,
+      "step": 8903
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 10.32644720846958,
+      "learning_rate": 3.106737616180211e-06,
+      "loss": 0.7927,
+      "step": 8904
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 10.443602801748685,
+      "learning_rate": 3.105668907477396e-06,
+      "loss": 0.7403,
+      "step": 8905
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 10.24154292805218,
+      "learning_rate": 3.104600299812932e-06,
+      "loss": 0.7676,
+      "step": 8906
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 7.6524690097484,
+      "learning_rate": 3.103531793243809e-06,
+      "loss": 0.7122,
+      "step": 8907
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 6.89664110870498,
+      "learning_rate": 3.1024633878270216e-06,
+      "loss": 0.707,
+      "step": 8908
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 9.114153409113861,
+      "learning_rate": 3.101395083619558e-06,
+      "loss": 0.6729,
+      "step": 8909
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 9.764844619356285,
+      "learning_rate": 3.1003268806783932e-06,
+      "loss": 0.6812,
+      "step": 8910
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 10.804888019499144,
+      "learning_rate": 3.0992587790605067e-06,
+      "loss": 0.8158,
+      "step": 8911
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 10.007886285462398,
+      "learning_rate": 3.098190778822865e-06,
+      "loss": 0.713,
+      "step": 8912
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 8.558321069980776,
+      "learning_rate": 3.0971228800224323e-06,
+      "loss": 0.7141,
+      "step": 8913
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 8.725759833222222,
+      "learning_rate": 3.096055082716171e-06,
+      "loss": 0.742,
+      "step": 8914
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 7.784782198024677,
+      "learning_rate": 3.0949873869610302e-06,
+      "loss": 0.707,
+      "step": 8915
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 10.585007528114003,
+      "learning_rate": 3.093919792813961e-06,
+      "loss": 0.7699,
+      "step": 8916
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 9.973385153030694,
+      "learning_rate": 3.092852300331902e-06,
+      "loss": 0.8735,
+      "step": 8917
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 7.368969308280155,
+      "learning_rate": 3.0917849095717947e-06,
+      "loss": 0.7826,
+      "step": 8918
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 11.765200979184344,
+      "learning_rate": 3.0907176205905686e-06,
+      "loss": 0.7725,
+      "step": 8919
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 10.946245367746728,
+      "learning_rate": 3.0896504334451483e-06,
+      "loss": 0.7729,
+      "step": 8920
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 7.907922493506932,
+      "learning_rate": 3.0885833481924587e-06,
+      "loss": 0.747,
+      "step": 8921
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 6.290057549601591,
+      "learning_rate": 3.0875163648894123e-06,
+      "loss": 0.7007,
+      "step": 8922
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 6.311301091612017,
+      "learning_rate": 3.0864494835929182e-06,
+      "loss": 0.8057,
+      "step": 8923
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 12.640075438252689,
+      "learning_rate": 3.085382704359884e-06,
+      "loss": 0.7595,
+      "step": 8924
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 8.215791278900827,
+      "learning_rate": 3.084316027247206e-06,
+      "loss": 0.7726,
+      "step": 8925
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 8.556415155481119,
+      "learning_rate": 3.083249452311778e-06,
+      "loss": 0.7526,
+      "step": 8926
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 10.759389238260438,
+      "learning_rate": 3.0821829796104906e-06,
+      "loss": 0.7155,
+      "step": 8927
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 7.436432454472481,
+      "learning_rate": 3.0811166092002242e-06,
+      "loss": 0.8293,
+      "step": 8928
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 6.967579368272514,
+      "learning_rate": 3.080050341137857e-06,
+      "loss": 0.7002,
+      "step": 8929
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 9.219629157364166,
+      "learning_rate": 3.0789841754802583e-06,
+      "loss": 0.6926,
+      "step": 8930
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 7.870996676197228,
+      "learning_rate": 3.0779181122842984e-06,
+      "loss": 0.8166,
+      "step": 8931
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 9.345656907618553,
+      "learning_rate": 3.076852151606837e-06,
+      "loss": 0.7166,
+      "step": 8932
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 8.005131045795657,
+      "learning_rate": 3.0757862935047266e-06,
+      "loss": 0.7674,
+      "step": 8933
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 7.738398285249559,
+      "learning_rate": 3.0747205380348212e-06,
+      "loss": 0.7965,
+      "step": 8934
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 8.702359398175151,
+      "learning_rate": 3.0736548852539617e-06,
+      "loss": 0.8095,
+      "step": 8935
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 9.449930945997217,
+      "learning_rate": 3.0725893352189885e-06,
+      "loss": 0.7438,
+      "step": 8936
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 9.240296039301713,
+      "learning_rate": 3.0715238879867363e-06,
+      "loss": 0.7384,
+      "step": 8937
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 11.238066373734844,
+      "learning_rate": 3.0704585436140317e-06,
+      "loss": 0.7663,
+      "step": 8938
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 9.458845751296947,
+      "learning_rate": 3.0693933021576983e-06,
+      "loss": 0.8132,
+      "step": 8939
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 9.862645403324263,
+      "learning_rate": 3.06832816367455e-06,
+      "loss": 0.7699,
+      "step": 8940
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 8.27888281812696,
+      "learning_rate": 3.0672631282214016e-06,
+      "loss": 0.7408,
+      "step": 8941
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 9.456057802139183,
+      "learning_rate": 3.0661981958550597e-06,
+      "loss": 0.7073,
+      "step": 8942
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 8.34093440737591,
+      "learning_rate": 3.065133366632321e-06,
+      "loss": 0.6931,
+      "step": 8943
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 6.922296307184566,
+      "learning_rate": 3.0640686406099846e-06,
+      "loss": 0.7112,
+      "step": 8944
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 8.204771919542427,
+      "learning_rate": 3.0630040178448372e-06,
+      "loss": 0.6785,
+      "step": 8945
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 8.7417177411559,
+      "learning_rate": 3.061939498393663e-06,
+      "loss": 0.7575,
+      "step": 8946
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 8.105502231461244,
+      "learning_rate": 3.0608750823132428e-06,
+      "loss": 0.7812,
+      "step": 8947
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 7.567019650938166,
+      "learning_rate": 3.0598107696603472e-06,
+      "loss": 0.7258,
+      "step": 8948
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 7.777893813372391,
+      "learning_rate": 3.0587465604917466e-06,
+      "loss": 0.7057,
+      "step": 8949
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 9.361724421024087,
+      "learning_rate": 3.0576824548641994e-06,
+      "loss": 0.8168,
+      "step": 8950
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 6.224078477615744,
+      "learning_rate": 3.0566184528344622e-06,
+      "loss": 0.7436,
+      "step": 8951
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 8.800828709012212,
+      "learning_rate": 3.0555545544592898e-06,
+      "loss": 0.7274,
+      "step": 8952
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 7.750166167361018,
+      "learning_rate": 3.054490759795423e-06,
+      "loss": 0.7641,
+      "step": 8953
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 10.262541555866733,
+      "learning_rate": 3.053427068899606e-06,
+      "loss": 0.7209,
+      "step": 8954
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 10.313377903980994,
+      "learning_rate": 3.0523634818285687e-06,
+      "loss": 0.7725,
+      "step": 8955
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 6.983442382471854,
+      "learning_rate": 3.0512999986390424e-06,
+      "loss": 0.7681,
+      "step": 8956
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 9.096028980846995,
+      "learning_rate": 3.050236619387751e-06,
+      "loss": 0.7499,
+      "step": 8957
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 8.709719773588093,
+      "learning_rate": 3.04917334413141e-06,
+      "loss": 0.6891,
+      "step": 8958
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 8.857461654262003,
+      "learning_rate": 3.048110172926734e-06,
+      "loss": 0.711,
+      "step": 8959
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 8.674810533047598,
+      "learning_rate": 3.0470471058304263e-06,
+      "loss": 0.7359,
+      "step": 8960
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 9.540993105911575,
+      "learning_rate": 3.04598414289919e-06,
+      "loss": 0.7081,
+      "step": 8961
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 9.210466965522153,
+      "learning_rate": 3.0449212841897222e-06,
+      "loss": 0.781,
+      "step": 8962
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 7.415392721397306,
+      "learning_rate": 3.0438585297587103e-06,
+      "loss": 0.7556,
+      "step": 8963
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 8.729108237087264,
+      "learning_rate": 3.0427958796628404e-06,
+      "loss": 0.756,
+      "step": 8964
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 8.060685383248147,
+      "learning_rate": 3.0417333339587885e-06,
+      "loss": 0.723,
+      "step": 8965
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 7.444522919707669,
+      "learning_rate": 3.0406708927032303e-06,
+      "loss": 0.7898,
+      "step": 8966
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 11.458352008552849,
+      "learning_rate": 3.0396085559528336e-06,
+      "loss": 0.7297,
+      "step": 8967
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 9.251330341838282,
+      "learning_rate": 3.0385463237642586e-06,
+      "loss": 0.7591,
+      "step": 8968
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 10.121146598828709,
+      "learning_rate": 3.0374841961941637e-06,
+      "loss": 0.724,
+      "step": 8969
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 11.070046882269096,
+      "learning_rate": 3.0364221732991993e-06,
+      "loss": 0.7659,
+      "step": 8970
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 8.043768657104307,
+      "learning_rate": 3.0353602551360095e-06,
+      "loss": 0.693,
+      "step": 8971
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 8.420179126604253,
+      "learning_rate": 3.0342984417612366e-06,
+      "loss": 0.7696,
+      "step": 8972
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 7.829730016628538,
+      "learning_rate": 3.0332367332315125e-06,
+      "loss": 0.8009,
+      "step": 8973
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 10.0234215219936,
+      "learning_rate": 3.032175129603465e-06,
+      "loss": 0.805,
+      "step": 8974
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 6.313565886753272,
+      "learning_rate": 3.0311136309337214e-06,
+      "loss": 0.7357,
+      "step": 8975
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 8.97751871923598,
+      "learning_rate": 3.0300522372788948e-06,
+      "loss": 0.6771,
+      "step": 8976
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 9.053697817023687,
+      "learning_rate": 3.0289909486955995e-06,
+      "loss": 0.707,
+      "step": 8977
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 8.81061122502166,
+      "learning_rate": 3.027929765240439e-06,
+      "loss": 0.7516,
+      "step": 8978
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 10.993888253633749,
+      "learning_rate": 3.026868686970016e-06,
+      "loss": 0.7575,
+      "step": 8979
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 9.632762431388173,
+      "learning_rate": 3.025807713940926e-06,
+      "loss": 0.7244,
+      "step": 8980
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 9.475606024876717,
+      "learning_rate": 3.0247468462097558e-06,
+      "loss": 0.7511,
+      "step": 8981
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 11.968494306969655,
+      "learning_rate": 3.0236860838330925e-06,
+      "loss": 0.7904,
+      "step": 8982
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 8.509104438348439,
+      "learning_rate": 3.0226254268675114e-06,
+      "loss": 0.7292,
+      "step": 8983
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 6.856828049312834,
+      "learning_rate": 3.021564875369585e-06,
+      "loss": 0.6959,
+      "step": 8984
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 9.095456500188098,
+      "learning_rate": 3.020504429395883e-06,
+      "loss": 0.7218,
+      "step": 8985
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 11.484030256841583,
+      "learning_rate": 3.0194440890029638e-06,
+      "loss": 0.8226,
+      "step": 8986
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 9.341519275495497,
+      "learning_rate": 3.0183838542473847e-06,
+      "loss": 0.8226,
+      "step": 8987
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 10.692636994202278,
+      "learning_rate": 3.0173237251856936e-06,
+      "loss": 0.7519,
+      "step": 8988
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 8.178812278219349,
+      "learning_rate": 3.0162637018744368e-06,
+      "loss": 0.7222,
+      "step": 8989
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 9.143762570490246,
+      "learning_rate": 3.015203784370153e-06,
+      "loss": 0.7563,
+      "step": 8990
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 13.346563533708903,
+      "learning_rate": 3.0141439727293732e-06,
+      "loss": 0.7068,
+      "step": 8991
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 8.950816011328557,
+      "learning_rate": 3.013084267008629e-06,
+      "loss": 0.7083,
+      "step": 8992
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 9.572560995990973,
+      "learning_rate": 3.012024667264436e-06,
+      "loss": 0.7092,
+      "step": 8993
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 9.826829873260218,
+      "learning_rate": 3.010965173553314e-06,
+      "loss": 0.7563,
+      "step": 8994
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 7.808806070725379,
+      "learning_rate": 3.0099057859317764e-06,
+      "loss": 0.693,
+      "step": 8995
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 8.122817363650167,
+      "learning_rate": 3.008846504456322e-06,
+      "loss": 0.7745,
+      "step": 8996
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 8.52044486049908,
+      "learning_rate": 3.0077873291834537e-06,
+      "loss": 0.7617,
+      "step": 8997
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 9.499889242552426,
+      "learning_rate": 3.0067282601696636e-06,
+      "loss": 0.7463,
+      "step": 8998
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 8.580169659906087,
+      "learning_rate": 3.0056692974714384e-06,
+      "loss": 0.746,
+      "step": 8999
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 8.355154379903219,
+      "learning_rate": 3.0046104411452637e-06,
+      "loss": 0.7047,
+      "step": 9000
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 10.08830932697903,
+      "learning_rate": 3.0035516912476133e-06,
+      "loss": 0.737,
+      "step": 9001
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 10.55860258282361,
+      "learning_rate": 3.002493047834959e-06,
+      "loss": 0.747,
+      "step": 9002
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 11.754550819229907,
+      "learning_rate": 3.001434510963763e-06,
+      "loss": 0.7201,
+      "step": 9003
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 9.221683512399894,
+      "learning_rate": 3.000376080690489e-06,
+      "loss": 0.7729,
+      "step": 9004
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 7.577793977780224,
+      "learning_rate": 2.9993177570715897e-06,
+      "loss": 0.8277,
+      "step": 9005
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 8.913334122835296,
+      "learning_rate": 2.9982595401635107e-06,
+      "loss": 0.7378,
+      "step": 9006
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 9.027487120335579,
+      "learning_rate": 2.997201430022697e-06,
+      "loss": 0.7456,
+      "step": 9007
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 8.286731089632141,
+      "learning_rate": 2.9961434267055837e-06,
+      "loss": 0.7004,
+      "step": 9008
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 8.71087362000458,
+      "learning_rate": 2.995085530268601e-06,
+      "loss": 0.6962,
+      "step": 9009
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 8.869743985354338,
+      "learning_rate": 2.994027740768178e-06,
+      "loss": 0.7742,
+      "step": 9010
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 10.134307373866685,
+      "learning_rate": 2.9929700582607303e-06,
+      "loss": 0.7161,
+      "step": 9011
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 8.86041758873591,
+      "learning_rate": 2.991912482802672e-06,
+      "loss": 0.8051,
+      "step": 9012
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 7.57806723452211,
+      "learning_rate": 2.9908550144504145e-06,
+      "loss": 0.7236,
+      "step": 9013
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 7.54536977143201,
+      "learning_rate": 2.9897976532603568e-06,
+      "loss": 0.6754,
+      "step": 9014
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 8.947367231612684,
+      "learning_rate": 2.9887403992888975e-06,
+      "loss": 0.7662,
+      "step": 9015
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 9.217632162737392,
+      "learning_rate": 2.9876832525924255e-06,
+      "loss": 0.7526,
+      "step": 9016
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 8.068072950948878,
+      "learning_rate": 2.9866262132273284e-06,
+      "loss": 0.7677,
+      "step": 9017
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 11.384395778468107,
+      "learning_rate": 2.985569281249986e-06,
+      "loss": 0.7212,
+      "step": 9018
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 6.764999945129379,
+      "learning_rate": 2.9845124567167684e-06,
+      "loss": 0.7818,
+      "step": 9019
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 10.04160003414662,
+      "learning_rate": 2.9834557396840485e-06,
+      "loss": 0.7547,
+      "step": 9020
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 7.566006274660438,
+      "learning_rate": 2.9823991302081855e-06,
+      "loss": 0.696,
+      "step": 9021
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 10.255495842104452,
+      "learning_rate": 2.9813426283455354e-06,
+      "loss": 0.7533,
+      "step": 9022
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 6.7646225636395485,
+      "learning_rate": 2.980286234152453e-06,
+      "loss": 0.6737,
+      "step": 9023
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 8.379054882425908,
+      "learning_rate": 2.97922994768528e-06,
+      "loss": 0.7598,
+      "step": 9024
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 8.673530474764629,
+      "learning_rate": 2.9781737690003576e-06,
+      "loss": 0.7828,
+      "step": 9025
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 7.254560977988973,
+      "learning_rate": 2.9771176981540163e-06,
+      "loss": 0.6869,
+      "step": 9026
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 7.729041449071971,
+      "learning_rate": 2.9760617352025874e-06,
+      "loss": 0.8055,
+      "step": 9027
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 7.84230683818612,
+      "learning_rate": 2.975005880202393e-06,
+      "loss": 0.7008,
+      "step": 9028
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 9.990674690915924,
+      "learning_rate": 2.9739501332097466e-06,
+      "loss": 0.7445,
+      "step": 9029
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 8.092220129935937,
+      "learning_rate": 2.9728944942809618e-06,
+      "loss": 0.8,
+      "step": 9030
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 10.36735946745136,
+      "learning_rate": 2.9718389634723416e-06,
+      "loss": 0.7427,
+      "step": 9031
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 8.774384818296358,
+      "learning_rate": 2.9707835408401847e-06,
+      "loss": 0.717,
+      "step": 9032
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 11.369674505265605,
+      "learning_rate": 2.9697282264407873e-06,
+      "loss": 0.8243,
+      "step": 9033
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 10.380572104707051,
+      "learning_rate": 2.9686730203304335e-06,
+      "loss": 0.7686,
+      "step": 9034
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 9.181733811812046,
+      "learning_rate": 2.967617922565409e-06,
+      "loss": 0.6224,
+      "step": 9035
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 7.184215273323491,
+      "learning_rate": 2.9665629332019854e-06,
+      "loss": 0.7225,
+      "step": 9036
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 7.173610962689225,
+      "learning_rate": 2.9655080522964334e-06,
+      "loss": 0.7582,
+      "step": 9037
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 7.601201173890774,
+      "learning_rate": 2.964453279905022e-06,
+      "loss": 0.7577,
+      "step": 9038
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 7.254760913037231,
+      "learning_rate": 2.9633986160840044e-06,
+      "loss": 0.7705,
+      "step": 9039
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 6.352540256819367,
+      "learning_rate": 2.962344060889637e-06,
+      "loss": 0.7169,
+      "step": 9040
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 8.234849161514271,
+      "learning_rate": 2.9612896143781644e-06,
+      "loss": 0.728,
+      "step": 9041
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 5.062105478845386,
+      "learning_rate": 2.960235276605829e-06,
+      "loss": 0.7153,
+      "step": 9042
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 10.395500842931309,
+      "learning_rate": 2.9591810476288673e-06,
+      "loss": 0.748,
+      "step": 9043
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 9.702181915094418,
+      "learning_rate": 2.958126927503506e-06,
+      "loss": 0.7197,
+      "step": 9044
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 9.216639953277491,
+      "learning_rate": 2.9570729162859736e-06,
+      "loss": 0.7677,
+      "step": 9045
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 6.894392426007648,
+      "learning_rate": 2.956019014032484e-06,
+      "loss": 0.8245,
+      "step": 9046
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 6.213472391579411,
+      "learning_rate": 2.954965220799249e-06,
+      "loss": 0.7017,
+      "step": 9047
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 7.814913032890481,
+      "learning_rate": 2.9539115366424793e-06,
+      "loss": 0.7402,
+      "step": 9048
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 8.793884150898949,
+      "learning_rate": 2.9528579616183717e-06,
+      "loss": 0.6468,
+      "step": 9049
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 8.827948070647842,
+      "learning_rate": 2.951804495783123e-06,
+      "loss": 0.7611,
+      "step": 9050
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 8.64748529039222,
+      "learning_rate": 2.95075113919292e-06,
+      "loss": 0.7368,
+      "step": 9051
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 7.719199748494742,
+      "learning_rate": 2.9496978919039476e-06,
+      "loss": 0.7985,
+      "step": 9052
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 10.194708852018353,
+      "learning_rate": 2.9486447539723833e-06,
+      "loss": 0.7447,
+      "step": 9053
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 7.939867995670364,
+      "learning_rate": 2.9475917254543962e-06,
+      "loss": 0.7035,
+      "step": 9054
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 6.278479025591817,
+      "learning_rate": 2.9465388064061553e-06,
+      "loss": 0.7291,
+      "step": 9055
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 6.372527134212043,
+      "learning_rate": 2.945485996883818e-06,
+      "loss": 0.7842,
+      "step": 9056
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 6.269371486719782,
+      "learning_rate": 2.944433296943538e-06,
+      "loss": 0.6966,
+      "step": 9057
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 8.487161245697328,
+      "learning_rate": 2.9433807066414654e-06,
+      "loss": 0.7718,
+      "step": 9058
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 8.634283852077237,
+      "learning_rate": 2.9423282260337414e-06,
+      "loss": 0.756,
+      "step": 9059
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 9.549720855183143,
+      "learning_rate": 2.941275855176501e-06,
+      "loss": 0.7814,
+      "step": 9060
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 11.583001109385137,
+      "learning_rate": 2.940223594125877e-06,
+      "loss": 0.7503,
+      "step": 9061
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 7.738305870410057,
+      "learning_rate": 2.9391714429379925e-06,
+      "loss": 0.7208,
+      "step": 9062
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 9.497323699331433,
+      "learning_rate": 2.9381194016689687e-06,
+      "loss": 0.7415,
+      "step": 9063
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 9.065003718879366,
+      "learning_rate": 2.9370674703749135e-06,
+      "loss": 0.7855,
+      "step": 9064
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 6.318279210371414,
+      "learning_rate": 2.936015649111939e-06,
+      "loss": 0.7244,
+      "step": 9065
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 8.91412660064053,
+      "learning_rate": 2.9349639379361457e-06,
+      "loss": 0.6842,
+      "step": 9066
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 8.024654078523866,
+      "learning_rate": 2.9339123369036256e-06,
+      "loss": 0.7071,
+      "step": 9067
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 10.043608077377316,
+      "learning_rate": 2.9328608460704723e-06,
+      "loss": 0.7853,
+      "step": 9068
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 8.365581079120378,
+      "learning_rate": 2.9318094654927665e-06,
+      "loss": 0.7152,
+      "step": 9069
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 10.322059348259414,
+      "learning_rate": 2.9307581952265864e-06,
+      "loss": 0.7226,
+      "step": 9070
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 7.719045897243739,
+      "learning_rate": 2.9297070353280066e-06,
+      "loss": 0.7228,
+      "step": 9071
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 10.867794022440917,
+      "learning_rate": 2.9286559858530896e-06,
+      "loss": 0.7217,
+      "step": 9072
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 10.979731163458782,
+      "learning_rate": 2.927605046857897e-06,
+      "loss": 0.7545,
+      "step": 9073
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 11.288243067904563,
+      "learning_rate": 2.9265542183984814e-06,
+      "loss": 0.7877,
+      "step": 9074
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 9.231622012797988,
+      "learning_rate": 2.925503500530893e-06,
+      "loss": 0.7554,
+      "step": 9075
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 6.406762995692955,
+      "learning_rate": 2.9244528933111747e-06,
+      "loss": 0.6608,
+      "step": 9076
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 7.772263976477105,
+      "learning_rate": 2.9234023967953593e-06,
+      "loss": 0.7717,
+      "step": 9077
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 7.015643393717041,
+      "learning_rate": 2.922352011039483e-06,
+      "loss": 0.7624,
+      "step": 9078
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 8.300820282304377,
+      "learning_rate": 2.9213017360995643e-06,
+      "loss": 0.7393,
+      "step": 9079
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 10.35960780754141,
+      "learning_rate": 2.920251572031626e-06,
+      "loss": 0.7317,
+      "step": 9080
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 7.446328431314612,
+      "learning_rate": 2.919201518891681e-06,
+      "loss": 0.7064,
+      "step": 9081
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 8.72696635792398,
+      "learning_rate": 2.9181515767357328e-06,
+      "loss": 0.701,
+      "step": 9082
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 7.886455678165742,
+      "learning_rate": 2.917101745619787e-06,
+      "loss": 0.739,
+      "step": 9083
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 6.802074782118995,
+      "learning_rate": 2.916052025599834e-06,
+      "loss": 0.7851,
+      "step": 9084
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 9.749698405928171,
+      "learning_rate": 2.915002416731867e-06,
+      "loss": 0.7857,
+      "step": 9085
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 8.04464907548604,
+      "learning_rate": 2.9139529190718673e-06,
+      "loss": 0.7713,
+      "step": 9086
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 10.235674265854016,
+      "learning_rate": 2.9129035326758114e-06,
+      "loss": 0.6876,
+      "step": 9087
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 12.275196928358303,
+      "learning_rate": 2.9118542575996726e-06,
+      "loss": 0.7585,
+      "step": 9088
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 10.480462341343511,
+      "learning_rate": 2.9108050938994138e-06,
+      "loss": 0.7023,
+      "step": 9089
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 7.935639334541856,
+      "learning_rate": 2.9097560416309957e-06,
+      "loss": 0.7243,
+      "step": 9090
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 12.849326498674621,
+      "learning_rate": 2.908707100850374e-06,
+      "loss": 0.7855,
+      "step": 9091
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 9.94892730642203,
+      "learning_rate": 2.907658271613495e-06,
+      "loss": 0.7319,
+      "step": 9092
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 12.70043370613187,
+      "learning_rate": 2.9066095539762996e-06,
+      "loss": 0.7494,
+      "step": 9093
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 10.89192216295692,
+      "learning_rate": 2.9055609479947213e-06,
+      "loss": 0.7335,
+      "step": 9094
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 8.451580382627158,
+      "learning_rate": 2.9045124537246914e-06,
+      "loss": 0.6516,
+      "step": 9095
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 8.830784015140287,
+      "learning_rate": 2.9034640712221374e-06,
+      "loss": 0.6888,
+      "step": 9096
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 11.022813511358429,
+      "learning_rate": 2.902415800542972e-06,
+      "loss": 0.8293,
+      "step": 9097
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 11.062869418511006,
+      "learning_rate": 2.9013676417431114e-06,
+      "loss": 0.7313,
+      "step": 9098
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 8.580026164944725,
+      "learning_rate": 2.900319594878458e-06,
+      "loss": 0.7408,
+      "step": 9099
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 8.399891906531227,
+      "learning_rate": 2.899271660004912e-06,
+      "loss": 0.714,
+      "step": 9100
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 8.339715236244844,
+      "learning_rate": 2.89822383717837e-06,
+      "loss": 0.7292,
+      "step": 9101
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 8.20172687015135,
+      "learning_rate": 2.897176126454716e-06,
+      "loss": 0.7527,
+      "step": 9102
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 7.898378435754703,
+      "learning_rate": 2.8961285278898343e-06,
+      "loss": 0.7611,
+      "step": 9103
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 9.618703532968667,
+      "learning_rate": 2.8950810415396025e-06,
+      "loss": 0.6831,
+      "step": 9104
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 11.255419539776618,
+      "learning_rate": 2.894033667459889e-06,
+      "loss": 0.7541,
+      "step": 9105
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 6.831470141846742,
+      "learning_rate": 2.8929864057065583e-06,
+      "loss": 0.7034,
+      "step": 9106
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 8.183864485160397,
+      "learning_rate": 2.891939256335465e-06,
+      "loss": 0.7689,
+      "step": 9107
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 11.085268445453973,
+      "learning_rate": 2.8908922194024647e-06,
+      "loss": 0.748,
+      "step": 9108
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 8.462508756366528,
+      "learning_rate": 2.889845294963404e-06,
+      "loss": 0.6775,
+      "step": 9109
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 9.045387521643383,
+      "learning_rate": 2.8887984830741195e-06,
+      "loss": 0.6545,
+      "step": 9110
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 8.187147151234708,
+      "learning_rate": 2.88775178379045e-06,
+      "loss": 0.767,
+      "step": 9111
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 8.432172353060892,
+      "learning_rate": 2.8867051971682203e-06,
+      "loss": 0.7287,
+      "step": 9112
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 8.058430458503134,
+      "learning_rate": 2.885658723263251e-06,
+      "loss": 0.7719,
+      "step": 9113
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 10.69904441929106,
+      "learning_rate": 2.8846123621313626e-06,
+      "loss": 0.7589,
+      "step": 9114
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 8.418270858451558,
+      "learning_rate": 2.88356611382836e-06,
+      "loss": 0.6847,
+      "step": 9115
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 8.500470342598828,
+      "learning_rate": 2.8825199784100522e-06,
+      "loss": 0.7807,
+      "step": 9116
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 10.457857936612621,
+      "learning_rate": 2.8814739559322325e-06,
+      "loss": 0.7114,
+      "step": 9117
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 9.647094733296107,
+      "learning_rate": 2.880428046450697e-06,
+      "loss": 0.7402,
+      "step": 9118
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 11.018036663299284,
+      "learning_rate": 2.8793822500212297e-06,
+      "loss": 0.7817,
+      "step": 9119
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 7.314177542778715,
+      "learning_rate": 2.878336566699609e-06,
+      "loss": 0.7411,
+      "step": 9120
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 10.73079251208904,
+      "learning_rate": 2.8772909965416106e-06,
+      "loss": 0.7589,
+      "step": 9121
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 7.332010676615851,
+      "learning_rate": 2.876245539603001e-06,
+      "loss": 0.673,
+      "step": 9122
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 10.468440565936815,
+      "learning_rate": 2.8752001959395446e-06,
+      "loss": 0.7572,
+      "step": 9123
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 9.42621879492352,
+      "learning_rate": 2.8741549656069956e-06,
+      "loss": 0.6745,
+      "step": 9124
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 10.635782188555584,
+      "learning_rate": 2.873109848661101e-06,
+      "loss": 0.7802,
+      "step": 9125
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 8.982019636244164,
+      "learning_rate": 2.8720648451576094e-06,
+      "loss": 0.7688,
+      "step": 9126
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 10.060150502448412,
+      "learning_rate": 2.871019955152253e-06,
+      "loss": 0.7276,
+      "step": 9127
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 8.17386078831568,
+      "learning_rate": 2.869975178700767e-06,
+      "loss": 0.705,
+      "step": 9128
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 7.121991214490479,
+      "learning_rate": 2.868930515858877e-06,
+      "loss": 0.7085,
+      "step": 9129
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 9.787285477997125,
+      "learning_rate": 2.867885966682302e-06,
+      "loss": 0.7275,
+      "step": 9130
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 7.01101244584911,
+      "learning_rate": 2.866841531226755e-06,
+      "loss": 0.7605,
+      "step": 9131
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 8.159232041469492,
+      "learning_rate": 2.86579720954794e-06,
+      "loss": 0.7319,
+      "step": 9132
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 8.982878341478242,
+      "learning_rate": 2.8647530017015624e-06,
+      "loss": 0.769,
+      "step": 9133
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 10.043683854571523,
+      "learning_rate": 2.8637089077433176e-06,
+      "loss": 0.6928,
+      "step": 9134
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 10.54157784698128,
+      "learning_rate": 2.8626649277288914e-06,
+      "loss": 0.7317,
+      "step": 9135
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 8.441549318787342,
+      "learning_rate": 2.8616210617139707e-06,
+      "loss": 0.6508,
+      "step": 9136
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 11.927379352447325,
+      "learning_rate": 2.8605773097542307e-06,
+      "loss": 0.7178,
+      "step": 9137
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 6.107624196956458,
+      "learning_rate": 2.859533671905339e-06,
+      "loss": 0.8414,
+      "step": 9138
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 7.996270514705099,
+      "learning_rate": 2.858490148222966e-06,
+      "loss": 0.7626,
+      "step": 9139
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 7.056972270070685,
+      "learning_rate": 2.8574467387627653e-06,
+      "loss": 0.6681,
+      "step": 9140
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 11.095974812691349,
+      "learning_rate": 2.856403443580391e-06,
+      "loss": 0.6998,
+      "step": 9141
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 9.776055406528585,
+      "learning_rate": 2.8553602627314937e-06,
+      "loss": 0.77,
+      "step": 9142
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 7.308498055127787,
+      "learning_rate": 2.854317196271709e-06,
+      "loss": 0.6899,
+      "step": 9143
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 8.571619689804093,
+      "learning_rate": 2.8532742442566735e-06,
+      "loss": 0.717,
+      "step": 9144
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 7.528919056732952,
+      "learning_rate": 2.852231406742012e-06,
+      "loss": 0.7758,
+      "step": 9145
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 6.293350390714188,
+      "learning_rate": 2.8511886837833492e-06,
+      "loss": 0.6767,
+      "step": 9146
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 8.405970500864921,
+      "learning_rate": 2.850146075436302e-06,
+      "loss": 0.6828,
+      "step": 9147
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 7.6854170629018554,
+      "learning_rate": 2.8491035817564772e-06,
+      "loss": 0.7,
+      "step": 9148
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 9.307959152639405,
+      "learning_rate": 2.8480612027994826e-06,
+      "loss": 0.753,
+      "step": 9149
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 11.175059656353751,
+      "learning_rate": 2.8470189386209136e-06,
+      "loss": 0.735,
+      "step": 9150
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 11.309320466071192,
+      "learning_rate": 2.845976789276359e-06,
+      "loss": 0.708,
+      "step": 9151
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 8.59982731424622,
+      "learning_rate": 2.844934754821409e-06,
+      "loss": 0.6911,
+      "step": 9152
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 7.00908856455747,
+      "learning_rate": 2.8438928353116386e-06,
+      "loss": 0.7653,
+      "step": 9153
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 8.804300731039922,
+      "learning_rate": 2.8428510308026247e-06,
+      "loss": 0.753,
+      "step": 9154
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 8.640562047396804,
+      "learning_rate": 2.8418093413499304e-06,
+      "loss": 0.721,
+      "step": 9155
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 7.444122650791536,
+      "learning_rate": 2.8407677670091205e-06,
+      "loss": 0.767,
+      "step": 9156
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 7.156981763590822,
+      "learning_rate": 2.8397263078357474e-06,
+      "loss": 0.7008,
+      "step": 9157
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 7.788189207307159,
+      "learning_rate": 2.838684963885358e-06,
+      "loss": 0.7197,
+      "step": 9158
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 7.0211237516861305,
+      "learning_rate": 2.837643735213499e-06,
+      "loss": 0.6937,
+      "step": 9159
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 10.303142119950436,
+      "learning_rate": 2.8366026218757014e-06,
+      "loss": 0.734,
+      "step": 9160
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 6.575625083238769,
+      "learning_rate": 2.8355616239274988e-06,
+      "loss": 0.7119,
+      "step": 9161
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 6.898106508943606,
+      "learning_rate": 2.834520741424418e-06,
+      "loss": 0.7112,
+      "step": 9162
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 8.351224831035006,
+      "learning_rate": 2.8334799744219694e-06,
+      "loss": 0.7869,
+      "step": 9163
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 9.870249522729974,
+      "learning_rate": 2.8324393229756705e-06,
+      "loss": 0.7323,
+      "step": 9164
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 8.15311719159732,
+      "learning_rate": 2.831398787141022e-06,
+      "loss": 0.7023,
+      "step": 9165
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 8.381848810982266,
+      "learning_rate": 2.8303583669735267e-06,
+      "loss": 0.7292,
+      "step": 9166
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 10.14360301537256,
+      "learning_rate": 2.829318062528679e-06,
+      "loss": 0.776,
+      "step": 9167
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 10.394781792618533,
+      "learning_rate": 2.8282778738619625e-06,
+      "loss": 0.7582,
+      "step": 9168
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 9.384186597588837,
+      "learning_rate": 2.8272378010288606e-06,
+      "loss": 0.7357,
+      "step": 9169
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 6.43051978501926,
+      "learning_rate": 2.826197844084844e-06,
+      "loss": 0.7481,
+      "step": 9170
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 6.285769997307404,
+      "learning_rate": 2.825158003085383e-06,
+      "loss": 0.6724,
+      "step": 9171
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 7.635400293808513,
+      "learning_rate": 2.8241182780859423e-06,
+      "loss": 0.7384,
+      "step": 9172
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 8.056607160751607,
+      "learning_rate": 2.8230786691419747e-06,
+      "loss": 0.6765,
+      "step": 9173
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 9.503812726851136,
+      "learning_rate": 2.8220391763089333e-06,
+      "loss": 0.6892,
+      "step": 9174
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 9.669587887733678,
+      "learning_rate": 2.82099979964226e-06,
+      "loss": 0.7611,
+      "step": 9175
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 11.686467626913473,
+      "learning_rate": 2.8199605391973894e-06,
+      "loss": 0.7506,
+      "step": 9176
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 8.748818812621039,
+      "learning_rate": 2.818921395029758e-06,
+      "loss": 0.7086,
+      "step": 9177
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 9.559102269862858,
+      "learning_rate": 2.8178823671947865e-06,
+      "loss": 0.7249,
+      "step": 9178
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 7.508189551515116,
+      "learning_rate": 2.8168434557478967e-06,
+      "loss": 0.7558,
+      "step": 9179
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 9.540417468771109,
+      "learning_rate": 2.815804660744499e-06,
+      "loss": 0.7711,
+      "step": 9180
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 12.144804918403016,
+      "learning_rate": 2.8147659822400023e-06,
+      "loss": 0.7702,
+      "step": 9181
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 9.263408957807373,
+      "learning_rate": 2.8137274202898058e-06,
+      "loss": 0.7143,
+      "step": 9182
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 12.029054879943626,
+      "learning_rate": 2.8126889749493014e-06,
+      "loss": 0.7738,
+      "step": 9183
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 10.179420638730939,
+      "learning_rate": 2.8116506462738784e-06,
+      "loss": 0.761,
+      "step": 9184
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 9.56900505715674,
+      "learning_rate": 2.8106124343189204e-06,
+      "loss": 0.7007,
+      "step": 9185
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 11.212423476023089,
+      "learning_rate": 2.8095743391397993e-06,
+      "loss": 0.698,
+      "step": 9186
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 10.821136232669518,
+      "learning_rate": 2.808536360791888e-06,
+      "loss": 0.7458,
+      "step": 9187
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 10.534537815343116,
+      "learning_rate": 2.8074984993305465e-06,
+      "loss": 0.7002,
+      "step": 9188
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 7.141411358171329,
+      "learning_rate": 2.806460754811131e-06,
+      "loss": 0.7847,
+      "step": 9189
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 7.331619080473323,
+      "learning_rate": 2.805423127288995e-06,
+      "loss": 0.7396,
+      "step": 9190
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 10.025634815497533,
+      "learning_rate": 2.8043856168194782e-06,
+      "loss": 0.6941,
+      "step": 9191
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 9.531402145009546,
+      "learning_rate": 2.8033482234579234e-06,
+      "loss": 0.6888,
+      "step": 9192
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 7.813521643520956,
+      "learning_rate": 2.802310947259658e-06,
+      "loss": 0.684,
+      "step": 9193
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 7.602323064594567,
+      "learning_rate": 2.8012737882800113e-06,
+      "loss": 0.7089,
+      "step": 9194
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 14.419132229682289,
+      "learning_rate": 2.8002367465743006e-06,
+      "loss": 0.771,
+      "step": 9195
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 8.476717541023591,
+      "learning_rate": 2.7991998221978366e-06,
+      "loss": 0.8195,
+      "step": 9196
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 8.156595549787554,
+      "learning_rate": 2.79816301520593e-06,
+      "loss": 0.7679,
+      "step": 9197
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 12.05876824623934,
+      "learning_rate": 2.7971263256538772e-06,
+      "loss": 0.7327,
+      "step": 9198
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 9.412614975147894,
+      "learning_rate": 2.796089753596975e-06,
+      "loss": 0.7758,
+      "step": 9199
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 8.825624826632653,
+      "learning_rate": 2.795053299090512e-06,
+      "loss": 0.7381,
+      "step": 9200
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 9.736244769819262,
+      "learning_rate": 2.7940169621897694e-06,
+      "loss": 0.692,
+      "step": 9201
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 9.095864098397746,
+      "learning_rate": 2.792980742950021e-06,
+      "loss": 0.7631,
+      "step": 9202
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 11.87775492771383,
+      "learning_rate": 2.791944641426535e-06,
+      "loss": 0.7767,
+      "step": 9203
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 7.060535274467395,
+      "learning_rate": 2.790908657674575e-06,
+      "loss": 0.7486,
+      "step": 9204
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 8.902909149774155,
+      "learning_rate": 2.789872791749401e-06,
+      "loss": 0.7313,
+      "step": 9205
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 9.005314956147593,
+      "learning_rate": 2.7888370437062574e-06,
+      "loss": 0.7171,
+      "step": 9206
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 7.56334268274144,
+      "learning_rate": 2.7878014136003958e-06,
+      "loss": 0.8116,
+      "step": 9207
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 7.290348981628766,
+      "learning_rate": 2.786765901487045e-06,
+      "loss": 0.7649,
+      "step": 9208
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 8.911379130336208,
+      "learning_rate": 2.78573050742144e-06,
+      "loss": 0.7322,
+      "step": 9209
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 8.069026707402607,
+      "learning_rate": 2.7846952314588085e-06,
+      "loss": 0.7449,
+      "step": 9210
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 11.319175530864182,
+      "learning_rate": 2.783660073654365e-06,
+      "loss": 0.7052,
+      "step": 9211
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 6.675614532035263,
+      "learning_rate": 2.782625034063326e-06,
+      "loss": 0.6681,
+      "step": 9212
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 9.105930007737939,
+      "learning_rate": 2.781590112740896e-06,
+      "loss": 0.7857,
+      "step": 9213
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 8.134572752522793,
+      "learning_rate": 2.7805553097422716e-06,
+      "loss": 0.7276,
+      "step": 9214
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 8.03131162057318,
+      "learning_rate": 2.779520625122651e-06,
+      "loss": 0.7228,
+      "step": 9215
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 6.6639307427961745,
+      "learning_rate": 2.778486058937219e-06,
+      "loss": 0.801,
+      "step": 9216
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 7.768913770054748,
+      "learning_rate": 2.7774516112411575e-06,
+      "loss": 0.6818,
+      "step": 9217
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 9.606931284537033,
+      "learning_rate": 2.776417282089639e-06,
+      "loss": 0.7111,
+      "step": 9218
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 8.376065610698804,
+      "learning_rate": 2.7753830715378357e-06,
+      "loss": 0.7824,
+      "step": 9219
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 11.204284704230725,
+      "learning_rate": 2.7743489796409064e-06,
+      "loss": 0.7473,
+      "step": 9220
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 8.211617967314494,
+      "learning_rate": 2.773315006454006e-06,
+      "loss": 0.8194,
+      "step": 9221
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 8.957384934425237,
+      "learning_rate": 2.7722811520322867e-06,
+      "loss": 0.6878,
+      "step": 9222
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 10.579415800304387,
+      "learning_rate": 2.7712474164308877e-06,
+      "loss": 0.7119,
+      "step": 9223
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 8.944354244912219,
+      "learning_rate": 2.770213799704948e-06,
+      "loss": 0.7571,
+      "step": 9224
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 5.517061666944554,
+      "learning_rate": 2.7691803019095996e-06,
+      "loss": 0.6941,
+      "step": 9225
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 8.866046614132031,
+      "learning_rate": 2.7681469230999646e-06,
+      "loss": 0.7535,
+      "step": 9226
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 10.474258350229933,
+      "learning_rate": 2.767113663331158e-06,
+      "loss": 0.7906,
+      "step": 9227
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 10.44532664208959,
+      "learning_rate": 2.7660805226582953e-06,
+      "loss": 0.6611,
+      "step": 9228
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 7.433865671822537,
+      "learning_rate": 2.7650475011364774e-06,
+      "loss": 0.761,
+      "step": 9229
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 10.711782941564342,
+      "learning_rate": 2.7640145988208066e-06,
+      "loss": 0.7704,
+      "step": 9230
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 6.033095723546049,
+      "learning_rate": 2.762981815766372e-06,
+      "loss": 0.7649,
+      "step": 9231
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 7.98183653796919,
+      "learning_rate": 2.7619491520282614e-06,
+      "loss": 0.6906,
+      "step": 9232
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 5.412699000995384,
+      "learning_rate": 2.7609166076615547e-06,
+      "loss": 0.7159,
+      "step": 9233
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 9.276506142650288,
+      "learning_rate": 2.7598841827213206e-06,
+      "loss": 0.7507,
+      "step": 9234
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 8.334909772443524,
+      "learning_rate": 2.7588518772626312e-06,
+      "loss": 0.7836,
+      "step": 9235
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 8.521575104473662,
+      "learning_rate": 2.757819691340542e-06,
+      "loss": 0.6633,
+      "step": 9236
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 10.77330022168892,
+      "learning_rate": 2.75678762501011e-06,
+      "loss": 0.7463,
+      "step": 9237
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 9.532728090980717,
+      "learning_rate": 2.7557556783263833e-06,
+      "loss": 0.7246,
+      "step": 9238
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 10.581167092400689,
+      "learning_rate": 2.7547238513444018e-06,
+      "loss": 0.7084,
+      "step": 9239
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 7.852236032242897,
+      "learning_rate": 2.753692144119201e-06,
+      "loss": 0.7294,
+      "step": 9240
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 10.408956789470599,
+      "learning_rate": 2.7526605567058063e-06,
+      "loss": 0.7476,
+      "step": 9241
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 10.38532213861741,
+      "learning_rate": 2.7516290891592418e-06,
+      "loss": 0.7264,
+      "step": 9242
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 7.477320552727444,
+      "learning_rate": 2.7505977415345254e-06,
+      "loss": 0.7258,
+      "step": 9243
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 9.9271087912453,
+      "learning_rate": 2.749566513886662e-06,
+      "loss": 0.7418,
+      "step": 9244
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 7.878477331627323,
+      "learning_rate": 2.748535406270659e-06,
+      "loss": 0.7547,
+      "step": 9245
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 9.955398536679919,
+      "learning_rate": 2.74750441874151e-06,
+      "loss": 0.7412,
+      "step": 9246
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 7.409428896362243,
+      "learning_rate": 2.746473551354204e-06,
+      "loss": 0.6834,
+      "step": 9247
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 10.56707195193563,
+      "learning_rate": 2.7454428041637284e-06,
+      "loss": 0.7903,
+      "step": 9248
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 8.92358780599877,
+      "learning_rate": 2.744412177225055e-06,
+      "loss": 0.7701,
+      "step": 9249
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 8.49150556587325,
+      "learning_rate": 2.74338167059316e-06,
+      "loss": 0.6953,
+      "step": 9250
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 10.889158633122834,
+      "learning_rate": 2.742351284323005e-06,
+      "loss": 0.7298,
+      "step": 9251
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 6.1803973292175725,
+      "learning_rate": 2.741321018469547e-06,
+      "loss": 0.6749,
+      "step": 9252
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 10.726601292449464,
+      "learning_rate": 2.7402908730877404e-06,
+      "loss": 0.7092,
+      "step": 9253
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 10.01818304529812,
+      "learning_rate": 2.7392608482325267e-06,
+      "loss": 0.697,
+      "step": 9254
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 8.996508296776971,
+      "learning_rate": 2.7382309439588485e-06,
+      "loss": 0.7609,
+      "step": 9255
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 8.364464052947069,
+      "learning_rate": 2.7372011603216343e-06,
+      "loss": 0.7283,
+      "step": 9256
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 9.692113712914725,
+      "learning_rate": 2.7361714973758137e-06,
+      "loss": 0.6747,
+      "step": 9257
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 8.322463581351297,
+      "learning_rate": 2.7351419551763037e-06,
+      "loss": 0.793,
+      "step": 9258
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 6.91031373820738,
+      "learning_rate": 2.734112533778015e-06,
+      "loss": 0.7482,
+      "step": 9259
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 7.441300090640494,
+      "learning_rate": 2.73308323323586e-06,
+      "loss": 0.7263,
+      "step": 9260
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 10.18821478384967,
+      "learning_rate": 2.7320540536047325e-06,
+      "loss": 0.7666,
+      "step": 9261
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 10.815745352073735,
+      "learning_rate": 2.7310249949395285e-06,
+      "loss": 0.7362,
+      "step": 9262
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 6.140081740990183,
+      "learning_rate": 2.7299960572951386e-06,
+      "loss": 0.7264,
+      "step": 9263
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 7.938849412964146,
+      "learning_rate": 2.7289672407264394e-06,
+      "loss": 0.7073,
+      "step": 9264
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 5.572973084089924,
+      "learning_rate": 2.727938545288307e-06,
+      "loss": 0.8004,
+      "step": 9265
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 7.514844105738841,
+      "learning_rate": 2.726909971035606e-06,
+      "loss": 0.7103,
+      "step": 9266
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 8.298275063001762,
+      "learning_rate": 2.725881518023199e-06,
+      "loss": 0.6766,
+      "step": 9267
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 10.894584572042607,
+      "learning_rate": 2.7248531863059447e-06,
+      "loss": 0.7502,
+      "step": 9268
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 9.156661186727476,
+      "learning_rate": 2.7238249759386863e-06,
+      "loss": 0.6989,
+      "step": 9269
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 8.423650717072247,
+      "learning_rate": 2.7227968869762698e-06,
+      "loss": 0.7321,
+      "step": 9270
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 8.670378751570869,
+      "learning_rate": 2.72176891947353e-06,
+      "loss": 0.7083,
+      "step": 9271
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 11.094360754139288,
+      "learning_rate": 2.720741073485291e-06,
+      "loss": 0.7615,
+      "step": 9272
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 7.758447453025755,
+      "learning_rate": 2.7197133490663807e-06,
+      "loss": 0.7865,
+      "step": 9273
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 8.737643848573919,
+      "learning_rate": 2.718685746271612e-06,
+      "loss": 0.6806,
+      "step": 9274
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 11.544387512307061,
+      "learning_rate": 2.7176582651557947e-06,
+      "loss": 0.7651,
+      "step": 9275
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 7.5224647780948635,
+      "learning_rate": 2.7166309057737355e-06,
+      "loss": 0.8758,
+      "step": 9276
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 9.504485435967304,
+      "learning_rate": 2.715603668180228e-06,
+      "loss": 0.7308,
+      "step": 9277
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 8.996576878657475,
+      "learning_rate": 2.7145765524300614e-06,
+      "loss": 0.7265,
+      "step": 9278
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 8.598089177545802,
+      "learning_rate": 2.713549558578018e-06,
+      "loss": 0.7336,
+      "step": 9279
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 9.99560179877916,
+      "learning_rate": 2.712522686678877e-06,
+      "loss": 0.7141,
+      "step": 9280
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 9.4203341921673,
+      "learning_rate": 2.711495936787411e-06,
+      "loss": 0.7567,
+      "step": 9281
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 9.286066732715339,
+      "learning_rate": 2.7104693089583794e-06,
+      "loss": 0.7489,
+      "step": 9282
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 10.563997939414788,
+      "learning_rate": 2.7094428032465435e-06,
+      "loss": 0.8006,
+      "step": 9283
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 8.543300483688878,
+      "learning_rate": 2.708416419706653e-06,
+      "loss": 0.7782,
+      "step": 9284
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 10.388478771088472,
+      "learning_rate": 2.7073901583934504e-06,
+      "loss": 0.7549,
+      "step": 9285
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 12.331219842424582,
+      "learning_rate": 2.7063640193616768e-06,
+      "loss": 0.7818,
+      "step": 9286
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 8.054811386463157,
+      "learning_rate": 2.7053380026660603e-06,
+      "loss": 0.717,
+      "step": 9287
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 9.978433793956315,
+      "learning_rate": 2.704312108361329e-06,
+      "loss": 0.704,
+      "step": 9288
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 10.274237827944969,
+      "learning_rate": 2.7032863365021982e-06,
+      "loss": 0.7445,
+      "step": 9289
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 11.431124766561434,
+      "learning_rate": 2.7022606871433834e-06,
+      "loss": 0.7599,
+      "step": 9290
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 8.062759183888405,
+      "learning_rate": 2.701235160339587e-06,
+      "loss": 0.7012,
+      "step": 9291
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 11.05607020652283,
+      "learning_rate": 2.700209756145507e-06,
+      "loss": 0.75,
+      "step": 9292
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 9.042054500899685,
+      "learning_rate": 2.6991844746158393e-06,
+      "loss": 0.7616,
+      "step": 9293
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 6.642580790077184,
+      "learning_rate": 2.6981593158052655e-06,
+      "loss": 0.6683,
+      "step": 9294
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 7.5310668940134065,
+      "learning_rate": 2.6971342797684676e-06,
+      "loss": 0.7681,
+      "step": 9295
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 9.884842781828027,
+      "learning_rate": 2.696109366560118e-06,
+      "loss": 0.738,
+      "step": 9296
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 9.518273678107255,
+      "learning_rate": 2.69508457623488e-06,
+      "loss": 0.7247,
+      "step": 9297
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 9.5869566378569,
+      "learning_rate": 2.6940599088474162e-06,
+      "loss": 0.802,
+      "step": 9298
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 8.905344585814156,
+      "learning_rate": 2.6930353644523764e-06,
+      "loss": 0.7703,
+      "step": 9299
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 13.430348940383817,
+      "learning_rate": 2.692010943104409e-06,
+      "loss": 0.7506,
+      "step": 9300
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 12.135261113778784,
+      "learning_rate": 2.6909866448581557e-06,
+      "loss": 0.6455,
+      "step": 9301
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 7.429817750527201,
+      "learning_rate": 2.689962469768247e-06,
+      "loss": 0.7102,
+      "step": 9302
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 10.619806563740116,
+      "learning_rate": 2.6889384178893106e-06,
+      "loss": 0.7756,
+      "step": 9303
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 9.862871287767828,
+      "learning_rate": 2.687914489275964e-06,
+      "loss": 0.7175,
+      "step": 9304
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 8.253269381894597,
+      "learning_rate": 2.6868906839828227e-06,
+      "loss": 0.7356,
+      "step": 9305
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 7.080531354538172,
+      "learning_rate": 2.685867002064496e-06,
+      "loss": 0.7047,
+      "step": 9306
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 9.43419542261013,
+      "learning_rate": 2.68484344357558e-06,
+      "loss": 0.7957,
+      "step": 9307
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 9.009976298920359,
+      "learning_rate": 2.683820008570672e-06,
+      "loss": 0.7695,
+      "step": 9308
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 9.062562711615358,
+      "learning_rate": 2.6827966971043584e-06,
+      "loss": 0.685,
+      "step": 9309
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 10.807337719875308,
+      "learning_rate": 2.6817735092312157e-06,
+      "loss": 0.7666,
+      "step": 9310
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 9.111702084654612,
+      "learning_rate": 2.680750445005824e-06,
+      "loss": 0.7123,
+      "step": 9311
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 7.61184550100747,
+      "learning_rate": 2.6797275044827463e-06,
+      "loss": 0.6417,
+      "step": 9312
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 9.646989030283972,
+      "learning_rate": 2.6787046877165446e-06,
+      "loss": 0.7211,
+      "step": 9313
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 9.577308415419408,
+      "learning_rate": 2.6776819947617756e-06,
+      "loss": 0.765,
+      "step": 9314
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 9.536145243628066,
+      "learning_rate": 2.6766594256729844e-06,
+      "loss": 0.7437,
+      "step": 9315
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 8.670029125100253,
+      "learning_rate": 2.6756369805047124e-06,
+      "loss": 0.7477,
+      "step": 9316
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 9.046818539562114,
+      "learning_rate": 2.6746146593114922e-06,
+      "loss": 0.7547,
+      "step": 9317
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 7.281969671893365,
+      "learning_rate": 2.673592462147854e-06,
+      "loss": 0.7645,
+      "step": 9318
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 11.015856118379558,
+      "learning_rate": 2.6725703890683184e-06,
+      "loss": 0.7028,
+      "step": 9319
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 13.002274302170171,
+      "learning_rate": 2.6715484401273994e-06,
+      "loss": 0.7119,
+      "step": 9320
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 10.782326580249586,
+      "learning_rate": 2.6705266153796073e-06,
+      "loss": 0.6957,
+      "step": 9321
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 7.387550822494561,
+      "learning_rate": 2.6695049148794407e-06,
+      "loss": 0.6925,
+      "step": 9322
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 9.238729868865159,
+      "learning_rate": 2.668483338681393e-06,
+      "loss": 0.7763,
+      "step": 9323
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 9.441230865920737,
+      "learning_rate": 2.6674618868399567e-06,
+      "loss": 0.7351,
+      "step": 9324
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 11.727941617217022,
+      "learning_rate": 2.666440559409608e-06,
+      "loss": 0.6718,
+      "step": 9325
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 9.965595363789905,
+      "learning_rate": 2.6654193564448272e-06,
+      "loss": 0.7435,
+      "step": 9326
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 11.02577156727273,
+      "learning_rate": 2.6643982780000764e-06,
+      "loss": 0.7397,
+      "step": 9327
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 6.4625327266291555,
+      "learning_rate": 2.663377324129823e-06,
+      "loss": 0.7823,
+      "step": 9328
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 7.795209638232543,
+      "learning_rate": 2.6623564948885194e-06,
+      "loss": 0.7134,
+      "step": 9329
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 8.520980622015095,
+      "learning_rate": 2.661335790330611e-06,
+      "loss": 0.788,
+      "step": 9330
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 9.844478967510065,
+      "learning_rate": 2.6603152105105445e-06,
+      "loss": 0.6818,
+      "step": 9331
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 8.132415583299723,
+      "learning_rate": 2.6592947554827497e-06,
+      "loss": 0.7068,
+      "step": 9332
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 8.28447073433404,
+      "learning_rate": 2.6582744253016567e-06,
+      "loss": 0.7023,
+      "step": 9333
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 5.697905786667871,
+      "learning_rate": 2.657254220021692e-06,
+      "loss": 0.6881,
+      "step": 9334
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 7.3457144664666965,
+      "learning_rate": 2.656234139697262e-06,
+      "loss": 0.6954,
+      "step": 9335
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 6.3093148227787825,
+      "learning_rate": 2.6552141843827816e-06,
+      "loss": 0.7735,
+      "step": 9336
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 7.53985676613315,
+      "learning_rate": 2.6541943541326475e-06,
+      "loss": 0.7735,
+      "step": 9337
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 8.83178475178237,
+      "learning_rate": 2.653174649001257e-06,
+      "loss": 0.6916,
+      "step": 9338
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 8.263498947507744,
+      "learning_rate": 2.6521550690430002e-06,
+      "loss": 0.6836,
+      "step": 9339
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 91.85896798835057,
+      "learning_rate": 2.6511356143122567e-06,
+      "loss": 0.841,
+      "step": 9340
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 8.106600691329948,
+      "learning_rate": 2.6501162848634023e-06,
+      "loss": 0.8057,
+      "step": 9341
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 10.910253547490436,
+      "learning_rate": 2.649097080750801e-06,
+      "loss": 0.7343,
+      "step": 9342
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 21.243969699781896,
+      "learning_rate": 2.6480780020288183e-06,
+      "loss": 0.7292,
+      "step": 9343
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 7.779349953840036,
+      "learning_rate": 2.64705904875181e-06,
+      "loss": 0.7554,
+      "step": 9344
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 8.809870503030451,
+      "learning_rate": 2.646040220974121e-06,
+      "loss": 0.7513,
+      "step": 9345
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 8.793764854247057,
+      "learning_rate": 2.645021518750096e-06,
+      "loss": 0.7741,
+      "step": 9346
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 10.890960901151479,
+      "learning_rate": 2.6440029421340687e-06,
+      "loss": 0.7667,
+      "step": 9347
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 10.262334245451484,
+      "learning_rate": 2.642984491180364e-06,
+      "loss": 0.7712,
+      "step": 9348
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 7.0962605949335,
+      "learning_rate": 2.641966165943308e-06,
+      "loss": 0.7411,
+      "step": 9349
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 8.98552291146463,
+      "learning_rate": 2.6409479664772104e-06,
+      "loss": 0.7516,
+      "step": 9350
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 6.994857098650199,
+      "learning_rate": 2.6399298928363844e-06,
+      "loss": 0.6861,
+      "step": 9351
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 11.879390500952233,
+      "learning_rate": 2.6389119450751266e-06,
+      "loss": 0.7635,
+      "step": 9352
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 10.226849431010058,
+      "learning_rate": 2.6378941232477353e-06,
+      "loss": 0.745,
+      "step": 9353
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 6.37065179732074,
+      "learning_rate": 2.6368764274084963e-06,
+      "loss": 0.7194,
+      "step": 9354
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 10.890449838969086,
+      "learning_rate": 2.6358588576116893e-06,
+      "loss": 0.7865,
+      "step": 9355
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 9.97379575980349,
+      "learning_rate": 2.6348414139115895e-06,
+      "loss": 0.7256,
+      "step": 9356
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 10.557765975181507,
+      "learning_rate": 2.6338240963624685e-06,
+      "loss": 0.724,
+      "step": 9357
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 12.176770232870249,
+      "learning_rate": 2.6328069050185813e-06,
+      "loss": 0.7585,
+      "step": 9358
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 7.876036486298675,
+      "learning_rate": 2.631789839934186e-06,
+      "loss": 0.7003,
+      "step": 9359
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 9.29050168608121,
+      "learning_rate": 2.630772901163529e-06,
+      "loss": 0.6684,
+      "step": 9360
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 8.065951930213654,
+      "learning_rate": 2.629756088760849e-06,
+      "loss": 0.6545,
+      "step": 9361
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 8.467138139637854,
+      "learning_rate": 2.6287394027803837e-06,
+      "loss": 0.7053,
+      "step": 9362
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 11.410030232219347,
+      "learning_rate": 2.6277228432763557e-06,
+      "loss": 0.7061,
+      "step": 9363
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 8.239114725139943,
+      "learning_rate": 2.6267064103029904e-06,
+      "loss": 0.6731,
+      "step": 9364
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 7.305266561846734,
+      "learning_rate": 2.6256901039144965e-06,
+      "loss": 0.7729,
+      "step": 9365
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 6.837593416459985,
+      "learning_rate": 2.6246739241650866e-06,
+      "loss": 0.7387,
+      "step": 9366
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 10.779791557436507,
+      "learning_rate": 2.6236578711089565e-06,
+      "loss": 0.7407,
+      "step": 9367
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 10.182024435655888,
+      "learning_rate": 2.6226419448003e-06,
+      "loss": 0.7039,
+      "step": 9368
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 7.501370268813522,
+      "learning_rate": 2.6216261452933062e-06,
+      "loss": 0.7365,
+      "step": 9369
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 7.817113240287696,
+      "learning_rate": 2.620610472642152e-06,
+      "loss": 0.698,
+      "step": 9370
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 6.0718063364632835,
+      "learning_rate": 2.6195949269010113e-06,
+      "loss": 0.7052,
+      "step": 9371
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 12.530993427168662,
+      "learning_rate": 2.618579508124054e-06,
+      "loss": 0.7481,
+      "step": 9372
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 9.256786673560935,
+      "learning_rate": 2.617564216365437e-06,
+      "loss": 0.7572,
+      "step": 9373
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 8.173299810944876,
+      "learning_rate": 2.616549051679313e-06,
+      "loss": 0.67,
+      "step": 9374
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 10.668928027826446,
+      "learning_rate": 2.615534014119826e-06,
+      "loss": 0.7499,
+      "step": 9375
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 8.491103862718829,
+      "learning_rate": 2.614519103741118e-06,
+      "loss": 0.7548,
+      "step": 9376
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 8.900090982894282,
+      "learning_rate": 2.613504320597322e-06,
+      "loss": 0.7898,
+      "step": 9377
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 8.503803290306541,
+      "learning_rate": 2.6124896647425635e-06,
+      "loss": 0.7125,
+      "step": 9378
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 7.419644260844053,
+      "learning_rate": 2.6114751362309604e-06,
+      "loss": 0.6827,
+      "step": 9379
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 11.450432932661457,
+      "learning_rate": 2.6104607351166235e-06,
+      "loss": 0.7165,
+      "step": 9380
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 9.831222585192387,
+      "learning_rate": 2.6094464614536584e-06,
+      "loss": 0.7427,
+      "step": 9381
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 10.040372524925083,
+      "learning_rate": 2.6084323152961684e-06,
+      "loss": 0.7519,
+      "step": 9382
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 10.020369404900258,
+      "learning_rate": 2.6074182966982386e-06,
+      "loss": 0.7438,
+      "step": 9383
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 6.983308899813829,
+      "learning_rate": 2.6064044057139598e-06,
+      "loss": 0.7818,
+      "step": 9384
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 9.063665522250064,
+      "learning_rate": 2.605390642397407e-06,
+      "loss": 0.7281,
+      "step": 9385
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 8.845540193966082,
+      "learning_rate": 2.6043770068026495e-06,
+      "loss": 0.7942,
+      "step": 9386
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 7.3777837690030665,
+      "learning_rate": 2.603363498983757e-06,
+      "loss": 0.7511,
+      "step": 9387
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 10.328739064053543,
+      "learning_rate": 2.602350118994782e-06,
+      "loss": 0.7767,
+      "step": 9388
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 7.53446347482629,
+      "learning_rate": 2.60133686688978e-06,
+      "loss": 0.7116,
+      "step": 9389
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 8.04282905366094,
+      "learning_rate": 2.60032374272279e-06,
+      "loss": 0.8103,
+      "step": 9390
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 6.3379280846360135,
+      "learning_rate": 2.599310746547855e-06,
+      "loss": 0.6563,
+      "step": 9391
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 10.577693423705048,
+      "learning_rate": 2.5982978784190017e-06,
+      "loss": 0.8216,
+      "step": 9392
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 7.587180717555581,
+      "learning_rate": 2.5972851383902533e-06,
+      "loss": 0.7068,
+      "step": 9393
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 8.404803574047742,
+      "learning_rate": 2.5962725265156284e-06,
+      "loss": 0.7426,
+      "step": 9394
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 7.9398075975810745,
+      "learning_rate": 2.595260042849135e-06,
+      "loss": 0.6865,
+      "step": 9395
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 10.19269029917816,
+      "learning_rate": 2.594247687444777e-06,
+      "loss": 0.7549,
+      "step": 9396
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 8.952962206389577,
+      "learning_rate": 2.593235460356553e-06,
+      "loss": 0.8182,
+      "step": 9397
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 8.436859958502337,
+      "learning_rate": 2.59222336163845e-06,
+      "loss": 0.7089,
+      "step": 9398
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 6.7847566559095664,
+      "learning_rate": 2.591211391344449e-06,
+      "loss": 0.7125,
+      "step": 9399
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 7.46712724618885,
+      "learning_rate": 2.5901995495285292e-06,
+      "loss": 0.6571,
+      "step": 9400
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 9.339850894789334,
+      "learning_rate": 2.5891878362446566e-06,
+      "loss": 0.7545,
+      "step": 9401
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 11.293776359966296,
+      "learning_rate": 2.588176251546795e-06,
+      "loss": 0.762,
+      "step": 9402
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 7.857672006933083,
+      "learning_rate": 2.587164795488898e-06,
+      "loss": 0.6994,
+      "step": 9403
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 9.737094744472882,
+      "learning_rate": 2.5861534681249154e-06,
+      "loss": 0.7772,
+      "step": 9404
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 7.406514867357246,
+      "learning_rate": 2.5851422695087885e-06,
+      "loss": 0.7401,
+      "step": 9405
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 10.272452339604307,
+      "learning_rate": 2.5841311996944494e-06,
+      "loss": 0.7696,
+      "step": 9406
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 7.96904284754169,
+      "learning_rate": 2.583120258735829e-06,
+      "loss": 0.7754,
+      "step": 9407
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 7.783082005931166,
+      "learning_rate": 2.5821094466868446e-06,
+      "loss": 0.6981,
+      "step": 9408
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 7.963962367282493,
+      "learning_rate": 2.581098763601413e-06,
+      "loss": 0.7422,
+      "step": 9409
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 9.42100229755454,
+      "learning_rate": 2.580088209533441e-06,
+      "loss": 0.7495,
+      "step": 9410
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 7.229846165779198,
+      "learning_rate": 2.5790777845368286e-06,
+      "loss": 0.7625,
+      "step": 9411
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 11.650228772698867,
+      "learning_rate": 2.578067488665468e-06,
+      "loss": 0.7023,
+      "step": 9412
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 8.095589259008932,
+      "learning_rate": 2.5770573219732446e-06,
+      "loss": 0.6869,
+      "step": 9413
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 8.936564090191762,
+      "learning_rate": 2.5760472845140393e-06,
+      "loss": 0.761,
+      "step": 9414
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 8.888220189924574,
+      "learning_rate": 2.575037376341727e-06,
+      "loss": 0.7339,
+      "step": 9415
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 7.201872922266538,
+      "learning_rate": 2.5740275975101687e-06,
+      "loss": 0.6814,
+      "step": 9416
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 8.898039090194578,
+      "learning_rate": 2.57301794807323e-06,
+      "loss": 0.7011,
+      "step": 9417
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 9.717928475407628,
+      "learning_rate": 2.5720084280847535e-06,
+      "loss": 0.7518,
+      "step": 9418
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 11.865744753957955,
+      "learning_rate": 2.57099903759859e-06,
+      "loss": 0.6644,
+      "step": 9419
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 11.8909586132912,
+      "learning_rate": 2.5699897766685785e-06,
+      "loss": 0.7666,
+      "step": 9420
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 10.373672593202418,
+      "learning_rate": 2.5689806453485455e-06,
+      "loss": 0.7916,
+      "step": 9421
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 7.435161619056009,
+      "learning_rate": 2.5679716436923206e-06,
+      "loss": 0.7875,
+      "step": 9422
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 9.42253822747142,
+      "learning_rate": 2.5669627717537195e-06,
+      "loss": 0.6906,
+      "step": 9423
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 6.732247540802854,
+      "learning_rate": 2.5659540295865492e-06,
+      "loss": 0.7205,
+      "step": 9424
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 10.49587961512358,
+      "learning_rate": 2.5649454172446177e-06,
+      "loss": 0.7904,
+      "step": 9425
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 11.234440379877261,
+      "learning_rate": 2.5639369347817166e-06,
+      "loss": 0.7561,
+      "step": 9426
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 7.827392468899084,
+      "learning_rate": 2.562928582251642e-06,
+      "loss": 0.6989,
+      "step": 9427
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 5.268597756469174,
+      "learning_rate": 2.561920359708171e-06,
+      "loss": 0.718,
+      "step": 9428
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 10.953325293234888,
+      "learning_rate": 2.560912267205083e-06,
+      "loss": 0.7227,
+      "step": 9429
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 9.883569484340992,
+      "learning_rate": 2.559904304796146e-06,
+      "loss": 0.7407,
+      "step": 9430
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 8.01391278470941,
+      "learning_rate": 2.5588964725351182e-06,
+      "loss": 0.7471,
+      "step": 9431
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 12.63392451432201,
+      "learning_rate": 2.5578887704757604e-06,
+      "loss": 0.7539,
+      "step": 9432
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 8.603398704952152,
+      "learning_rate": 2.5568811986718168e-06,
+      "loss": 0.6962,
+      "step": 9433
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 8.121936696083688,
+      "learning_rate": 2.5558737571770286e-06,
+      "loss": 0.674,
+      "step": 9434
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 9.391586033205687,
+      "learning_rate": 2.554866446045133e-06,
+      "loss": 0.7135,
+      "step": 9435
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 11.69972184474983,
+      "learning_rate": 2.553859265329855e-06,
+      "loss": 0.7502,
+      "step": 9436
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 11.354137536103774,
+      "learning_rate": 2.5528522150849156e-06,
+      "loss": 0.7302,
+      "step": 9437
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 6.121088059665159,
+      "learning_rate": 2.5518452953640242e-06,
+      "loss": 0.7556,
+      "step": 9438
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 9.857161589618135,
+      "learning_rate": 2.5508385062208913e-06,
+      "loss": 0.6941,
+      "step": 9439
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 7.713319632170615,
+      "learning_rate": 2.5498318477092166e-06,
+      "loss": 0.7494,
+      "step": 9440
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 10.505559913062923,
+      "learning_rate": 2.5488253198826898e-06,
+      "loss": 0.7321,
+      "step": 9441
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 7.322197640610701,
+      "learning_rate": 2.547818922794999e-06,
+      "loss": 0.786,
+      "step": 9442
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 9.363314235679484,
+      "learning_rate": 2.5468126564998207e-06,
+      "loss": 0.6893,
+      "step": 9443
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 9.471510070882967,
+      "learning_rate": 2.5458065210508255e-06,
+      "loss": 0.7336,
+      "step": 9444
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 11.365861880416665,
+      "learning_rate": 2.544800516501681e-06,
+      "loss": 0.7819,
+      "step": 9445
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 10.111356656961737,
+      "learning_rate": 2.543794642906041e-06,
+      "loss": 0.6789,
+      "step": 9446
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 6.58710448603633,
+      "learning_rate": 2.5427889003175577e-06,
+      "loss": 0.7336,
+      "step": 9447
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 8.941728652644155,
+      "learning_rate": 2.5417832887898763e-06,
+      "loss": 0.8058,
+      "step": 9448
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 7.982670542317304,
+      "learning_rate": 2.540777808376632e-06,
+      "loss": 0.6641,
+      "step": 9449
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 12.516998198428368,
+      "learning_rate": 2.539772459131455e-06,
+      "loss": 0.7365,
+      "step": 9450
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 11.39050052728362,
+      "learning_rate": 2.5387672411079643e-06,
+      "loss": 0.757,
+      "step": 9451
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 6.603603557079071,
+      "learning_rate": 2.537762154359778e-06,
+      "loss": 0.7153,
+      "step": 9452
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 7.725638151112929,
+      "learning_rate": 2.5367571989405064e-06,
+      "loss": 0.7088,
+      "step": 9453
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 7.328607056844982,
+      "learning_rate": 2.5357523749037483e-06,
+      "loss": 0.808,
+      "step": 9454
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 6.884672780662176,
+      "learning_rate": 2.5347476823031e-06,
+      "loss": 0.7244,
+      "step": 9455
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 10.831590561993707,
+      "learning_rate": 2.5337431211921494e-06,
+      "loss": 0.7504,
+      "step": 9456
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 8.823647847762992,
+      "learning_rate": 2.5327386916244735e-06,
+      "loss": 0.7731,
+      "step": 9457
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 10.288688581971487,
+      "learning_rate": 2.53173439365365e-06,
+      "loss": 0.7075,
+      "step": 9458
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 10.033099745780484,
+      "learning_rate": 2.5307302273332412e-06,
+      "loss": 0.6983,
+      "step": 9459
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 9.450245877842233,
+      "learning_rate": 2.5297261927168104e-06,
+      "loss": 0.7122,
+      "step": 9460
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 8.227392928399109,
+      "learning_rate": 2.528722289857908e-06,
+      "loss": 0.7442,
+      "step": 9461
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 8.88899712225775,
+      "learning_rate": 2.5277185188100807e-06,
+      "loss": 0.7572,
+      "step": 9462
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 9.037915618176317,
+      "learning_rate": 2.526714879626866e-06,
+      "loss": 0.7694,
+      "step": 9463
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 7.604858787934742,
+      "learning_rate": 2.525711372361793e-06,
+      "loss": 0.6756,
+      "step": 9464
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 8.51803856667725,
+      "learning_rate": 2.524707997068391e-06,
+      "loss": 0.7069,
+      "step": 9465
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 8.253345721405125,
+      "learning_rate": 2.523704753800171e-06,
+      "loss": 0.7365,
+      "step": 9466
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 10.418771601837232,
+      "learning_rate": 2.52270164261065e-06,
+      "loss": 0.7656,
+      "step": 9467
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 7.4224085045337045,
+      "learning_rate": 2.521698663553327e-06,
+      "loss": 0.7337,
+      "step": 9468
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 8.646652334154847,
+      "learning_rate": 2.5206958166816975e-06,
+      "loss": 0.7577,
+      "step": 9469
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 9.097983770207218,
+      "learning_rate": 2.5196931020492532e-06,
+      "loss": 0.7749,
+      "step": 9470
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 10.443420335648094,
+      "learning_rate": 2.5186905197094737e-06,
+      "loss": 0.6804,
+      "step": 9471
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 9.24296675860412,
+      "learning_rate": 2.5176880697158346e-06,
+      "loss": 0.7044,
+      "step": 9472
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 9.274347693774045,
+      "learning_rate": 2.5166857521218063e-06,
+      "loss": 0.7953,
+      "step": 9473
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 8.141447835336349,
+      "learning_rate": 2.515683566980848e-06,
+      "loss": 0.6935,
+      "step": 9474
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 8.292050260528146,
+      "learning_rate": 2.5146815143464127e-06,
+      "loss": 0.7507,
+      "step": 9475
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 8.297055848911368,
+      "learning_rate": 2.5136795942719455e-06,
+      "loss": 0.7331,
+      "step": 9476
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 9.539181304564721,
+      "learning_rate": 2.512677806810888e-06,
+      "loss": 0.6898,
+      "step": 9477
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 10.596471233906358,
+      "learning_rate": 2.511676152016675e-06,
+      "loss": 0.8208,
+      "step": 9478
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 9.087908300827968,
+      "learning_rate": 2.5106746299427275e-06,
+      "loss": 0.772,
+      "step": 9479
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 10.440697331543122,
+      "learning_rate": 2.5096732406424683e-06,
+      "loss": 0.7134,
+      "step": 9480
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 7.965148639922299,
+      "learning_rate": 2.5086719841693065e-06,
+      "loss": 0.7304,
+      "step": 9481
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 6.577670988716495,
+      "learning_rate": 2.5076708605766447e-06,
+      "loss": 0.7331,
+      "step": 9482
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 10.90598085719998,
+      "learning_rate": 2.506669869917884e-06,
+      "loss": 0.7147,
+      "step": 9483
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 6.595699057621559,
+      "learning_rate": 2.5056690122464093e-06,
+      "loss": 0.7625,
+      "step": 9484
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 8.241231684417649,
+      "learning_rate": 2.504668287615607e-06,
+      "loss": 0.6947,
+      "step": 9485
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 7.461312073654729,
+      "learning_rate": 2.503667696078854e-06,
+      "loss": 0.8213,
+      "step": 9486
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 8.156784606611938,
+      "learning_rate": 2.5026672376895177e-06,
+      "loss": 0.7022,
+      "step": 9487
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 10.028640783630607,
+      "learning_rate": 2.5016669125009602e-06,
+      "loss": 0.6844,
+      "step": 9488
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 9.911538596109203,
+      "learning_rate": 2.500666720566533e-06,
+      "loss": 0.6563,
+      "step": 9489
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 7.49954972954348,
+      "learning_rate": 2.4996666619395857e-06,
+      "loss": 0.723,
+      "step": 9490
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 8.779642963456642,
+      "learning_rate": 2.4986667366734614e-06,
+      "loss": 0.7677,
+      "step": 9491
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 10.993599206013256,
+      "learning_rate": 2.4976669448214883e-06,
+      "loss": 0.8499,
+      "step": 9492
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 9.094632134991349,
+      "learning_rate": 2.4966672864369977e-06,
+      "loss": 0.6807,
+      "step": 9493
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 9.506629701312415,
+      "learning_rate": 2.4956677615733053e-06,
+      "loss": 0.7583,
+      "step": 9494
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 9.871977288309003,
+      "learning_rate": 2.494668370283722e-06,
+      "loss": 0.7933,
+      "step": 9495
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 7.869981312439749,
+      "learning_rate": 2.4936691126215567e-06,
+      "loss": 0.7488,
+      "step": 9496
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 8.61270509755907,
+      "learning_rate": 2.492669988640102e-06,
+      "loss": 0.7139,
+      "step": 9497
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 11.501264015240823,
+      "learning_rate": 2.491670998392653e-06,
+      "loss": 0.7818,
+      "step": 9498
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 8.356524266409354,
+      "learning_rate": 2.4906721419324886e-06,
+      "loss": 0.7077,
+      "step": 9499
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 7.3899696272056135,
+      "learning_rate": 2.4896734193128896e-06,
+      "loss": 0.7221,
+      "step": 9500
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 12.449808062608836,
+      "learning_rate": 2.488674830587123e-06,
+      "loss": 0.7267,
+      "step": 9501
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 8.48778733383069,
+      "learning_rate": 2.4876763758084488e-06,
+      "loss": 0.6842,
+      "step": 9502
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 8.912905102846741,
+      "learning_rate": 2.486678055030125e-06,
+      "loss": 0.7482,
+      "step": 9503
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 10.817061459607794,
+      "learning_rate": 2.485679868305396e-06,
+      "loss": 0.7631,
+      "step": 9504
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 7.847684358969064,
+      "learning_rate": 2.4846818156875048e-06,
+      "loss": 0.7085,
+      "step": 9505
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 9.660136603470836,
+      "learning_rate": 2.4836838972296873e-06,
+      "loss": 0.7405,
+      "step": 9506
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 7.543075822247619,
+      "learning_rate": 2.4826861129851633e-06,
+      "loss": 0.7336,
+      "step": 9507
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 8.27234691222656,
+      "learning_rate": 2.4816884630071562e-06,
+      "loss": 0.7401,
+      "step": 9508
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 8.197663046526293,
+      "learning_rate": 2.4806909473488756e-06,
+      "loss": 0.7769,
+      "step": 9509
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 9.87638229500616,
+      "learning_rate": 2.479693566063527e-06,
+      "loss": 0.7707,
+      "step": 9510
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 8.298702533962965,
+      "learning_rate": 2.4786963192043102e-06,
+      "loss": 0.7728,
+      "step": 9511
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 8.715667332153494,
+      "learning_rate": 2.4776992068244132e-06,
+      "loss": 0.8148,
+      "step": 9512
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 7.875459155881116,
+      "learning_rate": 2.4767022289770203e-06,
+      "loss": 0.7169,
+      "step": 9513
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 12.259376756292781,
+      "learning_rate": 2.4757053857153047e-06,
+      "loss": 0.6585,
+      "step": 9514
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 8.347481993200313,
+      "learning_rate": 2.4747086770924374e-06,
+      "loss": 0.7578,
+      "step": 9515
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 10.803213617439583,
+      "learning_rate": 2.473712103161582e-06,
+      "loss": 0.8145,
+      "step": 9516
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 8.116252212969679,
+      "learning_rate": 2.4727156639758885e-06,
+      "loss": 0.7198,
+      "step": 9517
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 10.986827886949966,
+      "learning_rate": 2.4717193595885096e-06,
+      "loss": 0.7041,
+      "step": 9518
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 8.3835197358953,
+      "learning_rate": 2.470723190052582e-06,
+      "loss": 0.7016,
+      "step": 9519
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 6.24855270612064,
+      "learning_rate": 2.4697271554212377e-06,
+      "loss": 0.7747,
+      "step": 9520
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 7.543093787153777,
+      "learning_rate": 2.468731255747605e-06,
+      "loss": 0.788,
+      "step": 9521
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 8.399879192266505,
+      "learning_rate": 2.4677354910848e-06,
+      "loss": 0.804,
+      "step": 9522
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 8.173176812399817,
+      "learning_rate": 2.4667398614859374e-06,
+      "loss": 0.6935,
+      "step": 9523
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 9.870628259133847,
+      "learning_rate": 2.4657443670041166e-06,
+      "loss": 0.762,
+      "step": 9524
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 5.552760456635638,
+      "learning_rate": 2.4647490076924396e-06,
+      "loss": 0.7238,
+      "step": 9525
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 9.446291700521227,
+      "learning_rate": 2.463753783603993e-06,
+      "loss": 0.6856,
+      "step": 9526
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 9.253770472831045,
+      "learning_rate": 2.4627586947918587e-06,
+      "loss": 0.6992,
+      "step": 9527
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 10.277927674135707,
+      "learning_rate": 2.4617637413091128e-06,
+      "loss": 0.7358,
+      "step": 9528
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 8.936083479914982,
+      "learning_rate": 2.4607689232088256e-06,
+      "loss": 0.8636,
+      "step": 9529
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 9.643770219862983,
+      "learning_rate": 2.4597742405440546e-06,
+      "loss": 0.7423,
+      "step": 9530
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 9.577899387976945,
+      "learning_rate": 2.458779693367856e-06,
+      "loss": 0.6699,
+      "step": 9531
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 9.002583579323248,
+      "learning_rate": 2.4577852817332754e-06,
+      "loss": 0.7712,
+      "step": 9532
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 8.253431892359297,
+      "learning_rate": 2.4567910056933496e-06,
+      "loss": 0.7051,
+      "step": 9533
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 8.698711849898542,
+      "learning_rate": 2.4557968653011145e-06,
+      "loss": 0.7139,
+      "step": 9534
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 8.897865698581821,
+      "learning_rate": 2.454802860609591e-06,
+      "loss": 0.7003,
+      "step": 9535
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 9.231815219301469,
+      "learning_rate": 2.4538089916717994e-06,
+      "loss": 0.6594,
+      "step": 9536
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 8.239404215925743,
+      "learning_rate": 2.4528152585407477e-06,
+      "loss": 0.705,
+      "step": 9537
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 11.522904653486556,
+      "learning_rate": 2.4518216612694417e-06,
+      "loss": 0.8026,
+      "step": 9538
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 10.749222199477579,
+      "learning_rate": 2.4508281999108753e-06,
+      "loss": 0.8098,
+      "step": 9539
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 8.66953662350101,
+      "learning_rate": 2.449834874518035e-06,
+      "loss": 0.7685,
+      "step": 9540
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 8.086491531625393,
+      "learning_rate": 2.4488416851439063e-06,
+      "loss": 0.6933,
+      "step": 9541
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 9.429437836322396,
+      "learning_rate": 2.447848631841459e-06,
+      "loss": 0.7121,
+      "step": 9542
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 11.665227209468437,
+      "learning_rate": 2.4468557146636616e-06,
+      "loss": 0.7257,
+      "step": 9543
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 8.610997839994534,
+      "learning_rate": 2.4458629336634753e-06,
+      "loss": 0.7525,
+      "step": 9544
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 7.761473814511371,
+      "learning_rate": 2.444870288893851e-06,
+      "loss": 0.7307,
+      "step": 9545
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 10.689129854179297,
+      "learning_rate": 2.4438777804077327e-06,
+      "loss": 0.7547,
+      "step": 9546
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 8.159911625169057,
+      "learning_rate": 2.442885408258058e-06,
+      "loss": 0.6655,
+      "step": 9547
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 10.064368434476712,
+      "learning_rate": 2.4418931724977565e-06,
+      "loss": 0.7028,
+      "step": 9548
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 6.661643443580049,
+      "learning_rate": 2.440901073179755e-06,
+      "loss": 0.7683,
+      "step": 9549
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 9.415664489882838,
+      "learning_rate": 2.439909110356967e-06,
+      "loss": 0.7711,
+      "step": 9550
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 9.563014008820007,
+      "learning_rate": 2.438917284082301e-06,
+      "loss": 0.6586,
+      "step": 9551
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 11.154276123484287,
+      "learning_rate": 2.437925594408656e-06,
+      "loss": 0.7707,
+      "step": 9552
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 10.412197364987177,
+      "learning_rate": 2.4369340413889288e-06,
+      "loss": 0.7295,
+      "step": 9553
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 6.730324072707755,
+      "learning_rate": 2.435942625076007e-06,
+      "loss": 0.7249,
+      "step": 9554
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 14.288860883222899,
+      "learning_rate": 2.4349513455227663e-06,
+      "loss": 0.6893,
+      "step": 9555
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 10.665595639498216,
+      "learning_rate": 2.4339602027820835e-06,
+      "loss": 0.6928,
+      "step": 9556
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 10.125481890142318,
+      "learning_rate": 2.4329691969068202e-06,
+      "loss": 0.7572,
+      "step": 9557
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 9.996994638815687,
+      "learning_rate": 2.431978327949833e-06,
+      "loss": 0.7898,
+      "step": 9558
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 11.367796115885353,
+      "learning_rate": 2.430987595963976e-06,
+      "loss": 0.7961,
+      "step": 9559
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 7.165219687700218,
+      "learning_rate": 2.429997001002087e-06,
+      "loss": 0.7098,
+      "step": 9560
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 7.321547877003067,
+      "learning_rate": 2.4290065431170067e-06,
+      "loss": 0.7416,
+      "step": 9561
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 9.499745854214417,
+      "learning_rate": 2.4280162223615595e-06,
+      "loss": 0.6832,
+      "step": 9562
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 8.1444251561859,
+      "learning_rate": 2.427026038788569e-06,
+      "loss": 0.7722,
+      "step": 9563
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 7.81831148305728,
+      "learning_rate": 2.426035992450848e-06,
+      "loss": 0.7415,
+      "step": 9564
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 9.998118182027186,
+      "learning_rate": 2.4250460834012006e-06,
+      "loss": 0.7091,
+      "step": 9565
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 7.596272677125787,
+      "learning_rate": 2.4240563116924293e-06,
+      "loss": 0.6755,
+      "step": 9566
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 12.483566477758123,
+      "learning_rate": 2.4230666773773225e-06,
+      "loss": 0.8264,
+      "step": 9567
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 8.686394609042235,
+      "learning_rate": 2.4220771805086665e-06,
+      "loss": 0.6739,
+      "step": 9568
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 12.002941812410283,
+      "learning_rate": 2.4210878211392393e-06,
+      "loss": 0.6577,
+      "step": 9569
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 9.228913453265434,
+      "learning_rate": 2.4200985993218094e-06,
+      "loss": 0.7435,
+      "step": 9570
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 8.47684874915138,
+      "learning_rate": 2.419109515109137e-06,
+      "loss": 0.7617,
+      "step": 9571
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 8.908099752526102,
+      "learning_rate": 2.4181205685539814e-06,
+      "loss": 0.7573,
+      "step": 9572
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 9.882754187885432,
+      "learning_rate": 2.417131759709086e-06,
+      "loss": 0.7323,
+      "step": 9573
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 8.55807789729796,
+      "learning_rate": 2.4161430886271945e-06,
+      "loss": 0.727,
+      "step": 9574
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 7.032261014545914,
+      "learning_rate": 2.4151545553610374e-06,
+      "loss": 0.7827,
+      "step": 9575
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 10.981966715861489,
+      "learning_rate": 2.4141661599633427e-06,
+      "loss": 0.7793,
+      "step": 9576
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 10.297829264658043,
+      "learning_rate": 2.413177902486827e-06,
+      "loss": 0.7253,
+      "step": 9577
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 11.499009213545074,
+      "learning_rate": 2.4121897829842e-06,
+      "loss": 0.6869,
+      "step": 9578
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 11.043599107880391,
+      "learning_rate": 2.4112018015081685e-06,
+      "loss": 0.646,
+      "step": 9579
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 9.754902599867481,
+      "learning_rate": 2.4102139581114253e-06,
+      "loss": 0.7481,
+      "step": 9580
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 8.13277730454056,
+      "learning_rate": 2.4092262528466608e-06,
+      "loss": 0.7062,
+      "step": 9581
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 9.720566961528753,
+      "learning_rate": 2.4082386857665577e-06,
+      "loss": 0.877,
+      "step": 9582
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 13.707441073180798,
+      "learning_rate": 2.40725125692379e-06,
+      "loss": 0.7023,
+      "step": 9583
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 8.335409572045416,
+      "learning_rate": 2.4062639663710225e-06,
+      "loss": 0.7177,
+      "step": 9584
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 6.819823613451512,
+      "learning_rate": 2.405276814160914e-06,
+      "loss": 0.6819,
+      "step": 9585
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 10.177250536761553,
+      "learning_rate": 2.4042898003461174e-06,
+      "loss": 0.7355,
+      "step": 9586
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 9.627841860093652,
+      "learning_rate": 2.4033029249792793e-06,
+      "loss": 0.7469,
+      "step": 9587
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 9.039528561369014,
+      "learning_rate": 2.402316188113034e-06,
+      "loss": 0.7422,
+      "step": 9588
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 10.958876805987705,
+      "learning_rate": 2.401329589800016e-06,
+      "loss": 0.7069,
+      "step": 9589
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 9.270510334630675,
+      "learning_rate": 2.4003431300928404e-06,
+      "loss": 0.7411,
+      "step": 9590
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 13.337429872906341,
+      "learning_rate": 2.399356809044126e-06,
+      "loss": 0.7181,
+      "step": 9591
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 8.473466047043951,
+      "learning_rate": 2.398370626706482e-06,
+      "loss": 0.7337,
+      "step": 9592
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 10.022879321779662,
+      "learning_rate": 2.397384583132506e-06,
+      "loss": 0.7252,
+      "step": 9593
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 8.84386919315735,
+      "learning_rate": 2.396398678374793e-06,
+      "loss": 0.732,
+      "step": 9594
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 8.927994538787813,
+      "learning_rate": 2.395412912485928e-06,
+      "loss": 0.7335,
+      "step": 9595
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 10.393084860766942,
+      "learning_rate": 2.3944272855184863e-06,
+      "loss": 0.6898,
+      "step": 9596
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 8.295105994826969,
+      "learning_rate": 2.3934417975250424e-06,
+      "loss": 0.7589,
+      "step": 9597
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 9.109951971756564,
+      "learning_rate": 2.3924564485581553e-06,
+      "loss": 0.7243,
+      "step": 9598
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 10.297974632536393,
+      "learning_rate": 2.3914712386703854e-06,
+      "loss": 0.773,
+      "step": 9599
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 8.873223501136994,
+      "learning_rate": 2.3904861679142765e-06,
+      "loss": 0.6982,
+      "step": 9600
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 9.779055334407978,
+      "learning_rate": 2.389501236342374e-06,
+      "loss": 0.7793,
+      "step": 9601
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 7.545399948493497,
+      "learning_rate": 2.388516444007209e-06,
+      "loss": 0.7709,
+      "step": 9602
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 11.65663492438529,
+      "learning_rate": 2.387531790961306e-06,
+      "loss": 0.6699,
+      "step": 9603
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 9.340312809787024,
+      "learning_rate": 2.3865472772571874e-06,
+      "loss": 0.8135,
+      "step": 9604
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 10.920790176076503,
+      "learning_rate": 2.38556290294736e-06,
+      "loss": 0.7518,
+      "step": 9605
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 10.206058586062783,
+      "learning_rate": 2.3845786680843307e-06,
+      "loss": 0.7796,
+      "step": 9606
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 8.63280509071469,
+      "learning_rate": 2.3835945727205965e-06,
+      "loss": 0.6871,
+      "step": 9607
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 9.970660155171743,
+      "learning_rate": 2.3826106169086455e-06,
+      "loss": 0.6733,
+      "step": 9608
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 7.5829599345207,
+      "learning_rate": 2.3816268007009587e-06,
+      "loss": 0.843,
+      "step": 9609
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 12.731971577005247,
+      "learning_rate": 2.3806431241500077e-06,
+      "loss": 0.7576,
+      "step": 9610
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 7.377877741533909,
+      "learning_rate": 2.379659587308261e-06,
+      "loss": 0.7962,
+      "step": 9611
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 7.146583182023128,
+      "learning_rate": 2.378676190228181e-06,
+      "loss": 0.6736,
+      "step": 9612
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 8.468467216609461,
+      "learning_rate": 2.3776929329622145e-06,
+      "loss": 0.6998,
+      "step": 9613
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 7.9198619766254525,
+      "learning_rate": 2.3767098155628095e-06,
+      "loss": 0.7461,
+      "step": 9614
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 14.459934214559848,
+      "learning_rate": 2.3757268380824008e-06,
+      "loss": 0.7293,
+      "step": 9615
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 9.422378393450852,
+      "learning_rate": 2.3747440005734157e-06,
+      "loss": 0.6893,
+      "step": 9616
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 7.498125316312843,
+      "learning_rate": 2.37376130308828e-06,
+      "loss": 0.7295,
+      "step": 9617
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 8.462390375145374,
+      "learning_rate": 2.372778745679405e-06,
+      "loss": 0.7946,
+      "step": 9618
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 9.923685783623844,
+      "learning_rate": 2.371796328399198e-06,
+      "loss": 0.8276,
+      "step": 9619
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 8.613294210148489,
+      "learning_rate": 2.3708140513000615e-06,
+      "loss": 0.6993,
+      "step": 9620
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 10.399345201320877,
+      "learning_rate": 2.3698319144343853e-06,
+      "loss": 0.7506,
+      "step": 9621
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 9.026269932631484,
+      "learning_rate": 2.368849917854553e-06,
+      "loss": 0.7274,
+      "step": 9622
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 7.10673951859786,
+      "learning_rate": 2.3678680616129413e-06,
+      "loss": 0.7219,
+      "step": 9623
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 11.898985153447764,
+      "learning_rate": 2.36688634576192e-06,
+      "loss": 0.7428,
+      "step": 9624
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 11.910139502260865,
+      "learning_rate": 2.3659047703538533e-06,
+      "loss": 0.8146,
+      "step": 9625
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 7.397991526518098,
+      "learning_rate": 2.364923335441093e-06,
+      "loss": 0.8208,
+      "step": 9626
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 12.148352569341846,
+      "learning_rate": 2.363942041075989e-06,
+      "loss": 0.753,
+      "step": 9627
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 8.146117030079878,
+      "learning_rate": 2.362960887310878e-06,
+      "loss": 0.7231,
+      "step": 9628
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 7.0032913746543,
+      "learning_rate": 2.3619798741980923e-06,
+      "loss": 0.7122,
+      "step": 9629
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 11.983475552314198,
+      "learning_rate": 2.3609990017899586e-06,
+      "loss": 0.8002,
+      "step": 9630
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 9.139401247963564,
+      "learning_rate": 2.3600182701387904e-06,
+      "loss": 0.7315,
+      "step": 9631
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 10.499517431103703,
+      "learning_rate": 2.359037679296901e-06,
+      "loss": 0.7417,
+      "step": 9632
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 8.822930780509965,
+      "learning_rate": 2.358057229316591e-06,
+      "loss": 0.7278,
+      "step": 9633
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 7.89529350771528,
+      "learning_rate": 2.3570769202501525e-06,
+      "loss": 0.7589,
+      "step": 9634
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 8.025600843498868,
+      "learning_rate": 2.3560967521498764e-06,
+      "loss": 0.6689,
+      "step": 9635
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 10.022004107554867,
+      "learning_rate": 2.355116725068038e-06,
+      "loss": 0.7199,
+      "step": 9636
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 7.665649616416645,
+      "learning_rate": 2.354136839056914e-06,
+      "loss": 0.6865,
+      "step": 9637
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 11.424846470544514,
+      "learning_rate": 2.353157094168763e-06,
+      "loss": 0.7269,
+      "step": 9638
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 9.295597839033734,
+      "learning_rate": 2.352177490455848e-06,
+      "loss": 0.726,
+      "step": 9639
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 12.227874000204022,
+      "learning_rate": 2.351198027970415e-06,
+      "loss": 0.6853,
+      "step": 9640
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 10.296166035958285,
+      "learning_rate": 2.350218706764704e-06,
+      "loss": 0.7149,
+      "step": 9641
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 10.690306297555896,
+      "learning_rate": 2.349239526890954e-06,
+      "loss": 0.6936,
+      "step": 9642
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 7.888181672794081,
+      "learning_rate": 2.348260488401387e-06,
+      "loss": 0.7412,
+      "step": 9643
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 10.27937864957461,
+      "learning_rate": 2.347281591348225e-06,
+      "loss": 0.7612,
+      "step": 9644
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 12.005465054341178,
+      "learning_rate": 2.3463028357836805e-06,
+      "loss": 0.7545,
+      "step": 9645
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 7.418632707357741,
+      "learning_rate": 2.3453242217599553e-06,
+      "loss": 0.7784,
+      "step": 9646
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 9.108159043401864,
+      "learning_rate": 2.3443457493292476e-06,
+      "loss": 0.813,
+      "step": 9647
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 7.895455780820668,
+      "learning_rate": 2.343367418543744e-06,
+      "loss": 0.746,
+      "step": 9648
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 8.70836871612515,
+      "learning_rate": 2.3423892294556267e-06,
+      "loss": 0.7166,
+      "step": 9649
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 9.598440234473788,
+      "learning_rate": 2.3414111821170725e-06,
+      "loss": 0.8086,
+      "step": 9650
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 9.93555831745068,
+      "learning_rate": 2.340433276580244e-06,
+      "loss": 0.6729,
+      "step": 9651
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 8.081019452074397,
+      "learning_rate": 2.3394555128973033e-06,
+      "loss": 0.8006,
+      "step": 9652
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 9.060059722656442,
+      "learning_rate": 2.3384778911204e-06,
+      "loss": 0.6575,
+      "step": 9653
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 10.989814689660664,
+      "learning_rate": 2.3375004113016754e-06,
+      "loss": 0.7195,
+      "step": 9654
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 13.136328516866016,
+      "learning_rate": 2.3365230734932698e-06,
+      "loss": 0.6853,
+      "step": 9655
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 10.816225264497929,
+      "learning_rate": 2.3355458777473077e-06,
+      "loss": 0.7651,
+      "step": 9656
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 6.904335787331648,
+      "learning_rate": 2.334568824115912e-06,
+      "loss": 0.6777,
+      "step": 9657
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 9.25015979279504,
+      "learning_rate": 2.333591912651198e-06,
+      "loss": 0.7453,
+      "step": 9658
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 8.963060001369245,
+      "learning_rate": 2.332615143405269e-06,
+      "loss": 0.7078,
+      "step": 9659
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 10.589782797847707,
+      "learning_rate": 2.3316385164302235e-06,
+      "loss": 0.7371,
+      "step": 9660
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 6.950519128256542,
+      "learning_rate": 2.330662031778151e-06,
+      "loss": 0.7347,
+      "step": 9661
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 8.00145505211979,
+      "learning_rate": 2.3296856895011347e-06,
+      "loss": 0.7027,
+      "step": 9662
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 7.872928086845069,
+      "learning_rate": 2.328709489651253e-06,
+      "loss": 0.7145,
+      "step": 9663
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 6.545943294941191,
+      "learning_rate": 2.3277334322805694e-06,
+      "loss": 0.6996,
+      "step": 9664
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 7.601787126050356,
+      "learning_rate": 2.326757517441149e-06,
+      "loss": 0.7608,
+      "step": 9665
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 7.6104928771771565,
+      "learning_rate": 2.3257817451850407e-06,
+      "loss": 0.634,
+      "step": 9666
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 7.915187198923805,
+      "learning_rate": 2.3248061155642887e-06,
+      "loss": 0.7366,
+      "step": 9667
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 9.523960964317089,
+      "learning_rate": 2.3238306286309343e-06,
+      "loss": 0.7906,
+      "step": 9668
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 11.033456227992732,
+      "learning_rate": 2.3228552844370027e-06,
+      "loss": 0.7494,
+      "step": 9669
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 8.80443923904244,
+      "learning_rate": 2.32188008303452e-06,
+      "loss": 0.7296,
+      "step": 9670
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 9.35361341382433,
+      "learning_rate": 2.3209050244754977e-06,
+      "loss": 0.6503,
+      "step": 9671
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 7.807220499237261,
+      "learning_rate": 2.319930108811946e-06,
+      "loss": 0.6482,
+      "step": 9672
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 11.079558795725575,
+      "learning_rate": 2.3189553360958616e-06,
+      "loss": 0.7168,
+      "step": 9673
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 7.214211385719454,
+      "learning_rate": 2.317980706379236e-06,
+      "loss": 0.7053,
+      "step": 9674
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 6.783368324726936,
+      "learning_rate": 2.3170062197140554e-06,
+      "loss": 0.7231,
+      "step": 9675
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 10.643538435938574,
+      "learning_rate": 2.3160318761522927e-06,
+      "loss": 0.7104,
+      "step": 9676
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 9.903560421290566,
+      "learning_rate": 2.315057675745921e-06,
+      "loss": 0.7296,
+      "step": 9677
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 10.127506502847949,
+      "learning_rate": 2.314083618546899e-06,
+      "loss": 0.8616,
+      "step": 9678
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 7.558051169759395,
+      "learning_rate": 2.3131097046071795e-06,
+      "loss": 0.7507,
+      "step": 9679
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 7.235567650832474,
+      "learning_rate": 2.3121359339787102e-06,
+      "loss": 0.7602,
+      "step": 9680
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 8.590430511903357,
+      "learning_rate": 2.311162306713427e-06,
+      "loss": 0.8074,
+      "step": 9681
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 7.248111759657715,
+      "learning_rate": 2.310188822863262e-06,
+      "loss": 0.6481,
+      "step": 9682
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 7.327933057392281,
+      "learning_rate": 2.30921548248014e-06,
+      "loss": 0.7608,
+      "step": 9683
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 10.127562297375434,
+      "learning_rate": 2.3082422856159744e-06,
+      "loss": 0.7974,
+      "step": 9684
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 10.412482894000064,
+      "learning_rate": 2.307269232322672e-06,
+      "loss": 0.8113,
+      "step": 9685
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 9.004421057608255,
+      "learning_rate": 2.306296322652132e-06,
+      "loss": 0.7714,
+      "step": 9686
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 9.69325824433573,
+      "learning_rate": 2.3053235566562486e-06,
+      "loss": 0.762,
+      "step": 9687
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 8.18503109901243,
+      "learning_rate": 2.3043509343869073e-06,
+      "loss": 0.7226,
+      "step": 9688
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 8.788096155375445,
+      "learning_rate": 2.3033784558959823e-06,
+      "loss": 0.7318,
+      "step": 9689
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 7.290914073370226,
+      "learning_rate": 2.302406121235346e-06,
+      "loss": 0.6954,
+      "step": 9690
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 6.593383090081499,
+      "learning_rate": 2.3014339304568584e-06,
+      "loss": 0.667,
+      "step": 9691
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 9.594046806548173,
+      "learning_rate": 2.300461883612372e-06,
+      "loss": 0.7131,
+      "step": 9692
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 10.900524846891715,
+      "learning_rate": 2.299489980753736e-06,
+      "loss": 0.7097,
+      "step": 9693
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 7.36901637665662,
+      "learning_rate": 2.2985182219327857e-06,
+      "loss": 0.6941,
+      "step": 9694
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 8.410624032708423,
+      "learning_rate": 2.297546607201355e-06,
+      "loss": 0.7232,
+      "step": 9695
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 8.577072761397627,
+      "learning_rate": 2.2965751366112653e-06,
+      "loss": 0.7245,
+      "step": 9696
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 10.233494128933954,
+      "learning_rate": 2.2956038102143342e-06,
+      "loss": 0.7389,
+      "step": 9697
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 11.011933965109007,
+      "learning_rate": 2.2946326280623677e-06,
+      "loss": 0.7768,
+      "step": 9698
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 9.88848828148342,
+      "learning_rate": 2.293661590207165e-06,
+      "loss": 0.7465,
+      "step": 9699
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 7.814086199734139,
+      "learning_rate": 2.2926906967005198e-06,
+      "loss": 0.7764,
+      "step": 9700
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 5.735377292244082,
+      "learning_rate": 2.2917199475942187e-06,
+      "loss": 0.752,
+      "step": 9701
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 8.395419453082111,
+      "learning_rate": 2.290749342940035e-06,
+      "loss": 0.6666,
+      "step": 9702
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 10.480815635155459,
+      "learning_rate": 2.2897788827897425e-06,
+      "loss": 0.7646,
+      "step": 9703
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 9.546474550965149,
+      "learning_rate": 2.2888085671951e-06,
+      "loss": 0.7762,
+      "step": 9704
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 9.938853863937746,
+      "learning_rate": 2.2878383962078605e-06,
+      "loss": 0.7865,
+      "step": 9705
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 8.546639913024084,
+      "learning_rate": 2.286868369879773e-06,
+      "loss": 0.7782,
+      "step": 9706
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 9.499305928419618,
+      "learning_rate": 2.285898488262573e-06,
+      "loss": 0.7127,
+      "step": 9707
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 9.077131365033594,
+      "learning_rate": 2.2849287514079943e-06,
+      "loss": 0.6964,
+      "step": 9708
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 11.166865079128833,
+      "learning_rate": 2.2839591593677574e-06,
+      "loss": 0.7308,
+      "step": 9709
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 8.038738716284858,
+      "learning_rate": 2.28298971219358e-06,
+      "loss": 0.6656,
+      "step": 9710
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 8.559071255074356,
+      "learning_rate": 2.282020409937169e-06,
+      "loss": 0.6883,
+      "step": 9711
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 9.624579149374519,
+      "learning_rate": 2.2810512526502217e-06,
+      "loss": 0.7485,
+      "step": 9712
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 10.51332553729323,
+      "learning_rate": 2.2800822403844343e-06,
+      "loss": 0.7517,
+      "step": 9713
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 8.2757994545335,
+      "learning_rate": 2.279113373191487e-06,
+      "loss": 0.7189,
+      "step": 9714
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 8.821905776212928,
+      "learning_rate": 2.2781446511230588e-06,
+      "loss": 0.7409,
+      "step": 9715
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 9.738439334054123,
+      "learning_rate": 2.2771760742308226e-06,
+      "loss": 0.7491,
+      "step": 9716
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 8.364393408362957,
+      "learning_rate": 2.276207642566432e-06,
+      "loss": 0.7078,
+      "step": 9717
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 11.10358388696504,
+      "learning_rate": 2.2752393561815457e-06,
+      "loss": 0.6811,
+      "step": 9718
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 9.34515462282109,
+      "learning_rate": 2.2742712151278057e-06,
+      "loss": 0.7813,
+      "step": 9719
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 11.081480851774598,
+      "learning_rate": 2.273303219456852e-06,
+      "loss": 0.7653,
+      "step": 9720
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 7.885525330966052,
+      "learning_rate": 2.2723353692203164e-06,
+      "loss": 0.7648,
+      "step": 9721
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 8.8371905599849,
+      "learning_rate": 2.2713676644698204e-06,
+      "loss": 0.7493,
+      "step": 9722
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 8.649955571694814,
+      "learning_rate": 2.2704001052569773e-06,
+      "loss": 0.7699,
+      "step": 9723
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 10.627964295955325,
+      "learning_rate": 2.269432691633393e-06,
+      "loss": 0.7038,
+      "step": 9724
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 10.229540223791826,
+      "learning_rate": 2.2684654236506687e-06,
+      "loss": 0.7418,
+      "step": 9725
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 8.693489336707112,
+      "learning_rate": 2.267498301360397e-06,
+      "loss": 0.6642,
+      "step": 9726
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 9.24007670247442,
+      "learning_rate": 2.266531324814158e-06,
+      "loss": 0.7472,
+      "step": 9727
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 12.131912384954015,
+      "learning_rate": 2.2655644940635314e-06,
+      "loss": 0.721,
+      "step": 9728
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 10.363426951221223,
+      "learning_rate": 2.2645978091600836e-06,
+      "loss": 0.717,
+      "step": 9729
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 8.919550359440866,
+      "learning_rate": 2.2636312701553725e-06,
+      "loss": 0.6558,
+      "step": 9730
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 5.499710999615047,
+      "learning_rate": 2.262664877100955e-06,
+      "loss": 0.7671,
+      "step": 9731
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 10.574156385148184,
+      "learning_rate": 2.2616986300483704e-06,
+      "loss": 0.7397,
+      "step": 9732
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 7.03768935244224,
+      "learning_rate": 2.2607325290491617e-06,
+      "loss": 0.7537,
+      "step": 9733
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 10.073460503258918,
+      "learning_rate": 2.259766574154853e-06,
+      "loss": 0.7175,
+      "step": 9734
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 6.952915889806521,
+      "learning_rate": 2.2588007654169698e-06,
+      "loss": 0.7732,
+      "step": 9735
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 8.61471132381047,
+      "learning_rate": 2.2578351028870234e-06,
+      "loss": 0.7195,
+      "step": 9736
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 10.11851541202579,
+      "learning_rate": 2.256869586616518e-06,
+      "loss": 0.743,
+      "step": 9737
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 7.2225026794984055,
+      "learning_rate": 2.255904216656955e-06,
+      "loss": 0.6947,
+      "step": 9738
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 10.177668462223757,
+      "learning_rate": 2.254938993059821e-06,
+      "loss": 0.663,
+      "step": 9739
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 8.957411363609584,
+      "learning_rate": 2.2539739158765998e-06,
+      "loss": 0.6919,
+      "step": 9740
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 9.402960511441757,
+      "learning_rate": 2.253008985158769e-06,
+      "loss": 0.7946,
+      "step": 9741
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 12.721930913811192,
+      "learning_rate": 2.252044200957792e-06,
+      "loss": 0.6668,
+      "step": 9742
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 7.48751086458298,
+      "learning_rate": 2.2510795633251264e-06,
+      "loss": 0.7117,
+      "step": 9743
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 7.946540767822302,
+      "learning_rate": 2.2501150723122277e-06,
+      "loss": 0.7772,
+      "step": 9744
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 7.219921374206485,
+      "learning_rate": 2.2491507279705343e-06,
+      "loss": 0.7568,
+      "step": 9745
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 9.784739370737304,
+      "learning_rate": 2.248186530351486e-06,
+      "loss": 0.6838,
+      "step": 9746
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 9.829379577294812,
+      "learning_rate": 2.247222479506507e-06,
+      "loss": 0.7229,
+      "step": 9747
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 8.193984195511756,
+      "learning_rate": 2.2462585754870196e-06,
+      "loss": 0.7109,
+      "step": 9748
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 9.876630588454804,
+      "learning_rate": 2.2452948183444357e-06,
+      "loss": 0.7729,
+      "step": 9749
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 9.470449424302052,
+      "learning_rate": 2.244331208130156e-06,
+      "loss": 0.7432,
+      "step": 9750
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 7.431551254260799,
+      "learning_rate": 2.243367744895581e-06,
+      "loss": 0.7038,
+      "step": 9751
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 6.656597395198737,
+      "learning_rate": 2.2424044286920953e-06,
+      "loss": 0.7374,
+      "step": 9752
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 8.209054545838745,
+      "learning_rate": 2.241441259571082e-06,
+      "loss": 0.7121,
+      "step": 9753
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 10.382376363087847,
+      "learning_rate": 2.240478237583915e-06,
+      "loss": 0.7339,
+      "step": 9754
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 13.040044261463105,
+      "learning_rate": 2.239515362781958e-06,
+      "loss": 0.692,
+      "step": 9755
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 8.615027968192855,
+      "learning_rate": 2.238552635216567e-06,
+      "loss": 0.7402,
+      "step": 9756
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 7.146925831531433,
+      "learning_rate": 2.2375900549390907e-06,
+      "loss": 0.7344,
+      "step": 9757
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 10.713009682078964,
+      "learning_rate": 2.2366276220008715e-06,
+      "loss": 0.6638,
+      "step": 9758
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 11.61427536028361,
+      "learning_rate": 2.2356653364532444e-06,
+      "loss": 0.7981,
+      "step": 9759
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 7.635389926032065,
+      "learning_rate": 2.2347031983475325e-06,
+      "loss": 0.6897,
+      "step": 9760
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 12.025090817542486,
+      "learning_rate": 2.2337412077350577e-06,
+      "loss": 0.7113,
+      "step": 9761
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 8.974665685555895,
+      "learning_rate": 2.232779364667124e-06,
+      "loss": 0.7361,
+      "step": 9762
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 9.494143766125022,
+      "learning_rate": 2.231817669195036e-06,
+      "loss": 0.7054,
+      "step": 9763
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 7.151104154373321,
+      "learning_rate": 2.2308561213700904e-06,
+      "loss": 0.7011,
+      "step": 9764
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 7.891586810111664,
+      "learning_rate": 2.229894721243569e-06,
+      "loss": 0.7157,
+      "step": 9765
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 11.157220667033025,
+      "learning_rate": 2.2289334688667546e-06,
+      "loss": 0.6253,
+      "step": 9766
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 8.846822456842377,
+      "learning_rate": 2.227972364290916e-06,
+      "loss": 0.7661,
+      "step": 9767
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 20.787510787903862,
+      "learning_rate": 2.2270114075673134e-06,
+      "loss": 0.7503,
+      "step": 9768
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 9.07000054662947,
+      "learning_rate": 2.2260505987472054e-06,
+      "loss": 0.7246,
+      "step": 9769
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 9.001734886148721,
+      "learning_rate": 2.2250899378818353e-06,
+      "loss": 0.7736,
+      "step": 9770
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 6.441288945689715,
+      "learning_rate": 2.2241294250224455e-06,
+      "loss": 0.7741,
+      "step": 9771
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 9.675972142579667,
+      "learning_rate": 2.223169060220264e-06,
+      "loss": 0.7331,
+      "step": 9772
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 8.852183565291389,
+      "learning_rate": 2.222208843526517e-06,
+      "loss": 0.7715,
+      "step": 9773
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 11.555386089009575,
+      "learning_rate": 2.2212487749924182e-06,
+      "loss": 0.7441,
+      "step": 9774
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 9.137223748099965,
+      "learning_rate": 2.2202888546691736e-06,
+      "loss": 0.6636,
+      "step": 9775
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 11.176860530829353,
+      "learning_rate": 2.219329082607986e-06,
+      "loss": 0.649,
+      "step": 9776
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 11.470336566333952,
+      "learning_rate": 2.218369458860043e-06,
+      "loss": 0.6807,
+      "step": 9777
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 10.537445460389662,
+      "learning_rate": 2.21740998347653e-06,
+      "loss": 0.6322,
+      "step": 9778
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 11.699856922973884,
+      "learning_rate": 2.216450656508625e-06,
+      "loss": 0.6737,
+      "step": 9779
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 9.454277311826361,
+      "learning_rate": 2.215491478007495e-06,
+      "loss": 0.6999,
+      "step": 9780
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 11.955127316828305,
+      "learning_rate": 2.214532448024298e-06,
+      "loss": 0.721,
+      "step": 9781
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 6.593426664275623,
+      "learning_rate": 2.2135735666101846e-06,
+      "loss": 0.677,
+      "step": 9782
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 11.749123809504576,
+      "learning_rate": 2.212614833816302e-06,
+      "loss": 0.7221,
+      "step": 9783
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 10.387136016487375,
+      "learning_rate": 2.2116562496937864e-06,
+      "loss": 0.6455,
+      "step": 9784
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 8.477775633233893,
+      "learning_rate": 2.2106978142937635e-06,
+      "loss": 0.7052,
+      "step": 9785
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 9.697528498124836,
+      "learning_rate": 2.209739527667357e-06,
+      "loss": 0.7397,
+      "step": 9786
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 8.236724334773823,
+      "learning_rate": 2.2087813898656775e-06,
+      "loss": 0.7586,
+      "step": 9787
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 5.761632729126619,
+      "learning_rate": 2.2078234009398276e-06,
+      "loss": 0.7525,
+      "step": 9788
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 8.57029840683827,
+      "learning_rate": 2.2068655609409066e-06,
+      "loss": 0.7282,
+      "step": 9789
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 9.228464027776486,
+      "learning_rate": 2.2059078699200006e-06,
+      "loss": 0.6824,
+      "step": 9790
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 8.266361582316536,
+      "learning_rate": 2.204950327928191e-06,
+      "loss": 0.6874,
+      "step": 9791
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 9.064285315938335,
+      "learning_rate": 2.203992935016553e-06,
+      "loss": 0.7555,
+      "step": 9792
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 11.540509627596752,
+      "learning_rate": 2.203035691236149e-06,
+      "loss": 0.714,
+      "step": 9793
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 9.875116531026004,
+      "learning_rate": 2.2020785966380355e-06,
+      "loss": 0.7692,
+      "step": 9794
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 9.290806298935612,
+      "learning_rate": 2.20112165127326e-06,
+      "loss": 0.7343,
+      "step": 9795
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 7.298078255702233,
+      "learning_rate": 2.200164855192865e-06,
+      "loss": 0.7232,
+      "step": 9796
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 9.377093027445332,
+      "learning_rate": 2.199208208447885e-06,
+      "loss": 0.7044,
+      "step": 9797
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 11.32324372472913,
+      "learning_rate": 2.198251711089341e-06,
+      "loss": 0.7284,
+      "step": 9798
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 8.180734828767214,
+      "learning_rate": 2.197295363168254e-06,
+      "loss": 0.7493,
+      "step": 9799
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 8.45906212616308,
+      "learning_rate": 2.1963391647356314e-06,
+      "loss": 0.7497,
+      "step": 9800
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 11.998412950395164,
+      "learning_rate": 2.195383115842471e-06,
+      "loss": 0.7272,
+      "step": 9801
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 8.088206329398727,
+      "learning_rate": 2.194427216539771e-06,
+      "loss": 0.7552,
+      "step": 9802
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 8.382803510880107,
+      "learning_rate": 2.193471466878512e-06,
+      "loss": 0.7376,
+      "step": 9803
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 10.289746347995454,
+      "learning_rate": 2.1925158669096748e-06,
+      "loss": 0.7178,
+      "step": 9804
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 8.166331260998795,
+      "learning_rate": 2.1915604166842263e-06,
+      "loss": 0.7315,
+      "step": 9805
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 7.425097685977613,
+      "learning_rate": 2.1906051162531262e-06,
+      "loss": 0.744,
+      "step": 9806
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 7.00623816497269,
+      "learning_rate": 2.1896499656673307e-06,
+      "loss": 0.6786,
+      "step": 9807
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 8.107624470300395,
+      "learning_rate": 2.188694964977782e-06,
+      "loss": 0.7638,
+      "step": 9808
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 8.394230632564534,
+      "learning_rate": 2.18774011423542e-06,
+      "loss": 0.6672,
+      "step": 9809
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 8.063389246521162,
+      "learning_rate": 2.186785413491171e-06,
+      "loss": 0.8202,
+      "step": 9810
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 6.080338941040792,
+      "learning_rate": 2.1858308627959584e-06,
+      "loss": 0.7368,
+      "step": 9811
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 9.960090186204026,
+      "learning_rate": 2.184876462200695e-06,
+      "loss": 0.7074,
+      "step": 9812
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 6.587161501926792,
+      "learning_rate": 2.183922211756283e-06,
+      "loss": 0.7228,
+      "step": 9813
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 7.112867688612561,
+      "learning_rate": 2.1829681115136236e-06,
+      "loss": 0.7492,
+      "step": 9814
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 7.467500111521647,
+      "learning_rate": 2.1820141615236024e-06,
+      "loss": 0.7179,
+      "step": 9815
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 9.19774651526475,
+      "learning_rate": 2.1810603618371013e-06,
+      "loss": 0.8071,
+      "step": 9816
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 9.546015639597838,
+      "learning_rate": 2.1801067125049963e-06,
+      "loss": 0.6985,
+      "step": 9817
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 10.534509954278587,
+      "learning_rate": 2.1791532135781494e-06,
+      "loss": 0.7045,
+      "step": 9818
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 6.837269144685454,
+      "learning_rate": 2.1781998651074186e-06,
+      "loss": 0.6993,
+      "step": 9819
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 8.31031339605422,
+      "learning_rate": 2.1772466671436505e-06,
+      "loss": 0.7764,
+      "step": 9820
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 7.934714414916593,
+      "learning_rate": 2.1762936197376883e-06,
+      "loss": 0.6583,
+      "step": 9821
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 9.271254816219622,
+      "learning_rate": 2.1753407229403657e-06,
+      "loss": 0.7131,
+      "step": 9822
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 9.011247580314869,
+      "learning_rate": 2.174387976802505e-06,
+      "loss": 0.7354,
+      "step": 9823
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 9.341588242045137,
+      "learning_rate": 2.1734353813749266e-06,
+      "loss": 0.8204,
+      "step": 9824
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 13.210596254031383,
+      "learning_rate": 2.172482936708437e-06,
+      "loss": 0.7434,
+      "step": 9825
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 8.392227733619178,
+      "learning_rate": 2.1715306428538353e-06,
+      "loss": 0.7103,
+      "step": 9826
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 10.59067394438276,
+      "learning_rate": 2.170578499861917e-06,
+      "loss": 0.6835,
+      "step": 9827
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 8.628339296500325,
+      "learning_rate": 2.169626507783465e-06,
+      "loss": 0.7429,
+      "step": 9828
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 8.820673061815345,
+      "learning_rate": 2.1686746666692567e-06,
+      "loss": 0.7732,
+      "step": 9829
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 6.643071913173641,
+      "learning_rate": 2.167722976570062e-06,
+      "loss": 0.7587,
+      "step": 9830
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 9.50653639527464,
+      "learning_rate": 2.16677143753664e-06,
+      "loss": 0.7539,
+      "step": 9831
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 9.51353837778765,
+      "learning_rate": 2.1658200496197434e-06,
+      "loss": 0.7456,
+      "step": 9832
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 9.885927130103617,
+      "learning_rate": 2.1648688128701143e-06,
+      "loss": 0.7382,
+      "step": 9833
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 10.947223165424003,
+      "learning_rate": 2.1639177273384915e-06,
+      "loss": 0.7576,
+      "step": 9834
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 7.829929249692346,
+      "learning_rate": 2.1629667930756045e-06,
+      "loss": 0.7244,
+      "step": 9835
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 9.34575200960599,
+      "learning_rate": 2.16201601013217e-06,
+      "loss": 0.7564,
+      "step": 9836
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 8.015613745795704,
+      "learning_rate": 2.1610653785589036e-06,
+      "loss": 0.7417,
+      "step": 9837
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 7.271036838972332,
+      "learning_rate": 2.1601148984065075e-06,
+      "loss": 0.6988,
+      "step": 9838
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 7.321114844605173,
+      "learning_rate": 2.1591645697256765e-06,
+      "loss": 0.7244,
+      "step": 9839
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 9.817730927181795,
+      "learning_rate": 2.1582143925671013e-06,
+      "loss": 0.7489,
+      "step": 9840
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 7.187540083663607,
+      "learning_rate": 2.1572643669814585e-06,
+      "loss": 0.7674,
+      "step": 9841
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 8.90495922803602,
+      "learning_rate": 2.1563144930194237e-06,
+      "loss": 0.6731,
+      "step": 9842
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 9.732751261086495,
+      "learning_rate": 2.155364770731656e-06,
+      "loss": 0.7837,
+      "step": 9843
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 10.199304365965855,
+      "learning_rate": 2.154415200168815e-06,
+      "loss": 0.7499,
+      "step": 9844
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 10.698816240379557,
+      "learning_rate": 2.1534657813815473e-06,
+      "loss": 0.7932,
+      "step": 9845
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 10.483325313300648,
+      "learning_rate": 2.152516514420489e-06,
+      "loss": 0.7531,
+      "step": 9846
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 9.372927440981867,
+      "learning_rate": 2.151567399336276e-06,
+      "loss": 0.755,
+      "step": 9847
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 7.319357682252092,
+      "learning_rate": 2.1506184361795275e-06,
+      "loss": 0.7215,
+      "step": 9848
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 9.186804228459932,
+      "learning_rate": 2.149669625000862e-06,
+      "loss": 0.7764,
+      "step": 9849
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 6.940145643393257,
+      "learning_rate": 2.1487209658508844e-06,
+      "loss": 0.8082,
+      "step": 9850
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 8.176294981336525,
+      "learning_rate": 2.1477724587801926e-06,
+      "loss": 0.6914,
+      "step": 9851
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 10.591998078382431,
+      "learning_rate": 2.14682410383938e-06,
+      "loss": 0.7269,
+      "step": 9852
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 7.495321800921314,
+      "learning_rate": 2.145875901079026e-06,
+      "loss": 0.727,
+      "step": 9853
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 11.006730361953279,
+      "learning_rate": 2.1449278505497075e-06,
+      "loss": 0.6746,
+      "step": 9854
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 10.02253475192692,
+      "learning_rate": 2.1439799523019916e-06,
+      "loss": 0.7061,
+      "step": 9855
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 8.415337692892665,
+      "learning_rate": 2.143032206386435e-06,
+      "loss": 0.7215,
+      "step": 9856
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 6.893356887471796,
+      "learning_rate": 2.142084612853589e-06,
+      "loss": 0.6601,
+      "step": 9857
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 11.69228117951787,
+      "learning_rate": 2.141137171753992e-06,
+      "loss": 0.7636,
+      "step": 9858
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 9.732791377274165,
+      "learning_rate": 2.140189883138181e-06,
+      "loss": 0.7757,
+      "step": 9859
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 8.787658901499817,
+      "learning_rate": 2.139242747056683e-06,
+      "loss": 0.751,
+      "step": 9860
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 9.22474502954249,
+      "learning_rate": 2.138295763560012e-06,
+      "loss": 0.8004,
+      "step": 9861
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 10.918543791523255,
+      "learning_rate": 2.1373489326986814e-06,
+      "loss": 0.7411,
+      "step": 9862
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 10.247026764610968,
+      "learning_rate": 2.1364022545231906e-06,
+      "loss": 0.7443,
+      "step": 9863
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 7.965271794991339,
+      "learning_rate": 2.1354557290840307e-06,
+      "loss": 0.7161,
+      "step": 9864
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 9.461027761962912,
+      "learning_rate": 2.13450935643169e-06,
+      "loss": 0.7153,
+      "step": 9865
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 10.008659479581551,
+      "learning_rate": 2.1335631366166424e-06,
+      "loss": 0.6723,
+      "step": 9866
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 10.52018167985928,
+      "learning_rate": 2.1326170696893604e-06,
+      "loss": 0.7612,
+      "step": 9867
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 9.109358156681795,
+      "learning_rate": 2.131671155700301e-06,
+      "loss": 0.7146,
+      "step": 9868
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 10.740264239871705,
+      "learning_rate": 2.1307253946999196e-06,
+      "loss": 0.7078,
+      "step": 9869
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 11.552516147279773,
+      "learning_rate": 2.1297797867386594e-06,
+      "loss": 0.7212,
+      "step": 9870
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 7.412783721609897,
+      "learning_rate": 2.1288343318669534e-06,
+      "loss": 0.7461,
+      "step": 9871
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 8.344552342373232,
+      "learning_rate": 2.127889030135233e-06,
+      "loss": 0.7168,
+      "step": 9872
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 11.278572837307307,
+      "learning_rate": 2.1269438815939185e-06,
+      "loss": 0.7017,
+      "step": 9873
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 7.492643819528241,
+      "learning_rate": 2.125998886293418e-06,
+      "loss": 0.6879,
+      "step": 9874
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 10.3376598881211,
+      "learning_rate": 2.1250540442841393e-06,
+      "loss": 0.738,
+      "step": 9875
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 12.096206475588296,
+      "learning_rate": 2.124109355616476e-06,
+      "loss": 0.6949,
+      "step": 9876
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 9.74023907150305,
+      "learning_rate": 2.123164820340812e-06,
+      "loss": 0.7326,
+      "step": 9877
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 9.885643044370928,
+      "learning_rate": 2.1222204385075307e-06,
+      "loss": 0.7048,
+      "step": 9878
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 9.66844278310753,
+      "learning_rate": 2.121276210166999e-06,
+      "loss": 0.8167,
+      "step": 9879
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 7.44092153712696,
+      "learning_rate": 2.1203321353695834e-06,
+      "loss": 0.65,
+      "step": 9880
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 9.08943426157351,
+      "learning_rate": 2.119388214165634e-06,
+      "loss": 0.7106,
+      "step": 9881
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 11.423341084718892,
+      "learning_rate": 2.118444446605501e-06,
+      "loss": 0.7555,
+      "step": 9882
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 7.5137106799405755,
+      "learning_rate": 2.1175008327395204e-06,
+      "loss": 0.7669,
+      "step": 9883
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 13.206530171218652,
+      "learning_rate": 2.1165573726180195e-06,
+      "loss": 0.736,
+      "step": 9884
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 9.595307252039868,
+      "learning_rate": 2.1156140662913244e-06,
+      "loss": 0.7157,
+      "step": 9885
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 9.029928464494416,
+      "learning_rate": 2.114670913809745e-06,
+      "loss": 0.6872,
+      "step": 9886
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 7.826304027846353,
+      "learning_rate": 2.113727915223587e-06,
+      "loss": 0.7619,
+      "step": 9887
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 9.169947626645977,
+      "learning_rate": 2.1127850705831514e-06,
+      "loss": 0.7854,
+      "step": 9888
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 9.733543047274939,
+      "learning_rate": 2.1118423799387204e-06,
+      "loss": 0.7496,
+      "step": 9889
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 9.372098020718777,
+      "learning_rate": 2.1108998433405796e-06,
+      "loss": 0.6383,
+      "step": 9890
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 9.648591363403733,
+      "learning_rate": 2.109957460838997e-06,
+      "loss": 0.7359,
+      "step": 9891
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 7.828445735548058,
+      "learning_rate": 2.1090152324842392e-06,
+      "loss": 0.7262,
+      "step": 9892
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 9.172244034277066,
+      "learning_rate": 2.108073158326564e-06,
+      "loss": 0.6924,
+      "step": 9893
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 7.924219128364171,
+      "learning_rate": 2.107131238416216e-06,
+      "loss": 0.7136,
+      "step": 9894
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 8.064897478636688,
+      "learning_rate": 2.106189472803436e-06,
+      "loss": 0.779,
+      "step": 9895
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 9.117019630719163,
+      "learning_rate": 2.1052478615384525e-06,
+      "loss": 0.7008,
+      "step": 9896
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 7.5191700212764445,
+      "learning_rate": 2.1043064046714907e-06,
+      "loss": 0.8258,
+      "step": 9897
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 8.248284793832783,
+      "learning_rate": 2.1033651022527664e-06,
+      "loss": 0.68,
+      "step": 9898
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 12.626249225804857,
+      "learning_rate": 2.102423954332483e-06,
+      "loss": 0.7546,
+      "step": 9899
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 11.421669985935715,
+      "learning_rate": 2.101482960960842e-06,
+      "loss": 0.7848,
+      "step": 9900
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 10.386516722406874,
+      "learning_rate": 2.100542122188032e-06,
+      "loss": 0.735,
+      "step": 9901
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 8.974698474894971,
+      "learning_rate": 2.099601438064233e-06,
+      "loss": 0.7237,
+      "step": 9902
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 10.07059720065949,
+      "learning_rate": 2.098660908639622e-06,
+      "loss": 0.7347,
+      "step": 9903
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 9.289611036359638,
+      "learning_rate": 2.0977205339643602e-06,
+      "loss": 0.6943,
+      "step": 9904
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 7.700490417650762,
+      "learning_rate": 2.0967803140886076e-06,
+      "loss": 0.694,
+      "step": 9905
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 9.63317045740633,
+      "learning_rate": 2.095840249062511e-06,
+      "loss": 0.739,
+      "step": 9906
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 11.917761183739925,
+      "learning_rate": 2.0949003389362135e-06,
+      "loss": 0.694,
+      "step": 9907
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 6.3400947083607635,
+      "learning_rate": 2.0939605837598453e-06,
+      "loss": 0.7345,
+      "step": 9908
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 6.869868459575964,
+      "learning_rate": 2.0930209835835287e-06,
+      "loss": 0.7069,
+      "step": 9909
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 6.780909994599697,
+      "learning_rate": 2.092081538457383e-06,
+      "loss": 0.704,
+      "step": 9910
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 8.98314270995785,
+      "learning_rate": 2.0911422484315117e-06,
+      "loss": 0.6762,
+      "step": 9911
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 6.93405352780622,
+      "learning_rate": 2.0902031135560162e-06,
+      "loss": 0.6756,
+      "step": 9912
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 7.469430836200024,
+      "learning_rate": 2.0892641338809895e-06,
+      "loss": 0.7172,
+      "step": 9913
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 6.508598817585141,
+      "learning_rate": 2.0883253094565107e-06,
+      "loss": 0.8047,
+      "step": 9914
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 8.80125047998443,
+      "learning_rate": 2.0873866403326543e-06,
+      "loss": 0.6868,
+      "step": 9915
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 6.831865988952068,
+      "learning_rate": 2.086448126559489e-06,
+      "loss": 0.7061,
+      "step": 9916
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 7.57731341244196,
+      "learning_rate": 2.0855097681870685e-06,
+      "loss": 0.6788,
+      "step": 9917
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 9.951603777015753,
+      "learning_rate": 2.084571565265446e-06,
+      "loss": 0.6895,
+      "step": 9918
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 9.30404222410879,
+      "learning_rate": 2.0836335178446597e-06,
+      "loss": 0.763,
+      "step": 9919
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 8.915630817779128,
+      "learning_rate": 2.0826956259747457e-06,
+      "loss": 0.7534,
+      "step": 9920
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 7.3318501366365485,
+      "learning_rate": 2.081757889705726e-06,
+      "loss": 0.7164,
+      "step": 9921
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 8.69549061870394,
+      "learning_rate": 2.0808203090876166e-06,
+      "loss": 0.7146,
+      "step": 9922
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 9.224854569266585,
+      "learning_rate": 2.0798828841704276e-06,
+      "loss": 0.6639,
+      "step": 9923
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 6.751124025039645,
+      "learning_rate": 2.0789456150041552e-06,
+      "loss": 0.69,
+      "step": 9924
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 8.944870392581391,
+      "learning_rate": 2.0780085016387934e-06,
+      "loss": 0.7245,
+      "step": 9925
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 7.072837407893516,
+      "learning_rate": 2.0770715441243267e-06,
+      "loss": 0.6831,
+      "step": 9926
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 8.612107820354082,
+      "learning_rate": 2.0761347425107273e-06,
+      "loss": 0.6887,
+      "step": 9927
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 7.861478714959245,
+      "learning_rate": 2.0751980968479625e-06,
+      "loss": 0.7322,
+      "step": 9928
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 8.034484821710999,
+      "learning_rate": 2.0742616071859876e-06,
+      "loss": 0.7993,
+      "step": 9929
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 9.818901710711463,
+      "learning_rate": 2.0733252735747557e-06,
+      "loss": 0.73,
+      "step": 9930
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 12.25362879091874,
+      "learning_rate": 2.0723890960642077e-06,
+      "loss": 0.7356,
+      "step": 9931
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 9.378118828335973,
+      "learning_rate": 2.071453074704277e-06,
+      "loss": 0.7307,
+      "step": 9932
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 7.957113706212601,
+      "learning_rate": 2.070517209544888e-06,
+      "loss": 0.7309,
+      "step": 9933
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 11.177379643747798,
+      "learning_rate": 2.0695815006359547e-06,
+      "loss": 0.7736,
+      "step": 9934
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 11.263417078912147,
+      "learning_rate": 2.068645948027387e-06,
+      "loss": 0.7167,
+      "step": 9935
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 10.193195676244049,
+      "learning_rate": 2.0677105517690872e-06,
+      "loss": 0.7176,
+      "step": 9936
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 8.405322195710063,
+      "learning_rate": 2.066775311910942e-06,
+      "loss": 0.7618,
+      "step": 9937
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 8.86686905532498,
+      "learning_rate": 2.0658402285028394e-06,
+      "loss": 0.7015,
+      "step": 9938
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 8.642735103727896,
+      "learning_rate": 2.0649053015946514e-06,
+      "loss": 0.7064,
+      "step": 9939
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 8.686205383812316,
+      "learning_rate": 2.0639705312362434e-06,
+      "loss": 0.6874,
+      "step": 9940
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 10.042362643218762,
+      "learning_rate": 2.063035917477476e-06,
+      "loss": 0.7129,
+      "step": 9941
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 9.334356602014516,
+      "learning_rate": 2.062101460368196e-06,
+      "loss": 0.7632,
+      "step": 9942
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 9.071908459264344,
+      "learning_rate": 2.061167159958248e-06,
+      "loss": 0.7462,
+      "step": 9943
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 8.434849624936694,
+      "learning_rate": 2.060233016297462e-06,
+      "loss": 0.6626,
+      "step": 9944
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 9.097286960136765,
+      "learning_rate": 2.0592990294356656e-06,
+      "loss": 0.7273,
+      "step": 9945
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 9.081327036985238,
+      "learning_rate": 2.058365199422674e-06,
+      "loss": 0.6696,
+      "step": 9946
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 9.60576324459388,
+      "learning_rate": 2.0574315263082923e-06,
+      "loss": 0.7279,
+      "step": 9947
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 7.199522822487098,
+      "learning_rate": 2.056498010142324e-06,
+      "loss": 0.7631,
+      "step": 9948
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 13.427458621886961,
+      "learning_rate": 2.055564650974557e-06,
+      "loss": 0.7087,
+      "step": 9949
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 10.516005522147786,
+      "learning_rate": 2.054631448854775e-06,
+      "loss": 0.7778,
+      "step": 9950
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 12.45751185980535,
+      "learning_rate": 2.0536984038327555e-06,
+      "loss": 0.7353,
+      "step": 9951
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 10.318998417912516,
+      "learning_rate": 2.052765515958262e-06,
+      "loss": 0.7315,
+      "step": 9952
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 11.878571565018552,
+      "learning_rate": 2.051832785281052e-06,
+      "loss": 0.7785,
+      "step": 9953
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 8.313822845944404,
+      "learning_rate": 2.0509002118508736e-06,
+      "loss": 0.7093,
+      "step": 9954
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 9.311306689090896,
+      "learning_rate": 2.0499677957174687e-06,
+      "loss": 0.7486,
+      "step": 9955
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 8.069501056560643,
+      "learning_rate": 2.0490355369305715e-06,
+      "loss": 0.7337,
+      "step": 9956
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 8.620797303815326,
+      "learning_rate": 2.0481034355399032e-06,
+      "loss": 0.7211,
+      "step": 9957
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 9.280951739868025,
+      "learning_rate": 2.047171491595183e-06,
+      "loss": 0.7181,
+      "step": 9958
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 7.801079987967712,
+      "learning_rate": 2.0462397051461156e-06,
+      "loss": 0.7328,
+      "step": 9959
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 9.165882194008335,
+      "learning_rate": 2.0453080762423994e-06,
+      "loss": 0.8013,
+      "step": 9960
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 9.996931409702281,
+      "learning_rate": 2.0443766049337277e-06,
+      "loss": 0.6809,
+      "step": 9961
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 9.815291962695975,
+      "learning_rate": 2.043445291269779e-06,
+      "loss": 0.7281,
+      "step": 9962
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 9.621306750273215,
+      "learning_rate": 2.042514135300228e-06,
+      "loss": 0.7336,
+      "step": 9963
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 8.497942722820595,
+      "learning_rate": 2.0415831370747437e-06,
+      "loss": 0.6951,
+      "step": 9964
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 9.451713338876614,
+      "learning_rate": 2.0406522966429793e-06,
+      "loss": 0.7203,
+      "step": 9965
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 8.726444554110827,
+      "learning_rate": 2.0397216140545845e-06,
+      "loss": 0.7247,
+      "step": 9966
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 7.658287927802694,
+      "learning_rate": 2.0387910893591966e-06,
+      "loss": 0.7011,
+      "step": 9967
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 9.366403082141858,
+      "learning_rate": 2.0378607226064502e-06,
+      "loss": 0.7282,
+      "step": 9968
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 8.902594178217294,
+      "learning_rate": 2.036930513845969e-06,
+      "loss": 0.7393,
+      "step": 9969
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 7.995730362768462,
+      "learning_rate": 2.0360004631273643e-06,
+      "loss": 0.7615,
+      "step": 9970
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 8.85708706890939,
+      "learning_rate": 2.0350705705002483e-06,
+      "loss": 0.8927,
+      "step": 9971
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 13.262824398424742,
+      "learning_rate": 2.0341408360142116e-06,
+      "loss": 0.6938,
+      "step": 9972
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 7.745932333394445,
+      "learning_rate": 2.0332112597188474e-06,
+      "loss": 0.7174,
+      "step": 9973
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 10.28911441679881,
+      "learning_rate": 2.032281841663738e-06,
+      "loss": 0.7252,
+      "step": 9974
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 10.484454040213418,
+      "learning_rate": 2.031352581898453e-06,
+      "loss": 0.6908,
+      "step": 9975
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 7.565709359264896,
+      "learning_rate": 2.0304234804725594e-06,
+      "loss": 0.7101,
+      "step": 9976
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 9.062739524028041,
+      "learning_rate": 2.029494537435612e-06,
+      "loss": 0.7724,
+      "step": 9977
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 7.43648583442739,
+      "learning_rate": 2.028565752837155e-06,
+      "loss": 0.7357,
+      "step": 9978
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 8.281178391661234,
+      "learning_rate": 2.027637126726732e-06,
+      "loss": 0.6988,
+      "step": 9979
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 9.96133392506994,
+      "learning_rate": 2.0267086591538692e-06,
+      "loss": 0.6477,
+      "step": 9980
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 9.621654167004097,
+      "learning_rate": 2.025780350168092e-06,
+      "loss": 0.754,
+      "step": 9981
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 9.487589185934466,
+      "learning_rate": 2.0248521998189104e-06,
+      "loss": 0.7417,
+      "step": 9982
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 7.704302914316853,
+      "learning_rate": 2.0239242081558324e-06,
+      "loss": 0.7427,
+      "step": 9983
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 9.039192845241937,
+      "learning_rate": 2.0229963752283538e-06,
+      "loss": 0.762,
+      "step": 9984
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 11.078158064150891,
+      "learning_rate": 2.0220687010859602e-06,
+      "loss": 0.7539,
+      "step": 9985
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 7.381406083593044,
+      "learning_rate": 2.0211411857781342e-06,
+      "loss": 0.7611,
+      "step": 9986
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 9.410326764050927,
+      "learning_rate": 2.020213829354344e-06,
+      "loss": 0.7159,
+      "step": 9987
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 7.018496029736788,
+      "learning_rate": 2.0192866318640543e-06,
+      "loss": 0.749,
+      "step": 9988
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 9.441991432536241,
+      "learning_rate": 2.01835959335672e-06,
+      "loss": 0.6698,
+      "step": 9989
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 8.177335215088755,
+      "learning_rate": 2.017432713881786e-06,
+      "loss": 0.6824,
+      "step": 9990
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 9.120290638002821,
+      "learning_rate": 2.0165059934886894e-06,
+      "loss": 0.7502,
+      "step": 9991
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 9.288627160414055,
+      "learning_rate": 2.015579432226856e-06,
+      "loss": 0.7033,
+      "step": 9992
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 9.714475982176861,
+      "learning_rate": 2.0146530301457083e-06,
+      "loss": 0.7477,
+      "step": 9993
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 8.556653261678758,
+      "learning_rate": 2.013726787294661e-06,
+      "loss": 0.7137,
+      "step": 9994
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 8.759344773591469,
+      "learning_rate": 2.0128007037231117e-06,
+      "loss": 0.7466,
+      "step": 9995
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 11.612783351925168,
+      "learning_rate": 2.01187477948046e-06,
+      "loss": 0.7563,
+      "step": 9996
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 10.695842686049478,
+      "learning_rate": 2.0109490146160905e-06,
+      "loss": 0.7284,
+      "step": 9997
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 9.69786633687846,
+      "learning_rate": 2.0100234091793778e-06,
+      "loss": 0.7837,
+      "step": 9998
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 14.020886190653075,
+      "learning_rate": 2.0090979632196965e-06,
+      "loss": 0.7546,
+      "step": 9999
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 8.756878995551558,
+      "learning_rate": 2.0081726767864024e-06,
+      "loss": 0.668,
+      "step": 10000
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 15.188812519082811,
+      "learning_rate": 2.00724754992885e-06,
+      "loss": 0.7615,
+      "step": 10001
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 13.038964861882768,
+      "learning_rate": 2.006322582696385e-06,
+      "loss": 0.7667,
+      "step": 10002
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 11.352527509985963,
+      "learning_rate": 2.0053977751383395e-06,
+      "loss": 0.7315,
+      "step": 10003
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 10.56036553265538,
+      "learning_rate": 2.0044731273040425e-06,
+      "loss": 0.7776,
+      "step": 10004
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 7.4028805517800995,
+      "learning_rate": 2.003548639242808e-06,
+      "loss": 0.764,
+      "step": 10005
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 8.503792477819115,
+      "learning_rate": 2.0026243110039488e-06,
+      "loss": 0.6969,
+      "step": 10006
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 12.504289175275538,
+      "learning_rate": 2.0017001426367673e-06,
+      "loss": 0.7573,
+      "step": 10007
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 9.166717851922153,
+      "learning_rate": 2.0007761341905523e-06,
+      "loss": 0.7095,
+      "step": 10008
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 7.364403411620755,
+      "learning_rate": 1.999852285714592e-06,
+      "loss": 0.764,
+      "step": 10009
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 9.2106756670336,
+      "learning_rate": 1.9989285972581595e-06,
+      "loss": 0.6937,
+      "step": 10010
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 7.659650557076245,
+      "learning_rate": 1.998005068870521e-06,
+      "loss": 0.7188,
+      "step": 10011
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 9.437303669127099,
+      "learning_rate": 1.9970817006009373e-06,
+      "loss": 0.739,
+      "step": 10012
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 8.23467761970438,
+      "learning_rate": 1.9961584924986553e-06,
+      "loss": 0.7461,
+      "step": 10013
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 13.023460146397703,
+      "learning_rate": 1.9952354446129196e-06,
+      "loss": 0.7689,
+      "step": 10014
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 10.221217212776315,
+      "learning_rate": 1.99431255699296e-06,
+      "loss": 0.7006,
+      "step": 10015
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 9.194733219020291,
+      "learning_rate": 1.993389829688004e-06,
+      "loss": 0.799,
+      "step": 10016
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 7.785744483728715,
+      "learning_rate": 1.9924672627472656e-06,
+      "loss": 0.7635,
+      "step": 10017
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 10.05551525336431,
+      "learning_rate": 1.9915448562199495e-06,
+      "loss": 0.8118,
+      "step": 10018
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 7.74606519035776,
+      "learning_rate": 1.9906226101552596e-06,
+      "loss": 0.7584,
+      "step": 10019
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 9.204646482393816,
+      "learning_rate": 1.9897005246023806e-06,
+      "loss": 0.7363,
+      "step": 10020
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 7.695530824760758,
+      "learning_rate": 1.9887785996104976e-06,
+      "loss": 0.7319,
+      "step": 10021
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 9.985980501974263,
+      "learning_rate": 1.987856835228783e-06,
+      "loss": 0.7119,
+      "step": 10022
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 8.502996800442752,
+      "learning_rate": 1.9869352315063982e-06,
+      "loss": 0.6923,
+      "step": 10023
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 9.042543238657,
+      "learning_rate": 1.9860137884925037e-06,
+      "loss": 0.7029,
+      "step": 10024
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 6.956601638370662,
+      "learning_rate": 1.985092506236242e-06,
+      "loss": 0.7311,
+      "step": 10025
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 10.46731480187949,
+      "learning_rate": 1.984171384786754e-06,
+      "loss": 0.7119,
+      "step": 10026
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 7.635299882131645,
+      "learning_rate": 1.9832504241931717e-06,
+      "loss": 0.715,
+      "step": 10027
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 8.61874087365446,
+      "learning_rate": 1.9823296245046143e-06,
+      "loss": 0.7064,
+      "step": 10028
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 10.407742778937227,
+      "learning_rate": 1.9814089857701953e-06,
+      "loss": 0.7243,
+      "step": 10029
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 9.947650654869742,
+      "learning_rate": 1.980488508039016e-06,
+      "loss": 0.7402,
+      "step": 10030
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 7.094516662233248,
+      "learning_rate": 1.979568191360176e-06,
+      "loss": 0.782,
+      "step": 10031
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 10.881087218093537,
+      "learning_rate": 1.9786480357827625e-06,
+      "loss": 0.7334,
+      "step": 10032
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 6.794376596127091,
+      "learning_rate": 1.977728041355851e-06,
+      "loss": 0.6587,
+      "step": 10033
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 8.644027852472954,
+      "learning_rate": 1.9768082081285152e-06,
+      "loss": 0.7503,
+      "step": 10034
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 8.11631976750064,
+      "learning_rate": 1.9758885361498142e-06,
+      "loss": 0.694,
+      "step": 10035
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 13.24759255777801,
+      "learning_rate": 1.974969025468799e-06,
+      "loss": 0.7263,
+      "step": 10036
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 8.416117664994983,
+      "learning_rate": 1.974049676134518e-06,
+      "loss": 0.6756,
+      "step": 10037
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 12.396413407311178,
+      "learning_rate": 1.973130488196003e-06,
+      "loss": 0.7329,
+      "step": 10038
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 7.875419048892471,
+      "learning_rate": 1.972211461702284e-06,
+      "loss": 0.7563,
+      "step": 10039
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 9.282140009919006,
+      "learning_rate": 1.971292596702375e-06,
+      "loss": 0.736,
+      "step": 10040
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 9.426783335275607,
+      "learning_rate": 1.970373893245291e-06,
+      "loss": 0.7205,
+      "step": 10041
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 11.010381934231386,
+      "learning_rate": 1.969455351380031e-06,
+      "loss": 0.7531,
+      "step": 10042
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 11.387218424004693,
+      "learning_rate": 1.968536971155584e-06,
+      "loss": 0.7594,
+      "step": 10043
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 11.635089291454415,
+      "learning_rate": 1.9676187526209377e-06,
+      "loss": 0.7394,
+      "step": 10044
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 7.2753028311008645,
+      "learning_rate": 1.966700695825068e-06,
+      "loss": 0.7355,
+      "step": 10045
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 5.348532520079208,
+      "learning_rate": 1.9657828008169385e-06,
+      "loss": 0.7446,
+      "step": 10046
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 9.55440561630214,
+      "learning_rate": 1.9648650676455095e-06,
+      "loss": 0.7596,
+      "step": 10047
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 10.414931803358684,
+      "learning_rate": 1.96394749635973e-06,
+      "loss": 0.7979,
+      "step": 10048
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 9.979015174446758,
+      "learning_rate": 1.963030087008538e-06,
+      "loss": 0.7678,
+      "step": 10049
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 8.049211061211782,
+      "learning_rate": 1.96211283964087e-06,
+      "loss": 0.6855,
+      "step": 10050
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 9.207277422837656,
+      "learning_rate": 1.9611957543056455e-06,
+      "loss": 0.784,
+      "step": 10051
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 8.360639847470267,
+      "learning_rate": 1.9602788310517817e-06,
+      "loss": 0.7623,
+      "step": 10052
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 8.006330454891572,
+      "learning_rate": 1.9593620699281827e-06,
+      "loss": 0.7955,
+      "step": 10053
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 11.099143899715486,
+      "learning_rate": 1.9584454709837493e-06,
+      "loss": 0.7176,
+      "step": 10054
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 7.376136179233835,
+      "learning_rate": 1.957529034267368e-06,
+      "loss": 0.7697,
+      "step": 10055
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 10.027743791521067,
+      "learning_rate": 1.956612759827918e-06,
+      "loss": 0.698,
+      "step": 10056
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 9.930178165013464,
+      "learning_rate": 1.9556966477142732e-06,
+      "loss": 0.7081,
+      "step": 10057
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 13.109151278495071,
+      "learning_rate": 1.9547806979752933e-06,
+      "loss": 0.7074,
+      "step": 10058
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 8.99061969085558,
+      "learning_rate": 1.953864910659835e-06,
+      "loss": 0.7017,
+      "step": 10059
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 9.042806623784141,
+      "learning_rate": 1.952949285816747e-06,
+      "loss": 0.7751,
+      "step": 10060
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 9.732918918442815,
+      "learning_rate": 1.9520338234948588e-06,
+      "loss": 0.7297,
+      "step": 10061
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 10.482634051916747,
+      "learning_rate": 1.951118523743004e-06,
+      "loss": 0.7538,
+      "step": 10062
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 10.633053871574013,
+      "learning_rate": 1.950203386609999e-06,
+      "loss": 0.7279,
+      "step": 10063
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 13.72785432426422,
+      "learning_rate": 1.9492884121446563e-06,
+      "loss": 0.683,
+      "step": 10064
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 5.961562575782102,
+      "learning_rate": 1.9483736003957793e-06,
+      "loss": 0.7331,
+      "step": 10065
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 13.168632866222538,
+      "learning_rate": 1.9474589514121606e-06,
+      "loss": 0.6729,
+      "step": 10066
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 6.752370405206598,
+      "learning_rate": 1.9465444652425845e-06,
+      "loss": 0.7472,
+      "step": 10067
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 8.04420248582406,
+      "learning_rate": 1.9456301419358257e-06,
+      "loss": 0.7295,
+      "step": 10068
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 8.969953268993615,
+      "learning_rate": 1.944715981540653e-06,
+      "loss": 0.735,
+      "step": 10069
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 6.159755560259338,
+      "learning_rate": 1.9438019841058274e-06,
+      "loss": 0.7391,
+      "step": 10070
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 11.325305966543342,
+      "learning_rate": 1.9428881496800948e-06,
+      "loss": 0.7094,
+      "step": 10071
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 9.91216074227896,
+      "learning_rate": 1.941974478312201e-06,
+      "loss": 0.7121,
+      "step": 10072
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 7.589366173932995,
+      "learning_rate": 1.9410609700508766e-06,
+      "loss": 0.6921,
+      "step": 10073
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 9.222166686248064,
+      "learning_rate": 1.940147624944843e-06,
+      "loss": 0.7948,
+      "step": 10074
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 7.943163188388288,
+      "learning_rate": 1.93923444304282e-06,
+      "loss": 0.7264,
+      "step": 10075
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 10.965829908291568,
+      "learning_rate": 1.9383214243935107e-06,
+      "loss": 0.7326,
+      "step": 10076
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 9.631230200533597,
+      "learning_rate": 1.9374085690456156e-06,
+      "loss": 0.8186,
+      "step": 10077
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 8.231100725256628,
+      "learning_rate": 1.936495877047821e-06,
+      "loss": 0.6727,
+      "step": 10078
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 8.19828368916126,
+      "learning_rate": 1.935583348448811e-06,
+      "loss": 0.6887,
+      "step": 10079
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 11.115159048953371,
+      "learning_rate": 1.934670983297255e-06,
+      "loss": 0.8189,
+      "step": 10080
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 8.10518692939226,
+      "learning_rate": 1.9337587816418145e-06,
+      "loss": 0.7669,
+      "step": 10081
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 11.773447184495021,
+      "learning_rate": 1.9328467435311483e-06,
+      "loss": 0.7768,
+      "step": 10082
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 8.866885471438623,
+      "learning_rate": 1.9319348690138967e-06,
+      "loss": 0.6511,
+      "step": 10083
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 12.36097692292687,
+      "learning_rate": 1.9310231581386997e-06,
+      "loss": 0.7301,
+      "step": 10084
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 9.570533780427573,
+      "learning_rate": 1.9301116109541864e-06,
+      "loss": 0.839,
+      "step": 10085
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 10.89165640506962,
+      "learning_rate": 1.9292002275089742e-06,
+      "loss": 0.6446,
+      "step": 10086
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 10.408335701450545,
+      "learning_rate": 1.928289007851673e-06,
+      "loss": 0.7556,
+      "step": 10087
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 8.165303769009457,
+      "learning_rate": 1.927377952030888e-06,
+      "loss": 0.7002,
+      "step": 10088
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 7.466566303882468,
+      "learning_rate": 1.926467060095208e-06,
+      "loss": 0.7991,
+      "step": 10089
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 10.859507766079702,
+      "learning_rate": 1.9255563320932213e-06,
+      "loss": 0.7391,
+      "step": 10090
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 8.43365595056486,
+      "learning_rate": 1.924645768073501e-06,
+      "loss": 0.7245,
+      "step": 10091
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 9.915673487426861,
+      "learning_rate": 1.9237353680846168e-06,
+      "loss": 0.6878,
+      "step": 10092
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 8.826631794285312,
+      "learning_rate": 1.9228251321751246e-06,
+      "loss": 0.7062,
+      "step": 10093
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 9.179610139908897,
+      "learning_rate": 1.921915060393573e-06,
+      "loss": 0.7481,
+      "step": 10094
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 10.568466560736798,
+      "learning_rate": 1.9210051527885063e-06,
+      "loss": 0.7973,
+      "step": 10095
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 7.025367867886568,
+      "learning_rate": 1.9200954094084522e-06,
+      "loss": 0.7086,
+      "step": 10096
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 6.192804603784425,
+      "learning_rate": 1.9191858303019366e-06,
+      "loss": 0.7405,
+      "step": 10097
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 11.827187077162924,
+      "learning_rate": 1.9182764155174745e-06,
+      "loss": 0.7623,
+      "step": 10098
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 11.56243123436488,
+      "learning_rate": 1.9173671651035704e-06,
+      "loss": 0.6679,
+      "step": 10099
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 8.645605654812147,
+      "learning_rate": 1.916458079108721e-06,
+      "loss": 0.7032,
+      "step": 10100
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 10.482977497371921,
+      "learning_rate": 1.915549157581413e-06,
+      "loss": 0.8004,
+      "step": 10101
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 8.419004856140377,
+      "learning_rate": 1.9146404005701276e-06,
+      "loss": 0.756,
+      "step": 10102
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 7.544439455062723,
+      "learning_rate": 1.9137318081233362e-06,
+      "loss": 0.7502,
+      "step": 10103
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 10.366963500770362,
+      "learning_rate": 1.9128233802895e-06,
+      "loss": 0.7514,
+      "step": 10104
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 10.967121273486418,
+      "learning_rate": 1.9119151171170713e-06,
+      "loss": 0.7707,
+      "step": 10105
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 10.586055935284252,
+      "learning_rate": 1.9110070186544933e-06,
+      "loss": 0.6913,
+      "step": 10106
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 10.380650981584798,
+      "learning_rate": 1.910099084950202e-06,
+      "loss": 0.6772,
+      "step": 10107
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 11.878328248800772,
+      "learning_rate": 1.9091913160526267e-06,
+      "loss": 0.6876,
+      "step": 10108
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 8.183407293508063,
+      "learning_rate": 1.908283712010181e-06,
+      "loss": 0.6966,
+      "step": 10109
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 9.030689116339785,
+      "learning_rate": 1.9073762728712785e-06,
+      "loss": 0.7086,
+      "step": 10110
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 9.871098026950763,
+      "learning_rate": 1.9064689986843171e-06,
+      "loss": 0.7431,
+      "step": 10111
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 11.360204630290097,
+      "learning_rate": 1.9055618894976869e-06,
+      "loss": 0.7433,
+      "step": 10112
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 7.9916651633649884,
+      "learning_rate": 1.9046549453597735e-06,
+      "loss": 0.6608,
+      "step": 10113
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 10.081034071096655,
+      "learning_rate": 1.9037481663189478e-06,
+      "loss": 0.7001,
+      "step": 10114
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 10.55512087949143,
+      "learning_rate": 1.9028415524235783e-06,
+      "loss": 0.7698,
+      "step": 10115
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 7.965621569465214,
+      "learning_rate": 1.9019351037220174e-06,
+      "loss": 0.6842,
+      "step": 10116
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 11.446371551354474,
+      "learning_rate": 1.9010288202626165e-06,
+      "loss": 0.732,
+      "step": 10117
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 7.327859277961924,
+      "learning_rate": 1.900122702093712e-06,
+      "loss": 0.7133,
+      "step": 10118
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 8.849536466905937,
+      "learning_rate": 1.899216749263632e-06,
+      "loss": 0.7814,
+      "step": 10119
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 7.765757217769672,
+      "learning_rate": 1.8983109618207018e-06,
+      "loss": 0.6717,
+      "step": 10120
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 8.807139580038568,
+      "learning_rate": 1.8974053398132298e-06,
+      "loss": 0.7136,
+      "step": 10121
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 8.35907207228691,
+      "learning_rate": 1.8964998832895204e-06,
+      "loss": 0.6525,
+      "step": 10122
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 6.9996920510843355,
+      "learning_rate": 1.8955945922978703e-06,
+      "loss": 0.6911,
+      "step": 10123
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 8.848824210305098,
+      "learning_rate": 1.8946894668865635e-06,
+      "loss": 0.7629,
+      "step": 10124
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 8.132377045836888,
+      "learning_rate": 1.893784507103877e-06,
+      "loss": 0.711,
+      "step": 10125
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 6.933717399874757,
+      "learning_rate": 1.8928797129980763e-06,
+      "loss": 0.7571,
+      "step": 10126
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 7.253984517232828,
+      "learning_rate": 1.891975084617424e-06,
+      "loss": 0.6901,
+      "step": 10127
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 10.52220993997512,
+      "learning_rate": 1.8910706220101705e-06,
+      "loss": 0.6873,
+      "step": 10128
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 10.364664460522183,
+      "learning_rate": 1.8901663252245544e-06,
+      "loss": 0.7027,
+      "step": 10129
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 7.602427712467556,
+      "learning_rate": 1.8892621943088125e-06,
+      "loss": 0.8063,
+      "step": 10130
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 7.767998519848462,
+      "learning_rate": 1.8883582293111662e-06,
+      "loss": 0.7232,
+      "step": 10131
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 10.639153452811199,
+      "learning_rate": 1.8874544302798286e-06,
+      "loss": 0.7108,
+      "step": 10132
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 7.731244152044305,
+      "learning_rate": 1.8865507972630099e-06,
+      "loss": 0.688,
+      "step": 10133
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 9.23196505092645,
+      "learning_rate": 1.8856473303089035e-06,
+      "loss": 0.6568,
+      "step": 10134
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 7.848763912440526,
+      "learning_rate": 1.8847440294656999e-06,
+      "loss": 0.7167,
+      "step": 10135
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 10.17313834793678,
+      "learning_rate": 1.8838408947815801e-06,
+      "loss": 0.711,
+      "step": 10136
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 9.403822634970862,
+      "learning_rate": 1.8829379263047126e-06,
+      "loss": 0.7738,
+      "step": 10137
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 7.969827115142427,
+      "learning_rate": 1.8820351240832602e-06,
+      "loss": 0.669,
+      "step": 10138
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 7.989807765235495,
+      "learning_rate": 1.881132488165373e-06,
+      "loss": 0.7307,
+      "step": 10139
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 9.456459187823032,
+      "learning_rate": 1.880230018599198e-06,
+      "loss": 0.6846,
+      "step": 10140
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 7.331180143540538,
+      "learning_rate": 1.8793277154328715e-06,
+      "loss": 0.6945,
+      "step": 10141
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 10.373926931077058,
+      "learning_rate": 1.8784255787145167e-06,
+      "loss": 0.7119,
+      "step": 10142
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 10.11151912864614,
+      "learning_rate": 1.8775236084922554e-06,
+      "loss": 0.7124,
+      "step": 10143
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 12.690229172551714,
+      "learning_rate": 1.8766218048141904e-06,
+      "loss": 0.7043,
+      "step": 10144
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 10.536025541172524,
+      "learning_rate": 1.8757201677284243e-06,
+      "loss": 0.7706,
+      "step": 10145
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 11.807389921670813,
+      "learning_rate": 1.8748186972830496e-06,
+      "loss": 0.7601,
+      "step": 10146
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 7.414725930968538,
+      "learning_rate": 1.8739173935261445e-06,
+      "loss": 0.7092,
+      "step": 10147
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 8.757184035246942,
+      "learning_rate": 1.8730162565057862e-06,
+      "loss": 0.708,
+      "step": 10148
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 8.730463135200653,
+      "learning_rate": 1.872115286270037e-06,
+      "loss": 0.748,
+      "step": 10149
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 8.015637826585419,
+      "learning_rate": 1.8712144828669499e-06,
+      "loss": 0.7259,
+      "step": 10150
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 9.916137868852978,
+      "learning_rate": 1.8703138463445753e-06,
+      "loss": 0.6235,
+      "step": 10151
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 9.587675565891047,
+      "learning_rate": 1.8694133767509465e-06,
+      "loss": 0.7021,
+      "step": 10152
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 8.329706272613777,
+      "learning_rate": 1.8685130741340957e-06,
+      "loss": 0.6977,
+      "step": 10153
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 8.525867294618267,
+      "learning_rate": 1.8676129385420393e-06,
+      "loss": 0.7239,
+      "step": 10154
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 6.721039140236087,
+      "learning_rate": 1.866712970022792e-06,
+      "loss": 0.7101,
+      "step": 10155
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 8.365673779846244,
+      "learning_rate": 1.865813168624353e-06,
+      "loss": 0.7674,
+      "step": 10156
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 10.160842209808536,
+      "learning_rate": 1.8649135343947134e-06,
+      "loss": 0.7852,
+      "step": 10157
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 9.846925066981743,
+      "learning_rate": 1.8640140673818613e-06,
+      "loss": 0.7191,
+      "step": 10158
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 11.69584085023398,
+      "learning_rate": 1.8631147676337686e-06,
+      "loss": 0.734,
+      "step": 10159
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 9.140312443659836,
+      "learning_rate": 1.8622156351984016e-06,
+      "loss": 0.7162,
+      "step": 10160
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 9.32320532103343,
+      "learning_rate": 1.8613166701237212e-06,
+      "loss": 0.7611,
+      "step": 10161
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 9.431851116509735,
+      "learning_rate": 1.8604178724576727e-06,
+      "loss": 0.7937,
+      "step": 10162
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 8.601181918759004,
+      "learning_rate": 1.8595192422481956e-06,
+      "loss": 0.8209,
+      "step": 10163
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 7.774624941210122,
+      "learning_rate": 1.8586207795432187e-06,
+      "loss": 0.7212,
+      "step": 10164
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 6.872580121364043,
+      "learning_rate": 1.8577224843906654e-06,
+      "loss": 0.6889,
+      "step": 10165
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 12.04196813430701,
+      "learning_rate": 1.8568243568384504e-06,
+      "loss": 0.7466,
+      "step": 10166
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 10.423482952360937,
+      "learning_rate": 1.8559263969344727e-06,
+      "loss": 0.6423,
+      "step": 10167
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 5.763673060518287,
+      "learning_rate": 1.855028604726632e-06,
+      "loss": 0.7073,
+      "step": 10168
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 8.736428215283194,
+      "learning_rate": 1.8541309802628105e-06,
+      "loss": 0.7542,
+      "step": 10169
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 10.275855743391613,
+      "learning_rate": 1.853233523590885e-06,
+      "loss": 0.7728,
+      "step": 10170
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 7.624913381867235,
+      "learning_rate": 1.8523362347587253e-06,
+      "loss": 0.7352,
+      "step": 10171
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 10.12711730531842,
+      "learning_rate": 1.851439113814188e-06,
+      "loss": 0.6801,
+      "step": 10172
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 9.394498326517494,
+      "learning_rate": 1.8505421608051238e-06,
+      "loss": 0.664,
+      "step": 10173
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 8.762419808818336,
+      "learning_rate": 1.8496453757793764e-06,
+      "loss": 0.6955,
+      "step": 10174
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 8.03065557092096,
+      "learning_rate": 1.8487487587847752e-06,
+      "loss": 0.7772,
+      "step": 10175
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 10.847921798159426,
+      "learning_rate": 1.8478523098691441e-06,
+      "loss": 0.6733,
+      "step": 10176
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 12.403042477769262,
+      "learning_rate": 1.8469560290802946e-06,
+      "loss": 0.732,
+      "step": 10177
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 7.6359153359703065,
+      "learning_rate": 1.8460599164660342e-06,
+      "loss": 0.7663,
+      "step": 10178
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 8.647482075388064,
+      "learning_rate": 1.8451639720741604e-06,
+      "loss": 0.7353,
+      "step": 10179
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 11.42187454060188,
+      "learning_rate": 1.8442681959524572e-06,
+      "loss": 0.666,
+      "step": 10180
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 8.87141931376874,
+      "learning_rate": 1.8433725881487058e-06,
+      "loss": 0.7175,
+      "step": 10181
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 9.592607825447782,
+      "learning_rate": 1.842477148710674e-06,
+      "loss": 0.6757,
+      "step": 10182
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 10.89204604244309,
+      "learning_rate": 1.8415818776861205e-06,
+      "loss": 0.7374,
+      "step": 10183
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 10.907973732575787,
+      "learning_rate": 1.8406867751228003e-06,
+      "loss": 0.7326,
+      "step": 10184
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 6.774837138918958,
+      "learning_rate": 1.8397918410684512e-06,
+      "loss": 0.7666,
+      "step": 10185
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 8.214832215416115,
+      "learning_rate": 1.8388970755708108e-06,
+      "loss": 0.7343,
+      "step": 10186
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 7.88678160224227,
+      "learning_rate": 1.8380024786776013e-06,
+      "loss": 0.7075,
+      "step": 10187
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 8.753516188474402,
+      "learning_rate": 1.8371080504365364e-06,
+      "loss": 0.75,
+      "step": 10188
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 7.724846184466848,
+      "learning_rate": 1.8362137908953258e-06,
+      "loss": 0.6772,
+      "step": 10189
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 11.211492369951298,
+      "learning_rate": 1.8353197001016631e-06,
+      "loss": 0.7551,
+      "step": 10190
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 8.981102783071627,
+      "learning_rate": 1.834425778103241e-06,
+      "loss": 0.7171,
+      "step": 10191
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 7.199576776860094,
+      "learning_rate": 1.8335320249477346e-06,
+      "loss": 0.7366,
+      "step": 10192
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 8.298303537017194,
+      "learning_rate": 1.8326384406828173e-06,
+      "loss": 0.7311,
+      "step": 10193
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 8.112904589354663,
+      "learning_rate": 1.8317450253561498e-06,
+      "loss": 0.7033,
+      "step": 10194
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 10.31605324914565,
+      "learning_rate": 1.8308517790153817e-06,
+      "loss": 0.707,
+      "step": 10195
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 7.774890312024882,
+      "learning_rate": 1.8299587017081598e-06,
+      "loss": 0.7491,
+      "step": 10196
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 7.787316210820329,
+      "learning_rate": 1.8290657934821154e-06,
+      "loss": 0.6605,
+      "step": 10197
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 6.697637036873376,
+      "learning_rate": 1.8281730543848753e-06,
+      "loss": 0.7435,
+      "step": 10198
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 9.849709620561093,
+      "learning_rate": 1.8272804844640574e-06,
+      "loss": 0.7365,
+      "step": 10199
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 10.73038355038761,
+      "learning_rate": 1.826388083767267e-06,
+      "loss": 0.7572,
+      "step": 10200
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 7.77958992873557,
+      "learning_rate": 1.8254958523421023e-06,
+      "loss": 0.6947,
+      "step": 10201
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 10.062918771437717,
+      "learning_rate": 1.8246037902361514e-06,
+      "loss": 0.7208,
+      "step": 10202
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 12.750420688394767,
+      "learning_rate": 1.8237118974969947e-06,
+      "loss": 0.7586,
+      "step": 10203
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 9.092803888105543,
+      "learning_rate": 1.8228201741722067e-06,
+      "loss": 0.7232,
+      "step": 10204
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 7.873310326382482,
+      "learning_rate": 1.8219286203093444e-06,
+      "loss": 0.7596,
+      "step": 10205
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 9.054271944591282,
+      "learning_rate": 1.8210372359559653e-06,
+      "loss": 0.7352,
+      "step": 10206
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 8.707650923248341,
+      "learning_rate": 1.8201460211596116e-06,
+      "loss": 0.7208,
+      "step": 10207
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 11.411814334927174,
+      "learning_rate": 1.8192549759678158e-06,
+      "loss": 0.7346,
+      "step": 10208
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 10.025072530842916,
+      "learning_rate": 1.8183641004281076e-06,
+      "loss": 0.7225,
+      "step": 10209
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 10.5054099399215,
+      "learning_rate": 1.8174733945880008e-06,
+      "loss": 0.6791,
+      "step": 10210
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 9.053881532879023,
+      "learning_rate": 1.8165828584950062e-06,
+      "loss": 0.7297,
+      "step": 10211
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 10.62612077646856,
+      "learning_rate": 1.8156924921966191e-06,
+      "loss": 0.7454,
+      "step": 10212
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 10.18568377837126,
+      "learning_rate": 1.8148022957403321e-06,
+      "loss": 0.7259,
+      "step": 10213
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 9.175558783306625,
+      "learning_rate": 1.8139122691736249e-06,
+      "loss": 0.758,
+      "step": 10214
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 10.91106504035204,
+      "learning_rate": 1.8130224125439671e-06,
+      "loss": 0.6909,
+      "step": 10215
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 8.069569395114673,
+      "learning_rate": 1.812132725898823e-06,
+      "loss": 0.7383,
+      "step": 10216
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 6.038482996812674,
+      "learning_rate": 1.8112432092856475e-06,
+      "loss": 0.7027,
+      "step": 10217
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 8.826372161507063,
+      "learning_rate": 1.8103538627518813e-06,
+      "loss": 0.6772,
+      "step": 10218
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 6.975308603657583,
+      "learning_rate": 1.8094646863449633e-06,
+      "loss": 0.6962,
+      "step": 10219
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 8.165004241407956,
+      "learning_rate": 1.8085756801123179e-06,
+      "loss": 0.7256,
+      "step": 10220
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 9.543789657771883,
+      "learning_rate": 1.8076868441013606e-06,
+      "loss": 0.6754,
+      "step": 10221
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 8.844060145395732,
+      "learning_rate": 1.8067981783595034e-06,
+      "loss": 0.7471,
+      "step": 10222
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 11.306479288430985,
+      "learning_rate": 1.805909682934141e-06,
+      "loss": 0.7179,
+      "step": 10223
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 8.215053731872315,
+      "learning_rate": 1.8050213578726673e-06,
+      "loss": 0.7331,
+      "step": 10224
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 7.744875399359473,
+      "learning_rate": 1.804133203222459e-06,
+      "loss": 0.7584,
+      "step": 10225
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 12.165995925993434,
+      "learning_rate": 1.8032452190308913e-06,
+      "loss": 0.6973,
+      "step": 10226
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 8.76326239626247,
+      "learning_rate": 1.8023574053453259e-06,
+      "loss": 0.7227,
+      "step": 10227
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 12.06484864898554,
+      "learning_rate": 1.801469762213114e-06,
+      "loss": 0.7702,
+      "step": 10228
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 10.486254203928263,
+      "learning_rate": 1.8005822896816034e-06,
+      "loss": 0.7334,
+      "step": 10229
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 7.722079751864297,
+      "learning_rate": 1.7996949877981261e-06,
+      "loss": 0.7556,
+      "step": 10230
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 7.36617313629217,
+      "learning_rate": 1.7988078566100104e-06,
+      "loss": 0.7178,
+      "step": 10231
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 7.9911555233438065,
+      "learning_rate": 1.7979208961645761e-06,
+      "loss": 0.6962,
+      "step": 10232
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 9.541285077840604,
+      "learning_rate": 1.7970341065091246e-06,
+      "loss": 0.7396,
+      "step": 10233
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 9.862814452466717,
+      "learning_rate": 1.7961474876909606e-06,
+      "loss": 0.7039,
+      "step": 10234
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 12.824066858832035,
+      "learning_rate": 1.7952610397573694e-06,
+      "loss": 0.7559,
+      "step": 10235
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 7.470689246353335,
+      "learning_rate": 1.7943747627556341e-06,
+      "loss": 0.7762,
+      "step": 10236
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 8.888795015738742,
+      "learning_rate": 1.7934886567330278e-06,
+      "loss": 0.75,
+      "step": 10237
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 10.795146275560143,
+      "learning_rate": 1.7926027217368113e-06,
+      "loss": 0.6015,
+      "step": 10238
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 10.865855513274925,
+      "learning_rate": 1.7917169578142375e-06,
+      "loss": 0.76,
+      "step": 10239
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 7.684654979783849,
+      "learning_rate": 1.7908313650125492e-06,
+      "loss": 0.682,
+      "step": 10240
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 12.01128098428654,
+      "learning_rate": 1.7899459433789828e-06,
+      "loss": 0.7627,
+      "step": 10241
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 8.162989448411672,
+      "learning_rate": 1.7890606929607667e-06,
+      "loss": 0.7266,
+      "step": 10242
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 9.601071919949868,
+      "learning_rate": 1.7881756138051137e-06,
+      "loss": 0.691,
+      "step": 10243
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 8.764897893733382,
+      "learning_rate": 1.7872907059592355e-06,
+      "loss": 0.6828,
+      "step": 10244
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 41.24971907578253,
+      "learning_rate": 1.7864059694703272e-06,
+      "loss": 0.9444,
+      "step": 10245
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 11.521362888731286,
+      "learning_rate": 1.7855214043855783e-06,
+      "loss": 0.6503,
+      "step": 10246
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 11.845463070526437,
+      "learning_rate": 1.784637010752172e-06,
+      "loss": 0.6983,
+      "step": 10247
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 11.21632193575307,
+      "learning_rate": 1.7837527886172761e-06,
+      "loss": 0.6859,
+      "step": 10248
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 11.947414909366604,
+      "learning_rate": 1.7828687380280552e-06,
+      "loss": 0.7001,
+      "step": 10249
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 8.661688552188986,
+      "learning_rate": 1.781984859031659e-06,
+      "loss": 0.7097,
+      "step": 10250
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 10.746462258302772,
+      "learning_rate": 1.7811011516752352e-06,
+      "loss": 0.6799,
+      "step": 10251
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 11.76363936434726,
+      "learning_rate": 1.780217616005916e-06,
+      "loss": 0.6827,
+      "step": 10252
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 10.389737078332663,
+      "learning_rate": 1.779334252070825e-06,
+      "loss": 0.7865,
+      "step": 10253
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 9.543956508243726,
+      "learning_rate": 1.7784510599170817e-06,
+      "loss": 0.703,
+      "step": 10254
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 9.869754325910527,
+      "learning_rate": 1.7775680395917905e-06,
+      "loss": 0.7137,
+      "step": 10255
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 10.876227590229531,
+      "learning_rate": 1.7766851911420496e-06,
+      "loss": 0.7094,
+      "step": 10256
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 9.491798039989256,
+      "learning_rate": 1.775802514614951e-06,
+      "loss": 0.6792,
+      "step": 10257
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 9.728330198424262,
+      "learning_rate": 1.7749200100575708e-06,
+      "loss": 0.729,
+      "step": 10258
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 7.700384919207882,
+      "learning_rate": 1.7740376775169792e-06,
+      "loss": 0.7293,
+      "step": 10259
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 8.399906905016465,
+      "learning_rate": 1.77315551704024e-06,
+      "loss": 0.7468,
+      "step": 10260
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 9.989258455821087,
+      "learning_rate": 1.7722735286744013e-06,
+      "loss": 0.7446,
+      "step": 10261
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 8.478776288663813,
+      "learning_rate": 1.7713917124665103e-06,
+      "loss": 0.7,
+      "step": 10262
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 9.144143949167672,
+      "learning_rate": 1.770510068463596e-06,
+      "loss": 0.7839,
+      "step": 10263
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 10.70455235410555,
+      "learning_rate": 1.769628596712688e-06,
+      "loss": 0.7041,
+      "step": 10264
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 11.516222957550191,
+      "learning_rate": 1.7687472972607983e-06,
+      "loss": 0.7207,
+      "step": 10265
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 10.549974706037087,
+      "learning_rate": 1.7678661701549322e-06,
+      "loss": 0.711,
+      "step": 10266
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 11.938424937116132,
+      "learning_rate": 1.7669852154420892e-06,
+      "loss": 0.7265,
+      "step": 10267
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 7.326658174396286,
+      "learning_rate": 1.766104433169254e-06,
+      "loss": 0.6893,
+      "step": 10268
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 11.120102936594522,
+      "learning_rate": 1.7652238233834069e-06,
+      "loss": 0.7411,
+      "step": 10269
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 9.590899379807976,
+      "learning_rate": 1.7643433861315184e-06,
+      "loss": 0.7184,
+      "step": 10270
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 9.99465821355083,
+      "learning_rate": 1.7634631214605474e-06,
+      "loss": 0.7574,
+      "step": 10271
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 10.159488673411476,
+      "learning_rate": 1.7625830294174452e-06,
+      "loss": 0.7124,
+      "step": 10272
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 11.426501987697979,
+      "learning_rate": 1.76170311004915e-06,
+      "loss": 0.7469,
+      "step": 10273
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 9.72590615074022,
+      "learning_rate": 1.7608233634025974e-06,
+      "loss": 0.67,
+      "step": 10274
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 6.818172786637211,
+      "learning_rate": 1.7599437895247123e-06,
+      "loss": 0.8009,
+      "step": 10275
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 11.338992017973908,
+      "learning_rate": 1.7590643884624065e-06,
+      "loss": 0.7323,
+      "step": 10276
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 10.790344664661681,
+      "learning_rate": 1.758185160262585e-06,
+      "loss": 0.6545,
+      "step": 10277
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 7.97466057300658,
+      "learning_rate": 1.7573061049721418e-06,
+      "loss": 0.6919,
+      "step": 10278
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 6.552698226637488,
+      "learning_rate": 1.7564272226379648e-06,
+      "loss": 0.7049,
+      "step": 10279
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 6.495617294071356,
+      "learning_rate": 1.7555485133069328e-06,
+      "loss": 0.7933,
+      "step": 10280
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 10.898567312919793,
+      "learning_rate": 1.75466997702591e-06,
+      "loss": 0.7238,
+      "step": 10281
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 8.735784918505178,
+      "learning_rate": 1.7537916138417593e-06,
+      "loss": 0.7623,
+      "step": 10282
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 8.701621852443628,
+      "learning_rate": 1.7529134238013284e-06,
+      "loss": 0.7495,
+      "step": 10283
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 9.70157432048399,
+      "learning_rate": 1.752035406951455e-06,
+      "loss": 0.7356,
+      "step": 10284
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 8.887992097481014,
+      "learning_rate": 1.7511575633389743e-06,
+      "loss": 0.6766,
+      "step": 10285
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 8.666356755200278,
+      "learning_rate": 1.750279893010704e-06,
+      "loss": 0.7423,
+      "step": 10286
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 9.672715940113212,
+      "learning_rate": 1.74940239601346e-06,
+      "loss": 0.6778,
+      "step": 10287
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 7.563030811116717,
+      "learning_rate": 1.7485250723940429e-06,
+      "loss": 0.647,
+      "step": 10288
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 9.278371791284746,
+      "learning_rate": 1.7476479221992492e-06,
+      "loss": 0.6821,
+      "step": 10289
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 8.63002809312353,
+      "learning_rate": 1.7467709454758624e-06,
+      "loss": 0.6977,
+      "step": 10290
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 13.0229532094559,
+      "learning_rate": 1.7458941422706565e-06,
+      "loss": 0.6919,
+      "step": 10291
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 8.527060552394502,
+      "learning_rate": 1.7450175126304003e-06,
+      "loss": 0.7168,
+      "step": 10292
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 11.523668637159961,
+      "learning_rate": 1.7441410566018485e-06,
+      "loss": 0.6647,
+      "step": 10293
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 8.877597404440063,
+      "learning_rate": 1.74326477423175e-06,
+      "loss": 0.8305,
+      "step": 10294
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 9.68376453211504,
+      "learning_rate": 1.742388665566845e-06,
+      "loss": 0.7085,
+      "step": 10295
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 13.127540505199075,
+      "learning_rate": 1.741512730653861e-06,
+      "loss": 0.7488,
+      "step": 10296
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 9.584934913730613,
+      "learning_rate": 1.7406369695395175e-06,
+      "loss": 0.8003,
+      "step": 10297
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 9.900414994915192,
+      "learning_rate": 1.7397613822705238e-06,
+      "loss": 0.6964,
+      "step": 10298
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 10.944542600515256,
+      "learning_rate": 1.7388859688935832e-06,
+      "loss": 0.7064,
+      "step": 10299
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 8.919242323536334,
+      "learning_rate": 1.7380107294553895e-06,
+      "loss": 0.6775,
+      "step": 10300
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 9.000530473912306,
+      "learning_rate": 1.737135664002622e-06,
+      "loss": 0.6897,
+      "step": 10301
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 10.260911533245626,
+      "learning_rate": 1.7362607725819574e-06,
+      "loss": 0.7311,
+      "step": 10302
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 8.440534134383743,
+      "learning_rate": 1.7353860552400587e-06,
+      "loss": 0.6914,
+      "step": 10303
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 11.206082410736531,
+      "learning_rate": 1.734511512023579e-06,
+      "loss": 0.6829,
+      "step": 10304
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 9.125520487628032,
+      "learning_rate": 1.733637142979167e-06,
+      "loss": 0.7666,
+      "step": 10305
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 10.443768694062268,
+      "learning_rate": 1.732762948153457e-06,
+      "loss": 0.6507,
+      "step": 10306
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 12.863266077504736,
+      "learning_rate": 1.7318889275930762e-06,
+      "loss": 0.6693,
+      "step": 10307
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 10.287231930546481,
+      "learning_rate": 1.7310150813446452e-06,
+      "loss": 0.7623,
+      "step": 10308
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 9.77129024315608,
+      "learning_rate": 1.7301414094547702e-06,
+      "loss": 0.7213,
+      "step": 10309
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 8.188148433735169,
+      "learning_rate": 1.7292679119700506e-06,
+      "loss": 0.6961,
+      "step": 10310
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 7.340561362582836,
+      "learning_rate": 1.7283945889370752e-06,
+      "loss": 0.7022,
+      "step": 10311
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 6.851432572442761,
+      "learning_rate": 1.7275214404024254e-06,
+      "loss": 0.6703,
+      "step": 10312
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 10.490922721004221,
+      "learning_rate": 1.7266484664126748e-06,
+      "loss": 0.6798,
+      "step": 10313
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 7.702030090861966,
+      "learning_rate": 1.7257756670143822e-06,
+      "loss": 0.7768,
+      "step": 10314
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 9.648986408378969,
+      "learning_rate": 1.7249030422541046e-06,
+      "loss": 0.669,
+      "step": 10315
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 7.275468576352901,
+      "learning_rate": 1.724030592178379e-06,
+      "loss": 0.7144,
+      "step": 10316
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 8.567772914496842,
+      "learning_rate": 1.7231583168337435e-06,
+      "loss": 0.7067,
+      "step": 10317
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 7.982254344915568,
+      "learning_rate": 1.722286216266723e-06,
+      "loss": 0.7189,
+      "step": 10318
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 12.174690240216078,
+      "learning_rate": 1.7214142905238313e-06,
+      "loss": 0.6393,
+      "step": 10319
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 7.679723710888404,
+      "learning_rate": 1.7205425396515768e-06,
+      "loss": 0.7282,
+      "step": 10320
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 11.574574745159914,
+      "learning_rate": 1.7196709636964549e-06,
+      "loss": 0.6406,
+      "step": 10321
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 10.106329197745984,
+      "learning_rate": 1.7187995627049509e-06,
+      "loss": 0.6689,
+      "step": 10322
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 11.588186013033981,
+      "learning_rate": 1.7179283367235467e-06,
+      "loss": 0.7928,
+      "step": 10323
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 8.024824528618618,
+      "learning_rate": 1.7170572857987078e-06,
+      "loss": 0.6943,
+      "step": 10324
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 9.892418772257994,
+      "learning_rate": 1.7161864099768966e-06,
+      "loss": 0.7575,
+      "step": 10325
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 9.014744585904163,
+      "learning_rate": 1.7153157093045603e-06,
+      "loss": 0.7082,
+      "step": 10326
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 8.765189098687804,
+      "learning_rate": 1.7144451838281428e-06,
+      "loss": 0.6945,
+      "step": 10327
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 7.376868888369238,
+      "learning_rate": 1.7135748335940732e-06,
+      "loss": 0.7219,
+      "step": 10328
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 10.034678943243657,
+      "learning_rate": 1.7127046586487733e-06,
+      "loss": 0.7551,
+      "step": 10329
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 8.792977009654827,
+      "learning_rate": 1.711834659038658e-06,
+      "loss": 0.7163,
+      "step": 10330
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 10.324823449884066,
+      "learning_rate": 1.7109648348101276e-06,
+      "loss": 0.7494,
+      "step": 10331
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 14.137355274501708,
+      "learning_rate": 1.7100951860095777e-06,
+      "loss": 0.702,
+      "step": 10332
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 8.876821858300545,
+      "learning_rate": 1.709225712683395e-06,
+      "loss": 0.5941,
+      "step": 10333
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 9.738717734075266,
+      "learning_rate": 1.7083564148779525e-06,
+      "loss": 0.701,
+      "step": 10334
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 9.620027736609991,
+      "learning_rate": 1.7074872926396168e-06,
+      "loss": 0.7387,
+      "step": 10335
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 6.9546510321704025,
+      "learning_rate": 1.7066183460147423e-06,
+      "loss": 0.7686,
+      "step": 10336
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 7.7601151785127405,
+      "learning_rate": 1.7057495750496778e-06,
+      "loss": 0.734,
+      "step": 10337
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 9.212199740510439,
+      "learning_rate": 1.7048809797907634e-06,
+      "loss": 0.7349,
+      "step": 10338
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 8.241415007843274,
+      "learning_rate": 1.704012560284324e-06,
+      "loss": 0.7544,
+      "step": 10339
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 10.349791307940423,
+      "learning_rate": 1.7031443165766815e-06,
+      "loss": 0.688,
+      "step": 10340
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 11.03219305847622,
+      "learning_rate": 1.7022762487141443e-06,
+      "loss": 0.6938,
+      "step": 10341
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 8.974240211501016,
+      "learning_rate": 1.7014083567430111e-06,
+      "loss": 0.6819,
+      "step": 10342
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 8.459317051335763,
+      "learning_rate": 1.7005406407095765e-06,
+      "loss": 0.7258,
+      "step": 10343
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 6.463710476479418,
+      "learning_rate": 1.699673100660118e-06,
+      "loss": 0.7867,
+      "step": 10344
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 13.67635498704899,
+      "learning_rate": 1.6988057366409094e-06,
+      "loss": 0.7274,
+      "step": 10345
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 10.657943817376266,
+      "learning_rate": 1.6979385486982164e-06,
+      "loss": 0.7953,
+      "step": 10346
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 9.406281805354734,
+      "learning_rate": 1.6970715368782892e-06,
+      "loss": 0.7581,
+      "step": 10347
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 10.660836812620325,
+      "learning_rate": 1.6962047012273724e-06,
+      "loss": 0.6746,
+      "step": 10348
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 9.686861984752309,
+      "learning_rate": 1.6953380417916998e-06,
+      "loss": 0.7153,
+      "step": 10349
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 6.068460652431607,
+      "learning_rate": 1.694471558617497e-06,
+      "loss": 0.7655,
+      "step": 10350
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 8.807415013533845,
+      "learning_rate": 1.6936052517509816e-06,
+      "loss": 0.7357,
+      "step": 10351
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 7.8599262865506425,
+      "learning_rate": 1.692739121238357e-06,
+      "loss": 0.759,
+      "step": 10352
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 8.883682279675197,
+      "learning_rate": 1.6918731671258242e-06,
+      "loss": 0.7027,
+      "step": 10353
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 9.644155126831762,
+      "learning_rate": 1.691007389459568e-06,
+      "loss": 0.6683,
+      "step": 10354
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 9.749423347733577,
+      "learning_rate": 1.690141788285765e-06,
+      "loss": 0.7308,
+      "step": 10355
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 11.46901808545949,
+      "learning_rate": 1.6892763636505882e-06,
+      "loss": 0.6781,
+      "step": 10356
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 9.17581124908413,
+      "learning_rate": 1.6884111156001925e-06,
+      "loss": 0.7042,
+      "step": 10357
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 10.419606260173733,
+      "learning_rate": 1.6875460441807324e-06,
+      "loss": 0.7163,
+      "step": 10358
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 9.03336072230115,
+      "learning_rate": 1.6866811494383457e-06,
+      "loss": 0.7366,
+      "step": 10359
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 7.697678888737562,
+      "learning_rate": 1.6858164314191617e-06,
+      "loss": 0.7591,
+      "step": 10360
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 10.65876124857534,
+      "learning_rate": 1.6849518901693064e-06,
+      "loss": 0.7754,
+      "step": 10361
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 10.097832087157371,
+      "learning_rate": 1.6840875257348878e-06,
+      "loss": 0.7123,
+      "step": 10362
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 7.438130212700045,
+      "learning_rate": 1.6832233381620122e-06,
+      "loss": 0.7688,
+      "step": 10363
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 8.700259450460793,
+      "learning_rate": 1.6823593274967703e-06,
+      "loss": 0.6627,
+      "step": 10364
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 7.049940378228011,
+      "learning_rate": 1.6814954937852484e-06,
+      "loss": 0.6942,
+      "step": 10365
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 9.033915373675514,
+      "learning_rate": 1.6806318370735197e-06,
+      "loss": 0.7984,
+      "step": 10366
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 7.779895973809439,
+      "learning_rate": 1.6797683574076478e-06,
+      "loss": 0.7639,
+      "step": 10367
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 7.037184124344659,
+      "learning_rate": 1.6789050548336916e-06,
+      "loss": 0.7409,
+      "step": 10368
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 9.325290123289452,
+      "learning_rate": 1.6780419293976934e-06,
+      "loss": 0.7473,
+      "step": 10369
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 13.61673478608413,
+      "learning_rate": 1.6771789811456923e-06,
+      "loss": 0.6951,
+      "step": 10370
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 10.45566615397047,
+      "learning_rate": 1.6763162101237169e-06,
+      "loss": 0.6876,
+      "step": 10371
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 8.461188200136231,
+      "learning_rate": 1.675453616377783e-06,
+      "loss": 0.6869,
+      "step": 10372
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 8.341205156506765,
+      "learning_rate": 1.6745911999538995e-06,
+      "loss": 0.707,
+      "step": 10373
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 8.660330513663736,
+      "learning_rate": 1.6737289608980633e-06,
+      "loss": 0.7534,
+      "step": 10374
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 9.541352100665659,
+      "learning_rate": 1.6728668992562652e-06,
+      "loss": 0.8057,
+      "step": 10375
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 7.493914286823319,
+      "learning_rate": 1.6720050150744876e-06,
+      "loss": 0.7234,
+      "step": 10376
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 10.643086013685316,
+      "learning_rate": 1.6711433083986973e-06,
+      "loss": 0.6525,
+      "step": 10377
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 8.572318173931572,
+      "learning_rate": 1.6702817792748588e-06,
+      "loss": 0.6576,
+      "step": 10378
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 9.073176506266854,
+      "learning_rate": 1.6694204277489217e-06,
+      "loss": 0.6618,
+      "step": 10379
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 10.465841304159207,
+      "learning_rate": 1.6685592538668265e-06,
+      "loss": 0.6699,
+      "step": 10380
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 7.790332751644198,
+      "learning_rate": 1.6676982576745093e-06,
+      "loss": 0.6773,
+      "step": 10381
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 11.908217890255013,
+      "learning_rate": 1.66683743921789e-06,
+      "loss": 0.776,
+      "step": 10382
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 6.367210066187105,
+      "learning_rate": 1.6659767985428854e-06,
+      "loss": 0.6957,
+      "step": 10383
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 9.114862020652662,
+      "learning_rate": 1.6651163356953965e-06,
+      "loss": 0.6675,
+      "step": 10384
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 8.297560284879099,
+      "learning_rate": 1.6642560507213207e-06,
+      "loss": 0.7202,
+      "step": 10385
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 10.743283309666767,
+      "learning_rate": 1.6633959436665425e-06,
+      "loss": 0.7046,
+      "step": 10386
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 9.218871184071316,
+      "learning_rate": 1.6625360145769353e-06,
+      "loss": 0.7301,
+      "step": 10387
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 7.874423737153295,
+      "learning_rate": 1.6616762634983674e-06,
+      "loss": 0.719,
+      "step": 10388
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 7.177828614539127,
+      "learning_rate": 1.6608166904766964e-06,
+      "loss": 0.7619,
+      "step": 10389
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 10.056139891525982,
+      "learning_rate": 1.6599572955577676e-06,
+      "loss": 0.7691,
+      "step": 10390
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 9.808893527324084,
+      "learning_rate": 1.6590980787874206e-06,
+      "loss": 0.7538,
+      "step": 10391
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 11.254028404323932,
+      "learning_rate": 1.6582390402114828e-06,
+      "loss": 0.7633,
+      "step": 10392
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 8.98875455874762,
+      "learning_rate": 1.6573801798757715e-06,
+      "loss": 0.754,
+      "step": 10393
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 8.761962430043793,
+      "learning_rate": 1.6565214978260984e-06,
+      "loss": 0.6764,
+      "step": 10394
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 11.709105129167103,
+      "learning_rate": 1.6556629941082602e-06,
+      "loss": 0.8071,
+      "step": 10395
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 8.482088797945746,
+      "learning_rate": 1.6548046687680507e-06,
+      "loss": 0.7217,
+      "step": 10396
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 8.69891185382017,
+      "learning_rate": 1.6539465218512474e-06,
+      "loss": 0.6492,
+      "step": 10397
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 11.301214628592026,
+      "learning_rate": 1.6530885534036239e-06,
+      "loss": 0.7137,
+      "step": 10398
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 6.721911901392342,
+      "learning_rate": 1.6522307634709411e-06,
+      "loss": 0.6671,
+      "step": 10399
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 6.118945375760563,
+      "learning_rate": 1.651373152098949e-06,
+      "loss": 0.7282,
+      "step": 10400
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 13.748109015656993,
+      "learning_rate": 1.6505157193333937e-06,
+      "loss": 0.7556,
+      "step": 10401
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 10.697570567767144,
+      "learning_rate": 1.6496584652200054e-06,
+      "loss": 0.7415,
+      "step": 10402
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 6.730456568657257,
+      "learning_rate": 1.6488013898045096e-06,
+      "loss": 0.7884,
+      "step": 10403
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 11.52281563129008,
+      "learning_rate": 1.6479444931326204e-06,
+      "loss": 0.6604,
+      "step": 10404
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 9.149082543819194,
+      "learning_rate": 1.6470877752500391e-06,
+      "loss": 0.7542,
+      "step": 10405
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 10.585579038041525,
+      "learning_rate": 1.6462312362024653e-06,
+      "loss": 0.6602,
+      "step": 10406
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 8.781130495778322,
+      "learning_rate": 1.6453748760355797e-06,
+      "loss": 0.769,
+      "step": 10407
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 9.663482071987398,
+      "learning_rate": 1.6445186947950608e-06,
+      "loss": 0.7376,
+      "step": 10408
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 11.538032559643877,
+      "learning_rate": 1.6436626925265765e-06,
+      "loss": 0.7814,
+      "step": 10409
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 9.398964634622589,
+      "learning_rate": 1.6428068692757814e-06,
+      "loss": 0.7039,
+      "step": 10410
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 9.747222508484716,
+      "learning_rate": 1.6419512250883234e-06,
+      "loss": 0.7054,
+      "step": 10411
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 8.065272887669101,
+      "learning_rate": 1.6410957600098382e-06,
+      "loss": 0.6682,
+      "step": 10412
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 8.712698035821617,
+      "learning_rate": 1.6402404740859557e-06,
+      "loss": 0.736,
+      "step": 10413
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 10.065597103081108,
+      "learning_rate": 1.6393853673622956e-06,
+      "loss": 0.7065,
+      "step": 10414
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 9.125548458205845,
+      "learning_rate": 1.6385304398844642e-06,
+      "loss": 0.7829,
+      "step": 10415
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 11.453758584179177,
+      "learning_rate": 1.6376756916980647e-06,
+      "loss": 0.7045,
+      "step": 10416
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 9.239099200079167,
+      "learning_rate": 1.6368211228486835e-06,
+      "loss": 0.7631,
+      "step": 10417
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 9.096909521610506,
+      "learning_rate": 1.6359667333819013e-06,
+      "loss": 0.7332,
+      "step": 10418
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 10.729198415283895,
+      "learning_rate": 1.6351125233432913e-06,
+      "loss": 0.7325,
+      "step": 10419
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 8.897253205554323,
+      "learning_rate": 1.6342584927784112e-06,
+      "loss": 0.693,
+      "step": 10420
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 7.585733146382969,
+      "learning_rate": 1.6334046417328164e-06,
+      "loss": 0.6872,
+      "step": 10421
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 9.640689157708113,
+      "learning_rate": 1.6325509702520453e-06,
+      "loss": 0.7447,
+      "step": 10422
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 9.32232433061989,
+      "learning_rate": 1.6316974783816336e-06,
+      "loss": 0.758,
+      "step": 10423
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 9.253950052208763,
+      "learning_rate": 1.6308441661671027e-06,
+      "loss": 0.6871,
+      "step": 10424
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 6.123298044848222,
+      "learning_rate": 1.6299910336539643e-06,
+      "loss": 0.6913,
+      "step": 10425
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 9.419749906429681,
+      "learning_rate": 1.629138080887725e-06,
+      "loss": 0.6944,
+      "step": 10426
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 8.867520216744708,
+      "learning_rate": 1.628285307913876e-06,
+      "loss": 0.7074,
+      "step": 10427
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 10.429596778791172,
+      "learning_rate": 1.627432714777904e-06,
+      "loss": 0.7146,
+      "step": 10428
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 11.583557981667989,
+      "learning_rate": 1.6265803015252845e-06,
+      "loss": 0.6753,
+      "step": 10429
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 8.703731423735817,
+      "learning_rate": 1.625728068201482e-06,
+      "loss": 0.7268,
+      "step": 10430
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 9.407656241562728,
+      "learning_rate": 1.6248760148519505e-06,
+      "loss": 0.7097,
+      "step": 10431
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 5.892389127563389,
+      "learning_rate": 1.6240241415221391e-06,
+      "loss": 0.7118,
+      "step": 10432
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 9.198962972055487,
+      "learning_rate": 1.6231724482574813e-06,
+      "loss": 0.6984,
+      "step": 10433
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 9.55051916787097,
+      "learning_rate": 1.622320935103407e-06,
+      "loss": 0.6929,
+      "step": 10434
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 8.597237027787816,
+      "learning_rate": 1.6214696021053312e-06,
+      "loss": 0.7431,
+      "step": 10435
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 11.432561881811466,
+      "learning_rate": 1.6206184493086636e-06,
+      "loss": 0.7835,
+      "step": 10436
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 8.125476675245633,
+      "learning_rate": 1.6197674767588012e-06,
+      "loss": 0.6549,
+      "step": 10437
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 10.745750678897423,
+      "learning_rate": 1.6189166845011316e-06,
+      "loss": 0.7188,
+      "step": 10438
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 12.459578830938629,
+      "learning_rate": 1.618066072581036e-06,
+      "loss": 0.71,
+      "step": 10439
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 12.398542786402166,
+      "learning_rate": 1.617215641043881e-06,
+      "loss": 0.6803,
+      "step": 10440
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 11.076275140721837,
+      "learning_rate": 1.6163653899350278e-06,
+      "loss": 0.7097,
+      "step": 10441
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 7.451800235292537,
+      "learning_rate": 1.6155153192998291e-06,
+      "loss": 0.7366,
+      "step": 10442
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 9.301166681313246,
+      "learning_rate": 1.6146654291836194e-06,
+      "loss": 0.6738,
+      "step": 10443
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 8.026438917921112,
+      "learning_rate": 1.6138157196317344e-06,
+      "loss": 0.7064,
+      "step": 10444
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 8.187390045050638,
+      "learning_rate": 1.6129661906894916e-06,
+      "loss": 0.7145,
+      "step": 10445
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 9.461862329697821,
+      "learning_rate": 1.6121168424022044e-06,
+      "loss": 0.7495,
+      "step": 10446
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 9.8249650519997,
+      "learning_rate": 1.6112676748151762e-06,
+      "loss": 0.7776,
+      "step": 10447
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 9.115876181541939,
+      "learning_rate": 1.6104186879736976e-06,
+      "loss": 0.696,
+      "step": 10448
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 9.58128627052607,
+      "learning_rate": 1.6095698819230515e-06,
+      "loss": 0.7257,
+      "step": 10449
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 9.465040115059349,
+      "learning_rate": 1.6087212567085087e-06,
+      "loss": 0.6967,
+      "step": 10450
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 8.064993007359957,
+      "learning_rate": 1.607872812375334e-06,
+      "loss": 0.6551,
+      "step": 10451
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 8.980225066546486,
+      "learning_rate": 1.6070245489687835e-06,
+      "loss": 0.7012,
+      "step": 10452
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 9.307167069711507,
+      "learning_rate": 1.6061764665340974e-06,
+      "loss": 0.7243,
+      "step": 10453
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 12.327198272189726,
+      "learning_rate": 1.6053285651165135e-06,
+      "loss": 0.7544,
+      "step": 10454
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 8.730138545937711,
+      "learning_rate": 1.6044808447612548e-06,
+      "loss": 0.791,
+      "step": 10455
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 8.782286567788141,
+      "learning_rate": 1.6036333055135345e-06,
+      "loss": 0.7716,
+      "step": 10456
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 8.4499114566649,
+      "learning_rate": 1.602785947418562e-06,
+      "loss": 0.6876,
+      "step": 10457
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 8.408978208622983,
+      "learning_rate": 1.6019387705215289e-06,
+      "loss": 0.7408,
+      "step": 10458
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 10.100795140957564,
+      "learning_rate": 1.6010917748676248e-06,
+      "loss": 0.7712,
+      "step": 10459
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 9.093083495876586,
+      "learning_rate": 1.6002449605020226e-06,
+      "loss": 0.6822,
+      "step": 10460
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 10.096345708878824,
+      "learning_rate": 1.5993983274698933e-06,
+      "loss": 0.7223,
+      "step": 10461
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 8.565845821642746,
+      "learning_rate": 1.598551875816391e-06,
+      "loss": 0.7707,
+      "step": 10462
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 11.801130356604839,
+      "learning_rate": 1.5977056055866619e-06,
+      "loss": 0.7338,
+      "step": 10463
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 7.911738153610584,
+      "learning_rate": 1.5968595168258472e-06,
+      "loss": 0.6769,
+      "step": 10464
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 7.479259869570981,
+      "learning_rate": 1.5960136095790717e-06,
+      "loss": 0.6966,
+      "step": 10465
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 8.184842692709784,
+      "learning_rate": 1.5951678838914552e-06,
+      "loss": 0.7013,
+      "step": 10466
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 9.719295771165754,
+      "learning_rate": 1.5943223398081081e-06,
+      "loss": 0.6862,
+      "step": 10467
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 8.396058129355687,
+      "learning_rate": 1.5934769773741271e-06,
+      "loss": 0.7385,
+      "step": 10468
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 9.908616203019575,
+      "learning_rate": 1.5926317966346027e-06,
+      "loss": 0.7618,
+      "step": 10469
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 10.92712748750722,
+      "learning_rate": 1.591786797634612e-06,
+      "loss": 0.6519,
+      "step": 10470
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 8.535192724063771,
+      "learning_rate": 1.5909419804192262e-06,
+      "loss": 0.6956,
+      "step": 10471
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 8.749791349943186,
+      "learning_rate": 1.5900973450335078e-06,
+      "loss": 0.6565,
+      "step": 10472
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 10.792397780155131,
+      "learning_rate": 1.5892528915225042e-06,
+      "loss": 0.7378,
+      "step": 10473
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 8.495758504206966,
+      "learning_rate": 1.5884086199312587e-06,
+      "loss": 0.7176,
+      "step": 10474
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 7.623739339070099,
+      "learning_rate": 1.5875645303048016e-06,
+      "loss": 0.7736,
+      "step": 10475
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 8.931365772596804,
+      "learning_rate": 1.586720622688152e-06,
+      "loss": 0.7346,
+      "step": 10476
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 8.12145017338862,
+      "learning_rate": 1.5858768971263249e-06,
+      "loss": 0.7515,
+      "step": 10477
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 11.669529084090431,
+      "learning_rate": 1.5850333536643197e-06,
+      "loss": 0.7182,
+      "step": 10478
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 8.847994996033723,
+      "learning_rate": 1.5841899923471293e-06,
+      "loss": 0.7202,
+      "step": 10479
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 8.569228163894605,
+      "learning_rate": 1.5833468132197388e-06,
+      "loss": 0.6951,
+      "step": 10480
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 9.245149980221244,
+      "learning_rate": 1.582503816327119e-06,
+      "loss": 0.7981,
+      "step": 10481
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 10.531482145411742,
+      "learning_rate": 1.5816610017142325e-06,
+      "loss": 0.7092,
+      "step": 10482
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 7.978048105152736,
+      "learning_rate": 1.5808183694260315e-06,
+      "loss": 0.769,
+      "step": 10483
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 11.486747577827904,
+      "learning_rate": 1.5799759195074616e-06,
+      "loss": 0.6613,
+      "step": 10484
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 9.332341952245592,
+      "learning_rate": 1.5791336520034584e-06,
+      "loss": 0.793,
+      "step": 10485
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 7.263520369167084,
+      "learning_rate": 1.5782915669589422e-06,
+      "loss": 0.7359,
+      "step": 10486
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 9.241912296949877,
+      "learning_rate": 1.5774496644188325e-06,
+      "loss": 0.7125,
+      "step": 10487
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 7.755439785196375,
+      "learning_rate": 1.576607944428028e-06,
+      "loss": 0.7347,
+      "step": 10488
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 11.010363264351534,
+      "learning_rate": 1.5757664070314266e-06,
+      "loss": 0.696,
+      "step": 10489
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 8.389323515958678,
+      "learning_rate": 1.5749250522739152e-06,
+      "loss": 0.7414,
+      "step": 10490
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 9.037726650961618,
+      "learning_rate": 1.5740838802003661e-06,
+      "loss": 0.7285,
+      "step": 10491
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 11.667394945896238,
+      "learning_rate": 1.5732428908556479e-06,
+      "loss": 0.7181,
+      "step": 10492
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 7.792707621291376,
+      "learning_rate": 1.5724020842846161e-06,
+      "loss": 0.7146,
+      "step": 10493
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 9.77588010902468,
+      "learning_rate": 1.5715614605321144e-06,
+      "loss": 0.7803,
+      "step": 10494
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 9.140845530952895,
+      "learning_rate": 1.5707210196429833e-06,
+      "loss": 0.7763,
+      "step": 10495
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 12.162457501718693,
+      "learning_rate": 1.5698807616620454e-06,
+      "loss": 0.7286,
+      "step": 10496
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 6.73475111233441,
+      "learning_rate": 1.5690406866341218e-06,
+      "loss": 0.6742,
+      "step": 10497
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 10.838108788199804,
+      "learning_rate": 1.5682007946040162e-06,
+      "loss": 0.7348,
+      "step": 10498
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 10.693386870837678,
+      "learning_rate": 1.5673610856165294e-06,
+      "loss": 0.7502,
+      "step": 10499
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 7.19092942393038,
+      "learning_rate": 1.5665215597164473e-06,
+      "loss": 0.7364,
+      "step": 10500
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 13.193357207071683,
+      "learning_rate": 1.5656822169485464e-06,
+      "loss": 0.6804,
+      "step": 10501
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 11.251990482182725,
+      "learning_rate": 1.5648430573575985e-06,
+      "loss": 0.7186,
+      "step": 10502
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 8.859737728394254,
+      "learning_rate": 1.5640040809883582e-06,
+      "loss": 0.6676,
+      "step": 10503
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 8.98332124113175,
+      "learning_rate": 1.5631652878855762e-06,
+      "loss": 0.7852,
+      "step": 10504
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 11.618080095239176,
+      "learning_rate": 1.5623266780939928e-06,
+      "loss": 0.7536,
+      "step": 10505
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 9.002056156489807,
+      "learning_rate": 1.5614882516583357e-06,
+      "loss": 0.7274,
+      "step": 10506
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 9.9697691665643,
+      "learning_rate": 1.560650008623324e-06,
+      "loss": 0.6818,
+      "step": 10507
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 9.782814873999593,
+      "learning_rate": 1.5598119490336654e-06,
+      "loss": 0.7087,
+      "step": 10508
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 7.520092907565181,
+      "learning_rate": 1.5589740729340624e-06,
+      "loss": 0.7189,
+      "step": 10509
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 7.105172569829954,
+      "learning_rate": 1.5581363803692051e-06,
+      "loss": 0.6842,
+      "step": 10510
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 9.271528091206056,
+      "learning_rate": 1.557298871383771e-06,
+      "loss": 0.7352,
+      "step": 10511
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 10.084426119567299,
+      "learning_rate": 1.5564615460224346e-06,
+      "loss": 0.7769,
+      "step": 10512
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 8.278616742445358,
+      "learning_rate": 1.555624404329854e-06,
+      "loss": 0.7651,
+      "step": 10513
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 10.011257131267811,
+      "learning_rate": 1.5547874463506785e-06,
+      "loss": 0.6836,
+      "step": 10514
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 9.080412001894988,
+      "learning_rate": 1.5539506721295523e-06,
+      "loss": 0.7597,
+      "step": 10515
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 10.10535344003263,
+      "learning_rate": 1.5531140817111034e-06,
+      "loss": 0.7724,
+      "step": 10516
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 10.634057990201555,
+      "learning_rate": 1.5522776751399548e-06,
+      "loss": 0.6586,
+      "step": 10517
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 10.206503449108775,
+      "learning_rate": 1.5514414524607202e-06,
+      "loss": 0.7374,
+      "step": 10518
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 7.3984193753272365,
+      "learning_rate": 1.5506054137179994e-06,
+      "loss": 0.7498,
+      "step": 10519
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 10.332337080914387,
+      "learning_rate": 1.5497695589563843e-06,
+      "loss": 0.737,
+      "step": 10520
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 8.598511663808162,
+      "learning_rate": 1.5489338882204552e-06,
+      "loss": 0.6968,
+      "step": 10521
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 6.650416085403498,
+      "learning_rate": 1.5480984015547867e-06,
+      "loss": 0.7886,
+      "step": 10522
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 10.833277580375862,
+      "learning_rate": 1.5472630990039428e-06,
+      "loss": 0.7704,
+      "step": 10523
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 10.112381348401081,
+      "learning_rate": 1.5464279806124728e-06,
+      "loss": 0.7145,
+      "step": 10524
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 7.636199969473726,
+      "learning_rate": 1.5455930464249224e-06,
+      "loss": 0.6656,
+      "step": 10525
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 11.107621457148488,
+      "learning_rate": 1.5447582964858237e-06,
+      "loss": 0.7752,
+      "step": 10526
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 8.507935948348994,
+      "learning_rate": 1.5439237308396981e-06,
+      "loss": 0.6383,
+      "step": 10527
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 8.915722419401689,
+      "learning_rate": 1.5430893495310622e-06,
+      "loss": 0.7424,
+      "step": 10528
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 7.7433158805143085,
+      "learning_rate": 1.5422551526044166e-06,
+      "loss": 0.7573,
+      "step": 10529
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 12.784075869460219,
+      "learning_rate": 1.5414211401042583e-06,
+      "loss": 0.7131,
+      "step": 10530
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 8.026130508644204,
+      "learning_rate": 1.5405873120750692e-06,
+      "loss": 0.684,
+      "step": 10531
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 10.916955359306632,
+      "learning_rate": 1.5397536685613219e-06,
+      "loss": 0.6653,
+      "step": 10532
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 8.294722920400671,
+      "learning_rate": 1.5389202096074835e-06,
+      "loss": 0.7278,
+      "step": 10533
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 8.924384238510806,
+      "learning_rate": 1.5380869352580057e-06,
+      "loss": 0.698,
+      "step": 10534
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 11.20869747047229,
+      "learning_rate": 1.537253845557336e-06,
+      "loss": 0.7251,
+      "step": 10535
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 8.938683339072826,
+      "learning_rate": 1.5364209405499065e-06,
+      "loss": 0.7149,
+      "step": 10536
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 10.25576259201502,
+      "learning_rate": 1.5355882202801443e-06,
+      "loss": 0.7099,
+      "step": 10537
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 10.088700107373807,
+      "learning_rate": 1.5347556847924634e-06,
+      "loss": 0.7127,
+      "step": 10538
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 10.069623527757981,
+      "learning_rate": 1.5339233341312666e-06,
+      "loss": 0.6308,
+      "step": 10539
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 7.8073599099994935,
+      "learning_rate": 1.5330911683409533e-06,
+      "loss": 0.7862,
+      "step": 10540
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 7.099961280734558,
+      "learning_rate": 1.5322591874659048e-06,
+      "loss": 0.7757,
+      "step": 10541
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 13.063080638406367,
+      "learning_rate": 1.5314273915504996e-06,
+      "loss": 0.6587,
+      "step": 10542
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 13.441185363976855,
+      "learning_rate": 1.530595780639103e-06,
+      "loss": 0.6711,
+      "step": 10543
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 9.406433342286164,
+      "learning_rate": 1.529764354776071e-06,
+      "loss": 0.7411,
+      "step": 10544
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 10.348855862635036,
+      "learning_rate": 1.528933114005749e-06,
+      "loss": 0.7484,
+      "step": 10545
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 8.654792341389383,
+      "learning_rate": 1.528102058372471e-06,
+      "loss": 0.7114,
+      "step": 10546
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 9.448753579491761,
+      "learning_rate": 1.5272711879205649e-06,
+      "loss": 0.7568,
+      "step": 10547
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 9.825052348626354,
+      "learning_rate": 1.526440502694349e-06,
+      "loss": 0.7272,
+      "step": 10548
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 7.316481720038184,
+      "learning_rate": 1.5256100027381264e-06,
+      "loss": 0.6806,
+      "step": 10549
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 8.813432168743095,
+      "learning_rate": 1.5247796880961973e-06,
+      "loss": 0.664,
+      "step": 10550
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 7.494793015437367,
+      "learning_rate": 1.5239495588128456e-06,
+      "loss": 0.8225,
+      "step": 10551
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 8.437031143722347,
+      "learning_rate": 1.5231196149323475e-06,
+      "loss": 0.7319,
+      "step": 10552
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 8.401869704261397,
+      "learning_rate": 1.522289856498973e-06,
+      "loss": 0.7691,
+      "step": 10553
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 11.500480850590892,
+      "learning_rate": 1.5214602835569752e-06,
+      "loss": 0.6627,
+      "step": 10554
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 9.070373863197775,
+      "learning_rate": 1.520630896150605e-06,
+      "loss": 0.7548,
+      "step": 10555
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 7.550728611591801,
+      "learning_rate": 1.5198016943240967e-06,
+      "loss": 0.7082,
+      "step": 10556
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 8.288807074319555,
+      "learning_rate": 1.51897267812168e-06,
+      "loss": 0.6972,
+      "step": 10557
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 9.010133736076298,
+      "learning_rate": 1.5181438475875716e-06,
+      "loss": 0.6622,
+      "step": 10558
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 9.456790566606875,
+      "learning_rate": 1.5173152027659765e-06,
+      "loss": 0.6726,
+      "step": 10559
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 6.985674676297755,
+      "learning_rate": 1.5164867437010943e-06,
+      "loss": 0.7342,
+      "step": 10560
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 11.450985071741894,
+      "learning_rate": 1.5156584704371146e-06,
+      "loss": 0.736,
+      "step": 10561
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 12.267771608924333,
+      "learning_rate": 1.5148303830182115e-06,
+      "loss": 0.6933,
+      "step": 10562
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 10.751971317318313,
+      "learning_rate": 1.5140024814885568e-06,
+      "loss": 0.736,
+      "step": 10563
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 11.724114862380743,
+      "learning_rate": 1.5131747658923062e-06,
+      "loss": 0.7478,
+      "step": 10564
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 7.044332764436344,
+      "learning_rate": 1.5123472362736064e-06,
+      "loss": 0.7568,
+      "step": 10565
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 10.733538649135697,
+      "learning_rate": 1.5115198926765983e-06,
+      "loss": 0.7367,
+      "step": 10566
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 8.300552931212055,
+      "learning_rate": 1.5106927351454076e-06,
+      "loss": 0.7602,
+      "step": 10567
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 7.137719272874587,
+      "learning_rate": 1.5098657637241553e-06,
+      "loss": 0.6679,
+      "step": 10568
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 8.70743154961033,
+      "learning_rate": 1.509038978456947e-06,
+      "loss": 0.7027,
+      "step": 10569
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 8.779301419325895,
+      "learning_rate": 1.5082123793878838e-06,
+      "loss": 0.7133,
+      "step": 10570
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 10.768128646844032,
+      "learning_rate": 1.507385966561053e-06,
+      "loss": 0.6707,
+      "step": 10571
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 8.05250891233069,
+      "learning_rate": 1.5065597400205312e-06,
+      "loss": 0.7511,
+      "step": 10572
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 10.342128970026637,
+      "learning_rate": 1.5057336998103905e-06,
+      "loss": 0.6472,
+      "step": 10573
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 11.336905852762019,
+      "learning_rate": 1.5049078459746859e-06,
+      "loss": 0.7387,
+      "step": 10574
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 11.447666728756973,
+      "learning_rate": 1.50408217855747e-06,
+      "loss": 0.7595,
+      "step": 10575
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 7.839553956698706,
+      "learning_rate": 1.5032566976027802e-06,
+      "loss": 0.6763,
+      "step": 10576
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 8.63162871426254,
+      "learning_rate": 1.5024314031546427e-06,
+      "loss": 0.7173,
+      "step": 10577
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 8.154256251706508,
+      "learning_rate": 1.5016062952570804e-06,
+      "loss": 0.7345,
+      "step": 10578
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 7.133385374464046,
+      "learning_rate": 1.500781373954099e-06,
+      "loss": 0.7743,
+      "step": 10579
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 9.240578144378427,
+      "learning_rate": 1.4999566392896992e-06,
+      "loss": 0.7494,
+      "step": 10580
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 6.871674664485673,
+      "learning_rate": 1.499132091307871e-06,
+      "loss": 0.6943,
+      "step": 10581
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 6.006162364540266,
+      "learning_rate": 1.4983077300525923e-06,
+      "loss": 0.6976,
+      "step": 10582
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 7.618295492595491,
+      "learning_rate": 1.4974835555678325e-06,
+      "loss": 0.7444,
+      "step": 10583
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 6.381245839334471,
+      "learning_rate": 1.4966595678975487e-06,
+      "loss": 0.6997,
+      "step": 10584
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 9.73600999727683,
+      "learning_rate": 1.4958357670856922e-06,
+      "loss": 0.7228,
+      "step": 10585
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 15.22673176655961,
+      "learning_rate": 1.495012153176203e-06,
+      "loss": 0.6932,
+      "step": 10586
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 11.350123318839893,
+      "learning_rate": 1.4941887262130085e-06,
+      "loss": 0.7501,
+      "step": 10587
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 10.707057798340138,
+      "learning_rate": 1.4933654862400298e-06,
+      "loss": 0.7168,
+      "step": 10588
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 9.664310140881344,
+      "learning_rate": 1.4925424333011757e-06,
+      "loss": 0.6546,
+      "step": 10589
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 9.399166594474028,
+      "learning_rate": 1.4917195674403434e-06,
+      "loss": 0.6786,
+      "step": 10590
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 11.60626038152543,
+      "learning_rate": 1.4908968887014258e-06,
+      "loss": 0.6523,
+      "step": 10591
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 7.932022941039479,
+      "learning_rate": 1.4900743971282983e-06,
+      "loss": 0.7874,
+      "step": 10592
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 8.070073885930919,
+      "learning_rate": 1.4892520927648347e-06,
+      "loss": 0.7787,
+      "step": 10593
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 10.857767015694225,
+      "learning_rate": 1.4884299756548903e-06,
+      "loss": 0.6969,
+      "step": 10594
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 9.545136322028766,
+      "learning_rate": 1.4876080458423186e-06,
+      "loss": 0.7444,
+      "step": 10595
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 7.518770505118086,
+      "learning_rate": 1.4867863033709563e-06,
+      "loss": 0.6719,
+      "step": 10596
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 6.927662563785277,
+      "learning_rate": 1.4859647482846324e-06,
+      "loss": 0.688,
+      "step": 10597
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 6.723153883428273,
+      "learning_rate": 1.4851433806271686e-06,
+      "loss": 0.75,
+      "step": 10598
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 13.427913907003973,
+      "learning_rate": 1.4843222004423718e-06,
+      "loss": 0.7274,
+      "step": 10599
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 9.15440156636099,
+      "learning_rate": 1.483501207774043e-06,
+      "loss": 0.8226,
+      "step": 10600
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 7.525145404912625,
+      "learning_rate": 1.482680402665973e-06,
+      "loss": 0.7452,
+      "step": 10601
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 10.532274633920409,
+      "learning_rate": 1.4818597851619399e-06,
+      "loss": 0.6836,
+      "step": 10602
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 6.54184144218212,
+      "learning_rate": 1.4810393553057112e-06,
+      "loss": 0.6945,
+      "step": 10603
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 8.79760239510216,
+      "learning_rate": 1.4802191131410498e-06,
+      "loss": 0.6668,
+      "step": 10604
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 9.17628258504169,
+      "learning_rate": 1.4793990587117024e-06,
+      "loss": 0.7358,
+      "step": 10605
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 7.868557527483049,
+      "learning_rate": 1.4785791920614106e-06,
+      "loss": 0.7346,
+      "step": 10606
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 6.4901713656071225,
+      "learning_rate": 1.4777595132339018e-06,
+      "loss": 0.7038,
+      "step": 10607
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 11.815146503892395,
+      "learning_rate": 1.4769400222728974e-06,
+      "loss": 0.743,
+      "step": 10608
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 8.530153171913494,
+      "learning_rate": 1.4761207192221056e-06,
+      "loss": 0.6534,
+      "step": 10609
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 10.587005050372387,
+      "learning_rate": 1.4753016041252245e-06,
+      "loss": 0.6638,
+      "step": 10610
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 10.280102872635041,
+      "learning_rate": 1.4744826770259463e-06,
+      "loss": 0.7209,
+      "step": 10611
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 9.073720911435425,
+      "learning_rate": 1.4736639379679474e-06,
+      "loss": 0.7557,
+      "step": 10612
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 11.498523265323504,
+      "learning_rate": 1.4728453869948982e-06,
+      "loss": 0.7261,
+      "step": 10613
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 9.010608392676948,
+      "learning_rate": 1.4720270241504615e-06,
+      "loss": 0.7308,
+      "step": 10614
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 11.558359085309812,
+      "learning_rate": 1.47120884947828e-06,
+      "loss": 0.7045,
+      "step": 10615
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 8.820178355394225,
+      "learning_rate": 1.4703908630219972e-06,
+      "loss": 0.6896,
+      "step": 10616
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 13.540694355691262,
+      "learning_rate": 1.4695730648252398e-06,
+      "loss": 0.7425,
+      "step": 10617
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 11.006149867090448,
+      "learning_rate": 1.468755454931628e-06,
+      "loss": 0.6873,
+      "step": 10618
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 9.415609195281787,
+      "learning_rate": 1.4679380333847725e-06,
+      "loss": 0.7061,
+      "step": 10619
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 8.546611165332031,
+      "learning_rate": 1.4671208002282705e-06,
+      "loss": 0.6254,
+      "step": 10620
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 6.679296486355674,
+      "learning_rate": 1.466303755505712e-06,
+      "loss": 0.7705,
+      "step": 10621
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 10.803451624593679,
+      "learning_rate": 1.4654868992606725e-06,
+      "loss": 0.7106,
+      "step": 10622
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 7.420619742078818,
+      "learning_rate": 1.464670231536724e-06,
+      "loss": 0.7757,
+      "step": 10623
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 8.147331360987149,
+      "learning_rate": 1.463853752377426e-06,
+      "loss": 0.73,
+      "step": 10624
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 8.598819337922436,
+      "learning_rate": 1.463037461826325e-06,
+      "loss": 0.7266,
+      "step": 10625
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 7.6467235376602405,
+      "learning_rate": 1.4622213599269613e-06,
+      "loss": 0.6876,
+      "step": 10626
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 9.558877056056106,
+      "learning_rate": 1.4614054467228634e-06,
+      "loss": 0.7328,
+      "step": 10627
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 7.7669435361147405,
+      "learning_rate": 1.4605897222575472e-06,
+      "loss": 0.6879,
+      "step": 10628
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 7.293613859994767,
+      "learning_rate": 1.4597741865745246e-06,
+      "loss": 0.7328,
+      "step": 10629
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 9.483451002365856,
+      "learning_rate": 1.4589588397172916e-06,
+      "loss": 0.6894,
+      "step": 10630
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 9.786601487284594,
+      "learning_rate": 1.4581436817293388e-06,
+      "loss": 0.7165,
+      "step": 10631
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 6.561716998639803,
+      "learning_rate": 1.457328712654142e-06,
+      "loss": 0.6411,
+      "step": 10632
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 5.930044958001408,
+      "learning_rate": 1.4565139325351718e-06,
+      "loss": 0.6884,
+      "step": 10633
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 12.820142010236237,
+      "learning_rate": 1.4556993414158848e-06,
+      "loss": 0.7207,
+      "step": 10634
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 10.643075548160764,
+      "learning_rate": 1.454884939339728e-06,
+      "loss": 0.7519,
+      "step": 10635
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 7.682382652022918,
+      "learning_rate": 1.4540707263501425e-06,
+      "loss": 0.7461,
+      "step": 10636
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 11.338431495156481,
+      "learning_rate": 1.4532567024905525e-06,
+      "loss": 0.753,
+      "step": 10637
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 10.293441504509653,
+      "learning_rate": 1.4524428678043772e-06,
+      "loss": 0.7479,
+      "step": 10638
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 9.53483549438769,
+      "learning_rate": 1.451629222335027e-06,
+      "loss": 0.6696,
+      "step": 10639
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 9.14233453064876,
+      "learning_rate": 1.4508157661258964e-06,
+      "loss": 0.6555,
+      "step": 10640
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 9.94535489775749,
+      "learning_rate": 1.4500024992203742e-06,
+      "loss": 0.7012,
+      "step": 10641
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 8.261050203997026,
+      "learning_rate": 1.4491894216618352e-06,
+      "loss": 0.6957,
+      "step": 10642
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 6.953660353374422,
+      "learning_rate": 1.448376533493649e-06,
+      "loss": 0.692,
+      "step": 10643
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 11.547602736184553,
+      "learning_rate": 1.4475638347591736e-06,
+      "loss": 0.6833,
+      "step": 10644
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 9.895147219381581,
+      "learning_rate": 1.446751325501754e-06,
+      "loss": 0.6877,
+      "step": 10645
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 7.40175507194628,
+      "learning_rate": 1.4459390057647294e-06,
+      "loss": 0.7065,
+      "step": 10646
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 10.210448225449422,
+      "learning_rate": 1.4451268755914254e-06,
+      "loss": 0.6248,
+      "step": 10647
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 8.94403741399471,
+      "learning_rate": 1.4443149350251573e-06,
+      "loss": 0.7367,
+      "step": 10648
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 10.973424341871704,
+      "learning_rate": 1.4435031841092346e-06,
+      "loss": 0.7178,
+      "step": 10649
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 9.868458329175338,
+      "learning_rate": 1.4426916228869514e-06,
+      "loss": 0.6511,
+      "step": 10650
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 6.2758398110647,
+      "learning_rate": 1.441880251401595e-06,
+      "loss": 0.6663,
+      "step": 10651
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 11.090009216854618,
+      "learning_rate": 1.441069069696443e-06,
+      "loss": 0.7952,
+      "step": 10652
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 10.556091015302679,
+      "learning_rate": 1.4402580778147613e-06,
+      "loss": 0.7387,
+      "step": 10653
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 9.527193595436177,
+      "learning_rate": 1.4394472757998045e-06,
+      "loss": 0.6834,
+      "step": 10654
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 9.226179027460471,
+      "learning_rate": 1.438636663694818e-06,
+      "loss": 0.7071,
+      "step": 10655
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 9.048293391268228,
+      "learning_rate": 1.4378262415430384e-06,
+      "loss": 0.7456,
+      "step": 10656
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 10.630467561268368,
+      "learning_rate": 1.4370160093876935e-06,
+      "loss": 0.671,
+      "step": 10657
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 9.544883636909443,
+      "learning_rate": 1.436205967271997e-06,
+      "loss": 0.6609,
+      "step": 10658
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 12.025137898234213,
+      "learning_rate": 1.4353961152391539e-06,
+      "loss": 0.7304,
+      "step": 10659
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 9.407463524058121,
+      "learning_rate": 1.4345864533323585e-06,
+      "loss": 0.6898,
+      "step": 10660
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 8.517997133685919,
+      "learning_rate": 1.4337769815947977e-06,
+      "loss": 0.6476,
+      "step": 10661
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 9.396598184554362,
+      "learning_rate": 1.4329677000696469e-06,
+      "loss": 0.7208,
+      "step": 10662
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 8.006152378953985,
+      "learning_rate": 1.4321586088000688e-06,
+      "loss": 0.7064,
+      "step": 10663
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 12.720234854604357,
+      "learning_rate": 1.4313497078292215e-06,
+      "loss": 0.7532,
+      "step": 10664
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 7.686834747239978,
+      "learning_rate": 1.4305409972002465e-06,
+      "loss": 0.6651,
+      "step": 10665
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 11.786612853747586,
+      "learning_rate": 1.429732476956278e-06,
+      "loss": 0.7155,
+      "step": 10666
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 13.900916926076619,
+      "learning_rate": 1.428924147140443e-06,
+      "loss": 0.7222,
+      "step": 10667
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 8.31286479634872,
+      "learning_rate": 1.4281160077958517e-06,
+      "loss": 0.6714,
+      "step": 10668
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 10.87714262978844,
+      "learning_rate": 1.4273080589656124e-06,
+      "loss": 0.7536,
+      "step": 10669
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 11.043113943572395,
+      "learning_rate": 1.4265003006928146e-06,
+      "loss": 0.6741,
+      "step": 10670
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 9.076604259939264,
+      "learning_rate": 1.4256927330205456e-06,
+      "loss": 0.6549,
+      "step": 10671
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 9.657206909287423,
+      "learning_rate": 1.4248853559918769e-06,
+      "loss": 0.6759,
+      "step": 10672
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 9.822107630044657,
+      "learning_rate": 1.42407816964987e-06,
+      "loss": 0.7459,
+      "step": 10673
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 10.107036753737148,
+      "learning_rate": 1.423271174037582e-06,
+      "loss": 0.7281,
+      "step": 10674
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 9.670180169241586,
+      "learning_rate": 1.4224643691980517e-06,
+      "loss": 0.6837,
+      "step": 10675
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 7.536089120276389,
+      "learning_rate": 1.4216577551743143e-06,
+      "loss": 0.7089,
+      "step": 10676
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 8.052816760652023,
+      "learning_rate": 1.420851332009393e-06,
+      "loss": 0.6624,
+      "step": 10677
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 13.541881940444247,
+      "learning_rate": 1.4200450997462995e-06,
+      "loss": 0.6883,
+      "step": 10678
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 8.129107824129568,
+      "learning_rate": 1.4192390584280347e-06,
+      "loss": 0.7875,
+      "step": 10679
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 9.513544420188902,
+      "learning_rate": 1.4184332080975905e-06,
+      "loss": 0.6794,
+      "step": 10680
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 13.407305678569307,
+      "learning_rate": 1.4176275487979497e-06,
+      "loss": 0.7248,
+      "step": 10681
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 9.225553655764763,
+      "learning_rate": 1.416822080572085e-06,
+      "loss": 0.7444,
+      "step": 10682
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 8.030919372520813,
+      "learning_rate": 1.4160168034629551e-06,
+      "loss": 0.719,
+      "step": 10683
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 7.333335865205633,
+      "learning_rate": 1.4152117175135148e-06,
+      "loss": 0.7942,
+      "step": 10684
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 10.087034426393979,
+      "learning_rate": 1.414406822766703e-06,
+      "loss": 0.777,
+      "step": 10685
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 9.647902656873153,
+      "learning_rate": 1.4136021192654488e-06,
+      "loss": 0.7826,
+      "step": 10686
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 7.736959429269394,
+      "learning_rate": 1.4127976070526767e-06,
+      "loss": 0.6842,
+      "step": 10687
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 7.2450398629353465,
+      "learning_rate": 1.4119932861712938e-06,
+      "loss": 0.7226,
+      "step": 10688
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 9.09017280558623,
+      "learning_rate": 1.411189156664201e-06,
+      "loss": 0.7557,
+      "step": 10689
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 10.632738944176037,
+      "learning_rate": 1.4103852185742911e-06,
+      "loss": 0.6748,
+      "step": 10690
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 12.72479796157785,
+      "learning_rate": 1.4095814719444418e-06,
+      "loss": 0.6747,
+      "step": 10691
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 11.298279129897463,
+      "learning_rate": 1.408777916817523e-06,
+      "loss": 0.6747,
+      "step": 10692
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 5.5768399535989905,
+      "learning_rate": 1.4079745532363919e-06,
+      "loss": 0.7174,
+      "step": 10693
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 10.241224304925552,
+      "learning_rate": 1.4071713812439003e-06,
+      "loss": 0.7449,
+      "step": 10694
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 9.87838017334033,
+      "learning_rate": 1.4063684008828876e-06,
+      "loss": 0.7282,
+      "step": 10695
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 7.08718092642669,
+      "learning_rate": 1.4055656121961797e-06,
+      "loss": 0.7545,
+      "step": 10696
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 10.300514954568468,
+      "learning_rate": 1.4047630152266007e-06,
+      "loss": 0.7681,
+      "step": 10697
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 12.318793041015926,
+      "learning_rate": 1.4039606100169511e-06,
+      "loss": 0.7259,
+      "step": 10698
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 11.751872895294257,
+      "learning_rate": 1.4031583966100338e-06,
+      "loss": 0.6921,
+      "step": 10699
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 6.191127295162845,
+      "learning_rate": 1.4023563750486364e-06,
+      "loss": 0.713,
+      "step": 10700
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 8.380641410974397,
+      "learning_rate": 1.4015545453755346e-06,
+      "loss": 0.756,
+      "step": 10701
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 10.192116016962007,
+      "learning_rate": 1.400752907633499e-06,
+      "loss": 0.6887,
+      "step": 10702
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 8.077254575393896,
+      "learning_rate": 1.3999514618652843e-06,
+      "loss": 0.7012,
+      "step": 10703
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 10.180894929397368,
+      "learning_rate": 1.3991502081136366e-06,
+      "loss": 0.6617,
+      "step": 10704
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 9.405812610786478,
+      "learning_rate": 1.3983491464212951e-06,
+      "loss": 0.6409,
+      "step": 10705
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 8.771737904404135,
+      "learning_rate": 1.3975482768309833e-06,
+      "loss": 0.6849,
+      "step": 10706
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 9.80763249899722,
+      "learning_rate": 1.3967475993854202e-06,
+      "loss": 0.743,
+      "step": 10707
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 7.856583908917048,
+      "learning_rate": 1.3959471141273095e-06,
+      "loss": 0.7528,
+      "step": 10708
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 9.075746776698283,
+      "learning_rate": 1.395146821099349e-06,
+      "loss": 0.7668,
+      "step": 10709
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 8.158029821669977,
+      "learning_rate": 1.394346720344223e-06,
+      "loss": 0.7261,
+      "step": 10710
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 11.231452048600223,
+      "learning_rate": 1.3935468119046047e-06,
+      "loss": 0.7736,
+      "step": 10711
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 10.54233950623208,
+      "learning_rate": 1.3927470958231626e-06,
+      "loss": 0.7086,
+      "step": 10712
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 11.388144996703643,
+      "learning_rate": 1.391947572142548e-06,
+      "loss": 0.7211,
+      "step": 10713
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 11.832493065046206,
+      "learning_rate": 1.3911482409054066e-06,
+      "loss": 0.6801,
+      "step": 10714
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 8.573080383770664,
+      "learning_rate": 1.3903491021543746e-06,
+      "loss": 0.7037,
+      "step": 10715
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 7.246514386084382,
+      "learning_rate": 1.3895501559320735e-06,
+      "loss": 0.6548,
+      "step": 10716
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 11.548389794809104,
+      "learning_rate": 1.3887514022811178e-06,
+      "loss": 0.7078,
+      "step": 10717
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 9.338669012743233,
+      "learning_rate": 1.3879528412441084e-06,
+      "loss": 0.6842,
+      "step": 10718
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 11.730213839660397,
+      "learning_rate": 1.3871544728636406e-06,
+      "loss": 0.7184,
+      "step": 10719
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 11.045186319940898,
+      "learning_rate": 1.386356297182298e-06,
+      "loss": 0.8043,
+      "step": 10720
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 8.378871412318727,
+      "learning_rate": 1.385558314242651e-06,
+      "loss": 0.7525,
+      "step": 10721
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 8.617490235643439,
+      "learning_rate": 1.3847605240872637e-06,
+      "loss": 0.7434,
+      "step": 10722
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 7.765992967197683,
+      "learning_rate": 1.3839629267586868e-06,
+      "loss": 0.713,
+      "step": 10723
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 12.172358283826659,
+      "learning_rate": 1.3831655222994605e-06,
+      "loss": 0.8019,
+      "step": 10724
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 10.81840211267801,
+      "learning_rate": 1.3823683107521196e-06,
+      "loss": 0.6859,
+      "step": 10725
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 8.141358960989148,
+      "learning_rate": 1.381571292159181e-06,
+      "loss": 0.7454,
+      "step": 10726
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 9.584476317642777,
+      "learning_rate": 1.3807744665631601e-06,
+      "loss": 0.8161,
+      "step": 10727
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 8.28070629014609,
+      "learning_rate": 1.3799778340065535e-06,
+      "loss": 0.6417,
+      "step": 10728
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 9.703223766313364,
+      "learning_rate": 1.3791813945318538e-06,
+      "loss": 0.6526,
+      "step": 10729
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 8.7642620549733,
+      "learning_rate": 1.3783851481815403e-06,
+      "loss": 0.7595,
+      "step": 10730
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 9.758026796405922,
+      "learning_rate": 1.3775890949980803e-06,
+      "loss": 0.6808,
+      "step": 10731
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 9.8611008686117,
+      "learning_rate": 1.3767932350239348e-06,
+      "loss": 0.7374,
+      "step": 10732
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 11.02042200966625,
+      "learning_rate": 1.3759975683015546e-06,
+      "loss": 0.7143,
+      "step": 10733
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 9.993616459400407,
+      "learning_rate": 1.3752020948733752e-06,
+      "loss": 0.7258,
+      "step": 10734
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 8.993524550302457,
+      "learning_rate": 1.374406814781828e-06,
+      "loss": 0.7539,
+      "step": 10735
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 12.426471619064666,
+      "learning_rate": 1.3736117280693295e-06,
+      "loss": 0.7649,
+      "step": 10736
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 8.30918178539381,
+      "learning_rate": 1.3728168347782855e-06,
+      "loss": 0.7081,
+      "step": 10737
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 6.479477947975141,
+      "learning_rate": 1.3720221349510964e-06,
+      "loss": 0.6758,
+      "step": 10738
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 8.359241365485376,
+      "learning_rate": 1.3712276286301474e-06,
+      "loss": 0.7596,
+      "step": 10739
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 10.315906210894143,
+      "learning_rate": 1.370433315857817e-06,
+      "loss": 0.6888,
+      "step": 10740
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 9.073067180442111,
+      "learning_rate": 1.3696391966764704e-06,
+      "loss": 0.6812,
+      "step": 10741
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 7.824924987347646,
+      "learning_rate": 1.3688452711284622e-06,
+      "loss": 0.7093,
+      "step": 10742
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 8.93999642954416,
+      "learning_rate": 1.3680515392561416e-06,
+      "loss": 0.7783,
+      "step": 10743
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 12.775560506298032,
+      "learning_rate": 1.3672580011018411e-06,
+      "loss": 0.693,
+      "step": 10744
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 7.433751952363226,
+      "learning_rate": 1.3664646567078882e-06,
+      "loss": 0.7542,
+      "step": 10745
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 10.2394488465662,
+      "learning_rate": 1.3656715061165948e-06,
+      "loss": 0.7675,
+      "step": 10746
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 10.129110702832811,
+      "learning_rate": 1.3648785493702693e-06,
+      "loss": 0.6268,
+      "step": 10747
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 9.248213097856944,
+      "learning_rate": 1.364085786511203e-06,
+      "loss": 0.722,
+      "step": 10748
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 8.733898838294937,
+      "learning_rate": 1.3632932175816787e-06,
+      "loss": 0.7232,
+      "step": 10749
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 8.218217977939796,
+      "learning_rate": 1.3625008426239727e-06,
+      "loss": 0.7884,
+      "step": 10750
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 7.172956337481647,
+      "learning_rate": 1.3617086616803453e-06,
+      "loss": 0.6972,
+      "step": 10751
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 9.903905321792204,
+      "learning_rate": 1.3609166747930508e-06,
+      "loss": 0.6236,
+      "step": 10752
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 9.407367755302694,
+      "learning_rate": 1.3601248820043327e-06,
+      "loss": 0.7611,
+      "step": 10753
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 10.711089530235368,
+      "learning_rate": 1.3593332833564221e-06,
+      "loss": 0.7107,
+      "step": 10754
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 8.638002287489329,
+      "learning_rate": 1.35854187889154e-06,
+      "loss": 0.7182,
+      "step": 10755
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 9.667281618264361,
+      "learning_rate": 1.3577506686518966e-06,
+      "loss": 0.7845,
+      "step": 10756
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 13.751395863713977,
+      "learning_rate": 1.356959652679694e-06,
+      "loss": 0.6556,
+      "step": 10757
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 13.12085124222475,
+      "learning_rate": 1.356168831017125e-06,
+      "loss": 0.7856,
+      "step": 10758
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 8.030109930255556,
+      "learning_rate": 1.3553782037063662e-06,
+      "loss": 0.7002,
+      "step": 10759
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 11.373484390763767,
+      "learning_rate": 1.3545877707895904e-06,
+      "loss": 0.7882,
+      "step": 10760
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 9.532676039798577,
+      "learning_rate": 1.3537975323089563e-06,
+      "loss": 0.733,
+      "step": 10761
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 6.3867865801539105,
+      "learning_rate": 1.3530074883066107e-06,
+      "loss": 0.7016,
+      "step": 10762
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 9.261225753813047,
+      "learning_rate": 1.352217638824696e-06,
+      "loss": 0.7256,
+      "step": 10763
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 10.62276180100257,
+      "learning_rate": 1.3514279839053368e-06,
+      "loss": 0.7988,
+      "step": 10764
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 11.444441390208826,
+      "learning_rate": 1.350638523590655e-06,
+      "loss": 0.6585,
+      "step": 10765
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 8.796910260794537,
+      "learning_rate": 1.3498492579227546e-06,
+      "loss": 0.7395,
+      "step": 10766
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 8.093791492615681,
+      "learning_rate": 1.3490601869437359e-06,
+      "loss": 0.6972,
+      "step": 10767
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 11.176760092688603,
+      "learning_rate": 1.3482713106956847e-06,
+      "loss": 0.751,
+      "step": 10768
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 10.731555550265977,
+      "learning_rate": 1.3474826292206756e-06,
+      "loss": 0.686,
+      "step": 10769
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 9.377969308534782,
+      "learning_rate": 1.3466941425607772e-06,
+      "loss": 0.6971,
+      "step": 10770
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 11.26214115674302,
+      "learning_rate": 1.345905850758043e-06,
+      "loss": 0.6584,
+      "step": 10771
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 8.485694689137246,
+      "learning_rate": 1.3451177538545196e-06,
+      "loss": 0.68,
+      "step": 10772
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 7.847658410935096,
+      "learning_rate": 1.3443298518922433e-06,
+      "loss": 0.7228,
+      "step": 10773
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 11.28691133986792,
+      "learning_rate": 1.343542144913237e-06,
+      "loss": 0.744,
+      "step": 10774
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 7.564697198535389,
+      "learning_rate": 1.342754632959513e-06,
+      "loss": 0.653,
+      "step": 10775
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 6.885036620883787,
+      "learning_rate": 1.3419673160730789e-06,
+      "loss": 0.7148,
+      "step": 10776
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 6.692695143909258,
+      "learning_rate": 1.3411801942959246e-06,
+      "loss": 0.7439,
+      "step": 10777
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 10.301949674889437,
+      "learning_rate": 1.3403932676700354e-06,
+      "loss": 0.7466,
+      "step": 10778
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 7.234011115684038,
+      "learning_rate": 1.339606536237381e-06,
+      "loss": 0.6774,
+      "step": 10779
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 10.535395016208186,
+      "learning_rate": 1.3388200000399271e-06,
+      "loss": 0.6959,
+      "step": 10780
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 9.81677345581523,
+      "learning_rate": 1.3380336591196237e-06,
+      "loss": 0.6607,
+      "step": 10781
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 8.243604741739714,
+      "learning_rate": 1.33724751351841e-06,
+      "loss": 0.7673,
+      "step": 10782
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 9.0401994692746,
+      "learning_rate": 1.3364615632782197e-06,
+      "loss": 0.7388,
+      "step": 10783
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 10.892951551162357,
+      "learning_rate": 1.335675808440971e-06,
+      "loss": 0.7389,
+      "step": 10784
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 11.537487207388805,
+      "learning_rate": 1.3348902490485744e-06,
+      "loss": 0.6709,
+      "step": 10785
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 9.37683750246846,
+      "learning_rate": 1.334104885142934e-06,
+      "loss": 0.6788,
+      "step": 10786
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 10.143809325078655,
+      "learning_rate": 1.3333197167659312e-06,
+      "loss": 0.7108,
+      "step": 10787
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 9.114362524927142,
+      "learning_rate": 1.33253474395945e-06,
+      "loss": 0.6918,
+      "step": 10788
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 10.798189360198345,
+      "learning_rate": 1.3317499667653556e-06,
+      "loss": 0.663,
+      "step": 10789
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 8.58463984086014,
+      "learning_rate": 1.3309653852255079e-06,
+      "loss": 0.7478,
+      "step": 10790
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 6.902260062758302,
+      "learning_rate": 1.3301809993817543e-06,
+      "loss": 0.7458,
+      "step": 10791
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 9.256696873813869,
+      "learning_rate": 1.3293968092759319e-06,
+      "loss": 0.7719,
+      "step": 10792
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 8.337828051358136,
+      "learning_rate": 1.3286128149498661e-06,
+      "loss": 0.711,
+      "step": 10793
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 6.550831479364869,
+      "learning_rate": 1.3278290164453723e-06,
+      "loss": 0.7164,
+      "step": 10794
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 11.637564338476206,
+      "learning_rate": 1.3270454138042566e-06,
+      "loss": 0.7254,
+      "step": 10795
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 11.378939039300635,
+      "learning_rate": 1.3262620070683169e-06,
+      "loss": 0.6136,
+      "step": 10796
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 11.026980787913372,
+      "learning_rate": 1.3254787962793337e-06,
+      "loss": 0.7299,
+      "step": 10797
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 8.69246401239014,
+      "learning_rate": 1.3246957814790857e-06,
+      "loss": 0.8354,
+      "step": 10798
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 8.368259249484266,
+      "learning_rate": 1.323912962709334e-06,
+      "loss": 0.7492,
+      "step": 10799
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 11.971435557812546,
+      "learning_rate": 1.3231303400118305e-06,
+      "loss": 0.7565,
+      "step": 10800
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 8.133059482889157,
+      "learning_rate": 1.3223479134283218e-06,
+      "loss": 0.6812,
+      "step": 10801
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 10.286485517422276,
+      "learning_rate": 1.3215656830005373e-06,
+      "loss": 0.7721,
+      "step": 10802
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 7.408744321385693,
+      "learning_rate": 1.3207836487702019e-06,
+      "loss": 0.6762,
+      "step": 10803
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 7.363179321857886,
+      "learning_rate": 1.3200018107790235e-06,
+      "loss": 0.7662,
+      "step": 10804
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 8.052309204032595,
+      "learning_rate": 1.3192201690687073e-06,
+      "loss": 0.7111,
+      "step": 10805
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 9.654188151216905,
+      "learning_rate": 1.3184387236809416e-06,
+      "loss": 0.6809,
+      "step": 10806
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 11.156579702003397,
+      "learning_rate": 1.317657474657405e-06,
+      "loss": 0.8006,
+      "step": 10807
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 8.825236303169344,
+      "learning_rate": 1.31687642203977e-06,
+      "loss": 0.7845,
+      "step": 10808
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 8.747727722508722,
+      "learning_rate": 1.3160955658696933e-06,
+      "loss": 0.7617,
+      "step": 10809
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 8.199831202277904,
+      "learning_rate": 1.3153149061888255e-06,
+      "loss": 0.7395,
+      "step": 10810
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 11.495275213685247,
+      "learning_rate": 1.314534443038805e-06,
+      "loss": 0.667,
+      "step": 10811
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 10.101172889439932,
+      "learning_rate": 1.3137541764612588e-06,
+      "loss": 0.6269,
+      "step": 10812
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 7.835296798500004,
+      "learning_rate": 1.3129741064978047e-06,
+      "loss": 0.6883,
+      "step": 10813
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 12.059210918446094,
+      "learning_rate": 1.3121942331900467e-06,
+      "loss": 0.6833,
+      "step": 10814
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 8.933284682327768,
+      "learning_rate": 1.3114145565795833e-06,
+      "loss": 0.7122,
+      "step": 10815
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 13.661459815875704,
+      "learning_rate": 1.3106350767080022e-06,
+      "loss": 0.7087,
+      "step": 10816
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 8.298091976277505,
+      "learning_rate": 1.3098557936168748e-06,
+      "loss": 0.7181,
+      "step": 10817
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 10.154039213600281,
+      "learning_rate": 1.3090767073477695e-06,
+      "loss": 0.681,
+      "step": 10818
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 8.990721701410278,
+      "learning_rate": 1.3082978179422384e-06,
+      "loss": 0.6857,
+      "step": 10819
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 9.47855789172248,
+      "learning_rate": 1.3075191254418247e-06,
+      "loss": 0.6961,
+      "step": 10820
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 11.554957128262851,
+      "learning_rate": 1.306740629888064e-06,
+      "loss": 0.7817,
+      "step": 10821
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 13.514329558484473,
+      "learning_rate": 1.3059623313224768e-06,
+      "loss": 0.7558,
+      "step": 10822
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 10.319842542964151,
+      "learning_rate": 1.3051842297865758e-06,
+      "loss": 0.7586,
+      "step": 10823
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 9.287287664381806,
+      "learning_rate": 1.3044063253218652e-06,
+      "loss": 0.7214,
+      "step": 10824
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 7.672089290105611,
+      "learning_rate": 1.3036286179698348e-06,
+      "loss": 0.799,
+      "step": 10825
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 8.333810514972598,
+      "learning_rate": 1.3028511077719647e-06,
+      "loss": 0.7538,
+      "step": 10826
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 11.499869618397696,
+      "learning_rate": 1.3020737947697238e-06,
+      "loss": 0.7007,
+      "step": 10827
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 5.941097590880134,
+      "learning_rate": 1.3012966790045734e-06,
+      "loss": 0.7512,
+      "step": 10828
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 9.380288572827338,
+      "learning_rate": 1.3005197605179642e-06,
+      "loss": 0.6925,
+      "step": 10829
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 7.032177273048055,
+      "learning_rate": 1.2997430393513338e-06,
+      "loss": 0.7147,
+      "step": 10830
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 9.425090143000535,
+      "learning_rate": 1.2989665155461096e-06,
+      "loss": 0.7892,
+      "step": 10831
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 8.443110555804637,
+      "learning_rate": 1.2981901891437082e-06,
+      "loss": 0.636,
+      "step": 10832
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 8.413603881152756,
+      "learning_rate": 1.297414060185538e-06,
+      "loss": 0.7964,
+      "step": 10833
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 10.34573570270731,
+      "learning_rate": 1.296638128712997e-06,
+      "loss": 0.7394,
+      "step": 10834
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 8.552327762878635,
+      "learning_rate": 1.2958623947674687e-06,
+      "loss": 0.7191,
+      "step": 10835
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 9.574810608661977,
+      "learning_rate": 1.2950868583903309e-06,
+      "loss": 0.8198,
+      "step": 10836
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 7.405119349220218,
+      "learning_rate": 1.2943115196229472e-06,
+      "loss": 0.6752,
+      "step": 10837
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 8.905085208480676,
+      "learning_rate": 1.2935363785066712e-06,
+      "loss": 0.7118,
+      "step": 10838
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 10.994600610416663,
+      "learning_rate": 1.2927614350828488e-06,
+      "loss": 0.7712,
+      "step": 10839
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 9.949343683458734,
+      "learning_rate": 1.2919866893928113e-06,
+      "loss": 0.678,
+      "step": 10840
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 6.993406526845344,
+      "learning_rate": 1.2912121414778834e-06,
+      "loss": 0.7128,
+      "step": 10841
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 8.653764378848997,
+      "learning_rate": 1.2904377913793754e-06,
+      "loss": 0.6907,
+      "step": 10842
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 8.21651632404747,
+      "learning_rate": 1.2896636391385914e-06,
+      "loss": 0.6691,
+      "step": 10843
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 11.052189533403016,
+      "learning_rate": 1.2888896847968207e-06,
+      "loss": 0.6596,
+      "step": 10844
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 8.614611016647048,
+      "learning_rate": 1.288115928395343e-06,
+      "loss": 0.695,
+      "step": 10845
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 10.525233477403095,
+      "learning_rate": 1.287342369975431e-06,
+      "loss": 0.7443,
+      "step": 10846
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 13.098809042454203,
+      "learning_rate": 1.2865690095783412e-06,
+      "loss": 0.7575,
+      "step": 10847
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 9.981564732633458,
+      "learning_rate": 1.2857958472453241e-06,
+      "loss": 0.7537,
+      "step": 10848
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 10.927078261953724,
+      "learning_rate": 1.2850228830176199e-06,
+      "loss": 0.7173,
+      "step": 10849
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 6.551040478621706,
+      "learning_rate": 1.2842501169364536e-06,
+      "loss": 0.7632,
+      "step": 10850
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 9.227009835010813,
+      "learning_rate": 1.2834775490430435e-06,
+      "loss": 0.5887,
+      "step": 10851
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 7.8921724024370326,
+      "learning_rate": 1.2827051793785944e-06,
+      "loss": 0.6953,
+      "step": 10852
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 11.189221161267877,
+      "learning_rate": 1.281933007984304e-06,
+      "loss": 0.6926,
+      "step": 10853
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 8.057989377699181,
+      "learning_rate": 1.2811610349013593e-06,
+      "loss": 0.6952,
+      "step": 10854
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 9.43804611517377,
+      "learning_rate": 1.2803892601709317e-06,
+      "loss": 0.7097,
+      "step": 10855
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 7.708277702114856,
+      "learning_rate": 1.2796176838341888e-06,
+      "loss": 0.621,
+      "step": 10856
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 11.242709876893082,
+      "learning_rate": 1.2788463059322837e-06,
+      "loss": 0.7902,
+      "step": 10857
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 6.4883830711500625,
+      "learning_rate": 1.2780751265063562e-06,
+      "loss": 0.7649,
+      "step": 10858
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 10.68939302084352,
+      "learning_rate": 1.2773041455975438e-06,
+      "loss": 0.6713,
+      "step": 10859
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 12.915322368922668,
+      "learning_rate": 1.2765333632469639e-06,
+      "loss": 0.6784,
+      "step": 10860
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 7.239362676605021,
+      "learning_rate": 1.2757627794957306e-06,
+      "loss": 0.6484,
+      "step": 10861
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 10.82548674952306,
+      "learning_rate": 1.2749923943849451e-06,
+      "loss": 0.7532,
+      "step": 10862
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 11.379543094753735,
+      "learning_rate": 1.2742222079556971e-06,
+      "loss": 0.7333,
+      "step": 10863
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 7.362016408533704,
+      "learning_rate": 1.273452220249065e-06,
+      "loss": 0.7053,
+      "step": 10864
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 8.543220629833153,
+      "learning_rate": 1.2726824313061181e-06,
+      "loss": 0.7167,
+      "step": 10865
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 8.09168570340209,
+      "learning_rate": 1.2719128411679143e-06,
+      "loss": 0.729,
+      "step": 10866
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 11.504959663682548,
+      "learning_rate": 1.2711434498755049e-06,
+      "loss": 0.6736,
+      "step": 10867
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 10.13441486901725,
+      "learning_rate": 1.2703742574699223e-06,
+      "loss": 0.6981,
+      "step": 10868
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 12.269498104843633,
+      "learning_rate": 1.2696052639921985e-06,
+      "loss": 0.7351,
+      "step": 10869
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 9.46509961764595,
+      "learning_rate": 1.2688364694833433e-06,
+      "loss": 0.6883,
+      "step": 10870
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 7.961774832201691,
+      "learning_rate": 1.268067873984365e-06,
+      "loss": 0.7033,
+      "step": 10871
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 10.207814122235895,
+      "learning_rate": 1.2672994775362596e-06,
+      "loss": 0.6918,
+      "step": 10872
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 11.16824575341285,
+      "learning_rate": 1.266531280180009e-06,
+      "loss": 0.7027,
+      "step": 10873
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 16.07016922113954,
+      "learning_rate": 1.265763281956589e-06,
+      "loss": 0.6969,
+      "step": 10874
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 7.782254701825112,
+      "learning_rate": 1.2649954829069617e-06,
+      "loss": 0.7364,
+      "step": 10875
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 9.39281631278099,
+      "learning_rate": 1.2642278830720767e-06,
+      "loss": 0.7229,
+      "step": 10876
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 8.548199917305068,
+      "learning_rate": 1.2634604824928799e-06,
+      "loss": 0.6829,
+      "step": 10877
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 12.28257988241144,
+      "learning_rate": 1.2626932812102982e-06,
+      "loss": 0.7134,
+      "step": 10878
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 14.768848819974233,
+      "learning_rate": 1.2619262792652559e-06,
+      "loss": 0.8037,
+      "step": 10879
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 11.982672841240797,
+      "learning_rate": 1.261159476698659e-06,
+      "loss": 0.6807,
+      "step": 10880
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 12.904141045523408,
+      "learning_rate": 1.2603928735514103e-06,
+      "loss": 0.6852,
+      "step": 10881
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 10.234203359177183,
+      "learning_rate": 1.2596264698643962e-06,
+      "loss": 0.6611,
+      "step": 10882
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 7.775684753843163,
+      "learning_rate": 1.2588602656784938e-06,
+      "loss": 0.7678,
+      "step": 10883
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 9.089715395402806,
+      "learning_rate": 1.258094261034572e-06,
+      "loss": 0.7498,
+      "step": 10884
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 10.919440540755362,
+      "learning_rate": 1.257328455973486e-06,
+      "loss": 0.6797,
+      "step": 10885
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 7.8518155965027185,
+      "learning_rate": 1.2565628505360815e-06,
+      "loss": 0.7022,
+      "step": 10886
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 8.535701978204944,
+      "learning_rate": 1.2557974447631972e-06,
+      "loss": 0.712,
+      "step": 10887
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 11.18781921029854,
+      "learning_rate": 1.2550322386956543e-06,
+      "loss": 0.5708,
+      "step": 10888
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 9.84967042527302,
+      "learning_rate": 1.2542672323742678e-06,
+      "loss": 0.6323,
+      "step": 10889
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 9.82286540589909,
+      "learning_rate": 1.2535024258398393e-06,
+      "loss": 0.7526,
+      "step": 10890
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 8.191475817065356,
+      "learning_rate": 1.2527378191331629e-06,
+      "loss": 0.7282,
+      "step": 10891
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 10.59174098003908,
+      "learning_rate": 1.2519734122950222e-06,
+      "loss": 0.7114,
+      "step": 10892
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 8.474249554437696,
+      "learning_rate": 1.2512092053661857e-06,
+      "loss": 0.6796,
+      "step": 10893
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 8.086140765034436,
+      "learning_rate": 1.2504451983874165e-06,
+      "loss": 0.7003,
+      "step": 10894
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 10.739225550690941,
+      "learning_rate": 1.2496813913994637e-06,
+      "loss": 0.7044,
+      "step": 10895
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 11.634661363061603,
+      "learning_rate": 1.248917784443065e-06,
+      "loss": 0.6101,
+      "step": 10896
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 8.607595168045922,
+      "learning_rate": 1.2481543775589517e-06,
+      "loss": 0.7194,
+      "step": 10897
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 12.0009946959663,
+      "learning_rate": 1.2473911707878394e-06,
+      "loss": 0.7381,
+      "step": 10898
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 7.934037111009808,
+      "learning_rate": 1.2466281641704376e-06,
+      "loss": 0.6695,
+      "step": 10899
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 8.179497002874133,
+      "learning_rate": 1.2458653577474412e-06,
+      "loss": 0.738,
+      "step": 10900
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 12.825759979179171,
+      "learning_rate": 1.2451027515595375e-06,
+      "loss": 0.7227,
+      "step": 10901
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 11.07210000296654,
+      "learning_rate": 1.2443403456474017e-06,
+      "loss": 0.6553,
+      "step": 10902
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 6.6797456999959985,
+      "learning_rate": 1.2435781400516967e-06,
+      "loss": 0.7064,
+      "step": 10903
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 7.769142575688148,
+      "learning_rate": 1.2428161348130767e-06,
+      "loss": 0.7459,
+      "step": 10904
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 8.125306391226092,
+      "learning_rate": 1.2420543299721882e-06,
+      "loss": 0.6733,
+      "step": 10905
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 11.472435749868627,
+      "learning_rate": 1.24129272556966e-06,
+      "loss": 0.8489,
+      "step": 10906
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 8.437290761915952,
+      "learning_rate": 1.2405313216461161e-06,
+      "loss": 0.7861,
+      "step": 10907
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 8.286663510583939,
+      "learning_rate": 1.2397701182421678e-06,
+      "loss": 0.6564,
+      "step": 10908
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 7.100951501386227,
+      "learning_rate": 1.2390091153984124e-06,
+      "loss": 0.7499,
+      "step": 10909
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 9.105623689945823,
+      "learning_rate": 1.238248313155444e-06,
+      "loss": 0.7202,
+      "step": 10910
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 7.651831748727064,
+      "learning_rate": 1.237487711553838e-06,
+      "loss": 0.76,
+      "step": 10911
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 10.270836775869995,
+      "learning_rate": 1.2367273106341665e-06,
+      "loss": 0.7153,
+      "step": 10912
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 7.422422760816465,
+      "learning_rate": 1.2359671104369847e-06,
+      "loss": 0.6924,
+      "step": 10913
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 10.676515223893643,
+      "learning_rate": 1.2352071110028385e-06,
+      "loss": 0.6795,
+      "step": 10914
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 11.647062693471945,
+      "learning_rate": 1.2344473123722666e-06,
+      "loss": 0.738,
+      "step": 10915
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 9.911293419998687,
+      "learning_rate": 1.2336877145857928e-06,
+      "loss": 0.7588,
+      "step": 10916
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 8.830233130514952,
+      "learning_rate": 1.232928317683934e-06,
+      "loss": 0.768,
+      "step": 10917
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 8.350874613328232,
+      "learning_rate": 1.2321691217071913e-06,
+      "loss": 0.7663,
+      "step": 10918
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 10.832399229118254,
+      "learning_rate": 1.2314101266960615e-06,
+      "loss": 0.7153,
+      "step": 10919
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 8.509694076772037,
+      "learning_rate": 1.2306513326910258e-06,
+      "loss": 0.7059,
+      "step": 10920
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 9.804239318707253,
+      "learning_rate": 1.2298927397325538e-06,
+      "loss": 0.68,
+      "step": 10921
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 7.464114666101336,
+      "learning_rate": 1.2291343478611113e-06,
+      "loss": 0.7116,
+      "step": 10922
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 7.61515758238245,
+      "learning_rate": 1.2283761571171442e-06,
+      "loss": 0.6541,
+      "step": 10923
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 9.29490618001543,
+      "learning_rate": 1.2276181675410947e-06,
+      "loss": 0.777,
+      "step": 10924
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 8.33902142116472,
+      "learning_rate": 1.226860379173393e-06,
+      "loss": 0.7677,
+      "step": 10925
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 7.22917457550285,
+      "learning_rate": 1.2261027920544566e-06,
+      "loss": 0.7604,
+      "step": 10926
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 5.184853426383597,
+      "learning_rate": 1.2253454062246921e-06,
+      "loss": 0.7282,
+      "step": 10927
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 12.871525651499802,
+      "learning_rate": 1.2245882217244953e-06,
+      "loss": 0.7231,
+      "step": 10928
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 10.277039631756793,
+      "learning_rate": 1.223831238594254e-06,
+      "loss": 0.6856,
+      "step": 10929
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 8.27307669524854,
+      "learning_rate": 1.2230744568743448e-06,
+      "loss": 0.7686,
+      "step": 10930
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 7.247510913828765,
+      "learning_rate": 1.2223178766051296e-06,
+      "loss": 0.6322,
+      "step": 10931
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 9.975274401954552,
+      "learning_rate": 1.2215614978269647e-06,
+      "loss": 0.6411,
+      "step": 10932
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 8.165082848905639,
+      "learning_rate": 1.2208053205801922e-06,
+      "loss": 0.6633,
+      "step": 10933
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 9.905497219503237,
+      "learning_rate": 1.2200493449051437e-06,
+      "loss": 0.7787,
+      "step": 10934
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 10.922869209702467,
+      "learning_rate": 1.2192935708421423e-06,
+      "loss": 0.7391,
+      "step": 10935
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 8.972226209587713,
+      "learning_rate": 1.218537998431497e-06,
+      "loss": 0.6926,
+      "step": 10936
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 8.82313019877893,
+      "learning_rate": 1.2177826277135103e-06,
+      "loss": 0.7219,
+      "step": 10937
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 9.50658061852772,
+      "learning_rate": 1.2170274587284691e-06,
+      "loss": 0.6621,
+      "step": 10938
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 8.799906536233259,
+      "learning_rate": 1.2162724915166547e-06,
+      "loss": 0.703,
+      "step": 10939
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 9.482021379398406,
+      "learning_rate": 1.2155177261183337e-06,
+      "loss": 0.7951,
+      "step": 10940
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 10.247409339025648,
+      "learning_rate": 1.214763162573761e-06,
+      "loss": 0.7035,
+      "step": 10941
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 10.360807060896317,
+      "learning_rate": 1.214008800923187e-06,
+      "loss": 0.6962,
+      "step": 10942
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 11.5387075412991,
+      "learning_rate": 1.2132546412068435e-06,
+      "loss": 0.8263,
+      "step": 10943
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 10.193096215497455,
+      "learning_rate": 1.2125006834649572e-06,
+      "loss": 0.6903,
+      "step": 10944
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 11.279306585882598,
+      "learning_rate": 1.211746927737743e-06,
+      "loss": 0.7581,
+      "step": 10945
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 7.178895610698592,
+      "learning_rate": 1.2109933740654028e-06,
+      "loss": 0.7468,
+      "step": 10946
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 9.12932196054522,
+      "learning_rate": 1.2102400224881283e-06,
+      "loss": 0.6575,
+      "step": 10947
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 12.186646787720877,
+      "learning_rate": 1.2094868730461035e-06,
+      "loss": 0.6673,
+      "step": 10948
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 8.0700248418407,
+      "learning_rate": 1.2087339257794961e-06,
+      "loss": 0.7261,
+      "step": 10949
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 8.865787859480111,
+      "learning_rate": 1.2079811807284698e-06,
+      "loss": 0.6792,
+      "step": 10950
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 8.105434297802967,
+      "learning_rate": 1.2072286379331706e-06,
+      "loss": 0.7447,
+      "step": 10951
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 7.095875168399144,
+      "learning_rate": 1.2064762974337403e-06,
+      "loss": 0.7446,
+      "step": 10952
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 9.480534941746148,
+      "learning_rate": 1.2057241592703045e-06,
+      "loss": 0.5969,
+      "step": 10953
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 8.382022272360986,
+      "learning_rate": 1.2049722234829797e-06,
+      "loss": 0.6926,
+      "step": 10954
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 9.836480394486605,
+      "learning_rate": 1.2042204901118736e-06,
+      "loss": 0.65,
+      "step": 10955
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 6.819629048969976,
+      "learning_rate": 1.2034689591970806e-06,
+      "loss": 0.7779,
+      "step": 10956
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 10.306774560544419,
+      "learning_rate": 1.2027176307786864e-06,
+      "loss": 0.7063,
+      "step": 10957
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 8.445473394623791,
+      "learning_rate": 1.2019665048967639e-06,
+      "loss": 0.7348,
+      "step": 10958
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 7.007610488715854,
+      "learning_rate": 1.2012155815913746e-06,
+      "loss": 0.7268,
+      "step": 10959
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 13.303569941092753,
+      "learning_rate": 1.2004648609025737e-06,
+      "loss": 0.6887,
+      "step": 10960
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 11.92009482971398,
+      "learning_rate": 1.1997143428704e-06,
+      "loss": 0.684,
+      "step": 10961
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 8.861200593809604,
+      "learning_rate": 1.1989640275348847e-06,
+      "loss": 0.62,
+      "step": 10962
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 7.809185537396959,
+      "learning_rate": 1.1982139149360494e-06,
+      "loss": 0.6873,
+      "step": 10963
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 11.393041310482795,
+      "learning_rate": 1.197464005113902e-06,
+      "loss": 0.7176,
+      "step": 10964
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 8.706915093022465,
+      "learning_rate": 1.1967142981084396e-06,
+      "loss": 0.7359,
+      "step": 10965
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 9.664212027966332,
+      "learning_rate": 1.1959647939596487e-06,
+      "loss": 0.696,
+      "step": 10966
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 9.862198955139757,
+      "learning_rate": 1.1952154927075072e-06,
+      "loss": 0.6639,
+      "step": 10967
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 8.204149316628985,
+      "learning_rate": 1.194466394391982e-06,
+      "loss": 0.6683,
+      "step": 10968
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 6.842343064850154,
+      "learning_rate": 1.1937174990530248e-06,
+      "loss": 0.6865,
+      "step": 10969
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 8.85983680981045,
+      "learning_rate": 1.1929688067305833e-06,
+      "loss": 0.7519,
+      "step": 10970
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 10.714741843005498,
+      "learning_rate": 1.1922203174645886e-06,
+      "loss": 0.652,
+      "step": 10971
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 8.580376751203419,
+      "learning_rate": 1.1914720312949612e-06,
+      "loss": 0.6393,
+      "step": 10972
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 9.693976727485135,
+      "learning_rate": 1.1907239482616162e-06,
+      "loss": 0.7454,
+      "step": 10973
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 8.82216121951672,
+      "learning_rate": 1.1899760684044515e-06,
+      "loss": 0.7272,
+      "step": 10974
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 11.369428749976638,
+      "learning_rate": 1.1892283917633596e-06,
+      "loss": 0.6799,
+      "step": 10975
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 10.611258567961764,
+      "learning_rate": 1.1884809183782158e-06,
+      "loss": 0.6584,
+      "step": 10976
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 8.785359252534166,
+      "learning_rate": 1.1877336482888924e-06,
+      "loss": 0.7184,
+      "step": 10977
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 11.85777112574673,
+      "learning_rate": 1.1869865815352443e-06,
+      "loss": 0.7529,
+      "step": 10978
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 10.149814125458647,
+      "learning_rate": 1.1862397181571172e-06,
+      "loss": 0.6948,
+      "step": 10979
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 7.724567687956575,
+      "learning_rate": 1.185493058194349e-06,
+      "loss": 0.745,
+      "step": 10980
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 7.560929508020795,
+      "learning_rate": 1.1847466016867627e-06,
+      "loss": 0.7543,
+      "step": 10981
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 8.74709843988465,
+      "learning_rate": 1.184000348674172e-06,
+      "loss": 0.7751,
+      "step": 10982
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 8.777383878688942,
+      "learning_rate": 1.1832542991963826e-06,
+      "loss": 0.662,
+      "step": 10983
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 10.98685986089312,
+      "learning_rate": 1.1825084532931845e-06,
+      "loss": 0.6729,
+      "step": 10984
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 10.616291374943652,
+      "learning_rate": 1.1817628110043594e-06,
+      "loss": 0.803,
+      "step": 10985
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 8.162953209647934,
+      "learning_rate": 1.1810173723696766e-06,
+      "loss": 0.6935,
+      "step": 10986
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 10.317904929810817,
+      "learning_rate": 1.1802721374288972e-06,
+      "loss": 0.7775,
+      "step": 10987
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 9.325866460729952,
+      "learning_rate": 1.1795271062217712e-06,
+      "loss": 0.6888,
+      "step": 10988
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 10.61547303512722,
+      "learning_rate": 1.1787822787880332e-06,
+      "loss": 0.7068,
+      "step": 10989
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 10.95122677419511,
+      "learning_rate": 1.1780376551674134e-06,
+      "loss": 0.7309,
+      "step": 10990
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 10.14081932485279,
+      "learning_rate": 1.1772932353996268e-06,
+      "loss": 0.7321,
+      "step": 10991
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 8.850255791494394,
+      "learning_rate": 1.1765490195243773e-06,
+      "loss": 0.6337,
+      "step": 10992
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 9.464651083084858,
+      "learning_rate": 1.1758050075813616e-06,
+      "loss": 0.7184,
+      "step": 10993
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 11.788877765193709,
+      "learning_rate": 1.17506119961026e-06,
+      "loss": 0.7909,
+      "step": 10994
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 8.678437285746421,
+      "learning_rate": 1.174317595650748e-06,
+      "loss": 0.6472,
+      "step": 10995
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 10.250796227654387,
+      "learning_rate": 1.1735741957424896e-06,
+      "loss": 0.6783,
+      "step": 10996
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 8.292018861620186,
+      "learning_rate": 1.172830999925129e-06,
+      "loss": 0.7317,
+      "step": 10997
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 9.466214459214823,
+      "learning_rate": 1.1720880082383117e-06,
+      "loss": 0.6878,
+      "step": 10998
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 10.856607804569022,
+      "learning_rate": 1.1713452207216636e-06,
+      "loss": 0.7188,
+      "step": 10999
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 9.060210401714663,
+      "learning_rate": 1.1706026374148043e-06,
+      "loss": 0.7633,
+      "step": 11000
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 10.533576121626862,
+      "learning_rate": 1.1698602583573427e-06,
+      "loss": 0.7274,
+      "step": 11001
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 8.74054760538142,
+      "learning_rate": 1.1691180835888738e-06,
+      "loss": 0.7104,
+      "step": 11002
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 13.829649301750187,
+      "learning_rate": 1.1683761131489823e-06,
+      "loss": 0.7173,
+      "step": 11003
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 7.6078150347752365,
+      "learning_rate": 1.1676343470772429e-06,
+      "loss": 0.6731,
+      "step": 11004
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 7.908055182444193,
+      "learning_rate": 1.1668927854132195e-06,
+      "loss": 0.6963,
+      "step": 11005
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 7.040352896696859,
+      "learning_rate": 1.1661514281964665e-06,
+      "loss": 0.7012,
+      "step": 11006
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 7.631935390836044,
+      "learning_rate": 1.1654102754665237e-06,
+      "loss": 0.7712,
+      "step": 11007
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 8.15052110520404,
+      "learning_rate": 1.1646693272629244e-06,
+      "loss": 0.6825,
+      "step": 11008
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 7.420548952350795,
+      "learning_rate": 1.1639285836251878e-06,
+      "loss": 0.7336,
+      "step": 11009
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 11.890584086265601,
+      "learning_rate": 1.1631880445928212e-06,
+      "loss": 0.7264,
+      "step": 11010
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 12.230882219441908,
+      "learning_rate": 1.162447710205326e-06,
+      "loss": 0.7055,
+      "step": 11011
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 9.98358150506572,
+      "learning_rate": 1.1617075805021866e-06,
+      "loss": 0.7204,
+      "step": 11012
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 9.204585863293351,
+      "learning_rate": 1.1609676555228822e-06,
+      "loss": 0.737,
+      "step": 11013
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 5.912938999751077,
+      "learning_rate": 1.160227935306875e-06,
+      "loss": 0.6838,
+      "step": 11014
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 8.329728404755992,
+      "learning_rate": 1.159488419893624e-06,
+      "loss": 0.6756,
+      "step": 11015
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 9.122074325217039,
+      "learning_rate": 1.15874910932257e-06,
+      "loss": 0.7086,
+      "step": 11016
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 7.549110303559161,
+      "learning_rate": 1.1580100036331454e-06,
+      "loss": 0.7873,
+      "step": 11017
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 5.852161838892489,
+      "learning_rate": 1.1572711028647744e-06,
+      "loss": 0.6885,
+      "step": 11018
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 8.023689390570784,
+      "learning_rate": 1.1565324070568645e-06,
+      "loss": 0.6936,
+      "step": 11019
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 10.844704107026637,
+      "learning_rate": 1.155793916248818e-06,
+      "loss": 0.7131,
+      "step": 11020
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 8.176900002689072,
+      "learning_rate": 1.1550556304800247e-06,
+      "loss": 0.6773,
+      "step": 11021
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 11.434882066473318,
+      "learning_rate": 1.1543175497898617e-06,
+      "loss": 0.7573,
+      "step": 11022
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 11.935807421078673,
+      "learning_rate": 1.1535796742176964e-06,
+      "loss": 0.7126,
+      "step": 11023
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 7.5413091233734075,
+      "learning_rate": 1.1528420038028826e-06,
+      "loss": 0.6646,
+      "step": 11024
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 8.899755744989099,
+      "learning_rate": 1.1521045385847684e-06,
+      "loss": 0.6809,
+      "step": 11025
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 8.121453112436093,
+      "learning_rate": 1.1513672786026885e-06,
+      "loss": 0.7703,
+      "step": 11026
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 11.425244265866462,
+      "learning_rate": 1.1506302238959638e-06,
+      "loss": 0.7455,
+      "step": 11027
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 13.11817210944849,
+      "learning_rate": 1.1498933745039097e-06,
+      "loss": 0.6691,
+      "step": 11028
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 11.043641110073798,
+      "learning_rate": 1.1491567304658257e-06,
+      "loss": 0.6606,
+      "step": 11029
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 11.100540097998925,
+      "learning_rate": 1.1484202918210018e-06,
+      "loss": 0.7135,
+      "step": 11030
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 15.219510169241753,
+      "learning_rate": 1.14768405860872e-06,
+      "loss": 0.72,
+      "step": 11031
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 8.477253651657884,
+      "learning_rate": 1.1469480308682462e-06,
+      "loss": 0.6938,
+      "step": 11032
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 11.454481719296112,
+      "learning_rate": 1.1462122086388394e-06,
+      "loss": 0.7457,
+      "step": 11033
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 11.098485284380509,
+      "learning_rate": 1.1454765919597476e-06,
+      "loss": 0.7544,
+      "step": 11034
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 8.937391811926163,
+      "learning_rate": 1.1447411808702053e-06,
+      "loss": 0.7673,
+      "step": 11035
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 6.497516296074464,
+      "learning_rate": 1.1440059754094368e-06,
+      "loss": 0.6587,
+      "step": 11036
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 11.662298230268735,
+      "learning_rate": 1.1432709756166554e-06,
+      "loss": 0.6848,
+      "step": 11037
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 7.289167174116232,
+      "learning_rate": 1.1425361815310648e-06,
+      "loss": 0.6787,
+      "step": 11038
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 11.699717447458656,
+      "learning_rate": 1.141801593191858e-06,
+      "loss": 0.6526,
+      "step": 11039
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 9.387021261009075,
+      "learning_rate": 1.1410672106382142e-06,
+      "loss": 0.6761,
+      "step": 11040
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 10.705810566138348,
+      "learning_rate": 1.140333033909306e-06,
+      "loss": 0.7037,
+      "step": 11041
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 8.814687734978298,
+      "learning_rate": 1.1395990630442883e-06,
+      "loss": 0.7067,
+      "step": 11042
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 10.183091971654369,
+      "learning_rate": 1.1388652980823101e-06,
+      "loss": 0.7243,
+      "step": 11043
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 9.609844107596869,
+      "learning_rate": 1.1381317390625112e-06,
+      "loss": 0.7654,
+      "step": 11044
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 8.426373999995219,
+      "learning_rate": 1.1373983860240146e-06,
+      "loss": 0.706,
+      "step": 11045
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 8.083678466006383,
+      "learning_rate": 1.136665239005938e-06,
+      "loss": 0.7706,
+      "step": 11046
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 8.994961175787632,
+      "learning_rate": 1.1359322980473835e-06,
+      "loss": 0.744,
+      "step": 11047
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 9.092995360738175,
+      "learning_rate": 1.135199563187443e-06,
+      "loss": 0.7223,
+      "step": 11048
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 7.466201294436766,
+      "learning_rate": 1.134467034465202e-06,
+      "loss": 0.6585,
+      "step": 11049
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 8.922398894688333,
+      "learning_rate": 1.133734711919728e-06,
+      "loss": 0.7069,
+      "step": 11050
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 7.188866950055314,
+      "learning_rate": 1.1330025955900836e-06,
+      "loss": 0.6814,
+      "step": 11051
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 8.5244858440189,
+      "learning_rate": 1.132270685515316e-06,
+      "loss": 0.6975,
+      "step": 11052
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 9.71062681391709,
+      "learning_rate": 1.1315389817344652e-06,
+      "loss": 0.68,
+      "step": 11053
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 9.804832439222826,
+      "learning_rate": 1.1308074842865575e-06,
+      "loss": 0.7105,
+      "step": 11054
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 9.195894383043129,
+      "learning_rate": 1.1300761932106068e-06,
+      "loss": 0.7148,
+      "step": 11055
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 8.962949207939593,
+      "learning_rate": 1.1293451085456214e-06,
+      "loss": 0.7563,
+      "step": 11056
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 10.175691007401149,
+      "learning_rate": 1.1286142303305925e-06,
+      "loss": 0.6436,
+      "step": 11057
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 9.172284610339583,
+      "learning_rate": 1.127883558604505e-06,
+      "loss": 0.6492,
+      "step": 11058
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 9.020399421792769,
+      "learning_rate": 1.1271530934063312e-06,
+      "loss": 0.7075,
+      "step": 11059
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 10.953283599573487,
+      "learning_rate": 1.1264228347750317e-06,
+      "loss": 0.7088,
+      "step": 11060
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 8.691163289787657,
+      "learning_rate": 1.1256927827495557e-06,
+      "loss": 0.7135,
+      "step": 11061
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 7.513028685681633,
+      "learning_rate": 1.1249629373688409e-06,
+      "loss": 0.5703,
+      "step": 11062
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 10.270247114774696,
+      "learning_rate": 1.1242332986718168e-06,
+      "loss": 0.6943,
+      "step": 11063
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 6.0995138558048385,
+      "learning_rate": 1.1235038666974024e-06,
+      "loss": 0.7219,
+      "step": 11064
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 9.112298704542082,
+      "learning_rate": 1.1227746414844991e-06,
+      "loss": 0.7039,
+      "step": 11065
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 8.347866908335028,
+      "learning_rate": 1.122045623072006e-06,
+      "loss": 0.7173,
+      "step": 11066
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 9.460299071941671,
+      "learning_rate": 1.1213168114988048e-06,
+      "loss": 0.7021,
+      "step": 11067
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 8.626348459145355,
+      "learning_rate": 1.1205882068037677e-06,
+      "loss": 0.7381,
+      "step": 11068
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 7.075943497154645,
+      "learning_rate": 1.119859809025758e-06,
+      "loss": 0.7017,
+      "step": 11069
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 8.406682725694722,
+      "learning_rate": 1.1191316182036243e-06,
+      "loss": 0.707,
+      "step": 11070
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 8.626634666537852,
+      "learning_rate": 1.1184036343762088e-06,
+      "loss": 0.753,
+      "step": 11071
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 11.501027763570486,
+      "learning_rate": 1.1176758575823382e-06,
+      "loss": 0.7922,
+      "step": 11072
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 8.35172764301616,
+      "learning_rate": 1.1169482878608317e-06,
+      "loss": 0.7055,
+      "step": 11073
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 12.413223688359187,
+      "learning_rate": 1.1162209252504952e-06,
+      "loss": 0.6952,
+      "step": 11074
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 10.388174132680115,
+      "learning_rate": 1.1154937697901224e-06,
+      "loss": 0.7499,
+      "step": 11075
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 15.615493095800907,
+      "learning_rate": 1.1147668215184997e-06,
+      "loss": 0.7285,
+      "step": 11076
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 10.985358578498024,
+      "learning_rate": 1.114040080474401e-06,
+      "loss": 0.6548,
+      "step": 11077
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 9.43328007115216,
+      "learning_rate": 1.1133135466965872e-06,
+      "loss": 0.7406,
+      "step": 11078
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 8.926495419359885,
+      "learning_rate": 1.1125872202238108e-06,
+      "loss": 0.6942,
+      "step": 11079
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 8.794526592640505,
+      "learning_rate": 1.1118611010948115e-06,
+      "loss": 0.7242,
+      "step": 11080
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 13.091274387117444,
+      "learning_rate": 1.1111351893483168e-06,
+      "loss": 0.674,
+      "step": 11081
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 9.41009195911938,
+      "learning_rate": 1.1104094850230484e-06,
+      "loss": 0.6971,
+      "step": 11082
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 11.555127928119115,
+      "learning_rate": 1.1096839881577094e-06,
+      "loss": 0.7211,
+      "step": 11083
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 11.401116282341233,
+      "learning_rate": 1.1089586987909996e-06,
+      "loss": 0.6894,
+      "step": 11084
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 12.236274239323647,
+      "learning_rate": 1.1082336169616016e-06,
+      "loss": 0.6661,
+      "step": 11085
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 9.773292444695546,
+      "learning_rate": 1.1075087427081883e-06,
+      "loss": 0.6945,
+      "step": 11086
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 7.22410147150365,
+      "learning_rate": 1.106784076069425e-06,
+      "loss": 0.7274,
+      "step": 11087
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 8.226554336020492,
+      "learning_rate": 1.1060596170839611e-06,
+      "loss": 0.6827,
+      "step": 11088
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 9.356346552115276,
+      "learning_rate": 1.1053353657904398e-06,
+      "loss": 0.6711,
+      "step": 11089
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 11.266833072404316,
+      "learning_rate": 1.1046113222274878e-06,
+      "loss": 0.7214,
+      "step": 11090
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 10.83766134459724,
+      "learning_rate": 1.1038874864337262e-06,
+      "loss": 0.7405,
+      "step": 11091
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 12.602356777093124,
+      "learning_rate": 1.1031638584477616e-06,
+      "loss": 0.7013,
+      "step": 11092
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 6.931795098515442,
+      "learning_rate": 1.1024404383081882e-06,
+      "loss": 0.7573,
+      "step": 11093
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 9.020954651601887,
+      "learning_rate": 1.101717226053594e-06,
+      "loss": 0.6973,
+      "step": 11094
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 7.225389456056052,
+      "learning_rate": 1.100994221722551e-06,
+      "loss": 0.6867,
+      "step": 11095
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 9.041784502872389,
+      "learning_rate": 1.1002714253536234e-06,
+      "loss": 0.7562,
+      "step": 11096
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 10.695590840452624,
+      "learning_rate": 1.0995488369853642e-06,
+      "loss": 0.7023,
+      "step": 11097
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 9.645824384711865,
+      "learning_rate": 1.098826456656313e-06,
+      "loss": 0.7949,
+      "step": 11098
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 10.812786178003925,
+      "learning_rate": 1.0981042844049995e-06,
+      "loss": 0.7395,
+      "step": 11099
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 12.097209649668581,
+      "learning_rate": 1.0973823202699412e-06,
+      "loss": 0.7636,
+      "step": 11100
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 10.137816949795502,
+      "learning_rate": 1.0966605642896473e-06,
+      "loss": 0.7039,
+      "step": 11101
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 8.065637718761218,
+      "learning_rate": 1.0959390165026146e-06,
+      "loss": 0.7121,
+      "step": 11102
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 9.76534026164748,
+      "learning_rate": 1.0952176769473266e-06,
+      "loss": 0.7343,
+      "step": 11103
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 8.080325627947937,
+      "learning_rate": 1.0944965456622603e-06,
+      "loss": 0.7351,
+      "step": 11104
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 7.816455103680369,
+      "learning_rate": 1.093775622685877e-06,
+      "loss": 0.7815,
+      "step": 11105
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 11.109083211024005,
+      "learning_rate": 1.0930549080566278e-06,
+      "loss": 0.7649,
+      "step": 11106
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 10.878149249762048,
+      "learning_rate": 1.0923344018129556e-06,
+      "loss": 0.6851,
+      "step": 11107
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 10.28090292573865,
+      "learning_rate": 1.091614103993288e-06,
+      "loss": 0.7343,
+      "step": 11108
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 8.523325720582884,
+      "learning_rate": 1.0908940146360469e-06,
+      "loss": 0.7183,
+      "step": 11109
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 6.982727976271659,
+      "learning_rate": 1.0901741337796363e-06,
+      "loss": 0.7245,
+      "step": 11110
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 7.033329976162635,
+      "learning_rate": 1.0894544614624558e-06,
+      "loss": 0.7129,
+      "step": 11111
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 8.326494091090405,
+      "learning_rate": 1.0887349977228895e-06,
+      "loss": 0.7045,
+      "step": 11112
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 10.566682581391886,
+      "learning_rate": 1.0880157425993098e-06,
+      "loss": 0.7358,
+      "step": 11113
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 12.657259137461066,
+      "learning_rate": 1.0872966961300823e-06,
+      "loss": 0.6685,
+      "step": 11114
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 8.640796446654104,
+      "learning_rate": 1.0865778583535569e-06,
+      "loss": 0.7089,
+      "step": 11115
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 8.007543013380598,
+      "learning_rate": 1.0858592293080755e-06,
+      "loss": 0.7657,
+      "step": 11116
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 6.9365442877154955,
+      "learning_rate": 1.0851408090319693e-06,
+      "loss": 0.7455,
+      "step": 11117
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 9.957274468482574,
+      "learning_rate": 1.0844225975635548e-06,
+      "loss": 0.666,
+      "step": 11118
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 7.13246827446641,
+      "learning_rate": 1.0837045949411383e-06,
+      "loss": 0.6782,
+      "step": 11119
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 13.355060676006826,
+      "learning_rate": 1.0829868012030197e-06,
+      "loss": 0.7717,
+      "step": 11120
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 9.188812154144335,
+      "learning_rate": 1.0822692163874804e-06,
+      "loss": 0.7141,
+      "step": 11121
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 7.897822314588599,
+      "learning_rate": 1.0815518405327974e-06,
+      "loss": 0.7009,
+      "step": 11122
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 10.857931546852859,
+      "learning_rate": 1.0808346736772308e-06,
+      "loss": 0.7096,
+      "step": 11123
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 10.408621539814742,
+      "learning_rate": 1.0801177158590348e-06,
+      "loss": 0.7328,
+      "step": 11124
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 8.762558306894707,
+      "learning_rate": 1.0794009671164484e-06,
+      "loss": 0.7684,
+      "step": 11125
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 9.186063670671137,
+      "learning_rate": 1.0786844274877007e-06,
+      "loss": 0.7044,
+      "step": 11126
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 9.801403600459425,
+      "learning_rate": 1.0779680970110117e-06,
+      "loss": 0.681,
+      "step": 11127
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 8.069295753092486,
+      "learning_rate": 1.0772519757245859e-06,
+      "loss": 0.7115,
+      "step": 11128
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 9.302234800616827,
+      "learning_rate": 1.0765360636666222e-06,
+      "loss": 0.7502,
+      "step": 11129
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 8.390771993672798,
+      "learning_rate": 1.0758203608753038e-06,
+      "loss": 0.7464,
+      "step": 11130
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 10.098455719040123,
+      "learning_rate": 1.0751048673888032e-06,
+      "loss": 0.6963,
+      "step": 11131
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 9.7705706678532,
+      "learning_rate": 1.074389583245285e-06,
+      "loss": 0.6675,
+      "step": 11132
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 8.47083347737612,
+      "learning_rate": 1.0736745084828982e-06,
+      "loss": 0.7353,
+      "step": 11133
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 7.658359302253755,
+      "learning_rate": 1.0729596431397837e-06,
+      "loss": 0.8382,
+      "step": 11134
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 9.34456031801109,
+      "learning_rate": 1.0722449872540725e-06,
+      "loss": 0.6797,
+      "step": 11135
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 9.433517276918451,
+      "learning_rate": 1.071530540863881e-06,
+      "loss": 0.7009,
+      "step": 11136
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 10.280480354985967,
+      "learning_rate": 1.070816304007315e-06,
+      "loss": 0.6806,
+      "step": 11137
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 9.560490471822813,
+      "learning_rate": 1.0701022767224695e-06,
+      "loss": 0.6613,
+      "step": 11138
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 7.409856498816324,
+      "learning_rate": 1.069388459047429e-06,
+      "loss": 0.6935,
+      "step": 11139
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 12.585374701091556,
+      "learning_rate": 1.0686748510202688e-06,
+      "loss": 0.7415,
+      "step": 11140
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 12.108498496955045,
+      "learning_rate": 1.0679614526790478e-06,
+      "loss": 0.6405,
+      "step": 11141
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 7.0254256206441,
+      "learning_rate": 1.0672482640618193e-06,
+      "loss": 0.7117,
+      "step": 11142
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 8.4982615279648,
+      "learning_rate": 1.066535285206622e-06,
+      "loss": 0.6912,
+      "step": 11143
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 6.973363547037305,
+      "learning_rate": 1.0658225161514819e-06,
+      "loss": 0.6829,
+      "step": 11144
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 8.106541863026932,
+      "learning_rate": 1.0651099569344193e-06,
+      "loss": 0.7724,
+      "step": 11145
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 9.533601228480434,
+      "learning_rate": 1.0643976075934377e-06,
+      "loss": 0.719,
+      "step": 11146
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 10.569228787213003,
+      "learning_rate": 1.0636854681665338e-06,
+      "loss": 0.7516,
+      "step": 11147
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 11.674372239835728,
+      "learning_rate": 1.062973538691689e-06,
+      "loss": 0.7,
+      "step": 11148
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 8.764629638885484,
+      "learning_rate": 1.0622618192068783e-06,
+      "loss": 0.6717,
+      "step": 11149
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 7.80531710151323,
+      "learning_rate": 1.0615503097500612e-06,
+      "loss": 0.7332,
+      "step": 11150
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 7.151663279497707,
+      "learning_rate": 1.0608390103591864e-06,
+      "loss": 0.7044,
+      "step": 11151
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 8.728443145177609,
+      "learning_rate": 1.060127921072196e-06,
+      "loss": 0.7371,
+      "step": 11152
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 8.563877864278005,
+      "learning_rate": 1.0594170419270139e-06,
+      "loss": 0.6583,
+      "step": 11153
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 8.864652427302198,
+      "learning_rate": 1.058706372961558e-06,
+      "loss": 0.7163,
+      "step": 11154
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 9.511314964713472,
+      "learning_rate": 1.0579959142137347e-06,
+      "loss": 0.7448,
+      "step": 11155
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 9.158765920440757,
+      "learning_rate": 1.057285665721437e-06,
+      "loss": 0.7303,
+      "step": 11156
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 8.27843795411159,
+      "learning_rate": 1.0565756275225475e-06,
+      "loss": 0.7375,
+      "step": 11157
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 12.081921527795535,
+      "learning_rate": 1.0558657996549355e-06,
+      "loss": 0.7222,
+      "step": 11158
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 10.722838349926203,
+      "learning_rate": 1.0551561821564637e-06,
+      "loss": 0.7269,
+      "step": 11159
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 10.621714536659049,
+      "learning_rate": 1.0544467750649818e-06,
+      "loss": 0.6456,
+      "step": 11160
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 7.703580440157888,
+      "learning_rate": 1.0537375784183245e-06,
+      "loss": 0.7527,
+      "step": 11161
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 16.139842624221597,
+      "learning_rate": 1.0530285922543221e-06,
+      "loss": 0.7073,
+      "step": 11162
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 8.804966979370375,
+      "learning_rate": 1.0523198166107884e-06,
+      "loss": 0.7407,
+      "step": 11163
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 7.185248902569849,
+      "learning_rate": 1.0516112515255256e-06,
+      "loss": 0.7102,
+      "step": 11164
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 10.182086534270873,
+      "learning_rate": 1.0509028970363294e-06,
+      "loss": 0.6993,
+      "step": 11165
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 8.038130824777427,
+      "learning_rate": 1.0501947531809792e-06,
+      "loss": 0.7851,
+      "step": 11166
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 8.030514234190115,
+      "learning_rate": 1.0494868199972464e-06,
+      "loss": 0.762,
+      "step": 11167
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 10.440355675663533,
+      "learning_rate": 1.0487790975228928e-06,
+      "loss": 0.7343,
+      "step": 11168
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 13.53302312741022,
+      "learning_rate": 1.0480715857956614e-06,
+      "loss": 0.7412,
+      "step": 11169
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 8.971067528048984,
+      "learning_rate": 1.0473642848532923e-06,
+      "loss": 0.7114,
+      "step": 11170
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 7.042705128731034,
+      "learning_rate": 1.0466571947335092e-06,
+      "loss": 0.7077,
+      "step": 11171
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 8.361877071181437,
+      "learning_rate": 1.0459503154740268e-06,
+      "loss": 0.7152,
+      "step": 11172
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 8.999200885236506,
+      "learning_rate": 1.04524364711255e-06,
+      "loss": 0.7192,
+      "step": 11173
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 8.07039864296559,
+      "learning_rate": 1.0445371896867684e-06,
+      "loss": 0.6385,
+      "step": 11174
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 9.55067657358977,
+      "learning_rate": 1.0438309432343636e-06,
+      "loss": 0.7005,
+      "step": 11175
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 8.714076854592573,
+      "learning_rate": 1.0431249077930029e-06,
+      "loss": 0.7136,
+      "step": 11176
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 9.82415295640716,
+      "learning_rate": 1.042419083400345e-06,
+      "loss": 0.7171,
+      "step": 11177
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 9.081573709754256,
+      "learning_rate": 1.041713470094039e-06,
+      "loss": 0.7267,
+      "step": 11178
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 10.29165634059102,
+      "learning_rate": 1.0410080679117168e-06,
+      "loss": 0.6362,
+      "step": 11179
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 12.904915639983967,
+      "learning_rate": 1.040302876891006e-06,
+      "loss": 0.7073,
+      "step": 11180
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 8.550505473220761,
+      "learning_rate": 1.0395978970695175e-06,
+      "loss": 0.774,
+      "step": 11181
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 7.274447563385892,
+      "learning_rate": 1.0388931284848524e-06,
+      "loss": 0.6355,
+      "step": 11182
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 7.977020261059579,
+      "learning_rate": 1.0381885711746031e-06,
+      "loss": 0.738,
+      "step": 11183
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 7.6207736699715465,
+      "learning_rate": 1.0374842251763468e-06,
+      "loss": 0.7476,
+      "step": 11184
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 14.49760672974162,
+      "learning_rate": 1.0367800905276532e-06,
+      "loss": 0.7289,
+      "step": 11185
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 9.682835528899204,
+      "learning_rate": 1.0360761672660763e-06,
+      "loss": 0.7646,
+      "step": 11186
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 8.356899459972123,
+      "learning_rate": 1.0353724554291645e-06,
+      "loss": 0.6734,
+      "step": 11187
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 11.832608070973203,
+      "learning_rate": 1.0346689550544504e-06,
+      "loss": 0.6686,
+      "step": 11188
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 10.170276558879864,
+      "learning_rate": 1.033965666179455e-06,
+      "loss": 0.7402,
+      "step": 11189
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 7.2279306367472715,
+      "learning_rate": 1.033262588841693e-06,
+      "loss": 0.7272,
+      "step": 11190
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 9.821430441441755,
+      "learning_rate": 1.0325597230786616e-06,
+      "loss": 0.7475,
+      "step": 11191
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 9.033708187243821,
+      "learning_rate": 1.0318570689278513e-06,
+      "loss": 0.6642,
+      "step": 11192
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 9.730938750710497,
+      "learning_rate": 1.0311546264267408e-06,
+      "loss": 0.6652,
+      "step": 11193
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 9.087281807953318,
+      "learning_rate": 1.0304523956127948e-06,
+      "loss": 0.6911,
+      "step": 11194
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 6.619476126707359,
+      "learning_rate": 1.0297503765234696e-06,
+      "loss": 0.7,
+      "step": 11195
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 8.473973943059978,
+      "learning_rate": 1.029048569196206e-06,
+      "loss": 0.61,
+      "step": 11196
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 9.42254608654228,
+      "learning_rate": 1.0283469736684388e-06,
+      "loss": 0.6794,
+      "step": 11197
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 7.324113376479315,
+      "learning_rate": 1.02764558997759e-06,
+      "loss": 0.6649,
+      "step": 11198
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 8.772210776317765,
+      "learning_rate": 1.0269444181610665e-06,
+      "loss": 0.6945,
+      "step": 11199
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 8.481800870066687,
+      "learning_rate": 1.0262434582562708e-06,
+      "loss": 0.7051,
+      "step": 11200
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 10.253461834768148,
+      "learning_rate": 1.0255427103005877e-06,
+      "loss": 0.7,
+      "step": 11201
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 9.581226234340852,
+      "learning_rate": 1.0248421743313923e-06,
+      "loss": 0.6861,
+      "step": 11202
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 9.782922692431608,
+      "learning_rate": 1.0241418503860518e-06,
+      "loss": 0.682,
+      "step": 11203
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 10.929068504372514,
+      "learning_rate": 1.0234417385019164e-06,
+      "loss": 0.6962,
+      "step": 11204
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 14.04147903526189,
+      "learning_rate": 1.0227418387163295e-06,
+      "loss": 0.7355,
+      "step": 11205
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 9.968469908473288,
+      "learning_rate": 1.0220421510666245e-06,
+      "loss": 0.7079,
+      "step": 11206
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 9.142494132126071,
+      "learning_rate": 1.0213426755901179e-06,
+      "loss": 0.7252,
+      "step": 11207
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 11.517425019046462,
+      "learning_rate": 1.0206434123241182e-06,
+      "loss": 0.6874,
+      "step": 11208
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 9.43895700201895,
+      "learning_rate": 1.0199443613059213e-06,
+      "loss": 0.6983,
+      "step": 11209
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 9.443848544436518,
+      "learning_rate": 1.019245522572813e-06,
+      "loss": 0.7141,
+      "step": 11210
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 7.870564393458404,
+      "learning_rate": 1.0185468961620697e-06,
+      "loss": 0.6881,
+      "step": 11211
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 9.01841375176804,
+      "learning_rate": 1.017848482110953e-06,
+      "loss": 0.7139,
+      "step": 11212
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 7.788536403126377,
+      "learning_rate": 1.0171502804567134e-06,
+      "loss": 0.7716,
+      "step": 11213
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 12.002823697340794,
+      "learning_rate": 1.0164522912365898e-06,
+      "loss": 0.6659,
+      "step": 11214
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 6.756058600983969,
+      "learning_rate": 1.0157545144878133e-06,
+      "loss": 0.6493,
+      "step": 11215
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 7.676913548743156,
+      "learning_rate": 1.015056950247602e-06,
+      "loss": 0.7241,
+      "step": 11216
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 12.124707138599138,
+      "learning_rate": 1.0143595985531595e-06,
+      "loss": 0.8141,
+      "step": 11217
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 11.986810681080344,
+      "learning_rate": 1.0136624594416828e-06,
+      "loss": 0.732,
+      "step": 11218
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 8.498391275960493,
+      "learning_rate": 1.012965532950355e-06,
+      "loss": 0.7324,
+      "step": 11219
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 10.816515676137167,
+      "learning_rate": 1.012268819116346e-06,
+      "loss": 0.7187,
+      "step": 11220
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 7.2893281806741,
+      "learning_rate": 1.01157231797682e-06,
+      "loss": 0.6946,
+      "step": 11221
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 10.835803629987916,
+      "learning_rate": 1.010876029568923e-06,
+      "loss": 0.664,
+      "step": 11222
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 11.215674860498565,
+      "learning_rate": 1.0101799539297962e-06,
+      "loss": 0.6958,
+      "step": 11223
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 10.573937510336402,
+      "learning_rate": 1.0094840910965637e-06,
+      "loss": 0.7468,
+      "step": 11224
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 8.96738691941448,
+      "learning_rate": 1.0087884411063432e-06,
+      "loss": 0.6629,
+      "step": 11225
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 14.24483456102522,
+      "learning_rate": 1.0080930039962379e-06,
+      "loss": 0.7216,
+      "step": 11226
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 11.324878646316673,
+      "learning_rate": 1.0073977798033385e-06,
+      "loss": 0.689,
+      "step": 11227
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 8.555505733069442,
+      "learning_rate": 1.0067027685647297e-06,
+      "loss": 0.7058,
+      "step": 11228
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 7.103271999545947,
+      "learning_rate": 1.0060079703174786e-06,
+      "loss": 0.7182,
+      "step": 11229
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 7.531960860716394,
+      "learning_rate": 1.0053133850986447e-06,
+      "loss": 0.6192,
+      "step": 11230
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 9.82626747572318,
+      "learning_rate": 1.0046190129452771e-06,
+      "loss": 0.6741,
+      "step": 11231
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 11.816850159418513,
+      "learning_rate": 1.0039248538944101e-06,
+      "loss": 0.7387,
+      "step": 11232
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 12.172730206187993,
+      "learning_rate": 1.0032309079830683e-06,
+      "loss": 0.7049,
+      "step": 11233
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 8.534135386046925,
+      "learning_rate": 1.0025371752482632e-06,
+      "loss": 0.7556,
+      "step": 11234
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 12.97351212964945,
+      "learning_rate": 1.0018436557269984e-06,
+      "loss": 0.6605,
+      "step": 11235
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 9.955016542680967,
+      "learning_rate": 1.0011503494562651e-06,
+      "loss": 0.746,
+      "step": 11236
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 8.661578619629344,
+      "learning_rate": 1.0004572564730403e-06,
+      "loss": 0.7074,
+      "step": 11237
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 8.591845976269079,
+      "learning_rate": 9.997643768142933e-07,
+      "loss": 0.7648,
+      "step": 11238
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 11.111322209627707,
+      "learning_rate": 9.990717105169806e-07,
+      "loss": 0.7609,
+      "step": 11239
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 8.854564176746743,
+      "learning_rate": 9.983792576180444e-07,
+      "loss": 0.6784,
+      "step": 11240
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 9.91834079585425,
+      "learning_rate": 9.976870181544212e-07,
+      "loss": 0.7125,
+      "step": 11241
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 10.714135501655454,
+      "learning_rate": 9.969949921630312e-07,
+      "loss": 0.8264,
+      "step": 11242
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 9.648053793936732,
+      "learning_rate": 9.963031796807865e-07,
+      "loss": 0.6754,
+      "step": 11243
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 12.04672210044696,
+      "learning_rate": 9.956115807445849e-07,
+      "loss": 0.6307,
+      "step": 11244
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 10.059927994297148,
+      "learning_rate": 9.949201953913168e-07,
+      "loss": 0.7319,
+      "step": 11245
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 5.9317530745927405,
+      "learning_rate": 9.942290236578567e-07,
+      "loss": 0.7032,
+      "step": 11246
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 10.058957463402537,
+      "learning_rate": 9.935380655810695e-07,
+      "loss": 0.6591,
+      "step": 11247
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 7.220104487069397,
+      "learning_rate": 9.928473211978096e-07,
+      "loss": 0.771,
+      "step": 11248
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 7.808536897248862,
+      "learning_rate": 9.921567905449208e-07,
+      "loss": 0.7535,
+      "step": 11249
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 8.653930641045605,
+      "learning_rate": 9.914664736592318e-07,
+      "loss": 0.7374,
+      "step": 11250
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 11.666668127617884,
+      "learning_rate": 9.907763705775659e-07,
+      "loss": 0.7477,
+      "step": 11251
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 10.056275396530138,
+      "learning_rate": 9.900864813367261e-07,
+      "loss": 0.7728,
+      "step": 11252
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 7.856436741606718,
+      "learning_rate": 9.893968059735114e-07,
+      "loss": 0.7406,
+      "step": 11253
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 10.652356294633796,
+      "learning_rate": 9.88707344524709e-07,
+      "loss": 0.7539,
+      "step": 11254
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 7.068683772505508,
+      "learning_rate": 9.880180970270902e-07,
+      "loss": 0.6972,
+      "step": 11255
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 10.363495141801437,
+      "learning_rate": 9.873290635174193e-07,
+      "loss": 0.7069,
+      "step": 11256
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 7.507130465069668,
+      "learning_rate": 9.866402440324474e-07,
+      "loss": 0.6975,
+      "step": 11257
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 11.128624107978716,
+      "learning_rate": 9.859516386089124e-07,
+      "loss": 0.7214,
+      "step": 11258
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 9.242984845402134,
+      "learning_rate": 9.85263247283545e-07,
+      "loss": 0.6897,
+      "step": 11259
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 9.038594425349205,
+      "learning_rate": 9.845750700930595e-07,
+      "loss": 0.6937,
+      "step": 11260
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 12.246372090132756,
+      "learning_rate": 9.83887107074164e-07,
+      "loss": 0.732,
+      "step": 11261
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 9.83026821787045,
+      "learning_rate": 9.831993582635502e-07,
+      "loss": 0.6613,
+      "step": 11262
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 9.639809059665506,
+      "learning_rate": 9.825118236979025e-07,
+      "loss": 0.6518,
+      "step": 11263
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 8.725481668520654,
+      "learning_rate": 9.818245034138918e-07,
+      "loss": 0.6605,
+      "step": 11264
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 8.774167750772353,
+      "learning_rate": 9.811373974481764e-07,
+      "loss": 0.6873,
+      "step": 11265
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 8.440887546652743,
+      "learning_rate": 9.804505058374065e-07,
+      "loss": 0.6823,
+      "step": 11266
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 10.127858370162118,
+      "learning_rate": 9.797638286182172e-07,
+      "loss": 0.718,
+      "step": 11267
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 10.62474498315093,
+      "learning_rate": 9.790773658272346e-07,
+      "loss": 0.7268,
+      "step": 11268
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 7.793288530163588,
+      "learning_rate": 9.78391117501074e-07,
+      "loss": 0.7232,
+      "step": 11269
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 8.862045767524911,
+      "learning_rate": 9.777050836763375e-07,
+      "loss": 0.6541,
+      "step": 11270
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 11.632558264627804,
+      "learning_rate": 9.770192643896154e-07,
+      "loss": 0.7297,
+      "step": 11271
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 12.100896301966667,
+      "learning_rate": 9.763336596774865e-07,
+      "loss": 0.6881,
+      "step": 11272
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 10.53957903627139,
+      "learning_rate": 9.756482695765202e-07,
+      "loss": 0.7203,
+      "step": 11273
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 8.78518053792495,
+      "learning_rate": 9.749630941232746e-07,
+      "loss": 0.7109,
+      "step": 11274
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 9.038290382403803,
+      "learning_rate": 9.742781333542928e-07,
+      "loss": 0.674,
+      "step": 11275
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 8.130441434479604,
+      "learning_rate": 9.73593387306111e-07,
+      "loss": 0.7231,
+      "step": 11276
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 7.880161705985519,
+      "learning_rate": 9.7290885601525e-07,
+      "loss": 0.7717,
+      "step": 11277
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 11.966765687969612,
+      "learning_rate": 9.7222453951822e-07,
+      "loss": 0.6709,
+      "step": 11278
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 11.397982746386148,
+      "learning_rate": 9.715404378515226e-07,
+      "loss": 0.6707,
+      "step": 11279
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 12.091876085096564,
+      "learning_rate": 9.708565510516444e-07,
+      "loss": 0.6795,
+      "step": 11280
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 11.317872057895396,
+      "learning_rate": 9.70172879155063e-07,
+      "loss": 0.7815,
+      "step": 11281
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 11.405526373285362,
+      "learning_rate": 9.694894221982425e-07,
+      "loss": 0.6175,
+      "step": 11282
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 9.470691395575509,
+      "learning_rate": 9.688061802176385e-07,
+      "loss": 0.6693,
+      "step": 11283
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 8.409444263953658,
+      "learning_rate": 9.681231532496915e-07,
+      "loss": 0.7035,
+      "step": 11284
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 12.25589041539741,
+      "learning_rate": 9.674403413308313e-07,
+      "loss": 0.6696,
+      "step": 11285
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 8.93649878679306,
+      "learning_rate": 9.667577444974807e-07,
+      "loss": 0.7097,
+      "step": 11286
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 7.466549538857498,
+      "learning_rate": 9.660753627860435e-07,
+      "loss": 0.6982,
+      "step": 11287
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 11.171182377126778,
+      "learning_rate": 9.653931962329182e-07,
+      "loss": 0.6974,
+      "step": 11288
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 9.731219135217025,
+      "learning_rate": 9.647112448744906e-07,
+      "loss": 0.6586,
+      "step": 11289
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 8.257398478500486,
+      "learning_rate": 9.640295087471325e-07,
+      "loss": 0.7108,
+      "step": 11290
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 8.425954369387231,
+      "learning_rate": 9.633479878872055e-07,
+      "loss": 0.6988,
+      "step": 11291
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 13.15075061838997,
+      "learning_rate": 9.626666823310615e-07,
+      "loss": 0.7887,
+      "step": 11292
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 7.979666641407246,
+      "learning_rate": 9.619855921150378e-07,
+      "loss": 0.7234,
+      "step": 11293
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 12.027332032643558,
+      "learning_rate": 9.61304717275464e-07,
+      "loss": 0.74,
+      "step": 11294
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 10.453782659852093,
+      "learning_rate": 9.606240578486536e-07,
+      "loss": 0.7409,
+      "step": 11295
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 14.289265119102891,
+      "learning_rate": 9.599436138709134e-07,
+      "loss": 0.8179,
+      "step": 11296
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 11.061310024049561,
+      "learning_rate": 9.59263385378535e-07,
+      "loss": 0.7504,
+      "step": 11297
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 10.023138831994608,
+      "learning_rate": 9.585833724077992e-07,
+      "loss": 0.687,
+      "step": 11298
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 11.866959120016196,
+      "learning_rate": 9.579035749949779e-07,
+      "loss": 0.6817,
+      "step": 11299
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 9.061867063613285,
+      "learning_rate": 9.572239931763277e-07,
+      "loss": 0.7187,
+      "step": 11300
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 8.688733566564546,
+      "learning_rate": 9.56544626988098e-07,
+      "loss": 0.6842,
+      "step": 11301
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 9.1106312201917,
+      "learning_rate": 9.558654764665226e-07,
+      "loss": 0.7701,
+      "step": 11302
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 7.195967269693654,
+      "learning_rate": 9.551865416478239e-07,
+      "loss": 0.7165,
+      "step": 11303
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 8.912098168725388,
+      "learning_rate": 9.545078225682186e-07,
+      "loss": 0.6692,
+      "step": 11304
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 11.146025704385336,
+      "learning_rate": 9.538293192639037e-07,
+      "loss": 0.6744,
+      "step": 11305
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 11.150010135343427,
+      "learning_rate": 9.531510317710701e-07,
+      "loss": 0.7052,
+      "step": 11306
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 9.377560558334281,
+      "learning_rate": 9.52472960125897e-07,
+      "loss": 0.6732,
+      "step": 11307
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 8.46432425377264,
+      "learning_rate": 9.517951043645507e-07,
+      "loss": 0.6625,
+      "step": 11308
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 13.39039076142565,
+      "learning_rate": 9.511174645231846e-07,
+      "loss": 0.6956,
+      "step": 11309
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 10.403760722776596,
+      "learning_rate": 9.504400406379416e-07,
+      "loss": 0.7117,
+      "step": 11310
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 8.561281947429421,
+      "learning_rate": 9.497628327449548e-07,
+      "loss": 0.6094,
+      "step": 11311
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 13.134734712027225,
+      "learning_rate": 9.49085840880346e-07,
+      "loss": 0.6591,
+      "step": 11312
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 7.863353946203953,
+      "learning_rate": 9.484090650802214e-07,
+      "loss": 0.7574,
+      "step": 11313
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 8.293330706249673,
+      "learning_rate": 9.477325053806808e-07,
+      "loss": 0.6591,
+      "step": 11314
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 11.328976116712003,
+      "learning_rate": 9.470561618178087e-07,
+      "loss": 0.6996,
+      "step": 11315
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 7.585284336467879,
+      "learning_rate": 9.463800344276785e-07,
+      "loss": 0.6702,
+      "step": 11316
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 8.329049115957053,
+      "learning_rate": 9.457041232463549e-07,
+      "loss": 0.6379,
+      "step": 11317
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 7.466142883444238,
+      "learning_rate": 9.450284283098871e-07,
+      "loss": 0.7247,
+      "step": 11318
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 9.512456915739884,
+      "learning_rate": 9.443529496543169e-07,
+      "loss": 0.669,
+      "step": 11319
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 10.202252504031694,
+      "learning_rate": 9.436776873156706e-07,
+      "loss": 0.7023,
+      "step": 11320
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 10.073016495667993,
+      "learning_rate": 9.430026413299664e-07,
+      "loss": 0.7232,
+      "step": 11321
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 8.548048851617866,
+      "learning_rate": 9.423278117332085e-07,
+      "loss": 0.8045,
+      "step": 11322
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 7.779308143494608,
+      "learning_rate": 9.416531985613897e-07,
+      "loss": 0.6824,
+      "step": 11323
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 10.966795857904808,
+      "learning_rate": 9.40978801850494e-07,
+      "loss": 0.7525,
+      "step": 11324
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 8.459300329282055,
+      "learning_rate": 9.403046216364892e-07,
+      "loss": 0.721,
+      "step": 11325
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 8.12857742177377,
+      "learning_rate": 9.396306579553354e-07,
+      "loss": 0.6662,
+      "step": 11326
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 9.389312292585966,
+      "learning_rate": 9.389569108429819e-07,
+      "loss": 0.7234,
+      "step": 11327
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 11.24903964450817,
+      "learning_rate": 9.382833803353625e-07,
+      "loss": 0.6646,
+      "step": 11328
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 7.594527424488198,
+      "learning_rate": 9.376100664684018e-07,
+      "loss": 0.7227,
+      "step": 11329
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 8.926220925295393,
+      "learning_rate": 9.369369692780112e-07,
+      "loss": 0.6952,
+      "step": 11330
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 9.477806571427204,
+      "learning_rate": 9.362640888000929e-07,
+      "loss": 0.6424,
+      "step": 11331
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 8.048197165057495,
+      "learning_rate": 9.355914250705378e-07,
+      "loss": 0.7447,
+      "step": 11332
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 8.890497492284831,
+      "learning_rate": 9.349189781252216e-07,
+      "loss": 0.6794,
+      "step": 11333
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 11.724647700743809,
+      "learning_rate": 9.342467480000134e-07,
+      "loss": 0.7366,
+      "step": 11334
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 8.952147437407218,
+      "learning_rate": 9.335747347307666e-07,
+      "loss": 0.6838,
+      "step": 11335
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 7.090016195946126,
+      "learning_rate": 9.329029383533228e-07,
+      "loss": 0.6836,
+      "step": 11336
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 8.499008849793436,
+      "learning_rate": 9.32231358903517e-07,
+      "loss": 0.7222,
+      "step": 11337
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 7.639341245396352,
+      "learning_rate": 9.315599964171662e-07,
+      "loss": 0.7007,
+      "step": 11338
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 7.145213891483478,
+      "learning_rate": 9.308888509300807e-07,
+      "loss": 0.6799,
+      "step": 11339
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 8.727113488399228,
+      "learning_rate": 9.302179224780605e-07,
+      "loss": 0.6433,
+      "step": 11340
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 8.13358810203085,
+      "learning_rate": 9.29547211096885e-07,
+      "loss": 0.7426,
+      "step": 11341
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 10.74839116702949,
+      "learning_rate": 9.288767168223322e-07,
+      "loss": 0.6555,
+      "step": 11342
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 10.296749534844478,
+      "learning_rate": 9.282064396901625e-07,
+      "loss": 0.7491,
+      "step": 11343
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 9.312968882260169,
+      "learning_rate": 9.27536379736127e-07,
+      "loss": 0.7165,
+      "step": 11344
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 8.620064164032152,
+      "learning_rate": 9.268665369959661e-07,
+      "loss": 0.724,
+      "step": 11345
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 8.934406351719202,
+      "learning_rate": 9.26196911505407e-07,
+      "loss": 0.7387,
+      "step": 11346
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 7.391013490013043,
+      "learning_rate": 9.255275033001654e-07,
+      "loss": 0.6618,
+      "step": 11347
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 9.320567853810555,
+      "learning_rate": 9.248583124159438e-07,
+      "loss": 0.6739,
+      "step": 11348
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 7.633740502425439,
+      "learning_rate": 9.241893388884365e-07,
+      "loss": 0.7589,
+      "step": 11349
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 8.964700287912974,
+      "learning_rate": 9.235205827533262e-07,
+      "loss": 0.6671,
+      "step": 11350
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 8.11518644575943,
+      "learning_rate": 9.228520440462796e-07,
+      "loss": 0.7135,
+      "step": 11351
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 8.525834451446407,
+      "learning_rate": 9.221837228029579e-07,
+      "loss": 0.6569,
+      "step": 11352
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 10.24783069498861,
+      "learning_rate": 9.215156190590053e-07,
+      "loss": 0.6771,
+      "step": 11353
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 8.284348774181222,
+      "learning_rate": 9.208477328500559e-07,
+      "loss": 0.659,
+      "step": 11354
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 15.75314592425119,
+      "learning_rate": 9.201800642117354e-07,
+      "loss": 0.7437,
+      "step": 11355
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 10.755663876666508,
+      "learning_rate": 9.195126131796523e-07,
+      "loss": 0.702,
+      "step": 11356
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 8.384311687167155,
+      "learning_rate": 9.188453797894098e-07,
+      "loss": 0.7045,
+      "step": 11357
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 9.536733446686943,
+      "learning_rate": 9.18178364076594e-07,
+      "loss": 0.692,
+      "step": 11358
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 8.884383415837458,
+      "learning_rate": 9.17511566076783e-07,
+      "loss": 0.639,
+      "step": 11359
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 9.161759426861657,
+      "learning_rate": 9.168449858255418e-07,
+      "loss": 0.663,
+      "step": 11360
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 8.326752379087958,
+      "learning_rate": 9.161786233584219e-07,
+      "loss": 0.6559,
+      "step": 11361
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 13.044773890708722,
+      "learning_rate": 9.155124787109687e-07,
+      "loss": 0.792,
+      "step": 11362
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 7.8488869297151425,
+      "learning_rate": 9.148465519187089e-07,
+      "loss": 0.6441,
+      "step": 11363
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 9.31278306765664,
+      "learning_rate": 9.141808430171628e-07,
+      "loss": 0.6905,
+      "step": 11364
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 9.310398462902986,
+      "learning_rate": 9.135153520418394e-07,
+      "loss": 0.6882,
+      "step": 11365
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 9.329862056708912,
+      "learning_rate": 9.128500790282319e-07,
+      "loss": 0.6731,
+      "step": 11366
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 9.70384240089305,
+      "learning_rate": 9.121850240118246e-07,
+      "loss": 0.7228,
+      "step": 11367
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 7.919621636700522,
+      "learning_rate": 9.115201870280882e-07,
+      "loss": 0.7307,
+      "step": 11368
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 9.80444957266486,
+      "learning_rate": 9.108555681124848e-07,
+      "loss": 0.6925,
+      "step": 11369
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 9.756924928391859,
+      "learning_rate": 9.101911673004643e-07,
+      "loss": 0.7264,
+      "step": 11370
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 13.272698098565629,
+      "learning_rate": 9.095269846274618e-07,
+      "loss": 0.6946,
+      "step": 11371
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 8.214707652308945,
+      "learning_rate": 9.088630201289056e-07,
+      "loss": 0.7695,
+      "step": 11372
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 7.116969816410877,
+      "learning_rate": 9.081992738402079e-07,
+      "loss": 0.6766,
+      "step": 11373
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 9.068679145155384,
+      "learning_rate": 9.075357457967699e-07,
+      "loss": 0.6288,
+      "step": 11374
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 10.927733924497538,
+      "learning_rate": 9.068724360339848e-07,
+      "loss": 0.7231,
+      "step": 11375
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 12.831371820597578,
+      "learning_rate": 9.0620934458723e-07,
+      "loss": 0.7135,
+      "step": 11376
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 8.587951730198325,
+      "learning_rate": 9.055464714918733e-07,
+      "loss": 0.6936,
+      "step": 11377
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 7.693555228701561,
+      "learning_rate": 9.048838167832719e-07,
+      "loss": 0.7557,
+      "step": 11378
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 10.596103501210147,
+      "learning_rate": 9.042213804967693e-07,
+      "loss": 0.6734,
+      "step": 11379
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 8.55050066389662,
+      "learning_rate": 9.03559162667697e-07,
+      "loss": 0.696,
+      "step": 11380
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 6.879978874981294,
+      "learning_rate": 9.02897163331376e-07,
+      "loss": 0.7269,
+      "step": 11381
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 8.153312289046212,
+      "learning_rate": 9.022353825231151e-07,
+      "loss": 0.7899,
+      "step": 11382
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 12.536256826711828,
+      "learning_rate": 9.015738202782143e-07,
+      "loss": 0.7449,
+      "step": 11383
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 8.909916904438765,
+      "learning_rate": 9.009124766319582e-07,
+      "loss": 0.6808,
+      "step": 11384
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 7.2273477850799805,
+      "learning_rate": 9.002513516196204e-07,
+      "loss": 0.6405,
+      "step": 11385
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 8.512578556436306,
+      "learning_rate": 8.995904452764626e-07,
+      "loss": 0.7502,
+      "step": 11386
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 11.601127160057215,
+      "learning_rate": 8.989297576377371e-07,
+      "loss": 0.7481,
+      "step": 11387
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 12.514566942048035,
+      "learning_rate": 8.982692887386835e-07,
+      "loss": 0.8148,
+      "step": 11388
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 8.582108721689444,
+      "learning_rate": 8.976090386145281e-07,
+      "loss": 0.6854,
+      "step": 11389
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 9.241851003120136,
+      "learning_rate": 8.969490073004883e-07,
+      "loss": 0.7688,
+      "step": 11390
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 10.65631694484045,
+      "learning_rate": 8.962891948317681e-07,
+      "loss": 0.7475,
+      "step": 11391
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 9.19090135481819,
+      "learning_rate": 8.956296012435578e-07,
+      "loss": 0.7302,
+      "step": 11392
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 8.654884792825056,
+      "learning_rate": 8.949702265710413e-07,
+      "loss": 0.71,
+      "step": 11393
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 8.412503060022917,
+      "learning_rate": 8.943110708493858e-07,
+      "loss": 0.7701,
+      "step": 11394
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 13.040188199226542,
+      "learning_rate": 8.9365213411375e-07,
+      "loss": 0.7537,
+      "step": 11395
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 9.079697518403888,
+      "learning_rate": 8.929934163992782e-07,
+      "loss": 0.6654,
+      "step": 11396
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 8.880137008342421,
+      "learning_rate": 8.923349177411073e-07,
+      "loss": 0.6732,
+      "step": 11397
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 11.09132427880024,
+      "learning_rate": 8.916766381743581e-07,
+      "loss": 0.7021,
+      "step": 11398
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 9.096560813075387,
+      "learning_rate": 8.910185777341396e-07,
+      "loss": 0.7075,
+      "step": 11399
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 9.831498629987717,
+      "learning_rate": 8.903607364555544e-07,
+      "loss": 0.7307,
+      "step": 11400
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 11.105299660587596,
+      "learning_rate": 8.897031143736867e-07,
+      "loss": 0.7138,
+      "step": 11401
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 9.734409639834237,
+      "learning_rate": 8.890457115236139e-07,
+      "loss": 0.7593,
+      "step": 11402
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 7.102538558224855,
+      "learning_rate": 8.883885279404014e-07,
+      "loss": 0.6738,
+      "step": 11403
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 10.615072147809665,
+      "learning_rate": 8.877315636591e-07,
+      "loss": 0.7108,
+      "step": 11404
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 12.694376298469543,
+      "learning_rate": 8.870748187147504e-07,
+      "loss": 0.7332,
+      "step": 11405
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 9.607706552895735,
+      "learning_rate": 8.864182931423804e-07,
+      "loss": 0.7333,
+      "step": 11406
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 10.474701872609177,
+      "learning_rate": 8.857619869770078e-07,
+      "loss": 0.674,
+      "step": 11407
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 11.067564421713282,
+      "learning_rate": 8.851059002536405e-07,
+      "loss": 0.7141,
+      "step": 11408
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 8.274725795180865,
+      "learning_rate": 8.844500330072697e-07,
+      "loss": 0.7257,
+      "step": 11409
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 9.17355818326701,
+      "learning_rate": 8.837943852728792e-07,
+      "loss": 0.7445,
+      "step": 11410
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 8.346114051902038,
+      "learning_rate": 8.831389570854388e-07,
+      "loss": 0.7423,
+      "step": 11411
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 10.13712073511089,
+      "learning_rate": 8.824837484799059e-07,
+      "loss": 0.7241,
+      "step": 11412
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 9.79626900151635,
+      "learning_rate": 8.818287594912295e-07,
+      "loss": 0.6357,
+      "step": 11413
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 11.232438009300369,
+      "learning_rate": 8.811739901543431e-07,
+      "loss": 0.7346,
+      "step": 11414
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 10.77331309660794,
+      "learning_rate": 8.805194405041728e-07,
+      "loss": 0.6922,
+      "step": 11415
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 10.191840436788716,
+      "learning_rate": 8.798651105756273e-07,
+      "loss": 0.8162,
+      "step": 11416
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 8.861573235319952,
+      "learning_rate": 8.792110004036092e-07,
+      "loss": 0.7765,
+      "step": 11417
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 8.189516355752664,
+      "learning_rate": 8.785571100230067e-07,
+      "loss": 0.6653,
+      "step": 11418
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 9.417874874565205,
+      "learning_rate": 8.779034394686941e-07,
+      "loss": 0.6892,
+      "step": 11419
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 8.411598031492135,
+      "learning_rate": 8.772499887755387e-07,
+      "loss": 0.67,
+      "step": 11420
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 8.133326466127276,
+      "learning_rate": 8.765967579783941e-07,
+      "loss": 0.7333,
+      "step": 11421
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 8.820279606575626,
+      "learning_rate": 8.759437471120997e-07,
+      "loss": 0.7956,
+      "step": 11422
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 9.635111363162983,
+      "learning_rate": 8.752909562114886e-07,
+      "loss": 0.6491,
+      "step": 11423
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 7.652898799174416,
+      "learning_rate": 8.746383853113744e-07,
+      "loss": 0.7055,
+      "step": 11424
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 8.331454799847943,
+      "learning_rate": 8.739860344465656e-07,
+      "loss": 0.7326,
+      "step": 11425
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 8.70668294095172,
+      "learning_rate": 8.733339036518585e-07,
+      "loss": 0.7636,
+      "step": 11426
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 9.397401303467603,
+      "learning_rate": 8.726819929620323e-07,
+      "loss": 0.7217,
+      "step": 11427
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 9.270085239095897,
+      "learning_rate": 8.72030302411862e-07,
+      "loss": 0.7149,
+      "step": 11428
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 9.230528795398985,
+      "learning_rate": 8.71378832036105e-07,
+      "loss": 0.754,
+      "step": 11429
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 9.478399636052721,
+      "learning_rate": 8.707275818695077e-07,
+      "loss": 0.6593,
+      "step": 11430
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 9.689106361096709,
+      "learning_rate": 8.700765519468086e-07,
+      "loss": 0.7152,
+      "step": 11431
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 8.477401020781443,
+      "learning_rate": 8.694257423027291e-07,
+      "loss": 0.7337,
+      "step": 11432
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 8.724794066750155,
+      "learning_rate": 8.687751529719845e-07,
+      "loss": 0.7575,
+      "step": 11433
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 11.31339308302382,
+      "learning_rate": 8.681247839892725e-07,
+      "loss": 0.7137,
+      "step": 11434
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 9.691689059540506,
+      "learning_rate": 8.674746353892843e-07,
+      "loss": 0.7284,
+      "step": 11435
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 15.442945394561292,
+      "learning_rate": 8.668247072066965e-07,
+      "loss": 0.8202,
+      "step": 11436
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 10.361583552259225,
+      "learning_rate": 8.661749994761726e-07,
+      "loss": 0.682,
+      "step": 11437
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 11.101738326730954,
+      "learning_rate": 8.655255122323691e-07,
+      "loss": 0.7565,
+      "step": 11438
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 9.647472537749112,
+      "learning_rate": 8.648762455099246e-07,
+      "loss": 0.7431,
+      "step": 11439
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 9.905330480784862,
+      "learning_rate": 8.642271993434714e-07,
+      "loss": 0.7337,
+      "step": 11440
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 10.856915314744557,
+      "learning_rate": 8.63578373767629e-07,
+      "loss": 0.6837,
+      "step": 11441
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 6.607373314020652,
+      "learning_rate": 8.629297688170018e-07,
+      "loss": 0.6827,
+      "step": 11442
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 10.742394345974258,
+      "learning_rate": 8.622813845261851e-07,
+      "loss": 0.7163,
+      "step": 11443
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 9.626904259654378,
+      "learning_rate": 8.616332209297612e-07,
+      "loss": 0.7074,
+      "step": 11444
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 7.922966087592792,
+      "learning_rate": 8.609852780623012e-07,
+      "loss": 0.716,
+      "step": 11445
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 6.876442846837149,
+      "learning_rate": 8.603375559583676e-07,
+      "loss": 0.6871,
+      "step": 11446
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 13.876866507958285,
+      "learning_rate": 8.596900546525044e-07,
+      "loss": 0.7413,
+      "step": 11447
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 9.335122837999638,
+      "learning_rate": 8.590427741792501e-07,
+      "loss": 0.6924,
+      "step": 11448
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 9.348585252716576,
+      "learning_rate": 8.583957145731275e-07,
+      "loss": 0.6814,
+      "step": 11449
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 8.508419309437672,
+      "learning_rate": 8.577488758686486e-07,
+      "loss": 0.7081,
+      "step": 11450
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 9.735253159977743,
+      "learning_rate": 8.571022581003158e-07,
+      "loss": 0.7038,
+      "step": 11451
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 10.6779150356406,
+      "learning_rate": 8.564558613026153e-07,
+      "loss": 0.7622,
+      "step": 11452
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 9.686523411865014,
+      "learning_rate": 8.558096855100267e-07,
+      "loss": 0.6737,
+      "step": 11453
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 9.531749650784173,
+      "learning_rate": 8.551637307570126e-07,
+      "loss": 0.75,
+      "step": 11454
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 8.825097150352718,
+      "learning_rate": 8.545179970780298e-07,
+      "loss": 0.7876,
+      "step": 11455
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 7.850354782494738,
+      "learning_rate": 8.538724845075175e-07,
+      "loss": 0.6795,
+      "step": 11456
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 9.719845929778725,
+      "learning_rate": 8.532271930799046e-07,
+      "loss": 0.7403,
+      "step": 11457
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 10.178216342747927,
+      "learning_rate": 8.525821228296127e-07,
+      "loss": 0.7072,
+      "step": 11458
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 6.985193011437252,
+      "learning_rate": 8.519372737910442e-07,
+      "loss": 0.7375,
+      "step": 11459
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 11.328409943002882,
+      "learning_rate": 8.512926459985949e-07,
+      "loss": 0.7127,
+      "step": 11460
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 8.424150559642346,
+      "learning_rate": 8.506482394866495e-07,
+      "loss": 0.7187,
+      "step": 11461
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 7.743590229973749,
+      "learning_rate": 8.500040542895771e-07,
+      "loss": 0.8263,
+      "step": 11462
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 11.40667922909339,
+      "learning_rate": 8.49360090441736e-07,
+      "loss": 0.7359,
+      "step": 11463
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 12.63206020512985,
+      "learning_rate": 8.487163479774752e-07,
+      "loss": 0.7291,
+      "step": 11464
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 8.409845166768134,
+      "learning_rate": 8.480728269311278e-07,
+      "loss": 0.6527,
+      "step": 11465
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 13.166009316247829,
+      "learning_rate": 8.474295273370203e-07,
+      "loss": 0.7528,
+      "step": 11466
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 10.293833773776537,
+      "learning_rate": 8.467864492294631e-07,
+      "loss": 0.732,
+      "step": 11467
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 11.530432180048725,
+      "learning_rate": 8.461435926427547e-07,
+      "loss": 0.6892,
+      "step": 11468
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 8.523285997188676,
+      "learning_rate": 8.455009576111861e-07,
+      "loss": 0.7223,
+      "step": 11469
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 9.48978069618434,
+      "learning_rate": 8.448585441690316e-07,
+      "loss": 0.7492,
+      "step": 11470
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 9.756023159170132,
+      "learning_rate": 8.44216352350557e-07,
+      "loss": 0.6241,
+      "step": 11471
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 9.163626159589976,
+      "learning_rate": 8.435743821900133e-07,
+      "loss": 0.6865,
+      "step": 11472
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 9.348791078925924,
+      "learning_rate": 8.429326337216437e-07,
+      "loss": 0.75,
+      "step": 11473
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 8.442704136259216,
+      "learning_rate": 8.422911069796763e-07,
+      "loss": 0.7136,
+      "step": 11474
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 10.486528163251714,
+      "learning_rate": 8.416498019983272e-07,
+      "loss": 0.6906,
+      "step": 11475
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 6.121521241351451,
+      "learning_rate": 8.410087188118038e-07,
+      "loss": 0.732,
+      "step": 11476
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 11.832166212171092,
+      "learning_rate": 8.403678574542972e-07,
+      "loss": 0.7394,
+      "step": 11477
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 11.745835979590774,
+      "learning_rate": 8.397272179599914e-07,
+      "loss": 0.6177,
+      "step": 11478
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 10.583074524858523,
+      "learning_rate": 8.390868003630564e-07,
+      "loss": 0.6535,
+      "step": 11479
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 12.477381416184807,
+      "learning_rate": 8.384466046976491e-07,
+      "loss": 0.6874,
+      "step": 11480
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 11.020078109870493,
+      "learning_rate": 8.378066309979166e-07,
+      "loss": 0.6868,
+      "step": 11481
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 13.253122302122355,
+      "learning_rate": 8.37166879297992e-07,
+      "loss": 0.7674,
+      "step": 11482
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 8.45591145059835,
+      "learning_rate": 8.365273496319981e-07,
+      "loss": 0.7165,
+      "step": 11483
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 10.053818016190672,
+      "learning_rate": 8.358880420340476e-07,
+      "loss": 0.7141,
+      "step": 11484
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 8.831273425110545,
+      "learning_rate": 8.352489565382371e-07,
+      "loss": 0.6995,
+      "step": 11485
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 7.288329340211753,
+      "learning_rate": 8.346100931786554e-07,
+      "loss": 0.6566,
+      "step": 11486
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 10.446073451836707,
+      "learning_rate": 8.339714519893771e-07,
+      "loss": 0.6719,
+      "step": 11487
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 11.093266195821638,
+      "learning_rate": 8.333330330044642e-07,
+      "loss": 0.6517,
+      "step": 11488
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 9.915338519045655,
+      "learning_rate": 8.326948362579707e-07,
+      "loss": 0.7835,
+      "step": 11489
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 11.146937308496039,
+      "learning_rate": 8.320568617839331e-07,
+      "loss": 0.6931,
+      "step": 11490
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 8.945728020557961,
+      "learning_rate": 8.314191096163827e-07,
+      "loss": 0.735,
+      "step": 11491
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 8.48593376737134,
+      "learning_rate": 8.30781579789332e-07,
+      "loss": 0.7719,
+      "step": 11492
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 8.687273234271547,
+      "learning_rate": 8.301442723367881e-07,
+      "loss": 0.7131,
+      "step": 11493
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 8.011891509453957,
+      "learning_rate": 8.295071872927419e-07,
+      "loss": 0.6627,
+      "step": 11494
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 11.530269614634038,
+      "learning_rate": 8.288703246911728e-07,
+      "loss": 0.7615,
+      "step": 11495
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 11.058498261015076,
+      "learning_rate": 8.28233684566051e-07,
+      "loss": 0.6773,
+      "step": 11496
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 9.707206402467017,
+      "learning_rate": 8.27597266951331e-07,
+      "loss": 0.7102,
+      "step": 11497
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 8.31768275887598,
+      "learning_rate": 8.26961071880959e-07,
+      "loss": 0.7322,
+      "step": 11498
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 7.948326508137688,
+      "learning_rate": 8.263250993888688e-07,
+      "loss": 0.7043,
+      "step": 11499
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 6.295303855089282,
+      "learning_rate": 8.256893495089802e-07,
+      "loss": 0.6489,
+      "step": 11500
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 8.948905418845563,
+      "learning_rate": 8.250538222752031e-07,
+      "loss": 0.696,
+      "step": 11501
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 8.264172476762127,
+      "learning_rate": 8.24418517721432e-07,
+      "loss": 0.7442,
+      "step": 11502
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 8.600059205350847,
+      "learning_rate": 8.237834358815544e-07,
+      "loss": 0.6489,
+      "step": 11503
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 8.783488422995536,
+      "learning_rate": 8.231485767894453e-07,
+      "loss": 0.6455,
+      "step": 11504
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 9.047007100917849,
+      "learning_rate": 8.225139404789639e-07,
+      "loss": 0.7684,
+      "step": 11505
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 13.392551077824372,
+      "learning_rate": 8.218795269839613e-07,
+      "loss": 0.7118,
+      "step": 11506
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 9.581568894375273,
+      "learning_rate": 8.212453363382755e-07,
+      "loss": 0.6556,
+      "step": 11507
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 7.756105066212409,
+      "learning_rate": 8.2061136857573e-07,
+      "loss": 0.711,
+      "step": 11508
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 9.20579141122074,
+      "learning_rate": 8.199776237301421e-07,
+      "loss": 0.7655,
+      "step": 11509
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 10.639126516275237,
+      "learning_rate": 8.193441018353115e-07,
+      "loss": 0.7125,
+      "step": 11510
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 9.021417240505771,
+      "learning_rate": 8.187108029250307e-07,
+      "loss": 0.7205,
+      "step": 11511
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 12.383823731085824,
+      "learning_rate": 8.18077727033077e-07,
+      "loss": 0.7143,
+      "step": 11512
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 10.495910520023692,
+      "learning_rate": 8.174448741932156e-07,
+      "loss": 0.6568,
+      "step": 11513
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 8.741937561440997,
+      "learning_rate": 8.168122444392041e-07,
+      "loss": 0.7911,
+      "step": 11514
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 9.400984718204972,
+      "learning_rate": 8.161798378047819e-07,
+      "loss": 0.6876,
+      "step": 11515
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 8.168042378343737,
+      "learning_rate": 8.155476543236812e-07,
+      "loss": 0.7042,
+      "step": 11516
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 12.543955876071992,
+      "learning_rate": 8.149156940296226e-07,
+      "loss": 0.7013,
+      "step": 11517
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 8.479329466761794,
+      "learning_rate": 8.142839569563115e-07,
+      "loss": 0.7077,
+      "step": 11518
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 9.61792333228589,
+      "learning_rate": 8.136524431374432e-07,
+      "loss": 0.7657,
+      "step": 11519
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 9.02679658499815,
+      "learning_rate": 8.130211526066995e-07,
+      "loss": 0.6977,
+      "step": 11520
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 9.167658551298565,
+      "learning_rate": 8.123900853977529e-07,
+      "loss": 0.7041,
+      "step": 11521
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 11.143257756160716,
+      "learning_rate": 8.117592415442644e-07,
+      "loss": 0.7214,
+      "step": 11522
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 8.14378935385247,
+      "learning_rate": 8.11128621079878e-07,
+      "loss": 0.7056,
+      "step": 11523
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 12.41493171066679,
+      "learning_rate": 8.104982240382331e-07,
+      "loss": 0.7515,
+      "step": 11524
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 8.424367625115577,
+      "learning_rate": 8.09868050452951e-07,
+      "loss": 0.6562,
+      "step": 11525
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 9.384480177382954,
+      "learning_rate": 8.092381003576427e-07,
+      "loss": 0.6938,
+      "step": 11526
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 8.066630541173296,
+      "learning_rate": 8.086083737859102e-07,
+      "loss": 0.6408,
+      "step": 11527
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 8.369515808181278,
+      "learning_rate": 8.079788707713393e-07,
+      "loss": 0.6543,
+      "step": 11528
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 9.04441801048947,
+      "learning_rate": 8.073495913475076e-07,
+      "loss": 0.77,
+      "step": 11529
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 9.640459621865912,
+      "learning_rate": 8.067205355479774e-07,
+      "loss": 0.7452,
+      "step": 11530
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 8.377284606532704,
+      "learning_rate": 8.060917034063031e-07,
+      "loss": 0.6531,
+      "step": 11531
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 9.810791827713594,
+      "learning_rate": 8.054630949560232e-07,
+      "loss": 0.7017,
+      "step": 11532
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 8.517470408188014,
+      "learning_rate": 8.048347102306653e-07,
+      "loss": 0.7838,
+      "step": 11533
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 12.20617018015417,
+      "learning_rate": 8.04206549263748e-07,
+      "loss": 0.6902,
+      "step": 11534
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 8.096867152620566,
+      "learning_rate": 8.035786120887728e-07,
+      "loss": 0.7201,
+      "step": 11535
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 8.53327700421446,
+      "learning_rate": 8.029508987392337e-07,
+      "loss": 0.6624,
+      "step": 11536
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 7.3049179359941245,
+      "learning_rate": 8.02323409248612e-07,
+      "loss": 0.6506,
+      "step": 11537
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 7.657997977354245,
+      "learning_rate": 8.016961436503756e-07,
+      "loss": 0.6966,
+      "step": 11538
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 12.041524562044163,
+      "learning_rate": 8.010691019779809e-07,
+      "loss": 0.7452,
+      "step": 11539
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 11.386205304187653,
+      "learning_rate": 8.004422842648713e-07,
+      "loss": 0.6977,
+      "step": 11540
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 6.516455814115324,
+      "learning_rate": 7.998156905444804e-07,
+      "loss": 0.7234,
+      "step": 11541
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 10.412310106884242,
+      "learning_rate": 7.99189320850231e-07,
+      "loss": 0.6966,
+      "step": 11542
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 11.595477950445764,
+      "learning_rate": 7.985631752155287e-07,
+      "loss": 0.7312,
+      "step": 11543
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 8.808382589563204,
+      "learning_rate": 7.979372536737728e-07,
+      "loss": 0.7033,
+      "step": 11544
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 9.689718270489903,
+      "learning_rate": 7.973115562583478e-07,
+      "loss": 0.6775,
+      "step": 11545
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 8.06319790244795,
+      "learning_rate": 7.966860830026246e-07,
+      "loss": 0.7518,
+      "step": 11546
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 14.32117343888266,
+      "learning_rate": 7.960608339399666e-07,
+      "loss": 0.7239,
+      "step": 11547
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 10.269901032644489,
+      "learning_rate": 7.954358091037212e-07,
+      "loss": 0.6805,
+      "step": 11548
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 11.232522574673395,
+      "learning_rate": 7.948110085272265e-07,
+      "loss": 0.6878,
+      "step": 11549
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 8.493333318383165,
+      "learning_rate": 7.941864322438081e-07,
+      "loss": 0.6788,
+      "step": 11550
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 10.369779853402038,
+      "learning_rate": 7.935620802867789e-07,
+      "loss": 0.6362,
+      "step": 11551
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 9.050522807355321,
+      "learning_rate": 7.929379526894393e-07,
+      "loss": 0.6526,
+      "step": 11552
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 8.513556314134735,
+      "learning_rate": 7.923140494850778e-07,
+      "loss": 0.6884,
+      "step": 11553
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 11.085022665134359,
+      "learning_rate": 7.916903707069723e-07,
+      "loss": 0.729,
+      "step": 11554
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 9.851101644122108,
+      "learning_rate": 7.910669163883899e-07,
+      "loss": 0.6731,
+      "step": 11555
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 11.107003110979658,
+      "learning_rate": 7.90443686562583e-07,
+      "loss": 0.7229,
+      "step": 11556
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 11.161406752184678,
+      "learning_rate": 7.898206812627923e-07,
+      "loss": 0.7008,
+      "step": 11557
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 8.840178014870855,
+      "learning_rate": 7.891979005222461e-07,
+      "loss": 0.6999,
+      "step": 11558
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 8.590191221218776,
+      "learning_rate": 7.885753443741628e-07,
+      "loss": 0.6867,
+      "step": 11559
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 8.726248789592873,
+      "learning_rate": 7.879530128517493e-07,
+      "loss": 0.7093,
+      "step": 11560
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 9.796542354810391,
+      "learning_rate": 7.873309059881968e-07,
+      "loss": 0.6796,
+      "step": 11561
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 7.2907800854022,
+      "learning_rate": 7.867090238166891e-07,
+      "loss": 0.6795,
+      "step": 11562
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 8.600807343700108,
+      "learning_rate": 7.860873663703938e-07,
+      "loss": 0.754,
+      "step": 11563
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 10.208172716492792,
+      "learning_rate": 7.854659336824677e-07,
+      "loss": 0.6798,
+      "step": 11564
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 14.49114638641364,
+      "learning_rate": 7.848447257860592e-07,
+      "loss": 0.7257,
+      "step": 11565
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 11.00838738238411,
+      "learning_rate": 7.842237427142984e-07,
+      "loss": 0.7836,
+      "step": 11566
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 7.497753546778482,
+      "learning_rate": 7.836029845003096e-07,
+      "loss": 0.737,
+      "step": 11567
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 10.18925635369778,
+      "learning_rate": 7.829824511772005e-07,
+      "loss": 0.7259,
+      "step": 11568
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 9.5859969295814,
+      "learning_rate": 7.823621427780703e-07,
+      "loss": 0.6855,
+      "step": 11569
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 5.797976600522449,
+      "learning_rate": 7.817420593360031e-07,
+      "loss": 0.7278,
+      "step": 11570
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 9.402871626206851,
+      "learning_rate": 7.811222008840719e-07,
+      "loss": 0.6174,
+      "step": 11571
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 6.84508140318118,
+      "learning_rate": 7.805025674553407e-07,
+      "loss": 0.7467,
+      "step": 11572
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 6.882724511801803,
+      "learning_rate": 7.79883159082856e-07,
+      "loss": 0.747,
+      "step": 11573
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 12.035031386757566,
+      "learning_rate": 7.792639757996568e-07,
+      "loss": 0.677,
+      "step": 11574
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 9.969381085955442,
+      "learning_rate": 7.7864501763877e-07,
+      "loss": 0.6772,
+      "step": 11575
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 11.865002036223705,
+      "learning_rate": 7.780262846332076e-07,
+      "loss": 0.7198,
+      "step": 11576
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 12.603285997796869,
+      "learning_rate": 7.77407776815971e-07,
+      "loss": 0.7233,
+      "step": 11577
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 10.0320044548932,
+      "learning_rate": 7.767894942200493e-07,
+      "loss": 0.8027,
+      "step": 11578
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 9.043182872227186,
+      "learning_rate": 7.761714368784201e-07,
+      "loss": 0.6737,
+      "step": 11579
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 8.434934158221628,
+      "learning_rate": 7.755536048240508e-07,
+      "loss": 0.6593,
+      "step": 11580
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 11.39364865969733,
+      "learning_rate": 7.749359980898923e-07,
+      "loss": 0.7684,
+      "step": 11581
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 10.582051166013013,
+      "learning_rate": 7.743186167088878e-07,
+      "loss": 0.5981,
+      "step": 11582
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 7.989642828769587,
+      "learning_rate": 7.737014607139665e-07,
+      "loss": 0.7127,
+      "step": 11583
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 10.721932003506197,
+      "learning_rate": 7.730845301380441e-07,
+      "loss": 0.6873,
+      "step": 11584
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 12.220619941286266,
+      "learning_rate": 7.724678250140283e-07,
+      "loss": 0.6616,
+      "step": 11585
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 10.574531600981597,
+      "learning_rate": 7.7185134537481e-07,
+      "loss": 0.7097,
+      "step": 11586
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 10.153296648780634,
+      "learning_rate": 7.712350912532729e-07,
+      "loss": 0.6863,
+      "step": 11587
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 11.429284162445478,
+      "learning_rate": 7.706190626822841e-07,
+      "loss": 0.7349,
+      "step": 11588
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 9.780460186924882,
+      "learning_rate": 7.700032596947033e-07,
+      "loss": 0.7046,
+      "step": 11589
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 10.651545711156006,
+      "learning_rate": 7.693876823233742e-07,
+      "loss": 0.6872,
+      "step": 11590
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 8.458240209310635,
+      "learning_rate": 7.687723306011285e-07,
+      "loss": 0.6704,
+      "step": 11591
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 9.536529026398371,
+      "learning_rate": 7.681572045607899e-07,
+      "loss": 0.6864,
+      "step": 11592
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 10.038172823502965,
+      "learning_rate": 7.67542304235167e-07,
+      "loss": 0.7067,
+      "step": 11593
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 8.59533272939606,
+      "learning_rate": 7.669276296570555e-07,
+      "loss": 0.6964,
+      "step": 11594
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 9.008238360041222,
+      "learning_rate": 7.663131808592439e-07,
+      "loss": 0.6765,
+      "step": 11595
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 8.133975203078004,
+      "learning_rate": 7.656989578745006e-07,
+      "loss": 0.6687,
+      "step": 11596
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 9.688809092590574,
+      "learning_rate": 7.650849607355881e-07,
+      "loss": 0.7391,
+      "step": 11597
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 11.448693398317486,
+      "learning_rate": 7.644711894752571e-07,
+      "loss": 0.6703,
+      "step": 11598
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 8.907132818551668,
+      "learning_rate": 7.638576441262424e-07,
+      "loss": 0.7164,
+      "step": 11599
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 10.75807841150619,
+      "learning_rate": 7.632443247212701e-07,
+      "loss": 0.7128,
+      "step": 11600
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 8.243642983531252,
+      "learning_rate": 7.626312312930529e-07,
+      "loss": 0.7035,
+      "step": 11601
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 13.006341271437908,
+      "learning_rate": 7.6201836387429e-07,
+      "loss": 0.7207,
+      "step": 11602
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 7.648776332677952,
+      "learning_rate": 7.614057224976717e-07,
+      "loss": 0.654,
+      "step": 11603
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 11.542224642040777,
+      "learning_rate": 7.607933071958735e-07,
+      "loss": 0.6873,
+      "step": 11604
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 6.655594723785309,
+      "learning_rate": 7.601811180015606e-07,
+      "loss": 0.7215,
+      "step": 11605
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 10.486670002793689,
+      "learning_rate": 7.595691549473849e-07,
+      "loss": 0.7381,
+      "step": 11606
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 10.216337244824748,
+      "learning_rate": 7.589574180659881e-07,
+      "loss": 0.7132,
+      "step": 11607
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 10.79224656952356,
+      "learning_rate": 7.583459073899974e-07,
+      "loss": 0.6463,
+      "step": 11608
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 13.37065563476161,
+      "learning_rate": 7.577346229520283e-07,
+      "loss": 0.6874,
+      "step": 11609
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 10.504008912037513,
+      "learning_rate": 7.571235647846869e-07,
+      "loss": 0.7279,
+      "step": 11610
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 10.466832340504364,
+      "learning_rate": 7.565127329205635e-07,
+      "loss": 0.7945,
+      "step": 11611
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 8.88748385660304,
+      "learning_rate": 7.55902127392239e-07,
+      "loss": 0.6793,
+      "step": 11612
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 7.758373039396663,
+      "learning_rate": 7.552917482322825e-07,
+      "loss": 0.6571,
+      "step": 11613
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 6.278894357142268,
+      "learning_rate": 7.546815954732495e-07,
+      "loss": 0.661,
+      "step": 11614
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 11.996133945560237,
+      "learning_rate": 7.540716691476829e-07,
+      "loss": 0.662,
+      "step": 11615
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 9.504961608699292,
+      "learning_rate": 7.534619692881135e-07,
+      "loss": 0.7227,
+      "step": 11616
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 6.000825454927459,
+      "learning_rate": 7.528524959270628e-07,
+      "loss": 0.7265,
+      "step": 11617
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 10.853185243701564,
+      "learning_rate": 7.522432490970388e-07,
+      "loss": 0.7223,
+      "step": 11618
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 8.850896506409702,
+      "learning_rate": 7.516342288305351e-07,
+      "loss": 0.7319,
+      "step": 11619
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 9.656143520422884,
+      "learning_rate": 7.510254351600372e-07,
+      "loss": 0.6702,
+      "step": 11620
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 8.442513287424335,
+      "learning_rate": 7.504168681180158e-07,
+      "loss": 0.6979,
+      "step": 11621
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 7.227187369763489,
+      "learning_rate": 7.498085277369282e-07,
+      "loss": 0.6819,
+      "step": 11622
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 11.630918051776243,
+      "learning_rate": 7.492004140492249e-07,
+      "loss": 0.7176,
+      "step": 11623
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 11.352833249525744,
+      "learning_rate": 7.485925270873379e-07,
+      "loss": 0.7378,
+      "step": 11624
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 12.297649194385336,
+      "learning_rate": 7.479848668836931e-07,
+      "loss": 0.7182,
+      "step": 11625
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 8.676235626234318,
+      "learning_rate": 7.473774334706985e-07,
+      "loss": 0.7796,
+      "step": 11626
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 8.340414059895647,
+      "learning_rate": 7.467702268807553e-07,
+      "loss": 0.6841,
+      "step": 11627
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 9.026358517196433,
+      "learning_rate": 7.461632471462499e-07,
+      "loss": 0.7174,
+      "step": 11628
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 10.997458379823925,
+      "learning_rate": 7.455564942995546e-07,
+      "loss": 0.6206,
+      "step": 11629
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 10.850684954471744,
+      "learning_rate": 7.44949968373035e-07,
+      "loss": 0.7374,
+      "step": 11630
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 9.277805029130278,
+      "learning_rate": 7.443436693990385e-07,
+      "loss": 0.7511,
+      "step": 11631
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 8.749886411269923,
+      "learning_rate": 7.437375974099054e-07,
+      "loss": 0.7594,
+      "step": 11632
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 9.551926401113734,
+      "learning_rate": 7.431317524379622e-07,
+      "loss": 0.6719,
+      "step": 11633
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 16.540635823714034,
+      "learning_rate": 7.425261345155227e-07,
+      "loss": 0.6935,
+      "step": 11634
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 8.998694130501672,
+      "learning_rate": 7.419207436748871e-07,
+      "loss": 0.6763,
+      "step": 11635
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 10.224282892211404,
+      "learning_rate": 7.413155799483474e-07,
+      "loss": 0.6939,
+      "step": 11636
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 10.04340483948616,
+      "learning_rate": 7.407106433681798e-07,
+      "loss": 0.7432,
+      "step": 11637
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 7.933280356814104,
+      "learning_rate": 7.401059339666516e-07,
+      "loss": 0.7026,
+      "step": 11638
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 10.715457909284243,
+      "learning_rate": 7.395014517760157e-07,
+      "loss": 0.6816,
+      "step": 11639
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 10.759877621155159,
+      "learning_rate": 7.38897196828512e-07,
+      "loss": 0.6544,
+      "step": 11640
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 8.08259291651342,
+      "learning_rate": 7.382931691563722e-07,
+      "loss": 0.6882,
+      "step": 11641
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 8.961697218029514,
+      "learning_rate": 7.376893687918108e-07,
+      "loss": 0.756,
+      "step": 11642
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 9.557251496184836,
+      "learning_rate": 7.370857957670352e-07,
+      "loss": 0.68,
+      "step": 11643
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 9.159470694951242,
+      "learning_rate": 7.364824501142365e-07,
+      "loss": 0.6828,
+      "step": 11644
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 8.066368548603023,
+      "learning_rate": 7.358793318655972e-07,
+      "loss": 0.6913,
+      "step": 11645
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 9.976484847757273,
+      "learning_rate": 7.352764410532853e-07,
+      "loss": 0.6384,
+      "step": 11646
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 10.287324059122131,
+      "learning_rate": 7.346737777094559e-07,
+      "loss": 0.6871,
+      "step": 11647
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 7.531472022560681,
+      "learning_rate": 7.340713418662559e-07,
+      "loss": 0.7039,
+      "step": 11648
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 10.026151991000832,
+      "learning_rate": 7.334691335558148e-07,
+      "loss": 0.7888,
+      "step": 11649
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 10.938023328033172,
+      "learning_rate": 7.328671528102543e-07,
+      "loss": 0.7519,
+      "step": 11650
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 12.550866572942725,
+      "learning_rate": 7.322653996616835e-07,
+      "loss": 0.7402,
+      "step": 11651
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 10.683082258048435,
+      "learning_rate": 7.316638741421967e-07,
+      "loss": 0.7636,
+      "step": 11652
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 10.653054501694072,
+      "learning_rate": 7.31062576283878e-07,
+      "loss": 0.731,
+      "step": 11653
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 11.589307829817423,
+      "learning_rate": 7.304615061187975e-07,
+      "loss": 0.6469,
+      "step": 11654
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 9.469309122640546,
+      "learning_rate": 7.298606636790161e-07,
+      "loss": 0.654,
+      "step": 11655
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 10.465299309058125,
+      "learning_rate": 7.292600489965823e-07,
+      "loss": 0.7159,
+      "step": 11656
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 11.614966883543179,
+      "learning_rate": 7.28659662103528e-07,
+      "loss": 0.775,
+      "step": 11657
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 9.863788189418798,
+      "learning_rate": 7.280595030318799e-07,
+      "loss": 0.6147,
+      "step": 11658
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 7.841541148844676,
+      "learning_rate": 7.274595718136462e-07,
+      "loss": 0.6484,
+      "step": 11659
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 11.10698731441621,
+      "learning_rate": 7.268598684808253e-07,
+      "loss": 0.7201,
+      "step": 11660
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 11.408286997656878,
+      "learning_rate": 7.262603930654061e-07,
+      "loss": 0.654,
+      "step": 11661
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 12.453071983785186,
+      "learning_rate": 7.256611455993601e-07,
+      "loss": 0.7439,
+      "step": 11662
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 13.924160109549485,
+      "learning_rate": 7.250621261146517e-07,
+      "loss": 0.7281,
+      "step": 11663
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 9.441817561338082,
+      "learning_rate": 7.244633346432295e-07,
+      "loss": 0.6483,
+      "step": 11664
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 8.909395757022361,
+      "learning_rate": 7.238647712170327e-07,
+      "loss": 0.6568,
+      "step": 11665
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 10.81223230183655,
+      "learning_rate": 7.23266435867987e-07,
+      "loss": 0.7466,
+      "step": 11666
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 7.820062343239245,
+      "learning_rate": 7.226683286280034e-07,
+      "loss": 0.7464,
+      "step": 11667
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 16.03519658775925,
+      "learning_rate": 7.220704495289866e-07,
+      "loss": 0.6263,
+      "step": 11668
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 10.026085433900555,
+      "learning_rate": 7.214727986028231e-07,
+      "loss": 0.7282,
+      "step": 11669
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 8.536945063847257,
+      "learning_rate": 7.208753758813908e-07,
+      "loss": 0.7394,
+      "step": 11670
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 6.981938241485136,
+      "learning_rate": 7.202781813965564e-07,
+      "loss": 0.7069,
+      "step": 11671
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 7.843364263820777,
+      "learning_rate": 7.196812151801713e-07,
+      "loss": 0.7392,
+      "step": 11672
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 10.006574414268362,
+      "learning_rate": 7.190844772640759e-07,
+      "loss": 0.6976,
+      "step": 11673
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 8.271295002809728,
+      "learning_rate": 7.184879676800971e-07,
+      "loss": 0.6807,
+      "step": 11674
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 8.490905348776355,
+      "learning_rate": 7.178916864600521e-07,
+      "loss": 0.6895,
+      "step": 11675
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 7.268643085607709,
+      "learning_rate": 7.172956336357468e-07,
+      "loss": 0.6461,
+      "step": 11676
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 14.0925890113608,
+      "learning_rate": 7.166998092389699e-07,
+      "loss": 0.6654,
+      "step": 11677
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 10.596939637506098,
+      "learning_rate": 7.16104213301504e-07,
+      "loss": 0.6436,
+      "step": 11678
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 11.294665941443707,
+      "learning_rate": 7.155088458551152e-07,
+      "loss": 0.7219,
+      "step": 11679
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 11.820297017225412,
+      "learning_rate": 7.149137069315571e-07,
+      "loss": 0.6636,
+      "step": 11680
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 9.123303328846259,
+      "learning_rate": 7.143187965625753e-07,
+      "loss": 0.7669,
+      "step": 11681
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 10.113240466297157,
+      "learning_rate": 7.13724114779899e-07,
+      "loss": 0.7201,
+      "step": 11682
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 7.813368245426392,
+      "learning_rate": 7.131296616152483e-07,
+      "loss": 0.7417,
+      "step": 11683
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 10.961540010650552,
+      "learning_rate": 7.12535437100329e-07,
+      "loss": 0.7259,
+      "step": 11684
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 8.395602338736198,
+      "learning_rate": 7.119414412668341e-07,
+      "loss": 0.677,
+      "step": 11685
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 9.903802711281433,
+      "learning_rate": 7.113476741464476e-07,
+      "loss": 0.7229,
+      "step": 11686
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 5.905107783509119,
+      "learning_rate": 7.107541357708376e-07,
+      "loss": 0.6225,
+      "step": 11687
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 9.640651083167382,
+      "learning_rate": 7.101608261716631e-07,
+      "loss": 0.815,
+      "step": 11688
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 10.036584264420433,
+      "learning_rate": 7.095677453805705e-07,
+      "loss": 0.7112,
+      "step": 11689
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 9.72391895549917,
+      "learning_rate": 7.089748934291913e-07,
+      "loss": 0.6952,
+      "step": 11690
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 11.043342213986342,
+      "learning_rate": 7.083822703491477e-07,
+      "loss": 0.7543,
+      "step": 11691
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 9.73731487929712,
+      "learning_rate": 7.077898761720464e-07,
+      "loss": 0.7772,
+      "step": 11692
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 8.979550170011148,
+      "learning_rate": 7.071977109294858e-07,
+      "loss": 0.7499,
+      "step": 11693
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 6.358466476767154,
+      "learning_rate": 7.066057746530519e-07,
+      "loss": 0.6717,
+      "step": 11694
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 8.899896252138177,
+      "learning_rate": 7.060140673743132e-07,
+      "loss": 0.6886,
+      "step": 11695
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 7.433268855654482,
+      "learning_rate": 7.054225891248334e-07,
+      "loss": 0.7466,
+      "step": 11696
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 11.090335521400172,
+      "learning_rate": 7.048313399361584e-07,
+      "loss": 0.6866,
+      "step": 11697
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 10.37246232254543,
+      "learning_rate": 7.042403198398234e-07,
+      "loss": 0.7629,
+      "step": 11698
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 10.15387071518034,
+      "learning_rate": 7.036495288673534e-07,
+      "loss": 0.6932,
+      "step": 11699
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 13.689308340070573,
+      "learning_rate": 7.030589670502569e-07,
+      "loss": 0.7229,
+      "step": 11700
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 9.621385045607433,
+      "learning_rate": 7.02468634420036e-07,
+      "loss": 0.6821,
+      "step": 11701
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 8.48187324142198,
+      "learning_rate": 7.01878531008175e-07,
+      "loss": 0.7449,
+      "step": 11702
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 9.642301745173096,
+      "learning_rate": 7.012886568461497e-07,
+      "loss": 0.6622,
+      "step": 11703
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 9.772678443691616,
+      "learning_rate": 7.006990119654222e-07,
+      "loss": 0.7672,
+      "step": 11704
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 12.08364096396137,
+      "learning_rate": 7.001095963974414e-07,
+      "loss": 0.6607,
+      "step": 11705
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 7.9608622157751725,
+      "learning_rate": 6.995204101736469e-07,
+      "loss": 0.7851,
+      "step": 11706
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 9.162054612265287,
+      "learning_rate": 6.989314533254621e-07,
+      "loss": 0.699,
+      "step": 11707
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 8.423946770319342,
+      "learning_rate": 6.983427258843012e-07,
+      "loss": 0.7455,
+      "step": 11708
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 11.159209854028742,
+      "learning_rate": 6.977542278815669e-07,
+      "loss": 0.6373,
+      "step": 11709
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 8.748835754079302,
+      "learning_rate": 6.971659593486469e-07,
+      "loss": 0.6834,
+      "step": 11710
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 10.17875715298268,
+      "learning_rate": 6.96577920316917e-07,
+      "loss": 0.6712,
+      "step": 11711
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 8.972061529319669,
+      "learning_rate": 6.959901108177419e-07,
+      "loss": 0.6912,
+      "step": 11712
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 6.943296370123185,
+      "learning_rate": 6.954025308824736e-07,
+      "loss": 0.6778,
+      "step": 11713
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 16.287479615094306,
+      "learning_rate": 6.94815180542453e-07,
+      "loss": 0.6632,
+      "step": 11714
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 8.64006705464204,
+      "learning_rate": 6.942280598290069e-07,
+      "loss": 0.6622,
+      "step": 11715
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 6.942346122444929,
+      "learning_rate": 6.936411687734513e-07,
+      "loss": 0.7628,
+      "step": 11716
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 8.169253484426214,
+      "learning_rate": 6.930545074070893e-07,
+      "loss": 0.7125,
+      "step": 11717
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 10.29796220921541,
+      "learning_rate": 6.924680757612102e-07,
+      "loss": 0.7095,
+      "step": 11718
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 13.677687790507536,
+      "learning_rate": 6.918818738670946e-07,
+      "loss": 0.7615,
+      "step": 11719
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 5.783417765434926,
+      "learning_rate": 6.912959017560072e-07,
+      "loss": 0.7205,
+      "step": 11720
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 10.357488530947506,
+      "learning_rate": 6.907101594592036e-07,
+      "loss": 0.6636,
+      "step": 11721
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 9.967873165499112,
+      "learning_rate": 6.901246470079264e-07,
+      "loss": 0.7011,
+      "step": 11722
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 9.202151262796416,
+      "learning_rate": 6.895393644334025e-07,
+      "loss": 0.6532,
+      "step": 11723
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 11.168564066529107,
+      "learning_rate": 6.889543117668513e-07,
+      "loss": 0.6981,
+      "step": 11724
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 8.292842542409058,
+      "learning_rate": 6.883694890394765e-07,
+      "loss": 0.7365,
+      "step": 11725
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 8.530665437593388,
+      "learning_rate": 6.877848962824713e-07,
+      "loss": 0.7546,
+      "step": 11726
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 12.891483439723723,
+      "learning_rate": 6.872005335270176e-07,
+      "loss": 0.7208,
+      "step": 11727
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 9.064736715276803,
+      "learning_rate": 6.866164008042831e-07,
+      "loss": 0.7466,
+      "step": 11728
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 6.841998173437481,
+      "learning_rate": 6.860324981454231e-07,
+      "loss": 0.6693,
+      "step": 11729
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 10.583106340647705,
+      "learning_rate": 6.854488255815806e-07,
+      "loss": 0.6353,
+      "step": 11730
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 7.374148742425196,
+      "learning_rate": 6.848653831438883e-07,
+      "loss": 0.6808,
+      "step": 11731
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 10.126783229099145,
+      "learning_rate": 6.842821708634662e-07,
+      "loss": 0.734,
+      "step": 11732
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 9.197754127037475,
+      "learning_rate": 6.83699188771419e-07,
+      "loss": 0.6699,
+      "step": 11733
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 9.339176767871534,
+      "learning_rate": 6.831164368988435e-07,
+      "loss": 0.7111,
+      "step": 11734
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 8.566025670829001,
+      "learning_rate": 6.825339152768218e-07,
+      "loss": 0.7239,
+      "step": 11735
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 9.4985242918365,
+      "learning_rate": 6.819516239364216e-07,
+      "loss": 0.6732,
+      "step": 11736
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 11.507993299603624,
+      "learning_rate": 6.813695629087036e-07,
+      "loss": 0.6443,
+      "step": 11737
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 11.120822479211771,
+      "learning_rate": 6.807877322247108e-07,
+      "loss": 0.6945,
+      "step": 11738
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 9.51731182713326,
+      "learning_rate": 6.802061319154795e-07,
+      "loss": 0.6023,
+      "step": 11739
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 9.207310797957037,
+      "learning_rate": 6.796247620120272e-07,
+      "loss": 0.7041,
+      "step": 11740
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 7.9566402756583665,
+      "learning_rate": 6.790436225453656e-07,
+      "loss": 0.7147,
+      "step": 11741
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 13.182102197389787,
+      "learning_rate": 6.7846271354649e-07,
+      "loss": 0.678,
+      "step": 11742
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 6.905708272904554,
+      "learning_rate": 6.778820350463827e-07,
+      "loss": 0.6169,
+      "step": 11743
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 8.501985185912377,
+      "learning_rate": 6.77301587076018e-07,
+      "loss": 0.7495,
+      "step": 11744
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 12.750400119099512,
+      "learning_rate": 6.767213696663532e-07,
+      "loss": 0.7287,
+      "step": 11745
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 8.24079802975578,
+      "learning_rate": 6.761413828483371e-07,
+      "loss": 0.7483,
+      "step": 11746
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 7.9535020509232774,
+      "learning_rate": 6.755616266529047e-07,
+      "loss": 0.7447,
+      "step": 11747
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 11.19031787443273,
+      "learning_rate": 6.749821011109787e-07,
+      "loss": 0.6452,
+      "step": 11748
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 9.234733500632931,
+      "learning_rate": 6.74402806253468e-07,
+      "loss": 0.7208,
+      "step": 11749
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 9.563743950475429,
+      "learning_rate": 6.738237421112698e-07,
+      "loss": 0.6879,
+      "step": 11750
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 13.119028494030427,
+      "learning_rate": 6.73244908715272e-07,
+      "loss": 0.675,
+      "step": 11751
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 8.620570531755122,
+      "learning_rate": 6.726663060963479e-07,
+      "loss": 0.669,
+      "step": 11752
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 10.690861027316155,
+      "learning_rate": 6.720879342853565e-07,
+      "loss": 0.7456,
+      "step": 11753
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 9.158197089522442,
+      "learning_rate": 6.715097933131492e-07,
+      "loss": 0.7317,
+      "step": 11754
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 14.150656938276338,
+      "learning_rate": 6.70931883210561e-07,
+      "loss": 0.6876,
+      "step": 11755
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 8.322105630928812,
+      "learning_rate": 6.703542040084155e-07,
+      "loss": 0.7234,
+      "step": 11756
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 7.541548300569522,
+      "learning_rate": 6.697767557375257e-07,
+      "loss": 0.707,
+      "step": 11757
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 8.621240810614632,
+      "learning_rate": 6.691995384286892e-07,
+      "loss": 0.7325,
+      "step": 11758
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 13.046321796430382,
+      "learning_rate": 6.68622552112696e-07,
+      "loss": 0.707,
+      "step": 11759
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 8.947055820806776,
+      "learning_rate": 6.680457968203181e-07,
+      "loss": 0.6895,
+      "step": 11760
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 7.35881280592994,
+      "learning_rate": 6.674692725823206e-07,
+      "loss": 0.776,
+      "step": 11761
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 10.170613833855404,
+      "learning_rate": 6.668929794294521e-07,
+      "loss": 0.6166,
+      "step": 11762
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 8.523909668709981,
+      "learning_rate": 6.6631691739245e-07,
+      "loss": 0.793,
+      "step": 11763
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 8.764562738465814,
+      "learning_rate": 6.657410865020414e-07,
+      "loss": 0.7013,
+      "step": 11764
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 9.766862794304268,
+      "learning_rate": 6.651654867889379e-07,
+      "loss": 0.6572,
+      "step": 11765
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 8.56332152152389,
+      "learning_rate": 6.645901182838426e-07,
+      "loss": 0.6535,
+      "step": 11766
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 12.15980014442737,
+      "learning_rate": 6.640149810174424e-07,
+      "loss": 0.6902,
+      "step": 11767
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 5.474588724609712,
+      "learning_rate": 6.634400750204123e-07,
+      "loss": 0.7,
+      "step": 11768
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 7.711986362874467,
+      "learning_rate": 6.628654003234187e-07,
+      "loss": 0.7163,
+      "step": 11769
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 7.247817876802034,
+      "learning_rate": 6.622909569571123e-07,
+      "loss": 0.725,
+      "step": 11770
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 12.650592778293719,
+      "learning_rate": 6.617167449521317e-07,
+      "loss": 0.7226,
+      "step": 11771
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 9.412406600476125,
+      "learning_rate": 6.61142764339105e-07,
+      "loss": 0.713,
+      "step": 11772
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 10.857868213251333,
+      "learning_rate": 6.60569015148646e-07,
+      "loss": 0.7093,
+      "step": 11773
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 10.997580649388901,
+      "learning_rate": 6.599954974113554e-07,
+      "loss": 0.6822,
+      "step": 11774
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 8.157555357678715,
+      "learning_rate": 6.594222111578264e-07,
+      "loss": 0.7452,
+      "step": 11775
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 6.454790223663807,
+      "learning_rate": 6.588491564186328e-07,
+      "loss": 0.6911,
+      "step": 11776
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 13.13064715264066,
+      "learning_rate": 6.582763332243425e-07,
+      "loss": 0.7952,
+      "step": 11777
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 10.373745024345428,
+      "learning_rate": 6.577037416055066e-07,
+      "loss": 0.7692,
+      "step": 11778
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 8.928139327941874,
+      "learning_rate": 6.571313815926677e-07,
+      "loss": 0.7571,
+      "step": 11779
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 8.769716469550659,
+      "learning_rate": 6.565592532163517e-07,
+      "loss": 0.7206,
+      "step": 11780
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 11.62783671339857,
+      "learning_rate": 6.559873565070746e-07,
+      "loss": 0.7414,
+      "step": 11781
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 8.386418685494531,
+      "learning_rate": 6.554156914953408e-07,
+      "loss": 0.6807,
+      "step": 11782
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 10.202973624672348,
+      "learning_rate": 6.548442582116404e-07,
+      "loss": 0.6706,
+      "step": 11783
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 8.930963103023256,
+      "learning_rate": 6.542730566864519e-07,
+      "loss": 0.7279,
+      "step": 11784
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 11.762121698947565,
+      "learning_rate": 6.537020869502431e-07,
+      "loss": 0.6591,
+      "step": 11785
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 9.57680676720676,
+      "learning_rate": 6.531313490334673e-07,
+      "loss": 0.7354,
+      "step": 11786
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 12.23794380964189,
+      "learning_rate": 6.525608429665659e-07,
+      "loss": 0.7313,
+      "step": 11787
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 9.292947952490564,
+      "learning_rate": 6.519905687799666e-07,
+      "loss": 0.7513,
+      "step": 11788
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 9.178957002265525,
+      "learning_rate": 6.514205265040874e-07,
+      "loss": 0.7403,
+      "step": 11789
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 8.88223733446501,
+      "learning_rate": 6.508507161693345e-07,
+      "loss": 0.7045,
+      "step": 11790
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 10.8855347974004,
+      "learning_rate": 6.502811378060975e-07,
+      "loss": 0.7327,
+      "step": 11791
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 10.369940026472783,
+      "learning_rate": 6.497117914447576e-07,
+      "loss": 0.6634,
+      "step": 11792
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 9.383034197644962,
+      "learning_rate": 6.491426771156817e-07,
+      "loss": 0.8136,
+      "step": 11793
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 7.3766122386338395,
+      "learning_rate": 6.485737948492237e-07,
+      "loss": 0.6482,
+      "step": 11794
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 9.067906129742495,
+      "learning_rate": 6.480051446757285e-07,
+      "loss": 0.6544,
+      "step": 11795
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 10.84220872399839,
+      "learning_rate": 6.474367266255238e-07,
+      "loss": 0.7609,
+      "step": 11796
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 11.180173624460783,
+      "learning_rate": 6.468685407289299e-07,
+      "loss": 0.6898,
+      "step": 11797
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 10.050928831933883,
+      "learning_rate": 6.463005870162498e-07,
+      "loss": 0.7069,
+      "step": 11798
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 7.413011386762907,
+      "learning_rate": 6.457328655177792e-07,
+      "loss": 0.7426,
+      "step": 11799
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 11.243302944166226,
+      "learning_rate": 6.451653762637966e-07,
+      "loss": 0.7314,
+      "step": 11800
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 10.975075499836441,
+      "learning_rate": 6.445981192845708e-07,
+      "loss": 0.7253,
+      "step": 11801
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 6.243909441616601,
+      "learning_rate": 6.440310946103584e-07,
+      "loss": 0.7192,
+      "step": 11802
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 11.142353431940917,
+      "learning_rate": 6.434643022714021e-07,
+      "loss": 0.6701,
+      "step": 11803
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 7.602095861568955,
+      "learning_rate": 6.428977422979327e-07,
+      "loss": 0.7312,
+      "step": 11804
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 9.12799825799578,
+      "learning_rate": 6.42331414720172e-07,
+      "loss": 0.7076,
+      "step": 11805
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 8.92429544610546,
+      "learning_rate": 6.417653195683221e-07,
+      "loss": 0.6564,
+      "step": 11806
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 12.241799648367726,
+      "learning_rate": 6.411994568725799e-07,
+      "loss": 0.7113,
+      "step": 11807
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 7.009747686220994,
+      "learning_rate": 6.406338266631246e-07,
+      "loss": 0.6591,
+      "step": 11808
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 7.686907995783688,
+      "learning_rate": 6.400684289701265e-07,
+      "loss": 0.6935,
+      "step": 11809
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 11.961211350429796,
+      "learning_rate": 6.395032638237436e-07,
+      "loss": 0.6106,
+      "step": 11810
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 10.079115795179366,
+      "learning_rate": 6.389383312541192e-07,
+      "loss": 0.6852,
+      "step": 11811
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 7.3069672138080035,
+      "learning_rate": 6.383736312913841e-07,
+      "loss": 0.6601,
+      "step": 11812
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 14.024193402351989,
+      "learning_rate": 6.378091639656598e-07,
+      "loss": 0.7345,
+      "step": 11813
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 9.499383008505784,
+      "learning_rate": 6.372449293070515e-07,
+      "loss": 0.7613,
+      "step": 11814
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 8.812525975189022,
+      "learning_rate": 6.366809273456559e-07,
+      "loss": 0.7417,
+      "step": 11815
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 8.753622202874695,
+      "learning_rate": 6.361171581115527e-07,
+      "loss": 0.6289,
+      "step": 11816
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 12.321272373764508,
+      "learning_rate": 6.355536216348152e-07,
+      "loss": 0.6468,
+      "step": 11817
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 9.473989794574852,
+      "learning_rate": 6.349903179454986e-07,
+      "loss": 0.6915,
+      "step": 11818
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 8.604583031953887,
+      "learning_rate": 6.344272470736468e-07,
+      "loss": 0.6588,
+      "step": 11819
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 10.15010313020211,
+      "learning_rate": 6.338644090492957e-07,
+      "loss": 0.7958,
+      "step": 11820
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 7.322942443135157,
+      "learning_rate": 6.333018039024625e-07,
+      "loss": 0.6914,
+      "step": 11821
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 9.102474473565739,
+      "learning_rate": 6.327394316631564e-07,
+      "loss": 0.7751,
+      "step": 11822
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 11.711795138781525,
+      "learning_rate": 6.32177292361374e-07,
+      "loss": 0.6874,
+      "step": 11823
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 11.302593225818843,
+      "learning_rate": 6.316153860270963e-07,
+      "loss": 0.7083,
+      "step": 11824
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 10.276828588138683,
+      "learning_rate": 6.310537126902949e-07,
+      "loss": 0.7212,
+      "step": 11825
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 8.956483670111012,
+      "learning_rate": 6.304922723809259e-07,
+      "loss": 0.6867,
+      "step": 11826
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 8.465232244441593,
+      "learning_rate": 6.29931065128937e-07,
+      "loss": 0.75,
+      "step": 11827
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 8.816953194740428,
+      "learning_rate": 6.29370090964262e-07,
+      "loss": 0.8082,
+      "step": 11828
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 12.756197422831251,
+      "learning_rate": 6.288093499168191e-07,
+      "loss": 0.6716,
+      "step": 11829
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 10.832044317826854,
+      "learning_rate": 6.282488420165195e-07,
+      "loss": 0.7021,
+      "step": 11830
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 6.793453654369496,
+      "learning_rate": 6.276885672932576e-07,
+      "loss": 0.6903,
+      "step": 11831
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 9.110897786608463,
+      "learning_rate": 6.271285257769161e-07,
+      "loss": 0.677,
+      "step": 11832
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 7.9895322656602845,
+      "learning_rate": 6.265687174973684e-07,
+      "loss": 0.7284,
+      "step": 11833
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 9.479539459784762,
+      "learning_rate": 6.260091424844705e-07,
+      "loss": 0.6232,
+      "step": 11834
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 11.569495244394261,
+      "learning_rate": 6.254498007680709e-07,
+      "loss": 0.7873,
+      "step": 11835
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 8.633385744308386,
+      "learning_rate": 6.248906923780012e-07,
+      "loss": 0.6835,
+      "step": 11836
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 12.269335629440603,
+      "learning_rate": 6.243318173440849e-07,
+      "loss": 0.665,
+      "step": 11837
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 8.20833991498339,
+      "learning_rate": 6.237731756961302e-07,
+      "loss": 0.7016,
+      "step": 11838
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 10.065605716363383,
+      "learning_rate": 6.232147674639311e-07,
+      "loss": 0.6703,
+      "step": 11839
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 11.654412608999849,
+      "learning_rate": 6.226565926772748e-07,
+      "loss": 0.6668,
+      "step": 11840
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 11.806299287934547,
+      "learning_rate": 6.220986513659305e-07,
+      "loss": 0.7222,
+      "step": 11841
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 8.952392284141084,
+      "learning_rate": 6.215409435596581e-07,
+      "loss": 0.7286,
+      "step": 11842
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 10.528555562112343,
+      "learning_rate": 6.209834692882055e-07,
+      "loss": 0.6784,
+      "step": 11843
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 9.556456923290636,
+      "learning_rate": 6.20426228581305e-07,
+      "loss": 0.6578,
+      "step": 11844
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 10.074072105594254,
+      "learning_rate": 6.198692214686797e-07,
+      "loss": 0.7551,
+      "step": 11845
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 7.690625930984351,
+      "learning_rate": 6.193124479800361e-07,
+      "loss": 0.7301,
+      "step": 11846
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 12.517573096158179,
+      "learning_rate": 6.187559081450734e-07,
+      "loss": 0.7222,
+      "step": 11847
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 9.33869852179174,
+      "learning_rate": 6.181996019934755e-07,
+      "loss": 0.7051,
+      "step": 11848
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 10.278076601191536,
+      "learning_rate": 6.176435295549138e-07,
+      "loss": 0.6977,
+      "step": 11849
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 9.615141115719695,
+      "learning_rate": 6.170876908590495e-07,
+      "loss": 0.6624,
+      "step": 11850
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 9.341687102737595,
+      "learning_rate": 6.165320859355262e-07,
+      "loss": 0.6965,
+      "step": 11851
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 9.317834465941322,
+      "learning_rate": 6.159767148139794e-07,
+      "loss": 0.6773,
+      "step": 11852
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 9.047721281152002,
+      "learning_rate": 6.15421577524033e-07,
+      "loss": 0.7215,
+      "step": 11853
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 8.072807682431957,
+      "learning_rate": 6.148666740952941e-07,
+      "loss": 0.653,
+      "step": 11854
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 6.65600604073158,
+      "learning_rate": 6.143120045573614e-07,
+      "loss": 0.7435,
+      "step": 11855
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 10.44811149471618,
+      "learning_rate": 6.137575689398189e-07,
+      "loss": 0.6552,
+      "step": 11856
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 9.568859890672647,
+      "learning_rate": 6.132033672722376e-07,
+      "loss": 0.7474,
+      "step": 11857
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 8.205995865995417,
+      "learning_rate": 6.126493995841793e-07,
+      "loss": 0.6908,
+      "step": 11858
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 7.302740831814958,
+      "learning_rate": 6.12095665905188e-07,
+      "loss": 0.7297,
+      "step": 11859
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 9.529771098815365,
+      "learning_rate": 6.115421662648002e-07,
+      "loss": 0.644,
+      "step": 11860
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 8.254217443968413,
+      "learning_rate": 6.109889006925396e-07,
+      "loss": 0.7382,
+      "step": 11861
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 8.700165293176692,
+      "learning_rate": 6.104358692179135e-07,
+      "loss": 0.6582,
+      "step": 11862
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 11.203753639269067,
+      "learning_rate": 6.098830718704196e-07,
+      "loss": 0.6908,
+      "step": 11863
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 11.645346376606668,
+      "learning_rate": 6.09330508679542e-07,
+      "loss": 0.7361,
+      "step": 11864
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 8.056203295679637,
+      "learning_rate": 6.087781796747538e-07,
+      "loss": 0.6574,
+      "step": 11865
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 6.402848143147485,
+      "learning_rate": 6.082260848855148e-07,
+      "loss": 0.7174,
+      "step": 11866
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 12.353050953527992,
+      "learning_rate": 6.076742243412714e-07,
+      "loss": 0.6995,
+      "step": 11867
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 9.699117480159629,
+      "learning_rate": 6.071225980714596e-07,
+      "loss": 0.6768,
+      "step": 11868
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 10.069259638074556,
+      "learning_rate": 6.065712061055007e-07,
+      "loss": 0.7197,
+      "step": 11869
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 9.435330290579794,
+      "learning_rate": 6.06020048472803e-07,
+      "loss": 0.7446,
+      "step": 11870
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 8.886969269028716,
+      "learning_rate": 6.054691252027666e-07,
+      "loss": 0.6973,
+      "step": 11871
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 8.559044188803378,
+      "learning_rate": 6.04918436324774e-07,
+      "loss": 0.6933,
+      "step": 11872
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 12.923198628042304,
+      "learning_rate": 6.043679818681991e-07,
+      "loss": 0.6977,
+      "step": 11873
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 9.078936776991945,
+      "learning_rate": 6.038177618623997e-07,
+      "loss": 0.6885,
+      "step": 11874
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 12.690699932320468,
+      "learning_rate": 6.03267776336725e-07,
+      "loss": 0.7404,
+      "step": 11875
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 9.158627937248948,
+      "learning_rate": 6.027180253205089e-07,
+      "loss": 0.6256,
+      "step": 11876
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 8.25608444519056,
+      "learning_rate": 6.021685088430724e-07,
+      "loss": 0.8044,
+      "step": 11877
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 9.406634334531454,
+      "learning_rate": 6.016192269337267e-07,
+      "loss": 0.6876,
+      "step": 11878
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 10.270684361623626,
+      "learning_rate": 6.010701796217677e-07,
+      "loss": 0.7237,
+      "step": 11879
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 7.774968221258423,
+      "learning_rate": 6.005213669364812e-07,
+      "loss": 0.7677,
+      "step": 11880
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 9.094119662151307,
+      "learning_rate": 5.999727889071394e-07,
+      "loss": 0.82,
+      "step": 11881
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 12.320224002446478,
+      "learning_rate": 5.994244455630016e-07,
+      "loss": 0.7979,
+      "step": 11882
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 7.908072964769965,
+      "learning_rate": 5.98876336933315e-07,
+      "loss": 0.6792,
+      "step": 11883
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 13.32374304204102,
+      "learning_rate": 5.98328463047313e-07,
+      "loss": 0.711,
+      "step": 11884
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 9.01528882630562,
+      "learning_rate": 5.977808239342187e-07,
+      "loss": 0.7468,
+      "step": 11885
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 11.499992788994813,
+      "learning_rate": 5.972334196232422e-07,
+      "loss": 0.6974,
+      "step": 11886
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 6.962632444759409,
+      "learning_rate": 5.966862501435794e-07,
+      "loss": 0.6871,
+      "step": 11887
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 8.910328821466086,
+      "learning_rate": 5.961393155244161e-07,
+      "loss": 0.6826,
+      "step": 11888
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 7.473922502219836,
+      "learning_rate": 5.955926157949232e-07,
+      "loss": 0.6409,
+      "step": 11889
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 9.851357746192065,
+      "learning_rate": 5.950461509842597e-07,
+      "loss": 0.7629,
+      "step": 11890
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 8.741768147582695,
+      "learning_rate": 5.94499921121574e-07,
+      "loss": 0.6798,
+      "step": 11891
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 7.934256084624343,
+      "learning_rate": 5.939539262359989e-07,
+      "loss": 0.697,
+      "step": 11892
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 12.272861290037724,
+      "learning_rate": 5.934081663566582e-07,
+      "loss": 0.6909,
+      "step": 11893
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 10.529546039794102,
+      "learning_rate": 5.928626415126598e-07,
+      "loss": 0.6719,
+      "step": 11894
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 9.316284054091245,
+      "learning_rate": 5.923173517330999e-07,
+      "loss": 0.6962,
+      "step": 11895
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 8.535305517861504,
+      "learning_rate": 5.917722970470646e-07,
+      "loss": 0.7671,
+      "step": 11896
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 11.640491539799923,
+      "learning_rate": 5.912274774836235e-07,
+      "loss": 0.673,
+      "step": 11897
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 11.095286172897248,
+      "learning_rate": 5.906828930718372e-07,
+      "loss": 0.6897,
+      "step": 11898
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 9.67982159929911,
+      "learning_rate": 5.901385438407525e-07,
+      "loss": 0.735,
+      "step": 11899
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 7.652872004128022,
+      "learning_rate": 5.895944298194034e-07,
+      "loss": 0.6811,
+      "step": 11900
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 9.683682476901826,
+      "learning_rate": 5.890505510368106e-07,
+      "loss": 0.6839,
+      "step": 11901
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 6.671308057264822,
+      "learning_rate": 5.885069075219824e-07,
+      "loss": 0.7143,
+      "step": 11902
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 8.841185039420225,
+      "learning_rate": 5.879634993039162e-07,
+      "loss": 0.7322,
+      "step": 11903
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 8.825628662512216,
+      "learning_rate": 5.874203264115974e-07,
+      "loss": 0.6882,
+      "step": 11904
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 11.252559143973032,
+      "learning_rate": 5.868773888739948e-07,
+      "loss": 0.6947,
+      "step": 11905
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 7.754622208995714,
+      "learning_rate": 5.863346867200692e-07,
+      "loss": 0.6781,
+      "step": 11906
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 6.997239650391242,
+      "learning_rate": 5.857922199787657e-07,
+      "loss": 0.6953,
+      "step": 11907
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 9.807162791727293,
+      "learning_rate": 5.852499886790175e-07,
+      "loss": 0.7594,
+      "step": 11908
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 10.525548294595588,
+      "learning_rate": 5.847079928497468e-07,
+      "loss": 0.7156,
+      "step": 11909
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 9.936741533252551,
+      "learning_rate": 5.84166232519861e-07,
+      "loss": 0.7082,
+      "step": 11910
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 6.728258082683671,
+      "learning_rate": 5.836247077182583e-07,
+      "loss": 0.7562,
+      "step": 11911
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 9.987868361402999,
+      "learning_rate": 5.830834184738188e-07,
+      "loss": 0.6461,
+      "step": 11912
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 9.893846514738122,
+      "learning_rate": 5.825423648154166e-07,
+      "loss": 0.739,
+      "step": 11913
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 11.27617597395123,
+      "learning_rate": 5.820015467719086e-07,
+      "loss": 0.6687,
+      "step": 11914
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 8.859288627862707,
+      "learning_rate": 5.814609643721397e-07,
+      "loss": 0.6585,
+      "step": 11915
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 8.58723069192974,
+      "learning_rate": 5.809206176449445e-07,
+      "loss": 0.6946,
+      "step": 11916
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 9.537116288471031,
+      "learning_rate": 5.80380506619142e-07,
+      "loss": 0.6912,
+      "step": 11917
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 10.693349803888706,
+      "learning_rate": 5.798406313235411e-07,
+      "loss": 0.6859,
+      "step": 11918
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 9.971932824203499,
+      "learning_rate": 5.793009917869385e-07,
+      "loss": 0.7156,
+      "step": 11919
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 12.898383964664882,
+      "learning_rate": 5.787615880381159e-07,
+      "loss": 0.7342,
+      "step": 11920
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 11.252142840023943,
+      "learning_rate": 5.782224201058439e-07,
+      "loss": 0.6721,
+      "step": 11921
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 13.33962959918175,
+      "learning_rate": 5.776834880188786e-07,
+      "loss": 0.6561,
+      "step": 11922
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 13.019012188417225,
+      "learning_rate": 5.771447918059664e-07,
+      "loss": 0.7454,
+      "step": 11923
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 9.269281678312844,
+      "learning_rate": 5.766063314958409e-07,
+      "loss": 0.752,
+      "step": 11924
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 8.009278869923822,
+      "learning_rate": 5.760681071172203e-07,
+      "loss": 0.7048,
+      "step": 11925
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 9.260036640682152,
+      "learning_rate": 5.755301186988132e-07,
+      "loss": 0.7069,
+      "step": 11926
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 10.35489068069619,
+      "learning_rate": 5.749923662693146e-07,
+      "loss": 0.6562,
+      "step": 11927
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 6.377075438564161,
+      "learning_rate": 5.744548498574048e-07,
+      "loss": 0.7613,
+      "step": 11928
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 7.706172028996503,
+      "learning_rate": 5.739175694917559e-07,
+      "loss": 0.6869,
+      "step": 11929
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 10.106436133576358,
+      "learning_rate": 5.733805252010227e-07,
+      "loss": 0.65,
+      "step": 11930
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 9.595573467158124,
+      "learning_rate": 5.728437170138518e-07,
+      "loss": 0.7153,
+      "step": 11931
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 9.141813565332834,
+      "learning_rate": 5.723071449588724e-07,
+      "loss": 0.7283,
+      "step": 11932
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 10.68236054968558,
+      "learning_rate": 5.717708090647067e-07,
+      "loss": 0.7856,
+      "step": 11933
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 8.777699095811478,
+      "learning_rate": 5.712347093599601e-07,
+      "loss": 0.6781,
+      "step": 11934
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 9.107055240057393,
+      "learning_rate": 5.706988458732255e-07,
+      "loss": 0.7846,
+      "step": 11935
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 10.552885672357776,
+      "learning_rate": 5.701632186330863e-07,
+      "loss": 0.7298,
+      "step": 11936
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 10.250781976472297,
+      "learning_rate": 5.6962782766811e-07,
+      "loss": 0.7241,
+      "step": 11937
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 10.178995525765446,
+      "learning_rate": 5.690926730068535e-07,
+      "loss": 0.6997,
+      "step": 11938
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 9.742489152429565,
+      "learning_rate": 5.685577546778609e-07,
+      "loss": 0.6647,
+      "step": 11939
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 7.922058642699018,
+      "learning_rate": 5.680230727096614e-07,
+      "loss": 0.6511,
+      "step": 11940
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 9.02036992640883,
+      "learning_rate": 5.67488627130775e-07,
+      "loss": 0.7282,
+      "step": 11941
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 10.38349410715621,
+      "learning_rate": 5.669544179697078e-07,
+      "loss": 0.7109,
+      "step": 11942
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 10.185503649677036,
+      "learning_rate": 5.664204452549516e-07,
+      "loss": 0.7356,
+      "step": 11943
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 11.75392792512292,
+      "learning_rate": 5.658867090149894e-07,
+      "loss": 0.7362,
+      "step": 11944
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 9.736092558280934,
+      "learning_rate": 5.653532092782871e-07,
+      "loss": 0.6578,
+      "step": 11945
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 6.3752157345397045,
+      "learning_rate": 5.648199460733e-07,
+      "loss": 0.7149,
+      "step": 11946
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 8.5356750414496,
+      "learning_rate": 5.642869194284723e-07,
+      "loss": 0.7639,
+      "step": 11947
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 7.160334678947204,
+      "learning_rate": 5.637541293722326e-07,
+      "loss": 0.7229,
+      "step": 11948
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 10.522516320800849,
+      "learning_rate": 5.632215759330006e-07,
+      "loss": 0.6719,
+      "step": 11949
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 7.461178557924639,
+      "learning_rate": 5.626892591391786e-07,
+      "loss": 0.727,
+      "step": 11950
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 10.96867636641305,
+      "learning_rate": 5.62157179019161e-07,
+      "loss": 0.777,
+      "step": 11951
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 12.562687221399786,
+      "learning_rate": 5.616253356013268e-07,
+      "loss": 0.7101,
+      "step": 11952
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 9.08907473550967,
+      "learning_rate": 5.610937289140416e-07,
+      "loss": 0.724,
+      "step": 11953
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 9.031040024671489,
+      "learning_rate": 5.605623589856624e-07,
+      "loss": 0.6982,
+      "step": 11954
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 8.630837096207715,
+      "learning_rate": 5.600312258445289e-07,
+      "loss": 0.6475,
+      "step": 11955
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 8.653347046019713,
+      "learning_rate": 5.595003295189711e-07,
+      "loss": 0.7341,
+      "step": 11956
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 9.621464628361416,
+      "learning_rate": 5.589696700373059e-07,
+      "loss": 0.6593,
+      "step": 11957
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 10.163026889927268,
+      "learning_rate": 5.584392474278372e-07,
+      "loss": 0.7273,
+      "step": 11958
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 8.417804309412677,
+      "learning_rate": 5.579090617188559e-07,
+      "loss": 0.6866,
+      "step": 11959
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 10.720367180726422,
+      "learning_rate": 5.573791129386397e-07,
+      "loss": 0.6457,
+      "step": 11960
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 11.537040774259463,
+      "learning_rate": 5.568494011154552e-07,
+      "loss": 0.6875,
+      "step": 11961
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 11.762151447843811,
+      "learning_rate": 5.563199262775576e-07,
+      "loss": 0.6855,
+      "step": 11962
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 10.34390098988271,
+      "learning_rate": 5.557906884531844e-07,
+      "loss": 0.6782,
+      "step": 11963
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 8.520529811360726,
+      "learning_rate": 5.552616876705669e-07,
+      "loss": 0.6664,
+      "step": 11964
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 8.463243796569108,
+      "learning_rate": 5.547329239579185e-07,
+      "loss": 0.6564,
+      "step": 11965
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 11.847673004680862,
+      "learning_rate": 5.542043973434419e-07,
+      "loss": 0.6601,
+      "step": 11966
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 7.892944800540109,
+      "learning_rate": 5.536761078553282e-07,
+      "loss": 0.6899,
+      "step": 11967
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 11.495232418099466,
+      "learning_rate": 5.531480555217539e-07,
+      "loss": 0.7588,
+      "step": 11968
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 7.168876423051372,
+      "learning_rate": 5.526202403708853e-07,
+      "loss": 0.6218,
+      "step": 11969
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 12.65513083278859,
+      "learning_rate": 5.520926624308726e-07,
+      "loss": 0.6964,
+      "step": 11970
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 8.662128977977833,
+      "learning_rate": 5.515653217298577e-07,
+      "loss": 0.7337,
+      "step": 11971
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 11.289729666795084,
+      "learning_rate": 5.510382182959662e-07,
+      "loss": 0.6955,
+      "step": 11972
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 6.595876361868805,
+      "learning_rate": 5.505113521573113e-07,
+      "loss": 0.6595,
+      "step": 11973
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 11.517686981180129,
+      "learning_rate": 5.49984723341997e-07,
+      "loss": 0.703,
+      "step": 11974
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 8.35273172582859,
+      "learning_rate": 5.494583318781093e-07,
+      "loss": 0.7043,
+      "step": 11975
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 8.526172199720785,
+      "learning_rate": 5.489321777937262e-07,
+      "loss": 0.6474,
+      "step": 11976
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 9.17357003870175,
+      "learning_rate": 5.484062611169133e-07,
+      "loss": 0.6731,
+      "step": 11977
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 9.729758812239908,
+      "learning_rate": 5.478805818757171e-07,
+      "loss": 0.668,
+      "step": 11978
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 7.605914202538501,
+      "learning_rate": 5.473551400981791e-07,
+      "loss": 0.7883,
+      "step": 11979
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 8.668010574087509,
+      "learning_rate": 5.46829935812323e-07,
+      "loss": 0.6629,
+      "step": 11980
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 11.884375698103904,
+      "learning_rate": 5.463049690461624e-07,
+      "loss": 0.663,
+      "step": 11981
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 7.35600340144032,
+      "learning_rate": 5.45780239827699e-07,
+      "loss": 0.7837,
+      "step": 11982
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 9.604484194735017,
+      "learning_rate": 5.452557481849191e-07,
+      "loss": 0.7206,
+      "step": 11983
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 8.606835191312358,
+      "learning_rate": 5.447314941457971e-07,
+      "loss": 0.7047,
+      "step": 11984
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 6.108871294471997,
+      "learning_rate": 5.442074777382966e-07,
+      "loss": 0.6346,
+      "step": 11985
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 8.37111230602068,
+      "learning_rate": 5.436836989903655e-07,
+      "loss": 0.6756,
+      "step": 11986
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 6.97419121036031,
+      "learning_rate": 5.431601579299423e-07,
+      "loss": 0.7166,
+      "step": 11987
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 8.548844916777616,
+      "learning_rate": 5.426368545849497e-07,
+      "loss": 0.684,
+      "step": 11988
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 10.821390365877836,
+      "learning_rate": 5.421137889833017e-07,
+      "loss": 0.6064,
+      "step": 11989
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 12.844376963739732,
+      "learning_rate": 5.415909611528952e-07,
+      "loss": 0.7023,
+      "step": 11990
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 13.878412410185339,
+      "learning_rate": 5.410683711216159e-07,
+      "loss": 0.7234,
+      "step": 11991
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 11.470077703464286,
+      "learning_rate": 5.405460189173389e-07,
+      "loss": 0.6453,
+      "step": 11992
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 10.131655955968094,
+      "learning_rate": 5.400239045679234e-07,
+      "loss": 0.704,
+      "step": 11993
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 9.63726100538929,
+      "learning_rate": 5.395020281012181e-07,
+      "loss": 0.7597,
+      "step": 11994
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 9.75272099324983,
+      "learning_rate": 5.389803895450602e-07,
+      "loss": 0.6402,
+      "step": 11995
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 9.253360538428746,
+      "learning_rate": 5.384589889272707e-07,
+      "loss": 0.7996,
+      "step": 11996
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 9.046398984146688,
+      "learning_rate": 5.379378262756602e-07,
+      "loss": 0.6508,
+      "step": 11997
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 11.085396373801812,
+      "learning_rate": 5.374169016180247e-07,
+      "loss": 0.693,
+      "step": 11998
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 10.570176535649887,
+      "learning_rate": 5.3689621498215e-07,
+      "loss": 0.6697,
+      "step": 11999
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 7.555232108103474,
+      "learning_rate": 5.363757663958091e-07,
+      "loss": 0.7455,
+      "step": 12000
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 10.982602744891711,
+      "learning_rate": 5.35855555886759e-07,
+      "loss": 0.6035,
+      "step": 12001
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 6.337409499784366,
+      "learning_rate": 5.35335583482749e-07,
+      "loss": 0.7556,
+      "step": 12002
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 10.26153715929779,
+      "learning_rate": 5.348158492115113e-07,
+      "loss": 0.7352,
+      "step": 12003
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 9.572089274350494,
+      "learning_rate": 5.342963531007661e-07,
+      "loss": 0.6477,
+      "step": 12004
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 9.543117558166779,
+      "learning_rate": 5.33777095178224e-07,
+      "loss": 0.5812,
+      "step": 12005
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 8.336081283883875,
+      "learning_rate": 5.332580754715788e-07,
+      "loss": 0.7548,
+      "step": 12006
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 7.190493790653775,
+      "learning_rate": 5.327392940085158e-07,
+      "loss": 0.7089,
+      "step": 12007
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 7.59359409131355,
+      "learning_rate": 5.322207508167032e-07,
+      "loss": 0.6805,
+      "step": 12008
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 6.321015261303909,
+      "learning_rate": 5.317024459238007e-07,
+      "loss": 0.663,
+      "step": 12009
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 13.027891355110093,
+      "learning_rate": 5.311843793574517e-07,
+      "loss": 0.6842,
+      "step": 12010
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 9.804917383995729,
+      "learning_rate": 5.30666551145288e-07,
+      "loss": 0.6817,
+      "step": 12011
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 9.429724133285495,
+      "learning_rate": 5.301489613149314e-07,
+      "loss": 0.6285,
+      "step": 12012
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 11.262653088114247,
+      "learning_rate": 5.296316098939858e-07,
+      "loss": 0.7684,
+      "step": 12013
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 9.956779812336205,
+      "learning_rate": 5.29114496910047e-07,
+      "loss": 0.7427,
+      "step": 12014
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 10.555852898769734,
+      "learning_rate": 5.285976223906969e-07,
+      "loss": 0.738,
+      "step": 12015
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 9.29591426489601,
+      "learning_rate": 5.280809863635039e-07,
+      "loss": 0.654,
+      "step": 12016
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 10.038622866273831,
+      "learning_rate": 5.275645888560233e-07,
+      "loss": 0.6932,
+      "step": 12017
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 10.222395425162782,
+      "learning_rate": 5.270484298957968e-07,
+      "loss": 0.7412,
+      "step": 12018
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 10.361479699823155,
+      "learning_rate": 5.265325095103569e-07,
+      "loss": 0.692,
+      "step": 12019
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 10.24025294891253,
+      "learning_rate": 5.26016827727222e-07,
+      "loss": 0.6734,
+      "step": 12020
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 7.446987624326375,
+      "learning_rate": 5.25501384573896e-07,
+      "loss": 0.7126,
+      "step": 12021
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 10.045681274723503,
+      "learning_rate": 5.249861800778717e-07,
+      "loss": 0.7777,
+      "step": 12022
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 11.325049333004909,
+      "learning_rate": 5.244712142666269e-07,
+      "loss": 0.69,
+      "step": 12023
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 9.45146556855472,
+      "learning_rate": 5.2395648716763e-07,
+      "loss": 0.7685,
+      "step": 12024
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 10.326823032304889,
+      "learning_rate": 5.234419988083355e-07,
+      "loss": 0.7239,
+      "step": 12025
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 9.457135890055842,
+      "learning_rate": 5.229277492161839e-07,
+      "loss": 0.7571,
+      "step": 12026
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 8.390653166831045,
+      "learning_rate": 5.224137384186046e-07,
+      "loss": 0.7612,
+      "step": 12027
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 12.032889818509604,
+      "learning_rate": 5.21899966443013e-07,
+      "loss": 0.6451,
+      "step": 12028
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 10.488275805730312,
+      "learning_rate": 5.213864333168117e-07,
+      "loss": 0.7464,
+      "step": 12029
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 8.731379933846313,
+      "learning_rate": 5.208731390673921e-07,
+      "loss": 0.7352,
+      "step": 12030
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 8.28988398674159,
+      "learning_rate": 5.203600837221312e-07,
+      "loss": 0.7022,
+      "step": 12031
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 10.735944339490278,
+      "learning_rate": 5.198472673083937e-07,
+      "loss": 0.66,
+      "step": 12032
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 10.369329986362905,
+      "learning_rate": 5.193346898535339e-07,
+      "loss": 0.7105,
+      "step": 12033
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 8.311696466329964,
+      "learning_rate": 5.188223513848894e-07,
+      "loss": 0.7225,
+      "step": 12034
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 11.525368459368837,
+      "learning_rate": 5.183102519297872e-07,
+      "loss": 0.6641,
+      "step": 12035
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 7.910046561637409,
+      "learning_rate": 5.17798391515541e-07,
+      "loss": 0.616,
+      "step": 12036
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 10.647990579602833,
+      "learning_rate": 5.172867701694517e-07,
+      "loss": 0.7086,
+      "step": 12037
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 7.491191998715965,
+      "learning_rate": 5.167753879188092e-07,
+      "loss": 0.7293,
+      "step": 12038
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 13.80705125954974,
+      "learning_rate": 5.162642447908877e-07,
+      "loss": 0.6379,
+      "step": 12039
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 8.13066988933281,
+      "learning_rate": 5.157533408129517e-07,
+      "loss": 0.641,
+      "step": 12040
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 9.186182605355588,
+      "learning_rate": 5.152426760122503e-07,
+      "loss": 0.6568,
+      "step": 12041
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 9.66762950598841,
+      "learning_rate": 5.147322504160207e-07,
+      "loss": 0.7773,
+      "step": 12042
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 10.62151353377579,
+      "learning_rate": 5.142220640514884e-07,
+      "loss": 0.71,
+      "step": 12043
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 9.640125791726756,
+      "learning_rate": 5.137121169458642e-07,
+      "loss": 0.7073,
+      "step": 12044
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 9.710899397811922,
+      "learning_rate": 5.132024091263493e-07,
+      "loss": 0.6847,
+      "step": 12045
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 7.507204380048094,
+      "learning_rate": 5.126929406201275e-07,
+      "loss": 0.6804,
+      "step": 12046
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 10.34349886486809,
+      "learning_rate": 5.121837114543749e-07,
+      "loss": 0.621,
+      "step": 12047
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 9.652329467663636,
+      "learning_rate": 5.116747216562512e-07,
+      "loss": 0.7121,
+      "step": 12048
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 10.201599625212888,
+      "learning_rate": 5.111659712529033e-07,
+      "loss": 0.7347,
+      "step": 12049
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 10.615686986479776,
+      "learning_rate": 5.106574602714687e-07,
+      "loss": 0.6811,
+      "step": 12050
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 10.1905466892152,
+      "learning_rate": 5.10149188739068e-07,
+      "loss": 0.6628,
+      "step": 12051
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 11.176886884120918,
+      "learning_rate": 5.096411566828125e-07,
+      "loss": 0.691,
+      "step": 12052
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 10.620203343721576,
+      "learning_rate": 5.09133364129799e-07,
+      "loss": 0.6323,
+      "step": 12053
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 8.762530561826553,
+      "learning_rate": 5.086258111071124e-07,
+      "loss": 0.7409,
+      "step": 12054
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 12.995270661687416,
+      "learning_rate": 5.081184976418224e-07,
+      "loss": 0.7572,
+      "step": 12055
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 9.004274665282047,
+      "learning_rate": 5.076114237609875e-07,
+      "loss": 0.7414,
+      "step": 12056
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 8.184247755811452,
+      "learning_rate": 5.071045894916549e-07,
+      "loss": 0.7322,
+      "step": 12057
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 9.777217700967444,
+      "learning_rate": 5.065979948608585e-07,
+      "loss": 0.7052,
+      "step": 12058
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 8.831138228178753,
+      "learning_rate": 5.060916398956167e-07,
+      "loss": 0.7687,
+      "step": 12059
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 10.725697710203454,
+      "learning_rate": 5.055855246229397e-07,
+      "loss": 0.7126,
+      "step": 12060
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 12.370677857735828,
+      "learning_rate": 5.050796490698184e-07,
+      "loss": 0.7109,
+      "step": 12061
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 7.942050833231152,
+      "learning_rate": 5.045740132632371e-07,
+      "loss": 0.7698,
+      "step": 12062
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 13.10578025053802,
+      "learning_rate": 5.040686172301657e-07,
+      "loss": 0.711,
+      "step": 12063
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 11.43409020564345,
+      "learning_rate": 5.035634609975587e-07,
+      "loss": 0.7284,
+      "step": 12064
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 9.317882104826271,
+      "learning_rate": 5.030585445923619e-07,
+      "loss": 0.6472,
+      "step": 12065
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 11.160054563965813,
+      "learning_rate": 5.025538680415048e-07,
+      "loss": 0.6542,
+      "step": 12066
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 8.288374858307007,
+      "learning_rate": 5.020494313719048e-07,
+      "loss": 0.6586,
+      "step": 12067
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 10.522059144942027,
+      "learning_rate": 5.015452346104693e-07,
+      "loss": 0.6603,
+      "step": 12068
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 9.70762336130836,
+      "learning_rate": 5.010412777840884e-07,
+      "loss": 0.7106,
+      "step": 12069
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 11.270206348180007,
+      "learning_rate": 5.005375609196422e-07,
+      "loss": 0.6749,
+      "step": 12070
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 6.636445473574831,
+      "learning_rate": 5.000340840439999e-07,
+      "loss": 0.7607,
+      "step": 12071
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 10.145155906104888,
+      "learning_rate": 4.995308471840133e-07,
+      "loss": 0.6923,
+      "step": 12072
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 9.103574011520193,
+      "learning_rate": 4.990278503665246e-07,
+      "loss": 0.6537,
+      "step": 12073
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 9.28713158150178,
+      "learning_rate": 4.985250936183606e-07,
+      "loss": 0.6735,
+      "step": 12074
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 10.209559812920174,
+      "learning_rate": 4.980225769663383e-07,
+      "loss": 0.6883,
+      "step": 12075
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 7.935873228577909,
+      "learning_rate": 4.975203004372608e-07,
+      "loss": 0.7337,
+      "step": 12076
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 10.477591539000057,
+      "learning_rate": 4.970182640579174e-07,
+      "loss": 0.668,
+      "step": 12077
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 7.318643073850475,
+      "learning_rate": 4.965164678550866e-07,
+      "loss": 0.6697,
+      "step": 12078
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 9.174800979043537,
+      "learning_rate": 4.960149118555313e-07,
+      "loss": 0.7094,
+      "step": 12079
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 8.89972578457122,
+      "learning_rate": 4.955135960860035e-07,
+      "loss": 0.7208,
+      "step": 12080
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 8.564081913438898,
+      "learning_rate": 4.950125205732426e-07,
+      "loss": 0.6319,
+      "step": 12081
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 11.843597348390363,
+      "learning_rate": 4.945116853439735e-07,
+      "loss": 0.6698,
+      "step": 12082
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 11.509716991214072,
+      "learning_rate": 4.940110904249107e-07,
+      "loss": 0.6342,
+      "step": 12083
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 7.849368537633278,
+      "learning_rate": 4.935107358427527e-07,
+      "loss": 0.7354,
+      "step": 12084
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 8.419814746922242,
+      "learning_rate": 4.930106216241892e-07,
+      "loss": 0.7649,
+      "step": 12085
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 7.52249552750549,
+      "learning_rate": 4.92510747795894e-07,
+      "loss": 0.6746,
+      "step": 12086
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 10.518234556700719,
+      "learning_rate": 4.92011114384528e-07,
+      "loss": 0.7729,
+      "step": 12087
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 8.943682173475286,
+      "learning_rate": 4.915117214167419e-07,
+      "loss": 0.6594,
+      "step": 12088
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 6.808496449124956,
+      "learning_rate": 4.910125689191703e-07,
+      "loss": 0.6798,
+      "step": 12089
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 10.401151692836507,
+      "learning_rate": 4.905136569184371e-07,
+      "loss": 0.6538,
+      "step": 12090
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 12.522695155435708,
+      "learning_rate": 4.900149854411546e-07,
+      "loss": 0.7158,
+      "step": 12091
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 8.781671226315453,
+      "learning_rate": 4.895165545139191e-07,
+      "loss": 0.7209,
+      "step": 12092
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 7.031031317787145,
+      "learning_rate": 4.890183641633151e-07,
+      "loss": 0.7123,
+      "step": 12093
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 9.51601486363376,
+      "learning_rate": 4.885204144159145e-07,
+      "loss": 0.758,
+      "step": 12094
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 8.312332676419036,
+      "learning_rate": 4.880227052982773e-07,
+      "loss": 0.685,
+      "step": 12095
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 6.892734617008663,
+      "learning_rate": 4.875252368369505e-07,
+      "loss": 0.7122,
+      "step": 12096
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 10.706845171951137,
+      "learning_rate": 4.870280090584661e-07,
+      "loss": 0.6836,
+      "step": 12097
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 9.3220283729916,
+      "learning_rate": 4.865310219893471e-07,
+      "loss": 0.6549,
+      "step": 12098
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 11.311120102798276,
+      "learning_rate": 4.860342756560999e-07,
+      "loss": 0.6316,
+      "step": 12099
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 7.8010250445289335,
+      "learning_rate": 4.855377700852181e-07,
+      "loss": 0.6594,
+      "step": 12100
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 12.416536988215942,
+      "learning_rate": 4.850415053031871e-07,
+      "loss": 0.6953,
+      "step": 12101
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 7.433117592934884,
+      "learning_rate": 4.845454813364742e-07,
+      "loss": 0.6879,
+      "step": 12102
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 9.963788977942476,
+      "learning_rate": 4.840496982115366e-07,
+      "loss": 0.7347,
+      "step": 12103
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 8.418643233368124,
+      "learning_rate": 4.835541559548174e-07,
+      "loss": 0.6437,
+      "step": 12104
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 12.14969561308639,
+      "learning_rate": 4.830588545927484e-07,
+      "loss": 0.6614,
+      "step": 12105
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 6.986009267871214,
+      "learning_rate": 4.825637941517474e-07,
+      "loss": 0.7488,
+      "step": 12106
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 11.527110869038479,
+      "learning_rate": 4.820689746582185e-07,
+      "loss": 0.659,
+      "step": 12107
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 8.6747913410952,
+      "learning_rate": 4.815743961385555e-07,
+      "loss": 0.7139,
+      "step": 12108
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 9.77142444281648,
+      "learning_rate": 4.810800586191361e-07,
+      "loss": 0.6626,
+      "step": 12109
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 10.131828692071691,
+      "learning_rate": 4.805859621263293e-07,
+      "loss": 0.7021,
+      "step": 12110
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 11.358188984447342,
+      "learning_rate": 4.800921066864867e-07,
+      "loss": 0.6546,
+      "step": 12111
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 7.068777991091431,
+      "learning_rate": 4.795984923259495e-07,
+      "loss": 0.7528,
+      "step": 12112
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 9.20493114104892,
+      "learning_rate": 4.791051190710456e-07,
+      "loss": 0.7321,
+      "step": 12113
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 7.373321866779335,
+      "learning_rate": 4.786119869480915e-07,
+      "loss": 0.7153,
+      "step": 12114
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 9.499922640049345,
+      "learning_rate": 4.781190959833881e-07,
+      "loss": 0.6761,
+      "step": 12115
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 14.425050325398441,
+      "learning_rate": 4.776264462032265e-07,
+      "loss": 0.6953,
+      "step": 12116
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 9.618097495882248,
+      "learning_rate": 4.771340376338817e-07,
+      "loss": 0.7343,
+      "step": 12117
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 9.065654119176806,
+      "learning_rate": 4.7664187030161733e-07,
+      "loss": 0.699,
+      "step": 12118
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 7.043297702773932,
+      "learning_rate": 4.7614994423268556e-07,
+      "loss": 0.6312,
+      "step": 12119
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 11.11209397205663,
+      "learning_rate": 4.7565825945332213e-07,
+      "loss": 0.7248,
+      "step": 12120
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 11.989529398121396,
+      "learning_rate": 4.751668159897549e-07,
+      "loss": 0.7456,
+      "step": 12121
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 10.21381299057207,
+      "learning_rate": 4.7467561386819403e-07,
+      "loss": 0.6832,
+      "step": 12122
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 10.161778231773422,
+      "learning_rate": 4.741846531148403e-07,
+      "loss": 0.7099,
+      "step": 12123
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 9.781409708800272,
+      "learning_rate": 4.7369393375587937e-07,
+      "loss": 0.6542,
+      "step": 12124
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 7.9235194956371195,
+      "learning_rate": 4.7320345581748416e-07,
+      "loss": 0.7397,
+      "step": 12125
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 9.408401114432055,
+      "learning_rate": 4.727132193258166e-07,
+      "loss": 0.7277,
+      "step": 12126
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 10.242826089746321,
+      "learning_rate": 4.7222322430702406e-07,
+      "loss": 0.6341,
+      "step": 12127
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 6.632354777094127,
+      "learning_rate": 4.717334707872412e-07,
+      "loss": 0.6582,
+      "step": 12128
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 10.733810155192275,
+      "learning_rate": 4.71243958792591e-07,
+      "loss": 0.7447,
+      "step": 12129
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 8.118244290916223,
+      "learning_rate": 4.7075468834918257e-07,
+      "loss": 0.6319,
+      "step": 12130
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 11.174326783355236,
+      "learning_rate": 4.7026565948311177e-07,
+      "loss": 0.682,
+      "step": 12131
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 7.617009989667641,
+      "learning_rate": 4.6977687222046155e-07,
+      "loss": 0.6806,
+      "step": 12132
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 11.044976864590597,
+      "learning_rate": 4.6928832658730274e-07,
+      "loss": 0.727,
+      "step": 12133
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 10.175995256190294,
+      "learning_rate": 4.6880002260969393e-07,
+      "loss": 0.7407,
+      "step": 12134
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 9.623851665360307,
+      "learning_rate": 4.683119603136782e-07,
+      "loss": 0.7466,
+      "step": 12135
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 10.132300313143132,
+      "learning_rate": 4.6782413972528974e-07,
+      "loss": 0.6506,
+      "step": 12136
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 8.675726862791793,
+      "learning_rate": 4.673365608705466e-07,
+      "loss": 0.6684,
+      "step": 12137
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 9.545476856099391,
+      "learning_rate": 4.6684922377545296e-07,
+      "loss": 0.644,
+      "step": 12138
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 11.853457897903375,
+      "learning_rate": 4.663621284660047e-07,
+      "loss": 0.639,
+      "step": 12139
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 10.889341911401232,
+      "learning_rate": 4.658752749681805e-07,
+      "loss": 0.7072,
+      "step": 12140
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 9.799136314149033,
+      "learning_rate": 4.653886633079485e-07,
+      "loss": 0.656,
+      "step": 12141
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 9.067546290916288,
+      "learning_rate": 4.6490229351126294e-07,
+      "loss": 0.7914,
+      "step": 12142
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 9.871903283465567,
+      "learning_rate": 4.6441616560406577e-07,
+      "loss": 0.6591,
+      "step": 12143
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 8.339342588455164,
+      "learning_rate": 4.6393027961228577e-07,
+      "loss": 0.577,
+      "step": 12144
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 9.650159158087202,
+      "learning_rate": 4.634446355618377e-07,
+      "loss": 0.7387,
+      "step": 12145
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 9.377461652242273,
+      "learning_rate": 4.629592334786259e-07,
+      "loss": 0.7281,
+      "step": 12146
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 11.311630008407267,
+      "learning_rate": 4.6247407338853856e-07,
+      "loss": 0.6651,
+      "step": 12147
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 8.851781724280091,
+      "learning_rate": 4.619891553174538e-07,
+      "loss": 0.6643,
+      "step": 12148
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 8.032073998451686,
+      "learning_rate": 4.615044792912382e-07,
+      "loss": 0.7241,
+      "step": 12149
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 11.09600808011059,
+      "learning_rate": 4.610200453357383e-07,
+      "loss": 0.7298,
+      "step": 12150
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 7.326264863328567,
+      "learning_rate": 4.605358534767962e-07,
+      "loss": 0.6788,
+      "step": 12151
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 12.130815677029776,
+      "learning_rate": 4.6005190374023455e-07,
+      "loss": 0.7439,
+      "step": 12152
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 11.128621511433629,
+      "learning_rate": 4.5956819615186777e-07,
+      "loss": 0.7564,
+      "step": 12153
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 12.969062761799638,
+      "learning_rate": 4.590847307374957e-07,
+      "loss": 0.6941,
+      "step": 12154
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 9.328156613876864,
+      "learning_rate": 4.5860150752290447e-07,
+      "loss": 0.6936,
+      "step": 12155
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 12.229805412803286,
+      "learning_rate": 4.581185265338667e-07,
+      "loss": 0.7159,
+      "step": 12156
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 9.704430290661964,
+      "learning_rate": 4.5763578779614573e-07,
+      "loss": 0.6265,
+      "step": 12157
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 8.120648884670098,
+      "learning_rate": 4.5715329133548647e-07,
+      "loss": 0.7358,
+      "step": 12158
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 8.899754522735204,
+      "learning_rate": 4.566710371776273e-07,
+      "loss": 0.7013,
+      "step": 12159
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 7.584751345127273,
+      "learning_rate": 4.56189025348287e-07,
+      "loss": 0.7679,
+      "step": 12160
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 5.300803925653538,
+      "learning_rate": 4.557072558731773e-07,
+      "loss": 0.7348,
+      "step": 12161
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 8.716042027164514,
+      "learning_rate": 4.552257287779932e-07,
+      "loss": 0.6489,
+      "step": 12162
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 9.056044577236166,
+      "learning_rate": 4.54744444088418e-07,
+      "loss": 0.735,
+      "step": 12163
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 12.915218198071141,
+      "learning_rate": 4.5426340183012283e-07,
+      "loss": 0.7043,
+      "step": 12164
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 11.973296929257002,
+      "learning_rate": 4.537826020287639e-07,
+      "loss": 0.7396,
+      "step": 12165
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 12.110452396580992,
+      "learning_rate": 4.5330204470998616e-07,
+      "loss": 0.6833,
+      "step": 12166
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 11.607846580447486,
+      "learning_rate": 4.5282172989942307e-07,
+      "loss": 0.7172,
+      "step": 12167
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 11.732012283132784,
+      "learning_rate": 4.523416576226913e-07,
+      "loss": 0.6557,
+      "step": 12168
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 9.933892400882371,
+      "learning_rate": 4.518618279053971e-07,
+      "loss": 0.6822,
+      "step": 12169
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 6.877984733984477,
+      "learning_rate": 4.513822407731322e-07,
+      "loss": 0.7932,
+      "step": 12170
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 10.368372808451383,
+      "learning_rate": 4.5090289625147775e-07,
+      "loss": 0.7149,
+      "step": 12171
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 9.0417663609891,
+      "learning_rate": 4.504237943660006e-07,
+      "loss": 0.7213,
+      "step": 12172
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 10.24704953287852,
+      "learning_rate": 4.4994493514225413e-07,
+      "loss": 0.7582,
+      "step": 12173
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 9.177816922248006,
+      "learning_rate": 4.494663186057802e-07,
+      "loss": 0.6336,
+      "step": 12174
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 10.078815170951046,
+      "learning_rate": 4.489879447821066e-07,
+      "loss": 0.6902,
+      "step": 12175
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 6.3807942959287605,
+      "learning_rate": 4.4850981369674696e-07,
+      "loss": 0.7512,
+      "step": 12176
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 5.817714668298874,
+      "learning_rate": 4.480319253752058e-07,
+      "loss": 0.7099,
+      "step": 12177
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 9.445011560869437,
+      "learning_rate": 4.475542798429705e-07,
+      "loss": 0.6865,
+      "step": 12178
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 9.877220929104713,
+      "learning_rate": 4.4707687712551906e-07,
+      "loss": 0.6958,
+      "step": 12179
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 10.074548609016588,
+      "learning_rate": 4.4659971724831276e-07,
+      "loss": 0.6222,
+      "step": 12180
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 10.310683362590899,
+      "learning_rate": 4.4612280023680343e-07,
+      "loss": 0.7005,
+      "step": 12181
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 8.03699294888932,
+      "learning_rate": 4.456461261164291e-07,
+      "loss": 0.6729,
+      "step": 12182
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 9.233120997230735,
+      "learning_rate": 4.451696949126116e-07,
+      "loss": 0.6284,
+      "step": 12183
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 6.6319303188305225,
+      "learning_rate": 4.4469350665076515e-07,
+      "loss": 0.6927,
+      "step": 12184
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 9.552092883509038,
+      "learning_rate": 4.4421756135628655e-07,
+      "loss": 0.6893,
+      "step": 12185
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 11.607157432867046,
+      "learning_rate": 4.4374185905456226e-07,
+      "loss": 0.6636,
+      "step": 12186
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 8.605304676356267,
+      "learning_rate": 4.432663997709658e-07,
+      "loss": 0.7771,
+      "step": 12187
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 8.10212992833844,
+      "learning_rate": 4.4279118353085515e-07,
+      "loss": 0.6984,
+      "step": 12188
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 11.536595531000707,
+      "learning_rate": 4.423162103595785e-07,
+      "loss": 0.6757,
+      "step": 12189
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 8.17092005569209,
+      "learning_rate": 4.418414802824672e-07,
+      "loss": 0.6973,
+      "step": 12190
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 8.412965753558103,
+      "learning_rate": 4.413669933248438e-07,
+      "loss": 0.7527,
+      "step": 12191
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 9.156579679821402,
+      "learning_rate": 4.4089274951201743e-07,
+      "loss": 0.6751,
+      "step": 12192
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 6.459081801418537,
+      "learning_rate": 4.404187488692807e-07,
+      "loss": 0.719,
+      "step": 12193
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 8.636020124972562,
+      "learning_rate": 4.399449914219167e-07,
+      "loss": 0.7416,
+      "step": 12194
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 9.298846227442565,
+      "learning_rate": 4.3947147719519246e-07,
+      "loss": 0.774,
+      "step": 12195
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 10.041949360674398,
+      "learning_rate": 4.389982062143655e-07,
+      "loss": 0.7169,
+      "step": 12196
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 11.999559596796653,
+      "learning_rate": 4.3852517850467957e-07,
+      "loss": 0.76,
+      "step": 12197
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 12.045667228516088,
+      "learning_rate": 4.380523940913628e-07,
+      "loss": 0.7067,
+      "step": 12198
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 10.088315957542392,
+      "learning_rate": 4.375798529996339e-07,
+      "loss": 0.7531,
+      "step": 12199
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 10.087163777883838,
+      "learning_rate": 4.371075552546961e-07,
+      "loss": 0.7485,
+      "step": 12200
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 8.58257262120774,
+      "learning_rate": 4.3663550088173965e-07,
+      "loss": 0.6056,
+      "step": 12201
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 10.263522616176985,
+      "learning_rate": 4.3616368990594403e-07,
+      "loss": 0.6998,
+      "step": 12202
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 11.416668330830086,
+      "learning_rate": 4.3569212235247294e-07,
+      "loss": 0.6483,
+      "step": 12203
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 9.120733465895578,
+      "learning_rate": 4.35220798246479e-07,
+      "loss": 0.7482,
+      "step": 12204
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 6.938659471341527,
+      "learning_rate": 4.347497176131027e-07,
+      "loss": 0.6815,
+      "step": 12205
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 8.249727910296787,
+      "learning_rate": 4.3427888047746893e-07,
+      "loss": 0.6923,
+      "step": 12206
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 10.347456235450414,
+      "learning_rate": 4.338082868646903e-07,
+      "loss": 0.7537,
+      "step": 12207
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 10.596705180015016,
+      "learning_rate": 4.3333793679986747e-07,
+      "loss": 0.7611,
+      "step": 12208
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 7.949682913704618,
+      "learning_rate": 4.3286783030808743e-07,
+      "loss": 0.68,
+      "step": 12209
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 7.0428930028001435,
+      "learning_rate": 4.3239796741442576e-07,
+      "loss": 0.7336,
+      "step": 12210
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 10.204249016908397,
+      "learning_rate": 4.3192834814394125e-07,
+      "loss": 0.71,
+      "step": 12211
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 8.27587261962275,
+      "learning_rate": 4.314589725216839e-07,
+      "loss": 0.7036,
+      "step": 12212
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 12.850703926837136,
+      "learning_rate": 4.309898405726892e-07,
+      "loss": 0.6723,
+      "step": 12213
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 17.951302529185064,
+      "learning_rate": 4.305209523219772e-07,
+      "loss": 0.6988,
+      "step": 12214
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 11.606408678981163,
+      "learning_rate": 4.3005230779455896e-07,
+      "loss": 0.7111,
+      "step": 12215
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 8.084166394584201,
+      "learning_rate": 4.295839070154295e-07,
+      "loss": 0.7047,
+      "step": 12216
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 11.848470758963488,
+      "learning_rate": 4.291157500095733e-07,
+      "loss": 0.779,
+      "step": 12217
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 7.608636977546964,
+      "learning_rate": 4.286478368019592e-07,
+      "loss": 0.7403,
+      "step": 12218
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 11.204051287446081,
+      "learning_rate": 4.2818016741754567e-07,
+      "loss": 0.6951,
+      "step": 12219
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 8.970131603983265,
+      "learning_rate": 4.2771274188127653e-07,
+      "loss": 0.7372,
+      "step": 12220
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 12.901477179654139,
+      "learning_rate": 4.272455602180819e-07,
+      "loss": 0.6239,
+      "step": 12221
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 10.867483334479113,
+      "learning_rate": 4.2677862245288135e-07,
+      "loss": 0.7081,
+      "step": 12222
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 11.860681554625293,
+      "learning_rate": 4.2631192861057826e-07,
+      "loss": 0.7652,
+      "step": 12223
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 10.095107705765509,
+      "learning_rate": 4.2584547871606654e-07,
+      "loss": 0.7746,
+      "step": 12224
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 8.488679629086507,
+      "learning_rate": 4.253792727942252e-07,
+      "loss": 0.6983,
+      "step": 12225
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 9.712981782541197,
+      "learning_rate": 4.2491331086992047e-07,
+      "loss": 0.6759,
+      "step": 12226
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 11.706142648071422,
+      "learning_rate": 4.244475929680042e-07,
+      "loss": 0.7244,
+      "step": 12227
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 10.230561512385975,
+      "learning_rate": 4.23982119113317e-07,
+      "loss": 0.7144,
+      "step": 12228
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 7.847198157467232,
+      "learning_rate": 4.235168893306857e-07,
+      "loss": 0.719,
+      "step": 12229
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 8.519951655680565,
+      "learning_rate": 4.230519036449254e-07,
+      "loss": 0.6606,
+      "step": 12230
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 10.24938596174158,
+      "learning_rate": 4.2258716208083585e-07,
+      "loss": 0.7281,
+      "step": 12231
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 8.32835526500928,
+      "learning_rate": 4.2212266466320816e-07,
+      "loss": 0.6788,
+      "step": 12232
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 9.168031557074565,
+      "learning_rate": 4.216584114168126e-07,
+      "loss": 0.7353,
+      "step": 12233
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 7.529766941920507,
+      "learning_rate": 4.211944023664133e-07,
+      "loss": 0.6609,
+      "step": 12234
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 10.296641423819775,
+      "learning_rate": 4.2073063753676035e-07,
+      "loss": 0.6702,
+      "step": 12235
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 12.437055619661498,
+      "learning_rate": 4.2026711695258793e-07,
+      "loss": 0.6679,
+      "step": 12236
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 7.846845419457776,
+      "learning_rate": 4.198038406386207e-07,
+      "loss": 0.6724,
+      "step": 12237
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 10.109061510244935,
+      "learning_rate": 4.193408086195677e-07,
+      "loss": 0.7959,
+      "step": 12238
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 7.479508977663662,
+      "learning_rate": 4.188780209201243e-07,
+      "loss": 0.7199,
+      "step": 12239
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 9.86325207404977,
+      "learning_rate": 4.184154775649768e-07,
+      "loss": 0.6248,
+      "step": 12240
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 8.81527465000524,
+      "learning_rate": 4.179531785787938e-07,
+      "loss": 0.7584,
+      "step": 12241
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 12.199335930608576,
+      "learning_rate": 4.1749112398623446e-07,
+      "loss": 0.7154,
+      "step": 12242
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 8.106610073271929,
+      "learning_rate": 4.1702931381194357e-07,
+      "loss": 0.7174,
+      "step": 12243
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 6.562317424601609,
+      "learning_rate": 4.165677480805524e-07,
+      "loss": 0.6267,
+      "step": 12244
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 12.533319804250517,
+      "learning_rate": 4.1610642681667925e-07,
+      "loss": 0.7304,
+      "step": 12245
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 11.892287818889331,
+      "learning_rate": 4.1564535004492923e-07,
+      "loss": 0.7095,
+      "step": 12246
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 8.759037955140856,
+      "learning_rate": 4.1518451778989556e-07,
+      "loss": 0.6756,
+      "step": 12247
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 8.131046185765978,
+      "learning_rate": 4.1472393007615854e-07,
+      "loss": 0.6105,
+      "step": 12248
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 9.732285248562519,
+      "learning_rate": 4.1426358692828293e-07,
+      "loss": 0.7046,
+      "step": 12249
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 12.751371761140588,
+      "learning_rate": 4.1380348837082416e-07,
+      "loss": 0.7595,
+      "step": 12250
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 9.261420363486941,
+      "learning_rate": 4.1334363442832203e-07,
+      "loss": 0.6983,
+      "step": 12251
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 8.815308367495343,
+      "learning_rate": 4.128840251253019e-07,
+      "loss": 0.7252,
+      "step": 12252
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 7.042509950882118,
+      "learning_rate": 4.124246604862808e-07,
+      "loss": 0.6995,
+      "step": 12253
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 6.393453491094924,
+      "learning_rate": 4.119655405357575e-07,
+      "loss": 0.7067,
+      "step": 12254
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 8.806859041734302,
+      "learning_rate": 4.115066652982225e-07,
+      "loss": 0.8554,
+      "step": 12255
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 11.657257006292792,
+      "learning_rate": 4.110480347981488e-07,
+      "loss": 0.7053,
+      "step": 12256
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 9.07817604615654,
+      "learning_rate": 4.1058964906000034e-07,
+      "loss": 0.6489,
+      "step": 12257
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 10.243432110922578,
+      "learning_rate": 4.1013150810822524e-07,
+      "loss": 0.6461,
+      "step": 12258
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 10.539557009048513,
+      "learning_rate": 4.0967361196725896e-07,
+      "loss": 0.7211,
+      "step": 12259
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 11.008540014911711,
+      "learning_rate": 4.0921596066152534e-07,
+      "loss": 0.728,
+      "step": 12260
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 9.269502789692908,
+      "learning_rate": 4.087585542154332e-07,
+      "loss": 0.7365,
+      "step": 12261
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 10.584094250461193,
+      "learning_rate": 4.083013926533802e-07,
+      "loss": 0.7012,
+      "step": 12262
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 8.884840251300394,
+      "learning_rate": 4.078444759997502e-07,
+      "loss": 0.6952,
+      "step": 12263
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 13.49732994963374,
+      "learning_rate": 4.073878042789142e-07,
+      "loss": 0.6605,
+      "step": 12264
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 8.637369146713416,
+      "learning_rate": 4.0693137751522835e-07,
+      "loss": 0.5949,
+      "step": 12265
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 9.039999358546485,
+      "learning_rate": 4.064751957330376e-07,
+      "loss": 0.6589,
+      "step": 12266
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 8.223193390986898,
+      "learning_rate": 4.060192589566736e-07,
+      "loss": 0.6325,
+      "step": 12267
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 9.189138430767546,
+      "learning_rate": 4.055635672104558e-07,
+      "loss": 0.7268,
+      "step": 12268
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 9.514600554961154,
+      "learning_rate": 4.051081205186874e-07,
+      "loss": 0.7252,
+      "step": 12269
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 9.811127587830946,
+      "learning_rate": 4.0465291890566304e-07,
+      "loss": 0.6649,
+      "step": 12270
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 8.890271561458967,
+      "learning_rate": 4.04197962395661e-07,
+      "loss": 0.6661,
+      "step": 12271
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 9.603170230401266,
+      "learning_rate": 4.0374325101294576e-07,
+      "loss": 0.6422,
+      "step": 12272
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 10.22435389674235,
+      "learning_rate": 4.032887847817729e-07,
+      "loss": 0.7853,
+      "step": 12273
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 11.53646608235221,
+      "learning_rate": 4.028345637263803e-07,
+      "loss": 0.706,
+      "step": 12274
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 7.895621420708505,
+      "learning_rate": 4.023805878709969e-07,
+      "loss": 0.6466,
+      "step": 12275
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 9.722461780495882,
+      "learning_rate": 4.019268572398349e-07,
+      "loss": 0.7816,
+      "step": 12276
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 11.494320984024336,
+      "learning_rate": 4.014733718570951e-07,
+      "loss": 0.672,
+      "step": 12277
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 9.526625935240078,
+      "learning_rate": 4.010201317469664e-07,
+      "loss": 0.7109,
+      "step": 12278
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 8.621978027523298,
+      "learning_rate": 4.0056713693362113e-07,
+      "loss": 0.7332,
+      "step": 12279
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 11.318066003225352,
+      "learning_rate": 4.001143874412239e-07,
+      "loss": 0.718,
+      "step": 12280
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 10.423813354462693,
+      "learning_rate": 3.9966188329391977e-07,
+      "loss": 0.7296,
+      "step": 12281
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 8.410855610175343,
+      "learning_rate": 3.9920962451584666e-07,
+      "loss": 0.676,
+      "step": 12282
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 7.189346282095004,
+      "learning_rate": 3.9875761113112586e-07,
+      "loss": 0.7134,
+      "step": 12283
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 7.810034498359106,
+      "learning_rate": 3.983058431638659e-07,
+      "loss": 0.6291,
+      "step": 12284
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 7.474054470389598,
+      "learning_rate": 3.9785432063816297e-07,
+      "loss": 0.6285,
+      "step": 12285
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 13.518833125226816,
+      "learning_rate": 3.9740304357810123e-07,
+      "loss": 0.7016,
+      "step": 12286
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 8.587769417719944,
+      "learning_rate": 3.9695201200774914e-07,
+      "loss": 0.6826,
+      "step": 12287
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 12.169273797440887,
+      "learning_rate": 3.9650122595116467e-07,
+      "loss": 0.6435,
+      "step": 12288
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 9.756595946113881,
+      "learning_rate": 3.960506854323909e-07,
+      "loss": 0.6207,
+      "step": 12289
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 11.580629156584383,
+      "learning_rate": 3.9560039047545797e-07,
+      "loss": 0.7249,
+      "step": 12290
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 10.738367878697623,
+      "learning_rate": 3.951503411043839e-07,
+      "loss": 0.7108,
+      "step": 12291
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 9.848838194778669,
+      "learning_rate": 3.947005373431728e-07,
+      "loss": 0.7265,
+      "step": 12292
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 9.25565888958741,
+      "learning_rate": 3.9425097921581667e-07,
+      "loss": 0.6954,
+      "step": 12293
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 9.196276390215923,
+      "learning_rate": 3.938016667462918e-07,
+      "loss": 0.6908,
+      "step": 12294
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 11.824565413044388,
+      "learning_rate": 3.9335259995856634e-07,
+      "loss": 0.6733,
+      "step": 12295
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 7.920580464768334,
+      "learning_rate": 3.929037788765899e-07,
+      "loss": 0.6649,
+      "step": 12296
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 9.344399070691919,
+      "learning_rate": 3.9245520352430124e-07,
+      "loss": 0.6662,
+      "step": 12297
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 8.488546976225345,
+      "learning_rate": 3.9200687392562777e-07,
+      "loss": 0.6787,
+      "step": 12298
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 10.798366247338253,
+      "learning_rate": 3.915587901044798e-07,
+      "loss": 0.7101,
+      "step": 12299
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 13.308529529376433,
+      "learning_rate": 3.911109520847589e-07,
+      "loss": 0.716,
+      "step": 12300
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 10.414733393631643,
+      "learning_rate": 3.906633598903514e-07,
+      "loss": 0.7186,
+      "step": 12301
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 8.227405578658171,
+      "learning_rate": 3.902160135451305e-07,
+      "loss": 0.669,
+      "step": 12302
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 12.586436170258759,
+      "learning_rate": 3.8976891307295596e-07,
+      "loss": 0.6876,
+      "step": 12303
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 9.98919387128885,
+      "learning_rate": 3.893220584976737e-07,
+      "loss": 0.7498,
+      "step": 12304
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 8.595303245391161,
+      "learning_rate": 3.888754498431191e-07,
+      "loss": 0.6772,
+      "step": 12305
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 7.83614176414153,
+      "learning_rate": 3.884290871331142e-07,
+      "loss": 0.7031,
+      "step": 12306
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 7.666043722084515,
+      "learning_rate": 3.8798297039146394e-07,
+      "loss": 0.7024,
+      "step": 12307
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 11.446914352862128,
+      "learning_rate": 3.8753709964196526e-07,
+      "loss": 0.6558,
+      "step": 12308
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 9.8544599186821,
+      "learning_rate": 3.870914749083993e-07,
+      "loss": 0.7028,
+      "step": 12309
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 10.813282585181641,
+      "learning_rate": 3.8664609621453244e-07,
+      "loss": 0.7248,
+      "step": 12310
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 6.718253736730705,
+      "learning_rate": 3.86200963584123e-07,
+      "loss": 0.7559,
+      "step": 12311
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 9.302807579452288,
+      "learning_rate": 3.857560770409102e-07,
+      "loss": 0.6466,
+      "step": 12312
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 11.387886810988656,
+      "learning_rate": 3.853114366086252e-07,
+      "loss": 0.7079,
+      "step": 12313
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 7.184579202879289,
+      "learning_rate": 3.8486704231098237e-07,
+      "loss": 0.6903,
+      "step": 12314
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 7.422829755503691,
+      "learning_rate": 3.8442289417168655e-07,
+      "loss": 0.6624,
+      "step": 12315
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 9.18573205534177,
+      "learning_rate": 3.8397899221442546e-07,
+      "loss": 0.6367,
+      "step": 12316
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 10.624418554208086,
+      "learning_rate": 3.835353364628752e-07,
+      "loss": 0.7601,
+      "step": 12317
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 9.501490155029781,
+      "learning_rate": 3.830919269407013e-07,
+      "loss": 0.6896,
+      "step": 12318
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 8.87472633543726,
+      "learning_rate": 3.8264876367155203e-07,
+      "loss": 0.7408,
+      "step": 12319
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 8.565356451841273,
+      "learning_rate": 3.8220584667906457e-07,
+      "loss": 0.646,
+      "step": 12320
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 8.877387579358482,
+      "learning_rate": 3.817631759868656e-07,
+      "loss": 0.7547,
+      "step": 12321
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 8.247964868283512,
+      "learning_rate": 3.8132075161856243e-07,
+      "loss": 0.7058,
+      "step": 12322
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 14.073709243686046,
+      "learning_rate": 3.8087857359775493e-07,
+      "loss": 0.6412,
+      "step": 12323
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 10.926231136319343,
+      "learning_rate": 3.8043664194802546e-07,
+      "loss": 0.735,
+      "step": 12324
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 9.190687372608316,
+      "learning_rate": 3.7999495669294726e-07,
+      "loss": 0.6815,
+      "step": 12325
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 9.807163586732067,
+      "learning_rate": 3.795535178560794e-07,
+      "loss": 0.6763,
+      "step": 12326
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 7.695085998275182,
+      "learning_rate": 3.791123254609658e-07,
+      "loss": 0.7216,
+      "step": 12327
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 11.85733248697341,
+      "learning_rate": 3.7867137953113695e-07,
+      "loss": 0.6515,
+      "step": 12328
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 7.629016092027907,
+      "learning_rate": 3.782306800901142e-07,
+      "loss": 0.6705,
+      "step": 12329
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 11.809745381996532,
+      "learning_rate": 3.7779022716140147e-07,
+      "loss": 0.6367,
+      "step": 12330
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 12.430514363542889,
+      "learning_rate": 3.7735002076849324e-07,
+      "loss": 0.6505,
+      "step": 12331
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 7.605258557721551,
+      "learning_rate": 3.7691006093486636e-07,
+      "loss": 0.7128,
+      "step": 12332
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 9.085807639500993,
+      "learning_rate": 3.764703476839893e-07,
+      "loss": 0.7113,
+      "step": 12333
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 12.574445293136643,
+      "learning_rate": 3.7603088103931427e-07,
+      "loss": 0.6899,
+      "step": 12334
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 7.899618423345074,
+      "learning_rate": 3.7559166102428044e-07,
+      "loss": 0.7365,
+      "step": 12335
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 9.480913759935024,
+      "learning_rate": 3.7515268766231573e-07,
+      "loss": 0.6545,
+      "step": 12336
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 11.243091321223183,
+      "learning_rate": 3.7471396097683245e-07,
+      "loss": 0.6952,
+      "step": 12337
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 11.69236839380246,
+      "learning_rate": 3.742754809912325e-07,
+      "loss": 0.7254,
+      "step": 12338
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 6.7923526311674784,
+      "learning_rate": 3.738372477289032e-07,
+      "loss": 0.7837,
+      "step": 12339
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 8.880505788069692,
+      "learning_rate": 3.733992612132176e-07,
+      "loss": 0.689,
+      "step": 12340
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 8.311557156621642,
+      "learning_rate": 3.729615214675375e-07,
+      "loss": 0.6857,
+      "step": 12341
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 7.717399241902393,
+      "learning_rate": 3.7252402851520986e-07,
+      "loss": 0.6924,
+      "step": 12342
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 8.15452772744557,
+      "learning_rate": 3.720867823795699e-07,
+      "loss": 0.6863,
+      "step": 12343
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 7.635186591371262,
+      "learning_rate": 3.716497830839394e-07,
+      "loss": 0.6657,
+      "step": 12344
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 7.997898146055179,
+      "learning_rate": 3.7121303065162597e-07,
+      "loss": 0.664,
+      "step": 12345
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 11.777275650260872,
+      "learning_rate": 3.707765251059253e-07,
+      "loss": 0.6999,
+      "step": 12346
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 9.30113584895823,
+      "learning_rate": 3.7034026647012e-07,
+      "loss": 0.7086,
+      "step": 12347
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 8.458240562930362,
+      "learning_rate": 3.6990425476747694e-07,
+      "loss": 0.6964,
+      "step": 12348
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 12.110007945076616,
+      "learning_rate": 3.694684900212542e-07,
+      "loss": 0.7088,
+      "step": 12349
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 9.116628903111478,
+      "learning_rate": 3.6903297225469206e-07,
+      "loss": 0.6842,
+      "step": 12350
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 9.047237775560436,
+      "learning_rate": 3.685977014910214e-07,
+      "loss": 0.7432,
+      "step": 12351
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 11.00147900667854,
+      "learning_rate": 3.6816267775345693e-07,
+      "loss": 0.6599,
+      "step": 12352
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 9.487063321816771,
+      "learning_rate": 3.677279010652035e-07,
+      "loss": 0.6641,
+      "step": 12353
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 12.11474388954308,
+      "learning_rate": 3.6729337144944976e-07,
+      "loss": 0.7224,
+      "step": 12354
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 10.732932994661477,
+      "learning_rate": 3.668590889293711e-07,
+      "loss": 0.7037,
+      "step": 12355
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 10.207487490348692,
+      "learning_rate": 3.664250535281333e-07,
+      "loss": 0.7477,
+      "step": 12356
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 9.197358268707426,
+      "learning_rate": 3.659912652688846e-07,
+      "loss": 0.6683,
+      "step": 12357
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 10.269735693982385,
+      "learning_rate": 3.6555772417476323e-07,
+      "loss": 0.7413,
+      "step": 12358
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 12.994237517185095,
+      "learning_rate": 3.651244302688933e-07,
+      "loss": 0.6445,
+      "step": 12359
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 9.220470327460037,
+      "learning_rate": 3.646913835743854e-07,
+      "loss": 0.7476,
+      "step": 12360
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 10.92292587439804,
+      "learning_rate": 3.6425858411433643e-07,
+      "loss": 0.7169,
+      "step": 12361
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 10.908619542682576,
+      "learning_rate": 3.638260319118303e-07,
+      "loss": 0.7235,
+      "step": 12362
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 9.736224309004056,
+      "learning_rate": 3.6339372698993846e-07,
+      "loss": 0.7371,
+      "step": 12363
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 11.319572568891148,
+      "learning_rate": 3.629616693717197e-07,
+      "loss": 0.7325,
+      "step": 12364
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 8.128841666442417,
+      "learning_rate": 3.6252985908021844e-07,
+      "loss": 0.6814,
+      "step": 12365
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 7.855839737383964,
+      "learning_rate": 3.620982961384661e-07,
+      "loss": 0.6914,
+      "step": 12366
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 10.89075072652842,
+      "learning_rate": 3.6166698056948004e-07,
+      "loss": 0.6473,
+      "step": 12367
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 8.056895922069387,
+      "learning_rate": 3.6123591239626665e-07,
+      "loss": 0.6991,
+      "step": 12368
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 8.97479796373606,
+      "learning_rate": 3.608050916418182e-07,
+      "loss": 0.7218,
+      "step": 12369
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 10.196302089717319,
+      "learning_rate": 3.6037451832911187e-07,
+      "loss": 0.7058,
+      "step": 12370
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 10.820258723474575,
+      "learning_rate": 3.5994419248111534e-07,
+      "loss": 0.6553,
+      "step": 12371
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 9.348584408615086,
+      "learning_rate": 3.5951411412077977e-07,
+      "loss": 0.6785,
+      "step": 12372
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 11.063927472829365,
+      "learning_rate": 3.5908428327104396e-07,
+      "loss": 0.6475,
+      "step": 12373
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 7.108564922877316,
+      "learning_rate": 3.5865469995483515e-07,
+      "loss": 0.7079,
+      "step": 12374
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 7.353328970290171,
+      "learning_rate": 3.582253641950639e-07,
+      "loss": 0.7132,
+      "step": 12375
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 9.949474239232908,
+      "learning_rate": 3.5779627601463194e-07,
+      "loss": 0.7134,
+      "step": 12376
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 9.24671300106227,
+      "learning_rate": 3.573674354364254e-07,
+      "loss": 0.6792,
+      "step": 12377
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 8.32418509090095,
+      "learning_rate": 3.56938842483317e-07,
+      "loss": 0.7277,
+      "step": 12378
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 8.933661981205269,
+      "learning_rate": 3.565104971781669e-07,
+      "loss": 0.6999,
+      "step": 12379
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 11.198391035599059,
+      "learning_rate": 3.5608239954382116e-07,
+      "loss": 0.7026,
+      "step": 12380
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 9.558940337893363,
+      "learning_rate": 3.556545496031133e-07,
+      "loss": 0.718,
+      "step": 12381
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 8.566816487110382,
+      "learning_rate": 3.55226947378865e-07,
+      "loss": 0.701,
+      "step": 12382
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 9.552564226901733,
+      "learning_rate": 3.547995928938819e-07,
+      "loss": 0.6872,
+      "step": 12383
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 9.194289344073429,
+      "learning_rate": 3.543724861709591e-07,
+      "loss": 0.675,
+      "step": 12384
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 10.889606107889172,
+      "learning_rate": 3.539456272328767e-07,
+      "loss": 0.6816,
+      "step": 12385
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 10.162130125820953,
+      "learning_rate": 3.5351901610240155e-07,
+      "loss": 0.756,
+      "step": 12386
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 9.01004785573082,
+      "learning_rate": 3.530926528022893e-07,
+      "loss": 0.7397,
+      "step": 12387
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 6.9956758233109335,
+      "learning_rate": 3.52666537355279e-07,
+      "loss": 0.7216,
+      "step": 12388
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 8.341344734497794,
+      "learning_rate": 3.5224066978410076e-07,
+      "loss": 0.7367,
+      "step": 12389
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 10.107109167535667,
+      "learning_rate": 3.5181505011146755e-07,
+      "loss": 0.6709,
+      "step": 12390
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 10.669581707004518,
+      "learning_rate": 3.513896783600818e-07,
+      "loss": 0.7263,
+      "step": 12391
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 7.714309563801411,
+      "learning_rate": 3.509645545526308e-07,
+      "loss": 0.6663,
+      "step": 12392
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 11.360205704857893,
+      "learning_rate": 3.5053967871178927e-07,
+      "loss": 0.7328,
+      "step": 12393
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 8.443022072830187,
+      "learning_rate": 3.5011505086022025e-07,
+      "loss": 0.727,
+      "step": 12394
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 9.067503650421932,
+      "learning_rate": 3.496906710205705e-07,
+      "loss": 0.6666,
+      "step": 12395
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 7.489476522002467,
+      "learning_rate": 3.492665392154759e-07,
+      "loss": 0.701,
+      "step": 12396
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 9.679890426468244,
+      "learning_rate": 3.4884265546756e-07,
+      "loss": 0.6525,
+      "step": 12397
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 12.314588560963676,
+      "learning_rate": 3.4841901979942973e-07,
+      "loss": 0.6272,
+      "step": 12398
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 8.98714169466296,
+      "learning_rate": 3.479956322336819e-07,
+      "loss": 0.7058,
+      "step": 12399
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 8.918103758491636,
+      "learning_rate": 3.475724927928964e-07,
+      "loss": 0.6978,
+      "step": 12400
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 8.941751169899668,
+      "learning_rate": 3.471496014996445e-07,
+      "loss": 0.7363,
+      "step": 12401
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 7.139805218063829,
+      "learning_rate": 3.4672695837648265e-07,
+      "loss": 0.7075,
+      "step": 12402
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 10.361444956686395,
+      "learning_rate": 3.463045634459511e-07,
+      "loss": 0.672,
+      "step": 12403
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 8.390805486119195,
+      "learning_rate": 3.4588241673058243e-07,
+      "loss": 0.7751,
+      "step": 12404
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 9.082682476864319,
+      "learning_rate": 3.454605182528892e-07,
+      "loss": 0.6854,
+      "step": 12405
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 13.815946723829196,
+      "learning_rate": 3.450388680353756e-07,
+      "loss": 0.6845,
+      "step": 12406
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 8.942043500972188,
+      "learning_rate": 3.446174661005325e-07,
+      "loss": 0.6667,
+      "step": 12407
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 8.960035753769303,
+      "learning_rate": 3.4419631247083474e-07,
+      "loss": 0.6184,
+      "step": 12408
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 8.438822600444631,
+      "learning_rate": 3.437754071687471e-07,
+      "loss": 0.7083,
+      "step": 12409
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 8.041625907480212,
+      "learning_rate": 3.433547502167184e-07,
+      "loss": 0.682,
+      "step": 12410
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 11.491738873693537,
+      "learning_rate": 3.4293434163718444e-07,
+      "loss": 0.7582,
+      "step": 12411
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 7.449794659610502,
+      "learning_rate": 3.4251418145257064e-07,
+      "loss": 0.6992,
+      "step": 12412
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 8.784956462486713,
+      "learning_rate": 3.4209426968528527e-07,
+      "loss": 0.7887,
+      "step": 12413
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 13.06100895849719,
+      "learning_rate": 3.416746063577264e-07,
+      "loss": 0.7391,
+      "step": 12414
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 9.700579907365567,
+      "learning_rate": 3.412551914922785e-07,
+      "loss": 0.6564,
+      "step": 12415
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 7.4713615211348765,
+      "learning_rate": 3.408360251113102e-07,
+      "loss": 0.6634,
+      "step": 12416
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 8.295768353093166,
+      "learning_rate": 3.4041710723718036e-07,
+      "loss": 0.7166,
+      "step": 12417
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 8.579959192326578,
+      "learning_rate": 3.3999843789223054e-07,
+      "loss": 0.7795,
+      "step": 12418
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 7.705277146327307,
+      "learning_rate": 3.395800170987928e-07,
+      "loss": 0.7098,
+      "step": 12419
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 12.29103824577064,
+      "learning_rate": 3.3916184487918615e-07,
+      "loss": 0.6713,
+      "step": 12420
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 8.40847559445471,
+      "learning_rate": 3.387439212557114e-07,
+      "loss": 0.6699,
+      "step": 12421
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 9.278190806938943,
+      "learning_rate": 3.383262462506626e-07,
+      "loss": 0.7468,
+      "step": 12422
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 7.323023607893316,
+      "learning_rate": 3.379088198863162e-07,
+      "loss": 0.7454,
+      "step": 12423
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 8.005956474478138,
+      "learning_rate": 3.37491642184935e-07,
+      "loss": 0.6558,
+      "step": 12424
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 10.148855994475328,
+      "learning_rate": 3.3707471316877236e-07,
+      "loss": 0.7184,
+      "step": 12425
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 7.183595050703525,
+      "learning_rate": 3.366580328600644e-07,
+      "loss": 0.7285,
+      "step": 12426
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 10.147574936894836,
+      "learning_rate": 3.362416012810371e-07,
+      "loss": 0.6346,
+      "step": 12427
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 9.081920406315028,
+      "learning_rate": 3.358254184539006e-07,
+      "loss": 0.6808,
+      "step": 12428
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 10.658313743514205,
+      "learning_rate": 3.354094844008543e-07,
+      "loss": 0.7354,
+      "step": 12429
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 7.099998132191995,
+      "learning_rate": 3.349937991440816e-07,
+      "loss": 0.6494,
+      "step": 12430
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 8.023762827558537,
+      "learning_rate": 3.3457836270575416e-07,
+      "loss": 0.7016,
+      "step": 12431
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 8.268088934655976,
+      "learning_rate": 3.3416317510803153e-07,
+      "loss": 0.8021,
+      "step": 12432
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 10.466405937334104,
+      "learning_rate": 3.3374823637305654e-07,
+      "loss": 0.8052,
+      "step": 12433
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 10.6563312073648,
+      "learning_rate": 3.3333354652296204e-07,
+      "loss": 0.7546,
+      "step": 12434
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 7.795987859581582,
+      "learning_rate": 3.329191055798675e-07,
+      "loss": 0.724,
+      "step": 12435
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 8.656813466005241,
+      "learning_rate": 3.3250491356587643e-07,
+      "loss": 0.6377,
+      "step": 12436
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 14.22714229513738,
+      "learning_rate": 3.320909705030817e-07,
+      "loss": 0.6626,
+      "step": 12437
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 10.840892351521067,
+      "learning_rate": 3.3167727641356064e-07,
+      "loss": 0.7155,
+      "step": 12438
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 10.535742106394457,
+      "learning_rate": 3.312638313193789e-07,
+      "loss": 0.7648,
+      "step": 12439
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 9.276685479646048,
+      "learning_rate": 3.3085063524258997e-07,
+      "loss": 0.7269,
+      "step": 12440
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 11.37232315460902,
+      "learning_rate": 3.304376882052307e-07,
+      "loss": 0.7033,
+      "step": 12441
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 9.07735819919218,
+      "learning_rate": 3.3002499022932785e-07,
+      "loss": 0.6974,
+      "step": 12442
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 6.074407475870986,
+      "learning_rate": 3.2961254133689333e-07,
+      "loss": 0.6586,
+      "step": 12443
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 9.042569125343531,
+      "learning_rate": 3.292003415499245e-07,
+      "loss": 0.736,
+      "step": 12444
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 10.920841111443155,
+      "learning_rate": 3.2878839089040935e-07,
+      "loss": 0.6881,
+      "step": 12445
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 8.470786951041756,
+      "learning_rate": 3.2837668938031863e-07,
+      "loss": 0.7279,
+      "step": 12446
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 8.7830134834457,
+      "learning_rate": 3.27965237041612e-07,
+      "loss": 0.7581,
+      "step": 12447
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 8.946243686418331,
+      "learning_rate": 3.275540338962352e-07,
+      "loss": 0.7229,
+      "step": 12448
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 8.370550655224072,
+      "learning_rate": 3.2714307996611906e-07,
+      "loss": 0.714,
+      "step": 12449
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 7.8850146075352345,
+      "learning_rate": 3.267323752731855e-07,
+      "loss": 0.7089,
+      "step": 12450
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 10.280468228225619,
+      "learning_rate": 3.26321919839338e-07,
+      "loss": 0.7426,
+      "step": 12451
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 8.130989468756448,
+      "learning_rate": 3.2591171368647034e-07,
+      "loss": 0.6758,
+      "step": 12452
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 10.274854347625807,
+      "learning_rate": 3.2550175683646104e-07,
+      "loss": 0.7537,
+      "step": 12453
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 8.289154634554372,
+      "learning_rate": 3.25092049311177e-07,
+      "loss": 0.689,
+      "step": 12454
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 11.01595657997614,
+      "learning_rate": 3.246825911324708e-07,
+      "loss": 0.6686,
+      "step": 12455
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 9.855554996397824,
+      "learning_rate": 3.2427338232218054e-07,
+      "loss": 0.6869,
+      "step": 12456
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 11.553429160045615,
+      "learning_rate": 3.238644229021326e-07,
+      "loss": 0.7411,
+      "step": 12457
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 8.949453332049174,
+      "learning_rate": 3.2345571289414124e-07,
+      "loss": 0.7386,
+      "step": 12458
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 10.689051074106954,
+      "learning_rate": 3.2304725232000456e-07,
+      "loss": 0.6586,
+      "step": 12459
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 9.235625467319514,
+      "learning_rate": 3.2263904120150903e-07,
+      "loss": 0.7017,
+      "step": 12460
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 9.843404166829957,
+      "learning_rate": 3.2223107956042833e-07,
+      "loss": 0.6844,
+      "step": 12461
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 11.856741001154457,
+      "learning_rate": 3.2182336741852003e-07,
+      "loss": 0.7014,
+      "step": 12462
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 9.491609979215454,
+      "learning_rate": 3.214159047975324e-07,
+      "loss": 0.6911,
+      "step": 12463
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 9.53758672195733,
+      "learning_rate": 3.2100869171919625e-07,
+      "loss": 0.7014,
+      "step": 12464
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 9.193158069599951,
+      "learning_rate": 3.206017282052337e-07,
+      "loss": 0.7314,
+      "step": 12465
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 8.261316438887157,
+      "learning_rate": 3.2019501427734854e-07,
+      "loss": 0.6777,
+      "step": 12466
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 8.57269186353479,
+      "learning_rate": 3.197885499572362e-07,
+      "loss": 0.7504,
+      "step": 12467
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 6.835944265626216,
+      "learning_rate": 3.1938233526657534e-07,
+      "loss": 0.6623,
+      "step": 12468
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 9.695084751141279,
+      "learning_rate": 3.18976370227031e-07,
+      "loss": 0.6264,
+      "step": 12469
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 9.979151728634788,
+      "learning_rate": 3.1857065486025795e-07,
+      "loss": 0.7259,
+      "step": 12470
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 10.844818629993012,
+      "learning_rate": 3.1816518918789506e-07,
+      "loss": 0.7567,
+      "step": 12471
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 9.855170914254467,
+      "learning_rate": 3.177599732315684e-07,
+      "loss": 0.7911,
+      "step": 12472
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 10.054471381534208,
+      "learning_rate": 3.1735500701289277e-07,
+      "loss": 0.7228,
+      "step": 12473
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 7.715670737452695,
+      "learning_rate": 3.169502905534666e-07,
+      "loss": 0.7322,
+      "step": 12474
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 14.47066827907629,
+      "learning_rate": 3.16545823874877e-07,
+      "loss": 0.7315,
+      "step": 12475
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 11.173654859366906,
+      "learning_rate": 3.1614160699869555e-07,
+      "loss": 0.7531,
+      "step": 12476
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 9.761873798654296,
+      "learning_rate": 3.157376399464834e-07,
+      "loss": 0.7638,
+      "step": 12477
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 10.373639736604519,
+      "learning_rate": 3.153339227397878e-07,
+      "loss": 0.6935,
+      "step": 12478
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 9.49811712839271,
+      "learning_rate": 3.149304554001398e-07,
+      "loss": 0.7264,
+      "step": 12479
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 9.025291984436368,
+      "learning_rate": 3.14527237949061e-07,
+      "loss": 0.7279,
+      "step": 12480
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 11.86193129789721,
+      "learning_rate": 3.1412427040805716e-07,
+      "loss": 0.785,
+      "step": 12481
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 9.747146626044643,
+      "learning_rate": 3.1372155279862094e-07,
+      "loss": 0.661,
+      "step": 12482
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 10.771632471270495,
+      "learning_rate": 3.133190851422335e-07,
+      "loss": 0.8089,
+      "step": 12483
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 8.009396745967276,
+      "learning_rate": 3.1291686746035996e-07,
+      "loss": 0.866,
+      "step": 12484
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 14.390580303092664,
+      "learning_rate": 3.125148997744548e-07,
+      "loss": 0.7867,
+      "step": 12485
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 9.575992162501489,
+      "learning_rate": 3.121131821059564e-07,
+      "loss": 0.6779,
+      "step": 12486
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 11.300001461751483,
+      "learning_rate": 3.1171171447629266e-07,
+      "loss": 0.6228,
+      "step": 12487
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 8.849084983674231,
+      "learning_rate": 3.113104969068759e-07,
+      "loss": 0.7312,
+      "step": 12488
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 9.497133084392505,
+      "learning_rate": 3.1090952941910614e-07,
+      "loss": 0.6987,
+      "step": 12489
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 8.929355165504944,
+      "learning_rate": 3.1050881203437024e-07,
+      "loss": 0.7,
+      "step": 12490
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 11.72992096125006,
+      "learning_rate": 3.101083447740399e-07,
+      "loss": 0.7045,
+      "step": 12491
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 7.6909889580500295,
+      "learning_rate": 3.0970812765947764e-07,
+      "loss": 0.6669,
+      "step": 12492
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 7.204788260981977,
+      "learning_rate": 3.0930816071202797e-07,
+      "loss": 0.7001,
+      "step": 12493
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 8.698361164376307,
+      "learning_rate": 3.0890844395302323e-07,
+      "loss": 0.6776,
+      "step": 12494
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 9.764400366410253,
+      "learning_rate": 3.0850897740378584e-07,
+      "loss": 0.6906,
+      "step": 12495
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 9.349711504460947,
+      "learning_rate": 3.081097610856193e-07,
+      "loss": 0.7141,
+      "step": 12496
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 9.003205134866754,
+      "learning_rate": 3.0771079501981826e-07,
+      "loss": 0.6946,
+      "step": 12497
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 10.731284021113726,
+      "learning_rate": 3.073120792276635e-07,
+      "loss": 0.6546,
+      "step": 12498
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 9.516973920043426,
+      "learning_rate": 3.069136137304202e-07,
+      "loss": 0.7046,
+      "step": 12499
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 8.192196516707572,
+      "learning_rate": 3.065153985493402e-07,
+      "loss": 0.6793,
+      "step": 12500
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 12.38224905726931,
+      "learning_rate": 3.061174337056655e-07,
+      "loss": 0.7222,
+      "step": 12501
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 11.647802032771288,
+      "learning_rate": 3.057197192206207e-07,
+      "loss": 0.7187,
+      "step": 12502
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 6.018344504099764,
+      "learning_rate": 3.0532225511542e-07,
+      "loss": 0.6848,
+      "step": 12503
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 8.53855475513732,
+      "learning_rate": 3.04925041411262e-07,
+      "loss": 0.666,
+      "step": 12504
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 7.3403162746879485,
+      "learning_rate": 3.0452807812933415e-07,
+      "loss": 0.6698,
+      "step": 12505
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 10.12472497147687,
+      "learning_rate": 3.041313652908084e-07,
+      "loss": 0.6912,
+      "step": 12506
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 10.471772109060465,
+      "learning_rate": 3.0373490291684405e-07,
+      "loss": 0.7379,
+      "step": 12507
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 8.148793657163937,
+      "learning_rate": 3.033386910285885e-07,
+      "loss": 0.6466,
+      "step": 12508
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 11.105784901188406,
+      "learning_rate": 3.029427296471732e-07,
+      "loss": 0.6519,
+      "step": 12509
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 8.975792388371268,
+      "learning_rate": 3.025470187937185e-07,
+      "loss": 0.6418,
+      "step": 12510
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 9.663006096076835,
+      "learning_rate": 3.0215155848933086e-07,
+      "loss": 0.7354,
+      "step": 12511
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 8.290807666237367,
+      "learning_rate": 3.0175634875510283e-07,
+      "loss": 0.7279,
+      "step": 12512
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 5.8923378395829,
+      "learning_rate": 3.0136138961211313e-07,
+      "loss": 0.7393,
+      "step": 12513
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 10.183083246639427,
+      "learning_rate": 3.0096668108142766e-07,
+      "loss": 0.6831,
+      "step": 12514
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 10.766541737305909,
+      "learning_rate": 3.005722231840996e-07,
+      "loss": 0.6326,
+      "step": 12515
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 7.773239225728028,
+      "learning_rate": 3.001780159411688e-07,
+      "loss": 0.7038,
+      "step": 12516
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 7.296064079504447,
+      "learning_rate": 2.9978405937366007e-07,
+      "loss": 0.5992,
+      "step": 12517
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 8.826594813365297,
+      "learning_rate": 2.9939035350258715e-07,
+      "loss": 0.6835,
+      "step": 12518
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 8.976373822950809,
+      "learning_rate": 2.989968983489483e-07,
+      "loss": 0.7552,
+      "step": 12519
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 8.753512790152735,
+      "learning_rate": 2.986036939337289e-07,
+      "loss": 0.6997,
+      "step": 12520
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 9.035894319458016,
+      "learning_rate": 2.982107402779033e-07,
+      "loss": 0.7093,
+      "step": 12521
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 11.337231726915997,
+      "learning_rate": 2.978180374024281e-07,
+      "loss": 0.7622,
+      "step": 12522
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 8.996394521746769,
+      "learning_rate": 2.9742558532825095e-07,
+      "loss": 0.679,
+      "step": 12523
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 10.238655166822054,
+      "learning_rate": 2.970333840763023e-07,
+      "loss": 0.6549,
+      "step": 12524
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 10.001366491412195,
+      "learning_rate": 2.966414336675033e-07,
+      "loss": 0.721,
+      "step": 12525
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 5.567586503836447,
+      "learning_rate": 2.9624973412275827e-07,
+      "loss": 0.6637,
+      "step": 12526
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 9.804326328649989,
+      "learning_rate": 2.958582854629588e-07,
+      "loss": 0.6427,
+      "step": 12527
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 8.867099442200987,
+      "learning_rate": 2.954670877089849e-07,
+      "loss": 0.7003,
+      "step": 12528
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 8.49406334602729,
+      "learning_rate": 2.9507614088170046e-07,
+      "loss": 0.6836,
+      "step": 12529
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 10.555855955833469,
+      "learning_rate": 2.946854450019587e-07,
+      "loss": 0.6066,
+      "step": 12530
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 9.624405740902489,
+      "learning_rate": 2.9429500009059976e-07,
+      "loss": 0.6845,
+      "step": 12531
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 6.946131915343168,
+      "learning_rate": 2.939048061684452e-07,
+      "loss": 0.647,
+      "step": 12532
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 8.961714466879144,
+      "learning_rate": 2.935148632563095e-07,
+      "loss": 0.7397,
+      "step": 12533
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 9.114871681841443,
+      "learning_rate": 2.931251713749894e-07,
+      "loss": 0.7684,
+      "step": 12534
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 9.254046574561139,
+      "learning_rate": 2.927357305452716e-07,
+      "loss": 0.7063,
+      "step": 12535
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 9.52610114360094,
+      "learning_rate": 2.9234654078792777e-07,
+      "loss": 0.6363,
+      "step": 12536
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 11.149202652766077,
+      "learning_rate": 2.9195760212371526e-07,
+      "loss": 0.6527,
+      "step": 12537
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 11.063108954966046,
+      "learning_rate": 2.9156891457337967e-07,
+      "loss": 0.7616,
+      "step": 12538
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 7.2637669390022594,
+      "learning_rate": 2.9118047815765107e-07,
+      "loss": 0.6121,
+      "step": 12539
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 7.425578939767486,
+      "learning_rate": 2.9079229289724895e-07,
+      "loss": 0.6791,
+      "step": 12540
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 10.890589647007168,
+      "learning_rate": 2.9040435881287855e-07,
+      "loss": 0.7089,
+      "step": 12541
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 7.872682727305478,
+      "learning_rate": 2.900166759252293e-07,
+      "loss": 0.7477,
+      "step": 12542
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 6.9870779760936355,
+      "learning_rate": 2.896292442549814e-07,
+      "loss": 0.7112,
+      "step": 12543
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 10.101562991703734,
+      "learning_rate": 2.892420638227983e-07,
+      "loss": 0.7147,
+      "step": 12544
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 12.927816746569148,
+      "learning_rate": 2.8885513464933015e-07,
+      "loss": 0.6901,
+      "step": 12545
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 10.563840536563568,
+      "learning_rate": 2.8846845675521595e-07,
+      "loss": 0.7653,
+      "step": 12546
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 11.13642211951033,
+      "learning_rate": 2.880820301610793e-07,
+      "loss": 0.7406,
+      "step": 12547
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 12.09249164833212,
+      "learning_rate": 2.876958548875308e-07,
+      "loss": 0.6989,
+      "step": 12548
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 8.480230586612814,
+      "learning_rate": 2.873099309551702e-07,
+      "loss": 0.6592,
+      "step": 12549
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 10.03766275054491,
+      "learning_rate": 2.869242583845794e-07,
+      "loss": 0.6694,
+      "step": 12550
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 11.114414065216211,
+      "learning_rate": 2.8653883719633015e-07,
+      "loss": 0.6836,
+      "step": 12551
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 12.862723131045067,
+      "learning_rate": 2.861536674109783e-07,
+      "loss": 0.6827,
+      "step": 12552
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 9.39365121259628,
+      "learning_rate": 2.8576874904906905e-07,
+      "loss": 0.7264,
+      "step": 12553
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 7.633240451091181,
+      "learning_rate": 2.853840821311327e-07,
+      "loss": 0.6571,
+      "step": 12554
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 8.854741389816578,
+      "learning_rate": 2.849996666776861e-07,
+      "loss": 0.667,
+      "step": 12555
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 10.416171642828385,
+      "learning_rate": 2.8461550270923345e-07,
+      "loss": 0.6617,
+      "step": 12556
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 9.8140963930447,
+      "learning_rate": 2.8423159024626445e-07,
+      "loss": 0.6487,
+      "step": 12557
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 10.257864585923405,
+      "learning_rate": 2.8384792930925554e-07,
+      "loss": 0.6476,
+      "step": 12558
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 8.95046753788039,
+      "learning_rate": 2.8346451991867087e-07,
+      "loss": 0.6308,
+      "step": 12559
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 10.264189080292452,
+      "learning_rate": 2.8308136209495963e-07,
+      "loss": 0.7084,
+      "step": 12560
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 7.920066998772852,
+      "learning_rate": 2.826984558585599e-07,
+      "loss": 0.6262,
+      "step": 12561
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 9.558042744842401,
+      "learning_rate": 2.8231580122989265e-07,
+      "loss": 0.7251,
+      "step": 12562
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 9.667548458212732,
+      "learning_rate": 2.8193339822936983e-07,
+      "loss": 0.6788,
+      "step": 12563
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 8.870110914724126,
+      "learning_rate": 2.815512468773868e-07,
+      "loss": 0.7397,
+      "step": 12564
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 9.403967213398408,
+      "learning_rate": 2.8116934719432554e-07,
+      "loss": 0.7626,
+      "step": 12565
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 11.12187610648925,
+      "learning_rate": 2.8078769920055647e-07,
+      "loss": 0.7275,
+      "step": 12566
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 11.60544667041035,
+      "learning_rate": 2.8040630291643553e-07,
+      "loss": 0.7522,
+      "step": 12567
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 8.472969050913067,
+      "learning_rate": 2.800251583623054e-07,
+      "loss": 0.7154,
+      "step": 12568
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 8.631123107664507,
+      "learning_rate": 2.7964426555849524e-07,
+      "loss": 0.7042,
+      "step": 12569
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 13.623795686701554,
+      "learning_rate": 2.792636245253211e-07,
+      "loss": 0.6687,
+      "step": 12570
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 9.527475723776899,
+      "learning_rate": 2.7888323528308503e-07,
+      "loss": 0.6884,
+      "step": 12571
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 10.295787408825596,
+      "learning_rate": 2.785030978520753e-07,
+      "loss": 0.6751,
+      "step": 12572
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 11.473027923090195,
+      "learning_rate": 2.7812321225256787e-07,
+      "loss": 0.7004,
+      "step": 12573
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 10.480592048491483,
+      "learning_rate": 2.7774357850482545e-07,
+      "loss": 0.6208,
+      "step": 12574
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 10.491640306742918,
+      "learning_rate": 2.7736419662909573e-07,
+      "loss": 0.698,
+      "step": 12575
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 11.244179919052693,
+      "learning_rate": 2.7698506664561475e-07,
+      "loss": 0.6615,
+      "step": 12576
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 9.11999149920854,
+      "learning_rate": 2.76606188574603e-07,
+      "loss": 0.7307,
+      "step": 12577
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 6.9696717811481115,
+      "learning_rate": 2.7622756243626927e-07,
+      "loss": 0.7089,
+      "step": 12578
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 10.256210030565597,
+      "learning_rate": 2.758491882508096e-07,
+      "loss": 0.7471,
+      "step": 12579
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 7.643552261643118,
+      "learning_rate": 2.754710660384036e-07,
+      "loss": 0.7256,
+      "step": 12580
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 9.523890177148765,
+      "learning_rate": 2.7509319581922047e-07,
+      "loss": 0.7318,
+      "step": 12581
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 12.16431810787137,
+      "learning_rate": 2.747155776134147e-07,
+      "loss": 0.7532,
+      "step": 12582
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 9.658784533416629,
+      "learning_rate": 2.7433821144112637e-07,
+      "loss": 0.7353,
+      "step": 12583
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 11.314910773819513,
+      "learning_rate": 2.739610973224849e-07,
+      "loss": 0.72,
+      "step": 12584
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 7.7925017132297105,
+      "learning_rate": 2.7358423527760245e-07,
+      "loss": 0.6747,
+      "step": 12585
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 11.195415112752398,
+      "learning_rate": 2.7320762532658087e-07,
+      "loss": 0.7514,
+      "step": 12586
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 9.511334429061659,
+      "learning_rate": 2.7283126748950837e-07,
+      "loss": 0.6958,
+      "step": 12587
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 8.358057791684967,
+      "learning_rate": 2.7245516178645735e-07,
+      "loss": 0.6152,
+      "step": 12588
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 8.698603899301526,
+      "learning_rate": 2.720793082374895e-07,
+      "loss": 0.7001,
+      "step": 12589
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 8.107236216259187,
+      "learning_rate": 2.7170370686264993e-07,
+      "loss": 0.672,
+      "step": 12590
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 7.251870917311939,
+      "learning_rate": 2.7132835768197307e-07,
+      "loss": 0.7072,
+      "step": 12591
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 10.835200411736107,
+      "learning_rate": 2.709532607154808e-07,
+      "loss": 0.7357,
+      "step": 12592
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 10.268438082869404,
+      "learning_rate": 2.7057841598317703e-07,
+      "loss": 0.6465,
+      "step": 12593
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 14.99887781630863,
+      "learning_rate": 2.702038235050564e-07,
+      "loss": 0.7381,
+      "step": 12594
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 10.962140155650452,
+      "learning_rate": 2.6982948330109894e-07,
+      "loss": 0.7266,
+      "step": 12595
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 8.950218964554313,
+      "learning_rate": 2.694553953912693e-07,
+      "loss": 0.6888,
+      "step": 12596
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 7.576826604269967,
+      "learning_rate": 2.690815597955221e-07,
+      "loss": 0.6701,
+      "step": 12597
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 8.884921210956193,
+      "learning_rate": 2.6870797653379523e-07,
+      "loss": 0.7153,
+      "step": 12598
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 8.276968125642547,
+      "learning_rate": 2.683346456260161e-07,
+      "loss": 0.7091,
+      "step": 12599
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 9.735186941599368,
+      "learning_rate": 2.679615670920954e-07,
+      "loss": 0.6705,
+      "step": 12600
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 7.324592373012664,
+      "learning_rate": 2.6758874095193397e-07,
+      "loss": 0.7166,
+      "step": 12601
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 8.823214327598532,
+      "learning_rate": 2.672161672254159e-07,
+      "loss": 0.7492,
+      "step": 12602
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 7.740121383118104,
+      "learning_rate": 2.6684384593241355e-07,
+      "loss": 0.7033,
+      "step": 12603
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 8.179620808599175,
+      "learning_rate": 2.66471777092786e-07,
+      "loss": 0.6404,
+      "step": 12604
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 11.171400509061138,
+      "learning_rate": 2.66099960726377e-07,
+      "loss": 0.7649,
+      "step": 12605
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 11.068987214735644,
+      "learning_rate": 2.6572839685301945e-07,
+      "loss": 0.6674,
+      "step": 12606
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 7.909130631886235,
+      "learning_rate": 2.653570854925319e-07,
+      "loss": 0.7329,
+      "step": 12607
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 9.416275069187401,
+      "learning_rate": 2.649860266647186e-07,
+      "loss": 0.7241,
+      "step": 12608
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 12.996004572716243,
+      "learning_rate": 2.6461522038937096e-07,
+      "loss": 0.6742,
+      "step": 12609
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 8.563804916921846,
+      "learning_rate": 2.642446666862653e-07,
+      "loss": 0.6211,
+      "step": 12610
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 9.53748365020123,
+      "learning_rate": 2.6387436557516687e-07,
+      "loss": 0.7353,
+      "step": 12611
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 10.380662430385884,
+      "learning_rate": 2.635043170758278e-07,
+      "loss": 0.7401,
+      "step": 12612
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 8.494167251911389,
+      "learning_rate": 2.631345212079833e-07,
+      "loss": 0.647,
+      "step": 12613
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 12.224954449120755,
+      "learning_rate": 2.627649779913594e-07,
+      "loss": 0.7132,
+      "step": 12614
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 8.810844458729202,
+      "learning_rate": 2.623956874456646e-07,
+      "loss": 0.6548,
+      "step": 12615
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 8.876742720234919,
+      "learning_rate": 2.620266495905965e-07,
+      "loss": 0.7534,
+      "step": 12616
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 11.244368270626328,
+      "learning_rate": 2.616578644458395e-07,
+      "loss": 0.7221,
+      "step": 12617
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 8.195622529251043,
+      "learning_rate": 2.6128933203106164e-07,
+      "loss": 0.7402,
+      "step": 12618
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 9.931275185370325,
+      "learning_rate": 2.609210523659217e-07,
+      "loss": 0.7339,
+      "step": 12619
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 8.112797654983126,
+      "learning_rate": 2.605530254700611e-07,
+      "loss": 0.726,
+      "step": 12620
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 10.25866970974871,
+      "learning_rate": 2.6018525136310924e-07,
+      "loss": 0.6929,
+      "step": 12621
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 10.511409973315269,
+      "learning_rate": 2.598177300646837e-07,
+      "loss": 0.7502,
+      "step": 12622
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 8.315638810024097,
+      "learning_rate": 2.594504615943849e-07,
+      "loss": 0.7593,
+      "step": 12623
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 11.80576957939794,
+      "learning_rate": 2.590834459718039e-07,
+      "loss": 0.6955,
+      "step": 12624
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 9.850094893369125,
+      "learning_rate": 2.5871668321651445e-07,
+      "loss": 0.6876,
+      "step": 12625
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 8.467318762994449,
+      "learning_rate": 2.5835017334808086e-07,
+      "loss": 0.6213,
+      "step": 12626
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 10.52159952879535,
+      "learning_rate": 2.5798391638605036e-07,
+      "loss": 0.6272,
+      "step": 12627
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 10.444450980735933,
+      "learning_rate": 2.576179123499578e-07,
+      "loss": 0.7006,
+      "step": 12628
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 12.719837758027616,
+      "learning_rate": 2.5725216125932486e-07,
+      "loss": 0.7,
+      "step": 12629
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 6.275560990988641,
+      "learning_rate": 2.5688666313366083e-07,
+      "loss": 0.7601,
+      "step": 12630
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 12.656678096718949,
+      "learning_rate": 2.565214179924591e-07,
+      "loss": 0.6744,
+      "step": 12631
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 10.931136546284561,
+      "learning_rate": 2.561564258552024e-07,
+      "loss": 0.6819,
+      "step": 12632
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 8.627811803266082,
+      "learning_rate": 2.557916867413568e-07,
+      "loss": 0.7206,
+      "step": 12633
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 9.081461190225903,
+      "learning_rate": 2.554272006703767e-07,
+      "loss": 0.6299,
+      "step": 12634
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 9.366227410628323,
+      "learning_rate": 2.550629676617039e-07,
+      "loss": 0.6896,
+      "step": 12635
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 10.594949660929801,
+      "learning_rate": 2.5469898773476387e-07,
+      "loss": 0.6687,
+      "step": 12636
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 7.034509505582349,
+      "learning_rate": 2.5433526090897165e-07,
+      "loss": 0.7185,
+      "step": 12637
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 11.026635705456814,
+      "learning_rate": 2.5397178720372674e-07,
+      "loss": 0.7408,
+      "step": 12638
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 12.870264261140992,
+      "learning_rate": 2.536085666384164e-07,
+      "loss": 0.7478,
+      "step": 12639
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 11.526021238889312,
+      "learning_rate": 2.53245599232414e-07,
+      "loss": 0.6645,
+      "step": 12640
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 9.096667296450567,
+      "learning_rate": 2.528828850050774e-07,
+      "loss": 0.7478,
+      "step": 12641
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 10.28464043681751,
+      "learning_rate": 2.5252042397575447e-07,
+      "loss": 0.6984,
+      "step": 12642
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 11.912294512756358,
+      "learning_rate": 2.5215821616377757e-07,
+      "loss": 0.6714,
+      "step": 12643
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 12.576454503224673,
+      "learning_rate": 2.5179626158846503e-07,
+      "loss": 0.6362,
+      "step": 12644
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 7.957744780945362,
+      "learning_rate": 2.514345602691243e-07,
+      "loss": 0.7167,
+      "step": 12645
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 8.027331613295956,
+      "learning_rate": 2.5107311222504596e-07,
+      "loss": 0.6974,
+      "step": 12646
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 11.08720350233674,
+      "learning_rate": 2.507119174755096e-07,
+      "loss": 0.6265,
+      "step": 12647
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 11.335585121347185,
+      "learning_rate": 2.5035097603977874e-07,
+      "loss": 0.686,
+      "step": 12648
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 8.948553293771708,
+      "learning_rate": 2.4999028793710634e-07,
+      "loss": 0.7121,
+      "step": 12649
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 9.774921780811589,
+      "learning_rate": 2.4962985318673084e-07,
+      "loss": 0.6516,
+      "step": 12650
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 10.719881986796835,
+      "learning_rate": 2.492696718078752e-07,
+      "loss": 0.7316,
+      "step": 12651
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 8.588584436180575,
+      "learning_rate": 2.489097438197524e-07,
+      "loss": 0.638,
+      "step": 12652
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 8.445914181948796,
+      "learning_rate": 2.485500692415593e-07,
+      "loss": 0.6054,
+      "step": 12653
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 9.487792556716895,
+      "learning_rate": 2.4819064809247897e-07,
+      "loss": 0.6992,
+      "step": 12654
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 10.77021946244028,
+      "learning_rate": 2.478314803916837e-07,
+      "loss": 0.6751,
+      "step": 12655
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 9.677241698409446,
+      "learning_rate": 2.4747256615832827e-07,
+      "loss": 0.7424,
+      "step": 12656
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 11.492441186222656,
+      "learning_rate": 2.4711390541155846e-07,
+      "loss": 0.7038,
+      "step": 12657
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 10.05050699336315,
+      "learning_rate": 2.467554981705023e-07,
+      "loss": 0.6675,
+      "step": 12658
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 8.64748854871695,
+      "learning_rate": 2.463973444542783e-07,
+      "loss": 0.7582,
+      "step": 12659
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 10.28563439901565,
+      "learning_rate": 2.460394442819874e-07,
+      "loss": 0.7049,
+      "step": 12660
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 9.919340218589923,
+      "learning_rate": 2.4568179767271994e-07,
+      "loss": 0.7044,
+      "step": 12661
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 9.097454886684853,
+      "learning_rate": 2.453244046455516e-07,
+      "loss": 0.6928,
+      "step": 12662
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 10.22422118274368,
+      "learning_rate": 2.449672652195445e-07,
+      "loss": 0.6525,
+      "step": 12663
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 8.971286399206853,
+      "learning_rate": 2.4461037941374835e-07,
+      "loss": 0.7385,
+      "step": 12664
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 9.441176345877643,
+      "learning_rate": 2.4425374724719785e-07,
+      "loss": 0.7092,
+      "step": 12665
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 9.069150002264506,
+      "learning_rate": 2.438973687389146e-07,
+      "loss": 0.6471,
+      "step": 12666
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 9.925617948584561,
+      "learning_rate": 2.435412439079071e-07,
+      "loss": 0.6742,
+      "step": 12667
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 10.97811138954049,
+      "learning_rate": 2.4318537277316923e-07,
+      "loss": 0.778,
+      "step": 12668
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 8.061559264793994,
+      "learning_rate": 2.428297553536829e-07,
+      "loss": 0.7119,
+      "step": 12669
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 8.608700766490767,
+      "learning_rate": 2.424743916684164e-07,
+      "loss": 0.7244,
+      "step": 12670
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 7.517560660152907,
+      "learning_rate": 2.4211928173632324e-07,
+      "loss": 0.7683,
+      "step": 12671
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 9.462015727613695,
+      "learning_rate": 2.417644255763435e-07,
+      "loss": 0.7591,
+      "step": 12672
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 11.758534184049346,
+      "learning_rate": 2.4140982320740526e-07,
+      "loss": 0.6602,
+      "step": 12673
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 10.304577846521878,
+      "learning_rate": 2.410554746484206e-07,
+      "loss": 0.6819,
+      "step": 12674
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 12.467605031816728,
+      "learning_rate": 2.4070137991829113e-07,
+      "loss": 0.6979,
+      "step": 12675
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 10.453420696387107,
+      "learning_rate": 2.403475390359017e-07,
+      "loss": 0.7301,
+      "step": 12676
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 8.77275466887014,
+      "learning_rate": 2.399939520201267e-07,
+      "loss": 0.7128,
+      "step": 12677
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 8.270445295915508,
+      "learning_rate": 2.396406188898248e-07,
+      "loss": 0.7328,
+      "step": 12678
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 7.647324606236558,
+      "learning_rate": 2.392875396638411e-07,
+      "loss": 0.7032,
+      "step": 12679
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 9.197311729456905,
+      "learning_rate": 2.389347143610088e-07,
+      "loss": 0.6312,
+      "step": 12680
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 7.284396228714986,
+      "learning_rate": 2.385821430001456e-07,
+      "loss": 0.6364,
+      "step": 12681
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 10.181285538257692,
+      "learning_rate": 2.3822982560005813e-07,
+      "loss": 0.7269,
+      "step": 12682
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 11.239700521834903,
+      "learning_rate": 2.3787776217953752e-07,
+      "loss": 0.7103,
+      "step": 12683
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 7.960222825448982,
+      "learning_rate": 2.3752595275736145e-07,
+      "loss": 0.6785,
+      "step": 12684
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 10.831012908416698,
+      "learning_rate": 2.371743973522944e-07,
+      "loss": 0.6773,
+      "step": 12685
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 10.930454201301709,
+      "learning_rate": 2.368230959830875e-07,
+      "loss": 0.6479,
+      "step": 12686
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 7.255895883515182,
+      "learning_rate": 2.3647204866847794e-07,
+      "loss": 0.6417,
+      "step": 12687
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 8.523287432157641,
+      "learning_rate": 2.3612125542719077e-07,
+      "loss": 0.7073,
+      "step": 12688
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 7.979392021532022,
+      "learning_rate": 2.357707162779349e-07,
+      "loss": 0.748,
+      "step": 12689
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 11.184278958089333,
+      "learning_rate": 2.3542043123940817e-07,
+      "loss": 0.7011,
+      "step": 12690
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 9.018077561803105,
+      "learning_rate": 2.3507040033029337e-07,
+      "loss": 0.7642,
+      "step": 12691
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 8.510251893733761,
+      "learning_rate": 2.3472062356925951e-07,
+      "loss": 0.6931,
+      "step": 12692
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 12.031728681552993,
+      "learning_rate": 2.3437110097496385e-07,
+      "loss": 0.6787,
+      "step": 12693
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 10.440137238619661,
+      "learning_rate": 2.3402183256604817e-07,
+      "loss": 0.7006,
+      "step": 12694
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 10.997842218573876,
+      "learning_rate": 2.3367281836114198e-07,
+      "loss": 0.6999,
+      "step": 12695
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 9.636958390327553,
+      "learning_rate": 2.3332405837885984e-07,
+      "loss": 0.7622,
+      "step": 12696
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 12.209003319560255,
+      "learning_rate": 2.3297555263780525e-07,
+      "loss": 0.7374,
+      "step": 12697
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 9.33077690383122,
+      "learning_rate": 2.326273011565655e-07,
+      "loss": 0.6308,
+      "step": 12698
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 11.035857934733658,
+      "learning_rate": 2.3227930395371468e-07,
+      "loss": 0.7662,
+      "step": 12699
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 10.088762640024814,
+      "learning_rate": 2.3193156104781566e-07,
+      "loss": 0.6895,
+      "step": 12700
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 10.278375761789773,
+      "learning_rate": 2.3158407245741422e-07,
+      "loss": 0.6757,
+      "step": 12701
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 7.7496733458129015,
+      "learning_rate": 2.3123683820104548e-07,
+      "loss": 0.7185,
+      "step": 12702
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 9.247221880547313,
+      "learning_rate": 2.308898582972313e-07,
+      "loss": 0.7484,
+      "step": 12703
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 8.68495306958467,
+      "learning_rate": 2.3054313276447527e-07,
+      "loss": 0.7325,
+      "step": 12704
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 8.507106066066225,
+      "learning_rate": 2.3019666162127418e-07,
+      "loss": 0.6719,
+      "step": 12705
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 6.311937120710126,
+      "learning_rate": 2.2985044488610497e-07,
+      "loss": 0.7776,
+      "step": 12706
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 12.143628543647191,
+      "learning_rate": 2.2950448257743564e-07,
+      "loss": 0.7194,
+      "step": 12707
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 7.938628666032442,
+      "learning_rate": 2.291587747137186e-07,
+      "loss": 0.7027,
+      "step": 12708
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 10.156251462399855,
+      "learning_rate": 2.288133213133936e-07,
+      "loss": 0.616,
+      "step": 12709
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 9.777058441864774,
+      "learning_rate": 2.284681223948848e-07,
+      "loss": 0.6345,
+      "step": 12710
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 8.110279768843343,
+      "learning_rate": 2.281231779766041e-07,
+      "loss": 0.7197,
+      "step": 12711
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 10.024783301788185,
+      "learning_rate": 2.2777848807695068e-07,
+      "loss": 0.7399,
+      "step": 12712
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 8.526098262045588,
+      "learning_rate": 2.274340527143093e-07,
+      "loss": 0.7083,
+      "step": 12713
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 13.53240395153921,
+      "learning_rate": 2.2708987190705078e-07,
+      "loss": 0.7035,
+      "step": 12714
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 8.195497572072442,
+      "learning_rate": 2.267459456735338e-07,
+      "loss": 0.7137,
+      "step": 12715
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 12.82646366743109,
+      "learning_rate": 2.2640227403210147e-07,
+      "loss": 0.7496,
+      "step": 12716
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 12.05060194612339,
+      "learning_rate": 2.2605885700108355e-07,
+      "loss": 0.6201,
+      "step": 12717
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 9.945893483969686,
+      "learning_rate": 2.2571569459879873e-07,
+      "loss": 0.6966,
+      "step": 12718
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 11.52763098280026,
+      "learning_rate": 2.2537278684354847e-07,
+      "loss": 0.7734,
+      "step": 12719
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 10.658343988587932,
+      "learning_rate": 2.250301337536237e-07,
+      "loss": 0.7207,
+      "step": 12720
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 11.968461410719257,
+      "learning_rate": 2.2468773534730148e-07,
+      "loss": 0.6907,
+      "step": 12721
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 12.891705832238689,
+      "learning_rate": 2.2434559164284275e-07,
+      "loss": 0.6812,
+      "step": 12722
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 11.706087534985908,
+      "learning_rate": 2.2400370265849737e-07,
+      "loss": 0.7246,
+      "step": 12723
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 10.509460004242358,
+      "learning_rate": 2.236620684124996e-07,
+      "loss": 0.7418,
+      "step": 12724
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 12.449281798486195,
+      "learning_rate": 2.2332068892307213e-07,
+      "loss": 0.6628,
+      "step": 12725
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 7.751434502947743,
+      "learning_rate": 2.2297956420842372e-07,
+      "loss": 0.7037,
+      "step": 12726
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 11.462147275627819,
+      "learning_rate": 2.2263869428674756e-07,
+      "loss": 0.6346,
+      "step": 12727
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 10.922606857981659,
+      "learning_rate": 2.2229807917622636e-07,
+      "loss": 0.6823,
+      "step": 12728
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 12.085491290530156,
+      "learning_rate": 2.219577188950267e-07,
+      "loss": 0.71,
+      "step": 12729
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 9.663204124902785,
+      "learning_rate": 2.216176134613024e-07,
+      "loss": 0.7173,
+      "step": 12730
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 12.542192100043122,
+      "learning_rate": 2.212777628931939e-07,
+      "loss": 0.6952,
+      "step": 12731
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 8.160205193155855,
+      "learning_rate": 2.2093816720882733e-07,
+      "loss": 0.7414,
+      "step": 12732
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 11.256471594817912,
+      "learning_rate": 2.205988264263176e-07,
+      "loss": 0.6728,
+      "step": 12733
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 10.582342614857936,
+      "learning_rate": 2.202597405637613e-07,
+      "loss": 0.7142,
+      "step": 12734
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 10.182595678677904,
+      "learning_rate": 2.199209096392474e-07,
+      "loss": 0.6861,
+      "step": 12735
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 8.877655282403163,
+      "learning_rate": 2.1958233367084637e-07,
+      "loss": 0.7995,
+      "step": 12736
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 11.93159573821283,
+      "learning_rate": 2.1924401267661655e-07,
+      "loss": 0.7452,
+      "step": 12737
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 8.239885262868174,
+      "learning_rate": 2.1890594667460462e-07,
+      "loss": 0.7152,
+      "step": 12738
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 7.820317889514432,
+      "learning_rate": 2.1856813568284063e-07,
+      "loss": 0.6549,
+      "step": 12739
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 9.399315902333647,
+      "learning_rate": 2.1823057971934291e-07,
+      "loss": 0.637,
+      "step": 12740
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 9.18315071088702,
+      "learning_rate": 2.1789327880211652e-07,
+      "loss": 0.7566,
+      "step": 12741
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 11.66439183702759,
+      "learning_rate": 2.1755623294915152e-07,
+      "loss": 0.6722,
+      "step": 12742
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 9.289437897343456,
+      "learning_rate": 2.1721944217842516e-07,
+      "loss": 0.6929,
+      "step": 12743
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 11.070892341750028,
+      "learning_rate": 2.1688290650790033e-07,
+      "loss": 0.6846,
+      "step": 12744
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 13.125129222260354,
+      "learning_rate": 2.1654662595552712e-07,
+      "loss": 0.6006,
+      "step": 12745
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 9.328087264267994,
+      "learning_rate": 2.162106005392428e-07,
+      "loss": 0.7485,
+      "step": 12746
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 11.274226797214395,
+      "learning_rate": 2.158748302769692e-07,
+      "loss": 0.7101,
+      "step": 12747
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 6.765608397126474,
+      "learning_rate": 2.155393151866153e-07,
+      "loss": 0.709,
+      "step": 12748
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 8.295226713515849,
+      "learning_rate": 2.1520405528607623e-07,
+      "loss": 0.6919,
+      "step": 12749
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 14.141927422644244,
+      "learning_rate": 2.1486905059323439e-07,
+      "loss": 0.6771,
+      "step": 12750
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 13.704761129634814,
+      "learning_rate": 2.1453430112595874e-07,
+      "loss": 0.7281,
+      "step": 12751
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 9.747529805753864,
+      "learning_rate": 2.1419980690210175e-07,
+      "loss": 0.7263,
+      "step": 12752
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 7.342135317102987,
+      "learning_rate": 2.1386556793950685e-07,
+      "loss": 0.7056,
+      "step": 12753
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 6.182259499329982,
+      "learning_rate": 2.1353158425599984e-07,
+      "loss": 0.6706,
+      "step": 12754
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 10.748744266047083,
+      "learning_rate": 2.1319785586939478e-07,
+      "loss": 0.7653,
+      "step": 12755
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 9.532807415819898,
+      "learning_rate": 2.1286438279749245e-07,
+      "loss": 0.6953,
+      "step": 12756
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 9.251020636087851,
+      "learning_rate": 2.1253116505807803e-07,
+      "loss": 0.7347,
+      "step": 12757
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 9.430044505619783,
+      "learning_rate": 2.121982026689251e-07,
+      "loss": 0.708,
+      "step": 12758
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 9.648692908416447,
+      "learning_rate": 2.118654956477939e-07,
+      "loss": 0.7009,
+      "step": 12759
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 6.7531990895523375,
+      "learning_rate": 2.1153304401242914e-07,
+      "loss": 0.7129,
+      "step": 12760
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 10.56520205443562,
+      "learning_rate": 2.1120084778056327e-07,
+      "loss": 0.7151,
+      "step": 12761
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 8.537723922300309,
+      "learning_rate": 2.108689069699138e-07,
+      "loss": 0.6767,
+      "step": 12762
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 6.692331257171292,
+      "learning_rate": 2.10537221598186e-07,
+      "loss": 0.6174,
+      "step": 12763
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 10.355300994634264,
+      "learning_rate": 2.1020579168307241e-07,
+      "loss": 0.7047,
+      "step": 12764
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 8.337442480628779,
+      "learning_rate": 2.0987461724224833e-07,
+      "loss": 0.7174,
+      "step": 12765
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 6.979117145899381,
+      "learning_rate": 2.0954369829337905e-07,
+      "loss": 0.6956,
+      "step": 12766
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 8.770593086315689,
+      "learning_rate": 2.092130348541155e-07,
+      "loss": 0.6801,
+      "step": 12767
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 7.108248889822035,
+      "learning_rate": 2.0888262694209183e-07,
+      "loss": 0.6895,
+      "step": 12768
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 11.773797643929491,
+      "learning_rate": 2.08552474574934e-07,
+      "loss": 0.6607,
+      "step": 12769
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 10.519256730885386,
+      "learning_rate": 2.08222577770249e-07,
+      "loss": 0.6185,
+      "step": 12770
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 9.17669639368188,
+      "learning_rate": 2.0789293654563448e-07,
+      "loss": 0.6387,
+      "step": 12771
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 7.7288405789787396,
+      "learning_rate": 2.0756355091867132e-07,
+      "loss": 0.7065,
+      "step": 12772
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 9.6022958358381,
+      "learning_rate": 2.0723442090692935e-07,
+      "loss": 0.6866,
+      "step": 12773
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 9.514322528494843,
+      "learning_rate": 2.0690554652796234e-07,
+      "loss": 0.7134,
+      "step": 12774
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 10.936939332824593,
+      "learning_rate": 2.0657692779931072e-07,
+      "loss": 0.7105,
+      "step": 12775
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 10.018283085432747,
+      "learning_rate": 2.062485647385043e-07,
+      "loss": 0.7361,
+      "step": 12776
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 7.939513168747217,
+      "learning_rate": 2.059204573630552e-07,
+      "loss": 0.7017,
+      "step": 12777
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 7.664999164767732,
+      "learning_rate": 2.0559260569046502e-07,
+      "loss": 0.7555,
+      "step": 12778
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 8.43216892672716,
+      "learning_rate": 2.0526500973821972e-07,
+      "loss": 0.6591,
+      "step": 12779
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 8.21711875842935,
+      "learning_rate": 2.0493766952379313e-07,
+      "loss": 0.6648,
+      "step": 12780
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 8.08166524355401,
+      "learning_rate": 2.0461058506464403e-07,
+      "loss": 0.7245,
+      "step": 12781
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 10.09521830785316,
+      "learning_rate": 2.042837563782174e-07,
+      "loss": 0.672,
+      "step": 12782
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 9.332255744732523,
+      "learning_rate": 2.0395718348194648e-07,
+      "loss": 0.7023,
+      "step": 12783
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 15.313974358395269,
+      "learning_rate": 2.0363086639324957e-07,
+      "loss": 0.6972,
+      "step": 12784
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 7.666164841164002,
+      "learning_rate": 2.0330480512953165e-07,
+      "loss": 0.7155,
+      "step": 12785
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 9.27909261058864,
+      "learning_rate": 2.0297899970818435e-07,
+      "loss": 0.6538,
+      "step": 12786
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 7.574685978505186,
+      "learning_rate": 2.0265345014658376e-07,
+      "loss": 0.6612,
+      "step": 12787
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 9.354933644139772,
+      "learning_rate": 2.0232815646209437e-07,
+      "loss": 0.6383,
+      "step": 12788
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 9.589047790847331,
+      "learning_rate": 2.020031186720678e-07,
+      "loss": 0.6784,
+      "step": 12789
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 10.681927726683321,
+      "learning_rate": 2.0167833679383853e-07,
+      "loss": 0.6456,
+      "step": 12790
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 9.695669469110113,
+      "learning_rate": 2.0135381084473105e-07,
+      "loss": 0.7123,
+      "step": 12791
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 9.603816580846173,
+      "learning_rate": 2.010295408420543e-07,
+      "loss": 0.7045,
+      "step": 12792
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 8.613112873639585,
+      "learning_rate": 2.007055268031033e-07,
+      "loss": 0.732,
+      "step": 12793
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 11.390519216339227,
+      "learning_rate": 2.0038176874516147e-07,
+      "loss": 0.6358,
+      "step": 12794
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 8.702147622151765,
+      "learning_rate": 2.00058266685495e-07,
+      "loss": 0.5954,
+      "step": 12795
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 9.990156514384328,
+      "learning_rate": 1.997350206413612e-07,
+      "loss": 0.7579,
+      "step": 12796
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 7.8804947692597285,
+      "learning_rate": 1.9941203062999848e-07,
+      "loss": 0.6979,
+      "step": 12797
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 12.108176441856937,
+      "learning_rate": 1.990892966686364e-07,
+      "loss": 0.7343,
+      "step": 12798
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 8.745342133035514,
+      "learning_rate": 1.9876681877448788e-07,
+      "loss": 0.7377,
+      "step": 12799
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 10.26066725483905,
+      "learning_rate": 1.984445969647525e-07,
+      "loss": 0.6493,
+      "step": 12800
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 9.348518906258972,
+      "learning_rate": 1.9812263125661646e-07,
+      "loss": 0.6826,
+      "step": 12801
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 10.684641859609133,
+      "learning_rate": 1.978009216672544e-07,
+      "loss": 0.7123,
+      "step": 12802
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 10.81475836738651,
+      "learning_rate": 1.9747946821382314e-07,
+      "loss": 0.654,
+      "step": 12803
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 12.002914144998089,
+      "learning_rate": 1.9715827091347005e-07,
+      "loss": 0.6881,
+      "step": 12804
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 8.621039673246356,
+      "learning_rate": 1.9683732978332594e-07,
+      "loss": 0.663,
+      "step": 12805
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 9.731997171281822,
+      "learning_rate": 1.9651664484050814e-07,
+      "loss": 0.6887,
+      "step": 12806
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 6.205997169651654,
+      "learning_rate": 1.9619621610212248e-07,
+      "loss": 0.6591,
+      "step": 12807
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 9.420290483426303,
+      "learning_rate": 1.9587604358525913e-07,
+      "loss": 0.7146,
+      "step": 12808
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 9.659485486389507,
+      "learning_rate": 1.9555612730699501e-07,
+      "loss": 0.7178,
+      "step": 12809
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 7.485345265797357,
+      "learning_rate": 1.952364672843937e-07,
+      "loss": 0.7226,
+      "step": 12810
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 13.624859064703704,
+      "learning_rate": 1.9491706353450602e-07,
+      "loss": 0.7471,
+      "step": 12811
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 9.09862183259106,
+      "learning_rate": 1.9459791607436663e-07,
+      "loss": 0.6476,
+      "step": 12812
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 8.240540430895459,
+      "learning_rate": 1.9427902492099803e-07,
+      "loss": 0.7057,
+      "step": 12813
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 12.890472159231416,
+      "learning_rate": 1.9396039009140998e-07,
+      "loss": 0.6828,
+      "step": 12814
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 10.949349368881915,
+      "learning_rate": 1.936420116025961e-07,
+      "loss": 0.6758,
+      "step": 12815
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 13.4156720041827,
+      "learning_rate": 1.9332388947153947e-07,
+      "loss": 0.6324,
+      "step": 12816
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 9.26888361958254,
+      "learning_rate": 1.9300602371520704e-07,
+      "loss": 0.7732,
+      "step": 12817
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 12.504025323248177,
+      "learning_rate": 1.9268841435055309e-07,
+      "loss": 0.7255,
+      "step": 12818
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 8.206180815967025,
+      "learning_rate": 1.9237106139451788e-07,
+      "loss": 0.6958,
+      "step": 12819
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 11.368548586312569,
+      "learning_rate": 1.9205396486402793e-07,
+      "loss": 0.7505,
+      "step": 12820
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 11.65710596244072,
+      "learning_rate": 1.9173712477599583e-07,
+      "loss": 0.6575,
+      "step": 12821
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 10.603256999718662,
+      "learning_rate": 1.91420541147323e-07,
+      "loss": 0.6551,
+      "step": 12822
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 8.058087976864392,
+      "learning_rate": 1.9110421399489265e-07,
+      "loss": 0.69,
+      "step": 12823
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 11.838082248468037,
+      "learning_rate": 1.9078814333557848e-07,
+      "loss": 0.6601,
+      "step": 12824
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 8.322143458005137,
+      "learning_rate": 1.9047232918623816e-07,
+      "loss": 0.688,
+      "step": 12825
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 9.81147727776761,
+      "learning_rate": 1.9015677156371593e-07,
+      "loss": 0.7051,
+      "step": 12826
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 7.143051880996038,
+      "learning_rate": 1.8984147048484335e-07,
+      "loss": 0.7316,
+      "step": 12827
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 10.901577691591475,
+      "learning_rate": 1.8952642596643755e-07,
+      "loss": 0.6796,
+      "step": 12828
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 8.882674771378621,
+      "learning_rate": 1.892116380253023e-07,
+      "loss": 0.6738,
+      "step": 12829
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 10.058836931227573,
+      "learning_rate": 1.8889710667822803e-07,
+      "loss": 0.7059,
+      "step": 12830
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 8.831828835157467,
+      "learning_rate": 1.885828319419891e-07,
+      "loss": 0.6958,
+      "step": 12831
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 11.41049952901335,
+      "learning_rate": 1.8826881383334994e-07,
+      "loss": 0.68,
+      "step": 12832
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 11.636964735008773,
+      "learning_rate": 1.879550523690582e-07,
+      "loss": 0.6809,
+      "step": 12833
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 12.690962500182373,
+      "learning_rate": 1.8764154756584996e-07,
+      "loss": 0.7328,
+      "step": 12834
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 10.36504073890518,
+      "learning_rate": 1.8732829944044627e-07,
+      "loss": 0.7073,
+      "step": 12835
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 8.773311063439474,
+      "learning_rate": 1.8701530800955492e-07,
+      "loss": 0.6618,
+      "step": 12836
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 12.965341453956826,
+      "learning_rate": 1.867025732898703e-07,
+      "loss": 0.7336,
+      "step": 12837
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 9.46009456904989,
+      "learning_rate": 1.8639009529807184e-07,
+      "loss": 0.7091,
+      "step": 12838
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 9.554910416005292,
+      "learning_rate": 1.8607787405082732e-07,
+      "loss": 0.7056,
+      "step": 12839
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 9.48408500182837,
+      "learning_rate": 1.85765909564789e-07,
+      "loss": 0.6836,
+      "step": 12840
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 7.818173494715446,
+      "learning_rate": 1.8545420185659635e-07,
+      "loss": 0.7166,
+      "step": 12841
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 7.922978914870495,
+      "learning_rate": 1.8514275094287605e-07,
+      "loss": 0.6615,
+      "step": 12842
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 10.546959727038585,
+      "learning_rate": 1.8483155684023924e-07,
+      "loss": 0.6755,
+      "step": 12843
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 7.623189336199422,
+      "learning_rate": 1.8452061956528322e-07,
+      "loss": 0.6295,
+      "step": 12844
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 6.84758085045728,
+      "learning_rate": 1.842099391345936e-07,
+      "loss": 0.6651,
+      "step": 12845
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 8.284798306498105,
+      "learning_rate": 1.8389951556474107e-07,
+      "loss": 0.7002,
+      "step": 12846
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 9.956129242166305,
+      "learning_rate": 1.8358934887228285e-07,
+      "loss": 0.6846,
+      "step": 12847
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 7.662100145838896,
+      "learning_rate": 1.832794390737619e-07,
+      "loss": 0.6718,
+      "step": 12848
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 7.3326152964948905,
+      "learning_rate": 1.8296978618570882e-07,
+      "loss": 0.6826,
+      "step": 12849
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 8.978190833364197,
+      "learning_rate": 1.8266039022463878e-07,
+      "loss": 0.7338,
+      "step": 12850
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 10.512874368338922,
+      "learning_rate": 1.823512512070541e-07,
+      "loss": 0.7691,
+      "step": 12851
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 11.619598348916973,
+      "learning_rate": 1.8204236914944385e-07,
+      "loss": 0.717,
+      "step": 12852
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 6.366533662714983,
+      "learning_rate": 1.8173374406828258e-07,
+      "loss": 0.6783,
+      "step": 12853
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 6.7236496462677025,
+      "learning_rate": 1.8142537598003108e-07,
+      "loss": 0.7463,
+      "step": 12854
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 9.704419415386957,
+      "learning_rate": 1.811172649011389e-07,
+      "loss": 0.6645,
+      "step": 12855
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 7.995465264685436,
+      "learning_rate": 1.808094108480374e-07,
+      "loss": 0.7397,
+      "step": 12856
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 9.218538016484917,
+      "learning_rate": 1.8050181383714838e-07,
+      "loss": 0.7445,
+      "step": 12857
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 10.82233683054405,
+      "learning_rate": 1.8019447388487655e-07,
+      "loss": 0.7307,
+      "step": 12858
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 9.837443354766531,
+      "learning_rate": 1.7988739100761543e-07,
+      "loss": 0.6541,
+      "step": 12859
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 8.806697503781274,
+      "learning_rate": 1.795805652217447e-07,
+      "loss": 0.6704,
+      "step": 12860
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 11.263916559832435,
+      "learning_rate": 1.7927399654362798e-07,
+      "loss": 0.5918,
+      "step": 12861
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 11.019441924967891,
+      "learning_rate": 1.789676849896188e-07,
+      "loss": 0.6632,
+      "step": 12862
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 9.01916080334578,
+      "learning_rate": 1.7866163057605358e-07,
+      "loss": 0.7723,
+      "step": 12863
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 7.888705550320188,
+      "learning_rate": 1.7835583331925587e-07,
+      "loss": 0.7089,
+      "step": 12864
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 10.444578581612362,
+      "learning_rate": 1.780502932355377e-07,
+      "loss": 0.6628,
+      "step": 12865
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 7.353091285525266,
+      "learning_rate": 1.7774501034119429e-07,
+      "loss": 0.7504,
+      "step": 12866
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 8.559199459027827,
+      "learning_rate": 1.774399846525099e-07,
+      "loss": 0.7213,
+      "step": 12867
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 6.883565283814722,
+      "learning_rate": 1.7713521618575257e-07,
+      "loss": 0.6508,
+      "step": 12868
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 10.890274655483612,
+      "learning_rate": 1.768307049571788e-07,
+      "loss": 0.6772,
+      "step": 12869
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 8.680887212904134,
+      "learning_rate": 1.7652645098302945e-07,
+      "loss": 0.7686,
+      "step": 12870
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 9.631731095205613,
+      "learning_rate": 1.7622245427953266e-07,
+      "loss": 0.7073,
+      "step": 12871
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 7.299505001769216,
+      "learning_rate": 1.759187148629038e-07,
+      "loss": 0.6532,
+      "step": 12872
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 10.311946090522325,
+      "learning_rate": 1.7561523274934212e-07,
+      "loss": 0.7109,
+      "step": 12873
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 8.775267469424712,
+      "learning_rate": 1.7531200795503523e-07,
+      "loss": 0.7386,
+      "step": 12874
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 8.818728301736671,
+      "learning_rate": 1.7500904049615687e-07,
+      "loss": 0.7013,
+      "step": 12875
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 9.607318696594461,
+      "learning_rate": 1.7470633038886519e-07,
+      "loss": 0.6724,
+      "step": 12876
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 10.914049667415366,
+      "learning_rate": 1.7440387764930677e-07,
+      "loss": 0.7114,
+      "step": 12877
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 9.166536210589737,
+      "learning_rate": 1.7410168229361314e-07,
+      "loss": 0.7271,
+      "step": 12878
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 8.308408800179214,
+      "learning_rate": 1.7379974433790302e-07,
+      "loss": 0.6988,
+      "step": 12879
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 8.76821205694447,
+      "learning_rate": 1.7349806379828083e-07,
+      "loss": 0.6535,
+      "step": 12880
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 10.714392771772426,
+      "learning_rate": 1.7319664069083698e-07,
+      "loss": 0.689,
+      "step": 12881
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 10.962358438993837,
+      "learning_rate": 1.7289547503164972e-07,
+      "loss": 0.6962,
+      "step": 12882
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 11.059750056311405,
+      "learning_rate": 1.7259456683678012e-07,
+      "loss": 0.6035,
+      "step": 12883
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 9.225313015101008,
+      "learning_rate": 1.722939161222792e-07,
+      "loss": 0.6728,
+      "step": 12884
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 7.486740972986908,
+      "learning_rate": 1.719935229041836e-07,
+      "loss": 0.6571,
+      "step": 12885
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 10.841457914751318,
+      "learning_rate": 1.7169338719851435e-07,
+      "loss": 0.6773,
+      "step": 12886
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 8.950000963797324,
+      "learning_rate": 1.7139350902128038e-07,
+      "loss": 0.6785,
+      "step": 12887
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 7.719456179605086,
+      "learning_rate": 1.7109388838847607e-07,
+      "loss": 0.6433,
+      "step": 12888
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 10.346941903501481,
+      "learning_rate": 1.7079452531608198e-07,
+      "loss": 0.6842,
+      "step": 12889
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 11.264002061495333,
+      "learning_rate": 1.7049541982006647e-07,
+      "loss": 0.6807,
+      "step": 12890
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 11.215277366119032,
+      "learning_rate": 1.7019657191638118e-07,
+      "loss": 0.6908,
+      "step": 12891
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 8.980808176086338,
+      "learning_rate": 1.6989798162096728e-07,
+      "loss": 0.69,
+      "step": 12892
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 6.491453564562731,
+      "learning_rate": 1.695996489497509e-07,
+      "loss": 0.6989,
+      "step": 12893
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 8.458826381545965,
+      "learning_rate": 1.693015739186432e-07,
+      "loss": 0.6651,
+      "step": 12894
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 14.854668677873693,
+      "learning_rate": 1.6900375654354373e-07,
+      "loss": 0.6961,
+      "step": 12895
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 9.022802986995703,
+      "learning_rate": 1.687061968403364e-07,
+      "loss": 0.6598,
+      "step": 12896
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 7.361685218922727,
+      "learning_rate": 1.6840889482489187e-07,
+      "loss": 0.7084,
+      "step": 12897
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 7.684284256971544,
+      "learning_rate": 1.6811185051306911e-07,
+      "loss": 0.6952,
+      "step": 12898
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 5.848818731997481,
+      "learning_rate": 1.678150639207099e-07,
+      "loss": 0.7336,
+      "step": 12899
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 11.940486318617639,
+      "learning_rate": 1.6751853506364546e-07,
+      "loss": 0.7082,
+      "step": 12900
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 8.393755497036564,
+      "learning_rate": 1.6722226395769038e-07,
+      "loss": 0.6687,
+      "step": 12901
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 9.152152777008908,
+      "learning_rate": 1.6692625061864754e-07,
+      "loss": 0.6948,
+      "step": 12902
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 8.633747658557734,
+      "learning_rate": 1.66630495062306e-07,
+      "loss": 0.63,
+      "step": 12903
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 8.625670082246478,
+      "learning_rate": 1.6633499730443982e-07,
+      "loss": 0.7454,
+      "step": 12904
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 10.07550320186523,
+      "learning_rate": 1.660397573608108e-07,
+      "loss": 0.5879,
+      "step": 12905
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 8.191389480463314,
+      "learning_rate": 1.657447752471647e-07,
+      "loss": 0.6762,
+      "step": 12906
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 10.223739822950222,
+      "learning_rate": 1.6545005097923672e-07,
+      "loss": 0.7666,
+      "step": 12907
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 10.573348747334395,
+      "learning_rate": 1.651555845727465e-07,
+      "loss": 0.7193,
+      "step": 12908
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 11.363790712881626,
+      "learning_rate": 1.6486137604339813e-07,
+      "loss": 0.6874,
+      "step": 12909
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 11.384301126712137,
+      "learning_rate": 1.645674254068863e-07,
+      "loss": 0.6573,
+      "step": 12910
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 11.65956322648177,
+      "learning_rate": 1.6427373267888792e-07,
+      "loss": 0.6754,
+      "step": 12911
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 11.991569183334537,
+      "learning_rate": 1.6398029787506818e-07,
+      "loss": 0.709,
+      "step": 12912
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 9.147091102241749,
+      "learning_rate": 1.6368712101107908e-07,
+      "loss": 0.7157,
+      "step": 12913
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 8.303955272733383,
+      "learning_rate": 1.633942021025564e-07,
+      "loss": 0.7514,
+      "step": 12914
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 10.242462770623272,
+      "learning_rate": 1.631015411651249e-07,
+      "loss": 0.7477,
+      "step": 12915
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 7.904650412125125,
+      "learning_rate": 1.628091382143926e-07,
+      "loss": 0.68,
+      "step": 12916
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 12.9116899481046,
+      "learning_rate": 1.6251699326595649e-07,
+      "loss": 0.6544,
+      "step": 12917
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 10.51680668912899,
+      "learning_rate": 1.622251063353991e-07,
+      "loss": 0.6574,
+      "step": 12918
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 7.551109286218129,
+      "learning_rate": 1.6193347743828913e-07,
+      "loss": 0.6953,
+      "step": 12919
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 8.521225841184256,
+      "learning_rate": 1.616421065901802e-07,
+      "loss": 0.7057,
+      "step": 12920
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 11.959638772640231,
+      "learning_rate": 1.6135099380661267e-07,
+      "loss": 0.7049,
+      "step": 12921
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 7.820998129690549,
+      "learning_rate": 1.6106013910311524e-07,
+      "loss": 0.7569,
+      "step": 12922
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 9.188107081696511,
+      "learning_rate": 1.6076954249520104e-07,
+      "loss": 0.6742,
+      "step": 12923
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 8.246780629121059,
+      "learning_rate": 1.6047920399836882e-07,
+      "loss": 0.6767,
+      "step": 12924
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 10.027893514124745,
+      "learning_rate": 1.6018912362810556e-07,
+      "loss": 0.7174,
+      "step": 12925
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 11.491376415467611,
+      "learning_rate": 1.5989930139988285e-07,
+      "loss": 0.6723,
+      "step": 12926
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 11.064680476169714,
+      "learning_rate": 1.596097373291583e-07,
+      "loss": 0.6354,
+      "step": 12927
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 9.416944935281634,
+      "learning_rate": 1.593204314313773e-07,
+      "loss": 0.7211,
+      "step": 12928
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 7.727217389565944,
+      "learning_rate": 1.5903138372197036e-07,
+      "loss": 0.7403,
+      "step": 12929
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 11.924647569270501,
+      "learning_rate": 1.5874259421635396e-07,
+      "loss": 0.6758,
+      "step": 12930
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 10.970533814858948,
+      "learning_rate": 1.584540629299325e-07,
+      "loss": 0.7136,
+      "step": 12931
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 12.440689420812888,
+      "learning_rate": 1.5816578987809472e-07,
+      "loss": 0.7783,
+      "step": 12932
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 9.396331853423872,
+      "learning_rate": 1.5787777507621672e-07,
+      "loss": 0.7062,
+      "step": 12933
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 8.328355667186221,
+      "learning_rate": 1.575900185396595e-07,
+      "loss": 0.7026,
+      "step": 12934
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 10.415090914147621,
+      "learning_rate": 1.5730252028377136e-07,
+      "loss": 0.6865,
+      "step": 12935
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 8.545064159305424,
+      "learning_rate": 1.5701528032388779e-07,
+      "loss": 0.6822,
+      "step": 12936
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 8.035436807671099,
+      "learning_rate": 1.5672829867532825e-07,
+      "loss": 0.658,
+      "step": 12937
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 10.259027982822897,
+      "learning_rate": 1.5644157535340042e-07,
+      "loss": 0.6942,
+      "step": 12938
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 9.61178655736027,
+      "learning_rate": 1.561551103733966e-07,
+      "loss": 0.7247,
+      "step": 12939
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 9.440088484943589,
+      "learning_rate": 1.558689037505956e-07,
+      "loss": 0.5867,
+      "step": 12940
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 11.882941583282292,
+      "learning_rate": 1.555829555002647e-07,
+      "loss": 0.7577,
+      "step": 12941
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 10.562775002276533,
+      "learning_rate": 1.5529726563765334e-07,
+      "loss": 0.7098,
+      "step": 12942
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 8.772515683687802,
+      "learning_rate": 1.5501183417800103e-07,
+      "loss": 0.7211,
+      "step": 12943
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 7.65131846806018,
+      "learning_rate": 1.5472666113653112e-07,
+      "loss": 0.7448,
+      "step": 12944
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 9.522486395898913,
+      "learning_rate": 1.5444174652845478e-07,
+      "loss": 0.6963,
+      "step": 12945
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 9.81954633942848,
+      "learning_rate": 1.5415709036896764e-07,
+      "loss": 0.6892,
+      "step": 12946
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 10.611284158392388,
+      "learning_rate": 1.5387269267325256e-07,
+      "loss": 0.6559,
+      "step": 12947
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 9.755075111908244,
+      "learning_rate": 1.5358855345647906e-07,
+      "loss": 0.7726,
+      "step": 12948
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 9.353414111652915,
+      "learning_rate": 1.5330467273380167e-07,
+      "loss": 0.665,
+      "step": 12949
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 12.354929542585422,
+      "learning_rate": 1.530210505203622e-07,
+      "loss": 0.6764,
+      "step": 12950
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 11.482479045298032,
+      "learning_rate": 1.5273768683128854e-07,
+      "loss": 0.8719,
+      "step": 12951
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 11.63864918174718,
+      "learning_rate": 1.524545816816947e-07,
+      "loss": 0.6804,
+      "step": 12952
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 9.703388292663481,
+      "learning_rate": 1.521717350866797e-07,
+      "loss": 0.7994,
+      "step": 12953
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 9.942881256344059,
+      "learning_rate": 1.518891470613304e-07,
+      "loss": 0.6867,
+      "step": 12954
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 10.850139482063781,
+      "learning_rate": 1.516068176207186e-07,
+      "loss": 0.647,
+      "step": 12955
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 12.17768120110232,
+      "learning_rate": 1.513247467799045e-07,
+      "loss": 0.7238,
+      "step": 12956
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 9.899959551531863,
+      "learning_rate": 1.5104293455393105e-07,
+      "loss": 0.6477,
+      "step": 12957
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 9.96665921783476,
+      "learning_rate": 1.5076138095783178e-07,
+      "loss": 0.7788,
+      "step": 12958
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 9.457413031930528,
+      "learning_rate": 1.5048008600662134e-07,
+      "loss": 0.698,
+      "step": 12959
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 12.942283268481365,
+      "learning_rate": 1.5019904971530387e-07,
+      "loss": 0.6747,
+      "step": 12960
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 12.666111086189341,
+      "learning_rate": 1.4991827209887068e-07,
+      "loss": 0.7177,
+      "step": 12961
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 12.001838982246316,
+      "learning_rate": 1.496377531722959e-07,
+      "loss": 0.5762,
+      "step": 12962
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 6.9987636927038395,
+      "learning_rate": 1.493574929505426e-07,
+      "loss": 0.5826,
+      "step": 12963
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 8.58684274158248,
+      "learning_rate": 1.490774914485593e-07,
+      "loss": 0.6449,
+      "step": 12964
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 10.823831638814022,
+      "learning_rate": 1.4879774868127915e-07,
+      "loss": 0.739,
+      "step": 12965
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 13.497643386146137,
+      "learning_rate": 1.4851826466362407e-07,
+      "loss": 0.7462,
+      "step": 12966
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 10.6607613430492,
+      "learning_rate": 1.482390394104999e-07,
+      "loss": 0.6951,
+      "step": 12967
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 11.048056313289786,
+      "learning_rate": 1.4796007293680137e-07,
+      "loss": 0.7382,
+      "step": 12968
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 11.214273387385605,
+      "learning_rate": 1.4768136525740607e-07,
+      "loss": 0.6359,
+      "step": 12969
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 10.282289775077928,
+      "learning_rate": 1.4740291638718096e-07,
+      "loss": 0.6367,
+      "step": 12970
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 11.612820721319125,
+      "learning_rate": 1.471247263409764e-07,
+      "loss": 0.6817,
+      "step": 12971
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 8.565531615062996,
+      "learning_rate": 1.4684679513363108e-07,
+      "loss": 0.6703,
+      "step": 12972
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 9.362245313985452,
+      "learning_rate": 1.4656912277996815e-07,
+      "loss": 0.7108,
+      "step": 12973
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 8.24211233168886,
+      "learning_rate": 1.4629170929479963e-07,
+      "loss": 0.6941,
+      "step": 12974
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 13.930315962358087,
+      "learning_rate": 1.4601455469292036e-07,
+      "loss": 0.6944,
+      "step": 12975
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 11.008219505996907,
+      "learning_rate": 1.4573765898911408e-07,
+      "loss": 0.6927,
+      "step": 12976
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 10.057287259047937,
+      "learning_rate": 1.4546102219814894e-07,
+      "loss": 0.6804,
+      "step": 12977
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 8.614869266504696,
+      "learning_rate": 1.4518464433477986e-07,
+      "loss": 0.6801,
+      "step": 12978
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 10.650673564862078,
+      "learning_rate": 1.449085254137489e-07,
+      "loss": 0.6811,
+      "step": 12979
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 11.474464060355308,
+      "learning_rate": 1.4463266544978206e-07,
+      "loss": 0.7065,
+      "step": 12980
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 12.441274125834497,
+      "learning_rate": 1.4435706445759477e-07,
+      "loss": 0.6998,
+      "step": 12981
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 7.058032460644962,
+      "learning_rate": 1.4408172245188533e-07,
+      "loss": 0.7627,
+      "step": 12982
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 7.907376965768029,
+      "learning_rate": 1.4380663944734086e-07,
+      "loss": 0.7154,
+      "step": 12983
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 9.781601212320277,
+      "learning_rate": 1.4353181545863293e-07,
+      "loss": 0.6632,
+      "step": 12984
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 8.928288812375278,
+      "learning_rate": 1.432572505004193e-07,
+      "loss": 0.7236,
+      "step": 12985
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 7.654543795098602,
+      "learning_rate": 1.4298294458734542e-07,
+      "loss": 0.7403,
+      "step": 12986
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 8.00841329548721,
+      "learning_rate": 1.4270889773404127e-07,
+      "loss": 0.7354,
+      "step": 12987
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 8.703223988954404,
+      "learning_rate": 1.424351099551241e-07,
+      "loss": 0.7137,
+      "step": 12988
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 8.405946342751921,
+      "learning_rate": 1.4216158126519775e-07,
+      "loss": 0.6378,
+      "step": 12989
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 10.415096233061677,
+      "learning_rate": 1.4188831167885053e-07,
+      "loss": 0.6985,
+      "step": 12990
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 7.418700119606872,
+      "learning_rate": 1.4161530121065804e-07,
+      "loss": 0.6505,
+      "step": 12991
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 11.042870274269939,
+      "learning_rate": 1.4134254987518192e-07,
+      "loss": 0.6928,
+      "step": 12992
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 10.165635979627483,
+      "learning_rate": 1.4107005768696948e-07,
+      "loss": 0.6762,
+      "step": 12993
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 11.588627095590141,
+      "learning_rate": 1.4079782466055625e-07,
+      "loss": 0.6968,
+      "step": 12994
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 9.028721418878444,
+      "learning_rate": 1.4052585081046067e-07,
+      "loss": 0.6258,
+      "step": 12995
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 9.86363383816087,
+      "learning_rate": 1.4025413615118998e-07,
+      "loss": 0.7539,
+      "step": 12996
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 11.60605769415377,
+      "learning_rate": 1.3998268069723707e-07,
+      "loss": 0.7006,
+      "step": 12997
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 9.417610172459051,
+      "learning_rate": 1.3971148446307925e-07,
+      "loss": 0.6742,
+      "step": 12998
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 9.117941446297046,
+      "learning_rate": 1.394405474631827e-07,
+      "loss": 0.6515,
+      "step": 12999
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 7.42645336253012,
+      "learning_rate": 1.3916986971199697e-07,
+      "loss": 0.714,
+      "step": 13000
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 7.018543788640839,
+      "learning_rate": 1.388994512239611e-07,
+      "loss": 0.6938,
+      "step": 13001
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 8.621444577772353,
+      "learning_rate": 1.386292920134974e-07,
+      "loss": 0.6825,
+      "step": 13002
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 8.486947796455377,
+      "learning_rate": 1.383593920950149e-07,
+      "loss": 0.6913,
+      "step": 13003
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 8.368084103457363,
+      "learning_rate": 1.380897514829105e-07,
+      "loss": 0.6751,
+      "step": 13004
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 10.469769542709013,
+      "learning_rate": 1.3782037019156536e-07,
+      "loss": 0.6587,
+      "step": 13005
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 8.984910370707107,
+      "learning_rate": 1.3755124823534804e-07,
+      "loss": 0.735,
+      "step": 13006
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 11.355864528493386,
+      "learning_rate": 1.3728238562861207e-07,
+      "loss": 0.7657,
+      "step": 13007
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 7.089376116115369,
+      "learning_rate": 1.370137823856982e-07,
+      "loss": 0.7027,
+      "step": 13008
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 12.206640673654586,
+      "learning_rate": 1.3674543852093326e-07,
+      "loss": 0.7139,
+      "step": 13009
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 10.661162184964573,
+      "learning_rate": 1.3647735404862917e-07,
+      "loss": 0.7091,
+      "step": 13010
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 9.49987906552956,
+      "learning_rate": 1.3620952898308614e-07,
+      "loss": 0.7215,
+      "step": 13011
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 9.247014795872268,
+      "learning_rate": 1.3594196333858722e-07,
+      "loss": 0.6929,
+      "step": 13012
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 10.770478507284473,
+      "learning_rate": 1.356746571294054e-07,
+      "loss": 0.6804,
+      "step": 13013
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 9.352598104619675,
+      "learning_rate": 1.3540761036979765e-07,
+      "loss": 0.7152,
+      "step": 13014
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 13.26387140628363,
+      "learning_rate": 1.35140823074007e-07,
+      "loss": 0.6273,
+      "step": 13015
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 7.5664793382963795,
+      "learning_rate": 1.3487429525626315e-07,
+      "loss": 0.6881,
+      "step": 13016
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 10.477851422743077,
+      "learning_rate": 1.3460802693078256e-07,
+      "loss": 0.7128,
+      "step": 13017
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 10.71036761928645,
+      "learning_rate": 1.3434201811176662e-07,
+      "loss": 0.7372,
+      "step": 13018
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 13.692157661999646,
+      "learning_rate": 1.3407626881340453e-07,
+      "loss": 0.6632,
+      "step": 13019
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 11.98803421658716,
+      "learning_rate": 1.3381077904986883e-07,
+      "loss": 0.6586,
+      "step": 13020
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 13.388466323565703,
+      "learning_rate": 1.335455488353221e-07,
+      "loss": 0.7336,
+      "step": 13021
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 10.999689206556523,
+      "learning_rate": 1.3328057818390972e-07,
+      "loss": 0.6455,
+      "step": 13022
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 8.438619215557935,
+      "learning_rate": 1.3301586710976422e-07,
+      "loss": 0.6896,
+      "step": 13023
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 10.755790588335763,
+      "learning_rate": 1.3275141562700488e-07,
+      "loss": 0.6485,
+      "step": 13024
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 13.57191423769809,
+      "learning_rate": 1.324872237497371e-07,
+      "loss": 0.7675,
+      "step": 13025
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 10.1517435792896,
+      "learning_rate": 1.3222329149205182e-07,
+      "loss": 0.6843,
+      "step": 13026
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 7.799548416593652,
+      "learning_rate": 1.3195961886802723e-07,
+      "loss": 0.6621,
+      "step": 13027
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 10.55424707422497,
+      "learning_rate": 1.316962058917265e-07,
+      "loss": 0.7032,
+      "step": 13028
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 12.07811598960856,
+      "learning_rate": 1.3143305257719897e-07,
+      "loss": 0.6672,
+      "step": 13029
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 9.41241320228061,
+      "learning_rate": 1.3117015893848007e-07,
+      "loss": 0.6609,
+      "step": 13030
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 9.958058085491917,
+      "learning_rate": 1.3090752498959192e-07,
+      "loss": 0.6377,
+      "step": 13031
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 8.07774336038641,
+      "learning_rate": 1.3064515074454443e-07,
+      "loss": 0.7211,
+      "step": 13032
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 6.331246954674844,
+      "learning_rate": 1.3038303621732974e-07,
+      "loss": 0.785,
+      "step": 13033
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 10.116843198986137,
+      "learning_rate": 1.3012118142192998e-07,
+      "loss": 0.7238,
+      "step": 13034
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 7.893795695412055,
+      "learning_rate": 1.2985958637231122e-07,
+      "loss": 0.7487,
+      "step": 13035
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 9.48381612900999,
+      "learning_rate": 1.2959825108242507e-07,
+      "loss": 0.7665,
+      "step": 13036
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 12.927235580202318,
+      "learning_rate": 1.2933717556621205e-07,
+      "loss": 0.7206,
+      "step": 13037
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 11.564161462804154,
+      "learning_rate": 1.2907635983759548e-07,
+      "loss": 0.6547,
+      "step": 13038
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 9.998945304953462,
+      "learning_rate": 1.2881580391048864e-07,
+      "loss": 0.6857,
+      "step": 13039
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 12.735456787906813,
+      "learning_rate": 1.285555077987871e-07,
+      "loss": 0.6249,
+      "step": 13040
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 7.94967015006104,
+      "learning_rate": 1.2829547151637533e-07,
+      "loss": 0.6731,
+      "step": 13041
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 9.847221299757312,
+      "learning_rate": 1.2803569507712333e-07,
+      "loss": 0.7078,
+      "step": 13042
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 10.025839704669046,
+      "learning_rate": 1.2777617849488498e-07,
+      "loss": 0.7693,
+      "step": 13043
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 7.780857371890483,
+      "learning_rate": 1.2751692178350428e-07,
+      "loss": 0.7022,
+      "step": 13044
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 12.609414106772066,
+      "learning_rate": 1.2725792495680733e-07,
+      "loss": 0.6387,
+      "step": 13045
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 8.348374894518862,
+      "learning_rate": 1.2699918802860979e-07,
+      "loss": 0.6154,
+      "step": 13046
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 11.149372840880991,
+      "learning_rate": 1.2674071101271168e-07,
+      "loss": 0.6801,
+      "step": 13047
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 7.885617825712947,
+      "learning_rate": 1.2648249392289925e-07,
+      "loss": 0.6825,
+      "step": 13048
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 7.072067059305428,
+      "learning_rate": 1.2622453677294477e-07,
+      "loss": 0.6566,
+      "step": 13049
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 13.438156711112722,
+      "learning_rate": 1.2596683957660728e-07,
+      "loss": 0.678,
+      "step": 13050
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 6.3352333706816415,
+      "learning_rate": 1.257094023476313e-07,
+      "loss": 0.6701,
+      "step": 13051
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 10.606565113811762,
+      "learning_rate": 1.2545222509974864e-07,
+      "loss": 0.6805,
+      "step": 13052
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 8.488414803359797,
+      "learning_rate": 1.2519530784667611e-07,
+      "loss": 0.6931,
+      "step": 13053
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 7.187540098912917,
+      "learning_rate": 1.2493865060211663e-07,
+      "loss": 0.6943,
+      "step": 13054
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 8.433896869640098,
+      "learning_rate": 1.2468225337975925e-07,
+      "loss": 0.6047,
+      "step": 13055
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 7.908165248858814,
+      "learning_rate": 1.244261161932797e-07,
+      "loss": 0.7135,
+      "step": 13056
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 9.600629951915806,
+      "learning_rate": 1.241702390563404e-07,
+      "loss": 0.665,
+      "step": 13057
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 9.848958264801258,
+      "learning_rate": 1.239146219825882e-07,
+      "loss": 0.7115,
+      "step": 13058
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 9.036184014896307,
+      "learning_rate": 1.2365926498565827e-07,
+      "loss": 0.7291,
+      "step": 13059
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 8.928674961451211,
+      "learning_rate": 1.2340416807916922e-07,
+      "loss": 0.6941,
+      "step": 13060
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 6.989823356465215,
+      "learning_rate": 1.2314933127672735e-07,
+      "loss": 0.6274,
+      "step": 13061
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 9.660803460249562,
+      "learning_rate": 1.2289475459192568e-07,
+      "loss": 0.7168,
+      "step": 13062
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 8.814325204196674,
+      "learning_rate": 1.2264043803834224e-07,
+      "loss": 0.6309,
+      "step": 13063
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 9.542798963005971,
+      "learning_rate": 1.223863816295412e-07,
+      "loss": 0.7153,
+      "step": 13064
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 8.487111699573422,
+      "learning_rate": 1.2213258537907446e-07,
+      "loss": 0.6603,
+      "step": 13065
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 9.194921010283657,
+      "learning_rate": 1.218790493004779e-07,
+      "loss": 0.7233,
+      "step": 13066
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 11.415763411349468,
+      "learning_rate": 1.2162577340727454e-07,
+      "loss": 0.7056,
+      "step": 13067
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 8.939685673449238,
+      "learning_rate": 1.2137275771297252e-07,
+      "loss": 0.6809,
+      "step": 13068
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 12.156698127815792,
+      "learning_rate": 1.2112000223106823e-07,
+      "loss": 0.7168,
+      "step": 13069
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 7.6881236388034315,
+      "learning_rate": 1.2086750697504313e-07,
+      "loss": 0.6971,
+      "step": 13070
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 7.317058177331318,
+      "learning_rate": 1.206152719583631e-07,
+      "loss": 0.724,
+      "step": 13071
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 6.955065780793465,
+      "learning_rate": 1.2036329719448347e-07,
+      "loss": 0.7471,
+      "step": 13072
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 7.17090313925213,
+      "learning_rate": 1.2011158269684297e-07,
+      "loss": 0.6889,
+      "step": 13073
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 10.83881248714461,
+      "learning_rate": 1.198601284788664e-07,
+      "loss": 0.7158,
+      "step": 13074
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 9.067776033385321,
+      "learning_rate": 1.1960893455396804e-07,
+      "loss": 0.7065,
+      "step": 13075
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 8.67787452714061,
+      "learning_rate": 1.193580009355433e-07,
+      "loss": 0.6795,
+      "step": 13076
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 12.215370719037148,
+      "learning_rate": 1.1910732763697807e-07,
+      "loss": 0.7026,
+      "step": 13077
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 9.7361640818191,
+      "learning_rate": 1.1885691467164118e-07,
+      "loss": 0.7405,
+      "step": 13078
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 6.987409473012682,
+      "learning_rate": 1.1860676205289078e-07,
+      "loss": 0.7552,
+      "step": 13079
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 8.001126081904248,
+      "learning_rate": 1.183568697940679e-07,
+      "loss": 0.6902,
+      "step": 13080
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 7.564228843690074,
+      "learning_rate": 1.1810723790850076e-07,
+      "loss": 0.683,
+      "step": 13081
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 13.228247290224395,
+      "learning_rate": 1.1785786640950536e-07,
+      "loss": 0.6541,
+      "step": 13082
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 11.302670249057497,
+      "learning_rate": 1.1760875531038163e-07,
+      "loss": 0.7327,
+      "step": 13083
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 7.747341317021242,
+      "learning_rate": 1.1735990462441615e-07,
+      "loss": 0.7376,
+      "step": 13084
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 9.158306920746135,
+      "learning_rate": 1.1711131436488388e-07,
+      "loss": 0.6689,
+      "step": 13085
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 8.295874504288621,
+      "learning_rate": 1.1686298454504142e-07,
+      "loss": 0.6975,
+      "step": 13086
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 11.198539681589152,
+      "learning_rate": 1.166149151781354e-07,
+      "loss": 0.6556,
+      "step": 13087
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 9.379664598033605,
+      "learning_rate": 1.1636710627739634e-07,
+      "loss": 0.6981,
+      "step": 13088
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 9.284771972946915,
+      "learning_rate": 1.1611955785604201e-07,
+      "loss": 0.7337,
+      "step": 13089
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 11.71589281977341,
+      "learning_rate": 1.1587226992727684e-07,
+      "loss": 0.6734,
+      "step": 13090
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 8.464117255041415,
+      "learning_rate": 1.1562524250428974e-07,
+      "loss": 0.642,
+      "step": 13091
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 13.332119005186101,
+      "learning_rate": 1.1537847560025628e-07,
+      "loss": 0.6425,
+      "step": 13092
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 8.278983547413132,
+      "learning_rate": 1.1513196922833813e-07,
+      "loss": 0.6974,
+      "step": 13093
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 8.919012708600128,
+      "learning_rate": 1.1488572340168314e-07,
+      "loss": 0.8043,
+      "step": 13094
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 9.673586243992942,
+      "learning_rate": 1.1463973813342688e-07,
+      "loss": 0.7413,
+      "step": 13095
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 7.94157722763785,
+      "learning_rate": 1.1439401343668776e-07,
+      "loss": 0.6564,
+      "step": 13096
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 10.759775682464937,
+      "learning_rate": 1.1414854932457365e-07,
+      "loss": 0.6477,
+      "step": 13097
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 9.212158703182887,
+      "learning_rate": 1.139033458101757e-07,
+      "loss": 0.669,
+      "step": 13098
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 9.557149950896285,
+      "learning_rate": 1.1365840290657238e-07,
+      "loss": 0.7467,
+      "step": 13099
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 12.592469291057398,
+      "learning_rate": 1.1341372062682932e-07,
+      "loss": 0.7201,
+      "step": 13100
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 12.248016554281174,
+      "learning_rate": 1.1316929898399554e-07,
+      "loss": 0.6751,
+      "step": 13101
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 12.294932822280552,
+      "learning_rate": 1.1292513799110893e-07,
+      "loss": 0.6805,
+      "step": 13102
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 9.609266618860921,
+      "learning_rate": 1.1268123766119299e-07,
+      "loss": 0.6401,
+      "step": 13103
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 10.907524312240861,
+      "learning_rate": 1.1243759800725562e-07,
+      "loss": 0.7223,
+      "step": 13104
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 9.594835447999099,
+      "learning_rate": 1.1219421904229199e-07,
+      "loss": 0.5989,
+      "step": 13105
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 8.418503221753292,
+      "learning_rate": 1.119511007792834e-07,
+      "loss": 0.6685,
+      "step": 13106
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 8.08532716053211,
+      "learning_rate": 1.1170824323119667e-07,
+      "loss": 0.7503,
+      "step": 13107
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 8.477042613057954,
+      "learning_rate": 1.1146564641098645e-07,
+      "loss": 0.665,
+      "step": 13108
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 10.940749485564748,
+      "learning_rate": 1.1122331033159073e-07,
+      "loss": 0.6836,
+      "step": 13109
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 7.026811147572177,
+      "learning_rate": 1.1098123500593583e-07,
+      "loss": 0.6691,
+      "step": 13110
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 8.007128190823543,
+      "learning_rate": 1.107394204469331e-07,
+      "loss": 0.7007,
+      "step": 13111
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 10.701418494235858,
+      "learning_rate": 1.1049786666748052e-07,
+      "loss": 0.6852,
+      "step": 13112
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 10.702330117626744,
+      "learning_rate": 1.1025657368046173e-07,
+      "loss": 0.6468,
+      "step": 13113
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 8.7504958476275,
+      "learning_rate": 1.1001554149874582e-07,
+      "loss": 0.6593,
+      "step": 13114
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 12.343855247693012,
+      "learning_rate": 1.097747701351909e-07,
+      "loss": 0.7439,
+      "step": 13115
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 11.332181945925624,
+      "learning_rate": 1.0953425960263664e-07,
+      "loss": 0.6462,
+      "step": 13116
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 9.738045412503052,
+      "learning_rate": 1.0929400991391281e-07,
+      "loss": 0.7258,
+      "step": 13117
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 9.456908852128539,
+      "learning_rate": 1.0905402108183305e-07,
+      "loss": 0.7182,
+      "step": 13118
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 11.329141024251259,
+      "learning_rate": 1.0881429311919711e-07,
+      "loss": 0.7777,
+      "step": 13119
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 8.770909033764374,
+      "learning_rate": 1.085748260387931e-07,
+      "loss": 0.6241,
+      "step": 13120
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 10.376189431430383,
+      "learning_rate": 1.0833561985339192e-07,
+      "loss": 0.7195,
+      "step": 13121
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 9.595514136749332,
+      "learning_rate": 1.0809667457575224e-07,
+      "loss": 0.7597,
+      "step": 13122
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 12.011948949243017,
+      "learning_rate": 1.0785799021861997e-07,
+      "loss": 0.6975,
+      "step": 13123
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 11.028197799821443,
+      "learning_rate": 1.0761956679472552e-07,
+      "loss": 0.7096,
+      "step": 13124
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 11.3563958763236,
+      "learning_rate": 1.0738140431678479e-07,
+      "loss": 0.7487,
+      "step": 13125
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 7.615708257356203,
+      "learning_rate": 1.0714350279750097e-07,
+      "loss": 0.7206,
+      "step": 13126
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 6.9982419497078885,
+      "learning_rate": 1.0690586224956334e-07,
+      "loss": 0.7597,
+      "step": 13127
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 11.23792080690475,
+      "learning_rate": 1.0666848268564789e-07,
+      "loss": 0.7225,
+      "step": 13128
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 11.197832694866209,
+      "learning_rate": 1.0643136411841393e-07,
+      "loss": 0.6742,
+      "step": 13129
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 12.083584554961135,
+      "learning_rate": 1.0619450656051078e-07,
+      "loss": 0.6303,
+      "step": 13130
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 13.504304844687788,
+      "learning_rate": 1.0595791002456945e-07,
+      "loss": 0.7227,
+      "step": 13131
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 10.996036562172215,
+      "learning_rate": 1.0572157452321097e-07,
+      "loss": 0.6892,
+      "step": 13132
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 9.460056839133715,
+      "learning_rate": 1.0548550006904023e-07,
+      "loss": 0.6659,
+      "step": 13133
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 6.794708604991955,
+      "learning_rate": 1.0524968667464886e-07,
+      "loss": 0.7214,
+      "step": 13134
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 10.68658353739207,
+      "learning_rate": 1.0501413435261509e-07,
+      "loss": 0.7413,
+      "step": 13135
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 9.666449781347634,
+      "learning_rate": 1.0477884311550223e-07,
+      "loss": 0.6472,
+      "step": 13136
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 10.422806954892652,
+      "learning_rate": 1.0454381297585913e-07,
+      "loss": 0.7695,
+      "step": 13137
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 11.855679202745286,
+      "learning_rate": 1.0430904394622354e-07,
+      "loss": 0.6687,
+      "step": 13138
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 9.173568091591314,
+      "learning_rate": 1.0407453603911488e-07,
+      "loss": 0.6924,
+      "step": 13139
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 10.550058066003649,
+      "learning_rate": 1.038402892670437e-07,
+      "loss": 0.6259,
+      "step": 13140
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 9.66009000300627,
+      "learning_rate": 1.0360630364250224e-07,
+      "loss": 0.7023,
+      "step": 13141
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 10.224112992917918,
+      "learning_rate": 1.0337257917797216e-07,
+      "loss": 0.6772,
+      "step": 13142
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 12.187223905970688,
+      "learning_rate": 1.0313911588591851e-07,
+      "loss": 0.6998,
+      "step": 13143
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 10.722893598611082,
+      "learning_rate": 1.0290591377879355e-07,
+      "loss": 0.6558,
+      "step": 13144
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 10.915515887251331,
+      "learning_rate": 1.0267297286903566e-07,
+      "loss": 0.7133,
+      "step": 13145
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 10.80886093299976,
+      "learning_rate": 1.0244029316907045e-07,
+      "loss": 0.6597,
+      "step": 13146
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 8.177264722388708,
+      "learning_rate": 1.0220787469130688e-07,
+      "loss": 0.6878,
+      "step": 13147
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 8.392100050478259,
+      "learning_rate": 1.0197571744814284e-07,
+      "loss": 0.7383,
+      "step": 13148
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 6.594321590354224,
+      "learning_rate": 1.0174382145196004e-07,
+      "loss": 0.6898,
+      "step": 13149
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 8.18187709418188,
+      "learning_rate": 1.0151218671512753e-07,
+      "loss": 0.6838,
+      "step": 13150
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 8.348089056778404,
+      "learning_rate": 1.0128081324999983e-07,
+      "loss": 0.7217,
+      "step": 13151
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 8.258849805647094,
+      "learning_rate": 1.0104970106891765e-07,
+      "loss": 0.6912,
+      "step": 13152
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 6.675116702404015,
+      "learning_rate": 1.0081885018420834e-07,
+      "loss": 0.7501,
+      "step": 13153
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 15.299333537638372,
+      "learning_rate": 1.0058826060818427e-07,
+      "loss": 0.723,
+      "step": 13154
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 10.590988946547562,
+      "learning_rate": 1.0035793235314506e-07,
+      "loss": 0.7334,
+      "step": 13155
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 10.444605670292017,
+      "learning_rate": 1.0012786543137588e-07,
+      "loss": 0.7266,
+      "step": 13156
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 8.483442883161288,
+      "learning_rate": 9.989805985514689e-08,
+      "loss": 0.6926,
+      "step": 13157
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 12.017086706018297,
+      "learning_rate": 9.966851563671609e-08,
+      "loss": 0.7337,
+      "step": 13158
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 8.999504670180261,
+      "learning_rate": 9.94392327883259e-08,
+      "loss": 0.6849,
+      "step": 13159
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 9.65161915509385,
+      "learning_rate": 9.921021132220654e-08,
+      "loss": 0.7095,
+      "step": 13160
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 9.47447480080051,
+      "learning_rate": 9.898145125057379e-08,
+      "loss": 0.6873,
+      "step": 13161
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 10.667970045637064,
+      "learning_rate": 9.87529525856279e-08,
+      "loss": 0.6811,
+      "step": 13162
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 10.659906130971347,
+      "learning_rate": 9.852471533955688e-08,
+      "loss": 0.7212,
+      "step": 13163
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 10.835913970433925,
+      "learning_rate": 9.829673952453378e-08,
+      "loss": 0.7491,
+      "step": 13164
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 8.817394396324039,
+      "learning_rate": 9.806902515271888e-08,
+      "loss": 0.6416,
+      "step": 13165
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 9.119678807829278,
+      "learning_rate": 9.784157223625745e-08,
+      "loss": 0.7044,
+      "step": 13166
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 8.036138625924547,
+      "learning_rate": 9.761438078728147e-08,
+      "loss": 0.632,
+      "step": 13167
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 8.897393933547544,
+      "learning_rate": 9.738745081790846e-08,
+      "loss": 0.7038,
+      "step": 13168
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 12.246338728025018,
+      "learning_rate": 9.716078234024262e-08,
+      "loss": 0.6736,
+      "step": 13169
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 8.793963118557109,
+      "learning_rate": 9.693437536637318e-08,
+      "loss": 0.7396,
+      "step": 13170
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 13.306743163092996,
+      "learning_rate": 9.670822990837659e-08,
+      "loss": 0.6754,
+      "step": 13171
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 10.711767086484983,
+      "learning_rate": 9.648234597831429e-08,
+      "loss": 0.6671,
+      "step": 13172
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 8.588305143740032,
+      "learning_rate": 9.625672358823501e-08,
+      "loss": 0.6921,
+      "step": 13173
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 9.232211880341449,
+      "learning_rate": 9.603136275017244e-08,
+      "loss": 0.7168,
+      "step": 13174
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 6.8109322152637,
+      "learning_rate": 9.580626347614585e-08,
+      "loss": 0.6925,
+      "step": 13175
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 9.326109614476497,
+      "learning_rate": 9.558142577816343e-08,
+      "loss": 0.7338,
+      "step": 13176
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 9.458682337523104,
+      "learning_rate": 9.535684966821501e-08,
+      "loss": 0.6729,
+      "step": 13177
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 10.194348210784996,
+      "learning_rate": 9.513253515828103e-08,
+      "loss": 0.7664,
+      "step": 13178
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 12.24457658362323,
+      "learning_rate": 9.490848226032412e-08,
+      "loss": 0.6716,
+      "step": 13179
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 10.262170417364759,
+      "learning_rate": 9.468469098629585e-08,
+      "loss": 0.6316,
+      "step": 13180
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 11.63142236099782,
+      "learning_rate": 9.446116134813221e-08,
+      "loss": 0.7886,
+      "step": 13181
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 9.318973876764417,
+      "learning_rate": 9.423789335775535e-08,
+      "loss": 0.7099,
+      "step": 13182
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 11.425375252488776,
+      "learning_rate": 9.401488702707407e-08,
+      "loss": 0.7063,
+      "step": 13183
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 9.607412771419192,
+      "learning_rate": 9.379214236798273e-08,
+      "loss": 0.7239,
+      "step": 13184
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 9.08949053345973,
+      "learning_rate": 9.35696593923613e-08,
+      "loss": 0.6649,
+      "step": 13185
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 10.512616280175818,
+      "learning_rate": 9.334743811207803e-08,
+      "loss": 0.7201,
+      "step": 13186
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 7.782278610793643,
+      "learning_rate": 9.312547853898457e-08,
+      "loss": 0.7754,
+      "step": 13187
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 8.611859030863304,
+      "learning_rate": 9.290378068491924e-08,
+      "loss": 0.6529,
+      "step": 13188
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 8.617145620029333,
+      "learning_rate": 9.26823445617081e-08,
+      "loss": 0.6923,
+      "step": 13189
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 10.763316406134619,
+      "learning_rate": 9.246117018116008e-08,
+      "loss": 0.6827,
+      "step": 13190
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 7.400487956841404,
+      "learning_rate": 9.224025755507349e-08,
+      "loss": 0.7251,
+      "step": 13191
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 8.841715763779426,
+      "learning_rate": 9.201960669523057e-08,
+      "loss": 0.6763,
+      "step": 13192
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 12.08434132666623,
+      "learning_rate": 9.179921761340083e-08,
+      "loss": 0.7218,
+      "step": 13193
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 8.612907399628678,
+      "learning_rate": 9.157909032133872e-08,
+      "loss": 0.6892,
+      "step": 13194
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 10.008797175963075,
+      "learning_rate": 9.135922483078485e-08,
+      "loss": 0.7244,
+      "step": 13195
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 9.941242146794561,
+      "learning_rate": 9.113962115346652e-08,
+      "loss": 0.7203,
+      "step": 13196
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 9.836302084540318,
+      "learning_rate": 9.092027930109715e-08,
+      "loss": 0.6415,
+      "step": 13197
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 6.52585015142537,
+      "learning_rate": 9.070119928537513e-08,
+      "loss": 0.6631,
+      "step": 13198
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 10.038946390170503,
+      "learning_rate": 9.048238111798668e-08,
+      "loss": 0.6916,
+      "step": 13199
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 8.353730380387635,
+      "learning_rate": 9.026382481060192e-08,
+      "loss": 0.6704,
+      "step": 13200
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 7.948831024807473,
+      "learning_rate": 9.004553037487873e-08,
+      "loss": 0.7305,
+      "step": 13201
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 8.528578616342552,
+      "learning_rate": 8.982749782245948e-08,
+      "loss": 0.6696,
+      "step": 13202
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 14.052076521439313,
+      "learning_rate": 8.960972716497429e-08,
+      "loss": 0.6925,
+      "step": 13203
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 11.952667291610629,
+      "learning_rate": 8.939221841403833e-08,
+      "loss": 0.6947,
+      "step": 13204
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 8.250803605221414,
+      "learning_rate": 8.917497158125177e-08,
+      "loss": 0.6511,
+      "step": 13205
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 12.284073214018719,
+      "learning_rate": 8.895798667820366e-08,
+      "loss": 0.7071,
+      "step": 13206
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 12.268812967420782,
+      "learning_rate": 8.87412637164664e-08,
+      "loss": 0.6643,
+      "step": 13207
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 10.827983123830034,
+      "learning_rate": 8.852480270759967e-08,
+      "loss": 0.7094,
+      "step": 13208
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 8.702616589051303,
+      "learning_rate": 8.830860366314864e-08,
+      "loss": 0.7014,
+      "step": 13209
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 8.98241824562203,
+      "learning_rate": 8.809266659464466e-08,
+      "loss": 0.6447,
+      "step": 13210
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 9.861134251006744,
+      "learning_rate": 8.787699151360574e-08,
+      "loss": 0.719,
+      "step": 13211
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 15.56702570037561,
+      "learning_rate": 8.766157843153488e-08,
+      "loss": 0.7139,
+      "step": 13212
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 8.763574155821566,
+      "learning_rate": 8.744642735992237e-08,
+      "loss": 0.6998,
+      "step": 13213
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 11.537104257672208,
+      "learning_rate": 8.723153831024289e-08,
+      "loss": 0.7068,
+      "step": 13214
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 9.275308374053575,
+      "learning_rate": 8.701691129395783e-08,
+      "loss": 0.6979,
+      "step": 13215
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 10.984273387934229,
+      "learning_rate": 8.680254632251639e-08,
+      "loss": 0.7074,
+      "step": 13216
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 11.989096299739519,
+      "learning_rate": 8.658844340734995e-08,
+      "loss": 0.7346,
+      "step": 13217
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 11.05375203786195,
+      "learning_rate": 8.637460255988051e-08,
+      "loss": 0.695,
+      "step": 13218
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 11.396136491803414,
+      "learning_rate": 8.616102379151226e-08,
+      "loss": 0.6943,
+      "step": 13219
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 13.531207986347384,
+      "learning_rate": 8.594770711363665e-08,
+      "loss": 0.7233,
+      "step": 13220
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 10.404877545707704,
+      "learning_rate": 8.573465253763291e-08,
+      "loss": 0.7247,
+      "step": 13221
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 10.77867933288153,
+      "learning_rate": 8.552186007486307e-08,
+      "loss": 0.6701,
+      "step": 13222
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 10.958623709571068,
+      "learning_rate": 8.530932973667749e-08,
+      "loss": 0.7457,
+      "step": 13223
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 11.351456366075535,
+      "learning_rate": 8.509706153441267e-08,
+      "loss": 0.6812,
+      "step": 13224
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 10.12487446048286,
+      "learning_rate": 8.488505547939007e-08,
+      "loss": 0.7523,
+      "step": 13225
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 7.191046261683946,
+      "learning_rate": 8.467331158291736e-08,
+      "loss": 0.6593,
+      "step": 13226
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 10.18220685049374,
+      "learning_rate": 8.44618298562877e-08,
+      "loss": 0.6814,
+      "step": 13227
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 10.938664917100674,
+      "learning_rate": 8.425061031078152e-08,
+      "loss": 0.6513,
+      "step": 13228
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 10.94986554175674,
+      "learning_rate": 8.403965295766536e-08,
+      "loss": 0.63,
+      "step": 13229
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 8.55442146296943,
+      "learning_rate": 8.382895780818968e-08,
+      "loss": 0.6693,
+      "step": 13230
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 10.293879114902728,
+      "learning_rate": 8.361852487359379e-08,
+      "loss": 0.6745,
+      "step": 13231
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 9.273434851741904,
+      "learning_rate": 8.340835416510096e-08,
+      "loss": 0.6604,
+      "step": 13232
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 10.979202064163095,
+      "learning_rate": 8.319844569392055e-08,
+      "loss": 0.6876,
+      "step": 13233
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 8.661982038907555,
+      "learning_rate": 8.298879947124916e-08,
+      "loss": 0.6768,
+      "step": 13234
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 12.917949078924732,
+      "learning_rate": 8.277941550826896e-08,
+      "loss": 0.6667,
+      "step": 13235
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 10.144483157550162,
+      "learning_rate": 8.257029381614712e-08,
+      "loss": 0.746,
+      "step": 13236
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 9.020967745142777,
+      "learning_rate": 8.236143440603806e-08,
+      "loss": 0.7013,
+      "step": 13237
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 11.219598888442883,
+      "learning_rate": 8.21528372890823e-08,
+      "loss": 0.6708,
+      "step": 13238
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 7.524072722590035,
+      "learning_rate": 8.19445024764054e-08,
+      "loss": 0.712,
+      "step": 13239
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 7.489738364882174,
+      "learning_rate": 8.173642997911846e-08,
+      "loss": 0.6994,
+      "step": 13240
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 7.270076980156626,
+      "learning_rate": 8.152861980832039e-08,
+      "loss": 0.5842,
+      "step": 13241
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 9.529781999179233,
+      "learning_rate": 8.132107197509509e-08,
+      "loss": 0.6472,
+      "step": 13242
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 8.998621868581456,
+      "learning_rate": 8.11137864905126e-08,
+      "loss": 0.6602,
+      "step": 13243
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 9.360205474162731,
+      "learning_rate": 8.090676336562908e-08,
+      "loss": 0.6981,
+      "step": 13244
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 9.170392120789023,
+      "learning_rate": 8.07000026114868e-08,
+      "loss": 0.6312,
+      "step": 13245
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 9.791011642435722,
+      "learning_rate": 8.049350423911307e-08,
+      "loss": 0.6504,
+      "step": 13246
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 8.454933420337436,
+      "learning_rate": 8.028726825952237e-08,
+      "loss": 0.7334,
+      "step": 13247
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 9.64291174432373,
+      "learning_rate": 8.008129468371428e-08,
+      "loss": 0.6107,
+      "step": 13248
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 12.640639867901125,
+      "learning_rate": 7.987558352267555e-08,
+      "loss": 0.7332,
+      "step": 13249
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 8.755731940106614,
+      "learning_rate": 7.967013478737795e-08,
+      "loss": 0.7594,
+      "step": 13250
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 9.19333870771517,
+      "learning_rate": 7.94649484887794e-08,
+      "loss": 0.6978,
+      "step": 13251
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 10.159173046154994,
+      "learning_rate": 7.926002463782446e-08,
+      "loss": 0.7138,
+      "step": 13252
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 9.12666628255284,
+      "learning_rate": 7.905536324544216e-08,
+      "loss": 0.6973,
+      "step": 13253
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 10.09327772705528,
+      "learning_rate": 7.88509643225499e-08,
+      "loss": 0.6495,
+      "step": 13254
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 8.736764913274875,
+      "learning_rate": 7.864682788004841e-08,
+      "loss": 0.675,
+      "step": 13255
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 14.053792575260996,
+      "learning_rate": 7.844295392882672e-08,
+      "loss": 0.685,
+      "step": 13256
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 12.125729633005669,
+      "learning_rate": 7.82393424797595e-08,
+      "loss": 0.7507,
+      "step": 13257
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 8.474730257886584,
+      "learning_rate": 7.803599354370473e-08,
+      "loss": 0.6814,
+      "step": 13258
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 6.57848420457484,
+      "learning_rate": 7.783290713151037e-08,
+      "loss": 0.7159,
+      "step": 13259
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 8.918656353148563,
+      "learning_rate": 7.763008325400723e-08,
+      "loss": 0.7355,
+      "step": 13260
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 11.422917984353292,
+      "learning_rate": 7.742752192201386e-08,
+      "loss": 0.6351,
+      "step": 13261
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 10.199228131540773,
+      "learning_rate": 7.722522314633495e-08,
+      "loss": 0.6426,
+      "step": 13262
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 11.049836384983381,
+      "learning_rate": 7.702318693776024e-08,
+      "loss": 0.6598,
+      "step": 13263
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 7.576798311535204,
+      "learning_rate": 7.68214133070655e-08,
+      "loss": 0.7186,
+      "step": 13264
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 10.833212209932794,
+      "learning_rate": 7.661990226501215e-08,
+      "loss": 0.6365,
+      "step": 13265
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 8.616518092587741,
+      "learning_rate": 7.641865382234881e-08,
+      "loss": 0.6409,
+      "step": 13266
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 12.34343663985742,
+      "learning_rate": 7.621766798981023e-08,
+      "loss": 0.7023,
+      "step": 13267
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 10.128628109266566,
+      "learning_rate": 7.601694477811561e-08,
+      "loss": 0.7216,
+      "step": 13268
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 7.48566067566551,
+      "learning_rate": 7.581648419797083e-08,
+      "loss": 0.7076,
+      "step": 13269
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 8.10833609381694,
+      "learning_rate": 7.561628626006901e-08,
+      "loss": 0.6561,
+      "step": 13270
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 9.237320992264038,
+      "learning_rate": 7.54163509750866e-08,
+      "loss": 0.7491,
+      "step": 13271
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 15.251950054852903,
+      "learning_rate": 7.521667835368896e-08,
+      "loss": 0.6321,
+      "step": 13272
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 7.547242224807358,
+      "learning_rate": 7.501726840652479e-08,
+      "loss": 0.753,
+      "step": 13273
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 12.682032877156027,
+      "learning_rate": 7.481812114423059e-08,
+      "loss": 0.7281,
+      "step": 13274
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 7.511168871260427,
+      "learning_rate": 7.461923657742953e-08,
+      "loss": 0.7019,
+      "step": 13275
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 8.503594963405465,
+      "learning_rate": 7.442061471672757e-08,
+      "loss": 0.6857,
+      "step": 13276
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 11.00753564727359,
+      "learning_rate": 7.422225557272011e-08,
+      "loss": 0.6948,
+      "step": 13277
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 8.50800503464424,
+      "learning_rate": 7.402415915598649e-08,
+      "loss": 0.7355,
+      "step": 13278
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 12.628091042032,
+      "learning_rate": 7.382632547709212e-08,
+      "loss": 0.6923,
+      "step": 13279
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 12.352028491778805,
+      "learning_rate": 7.362875454659024e-08,
+      "loss": 0.7141,
+      "step": 13280
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 12.017322158579038,
+      "learning_rate": 7.343144637501743e-08,
+      "loss": 0.6829,
+      "step": 13281
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 8.620094304964919,
+      "learning_rate": 7.323440097289802e-08,
+      "loss": 0.7342,
+      "step": 13282
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 9.131098899861561,
+      "learning_rate": 7.303761835074253e-08,
+      "loss": 0.7276,
+      "step": 13283
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 10.73894280014529,
+      "learning_rate": 7.284109851904531e-08,
+      "loss": 0.7509,
+      "step": 13284
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 8.555270767781378,
+      "learning_rate": 7.264484148828966e-08,
+      "loss": 0.696,
+      "step": 13285
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 62.283558419975165,
+      "learning_rate": 7.244884726894274e-08,
+      "loss": 0.8302,
+      "step": 13286
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 8.633075914683436,
+      "learning_rate": 7.225311587145845e-08,
+      "loss": 0.7142,
+      "step": 13287
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 8.655828790813134,
+      "learning_rate": 7.205764730627618e-08,
+      "loss": 0.7261,
+      "step": 13288
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 8.183927994129624,
+      "learning_rate": 7.186244158382205e-08,
+      "loss": 0.6696,
+      "step": 13289
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 6.7354668319360576,
+      "learning_rate": 7.166749871450773e-08,
+      "loss": 0.6569,
+      "step": 13290
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 11.749631714590565,
+      "learning_rate": 7.147281870873102e-08,
+      "loss": 0.7106,
+      "step": 13291
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 9.214066119452177,
+      "learning_rate": 7.127840157687582e-08,
+      "loss": 0.6043,
+      "step": 13292
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 10.284139709362803,
+      "learning_rate": 7.108424732931051e-08,
+      "loss": 0.707,
+      "step": 13293
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 9.91330939627666,
+      "learning_rate": 7.089035597639237e-08,
+      "loss": 0.7272,
+      "step": 13294
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 7.029350164811774,
+      "learning_rate": 7.069672752846258e-08,
+      "loss": 0.7094,
+      "step": 13295
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 8.654990297107075,
+      "learning_rate": 7.05033619958484e-08,
+      "loss": 0.6852,
+      "step": 13296
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 9.03527249809394,
+      "learning_rate": 7.031025938886327e-08,
+      "loss": 0.6484,
+      "step": 13297
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 10.018216122551452,
+      "learning_rate": 7.011741971780727e-08,
+      "loss": 0.6939,
+      "step": 13298
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 8.153519050175497,
+      "learning_rate": 6.992484299296554e-08,
+      "loss": 0.7026,
+      "step": 13299
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 7.947112564101022,
+      "learning_rate": 6.973252922460982e-08,
+      "loss": 0.6836,
+      "step": 13300
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 14.572467904440655,
+      "learning_rate": 6.95404784229975e-08,
+      "loss": 0.6855,
+      "step": 13301
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 8.082579999646063,
+      "learning_rate": 6.934869059837202e-08,
+      "loss": 0.6899,
+      "step": 13302
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 9.85390784897303,
+      "learning_rate": 6.915716576096243e-08,
+      "loss": 0.6722,
+      "step": 13303
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 8.270750530038251,
+      "learning_rate": 6.896590392098501e-08,
+      "loss": 0.7075,
+      "step": 13304
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 8.785908638585138,
+      "learning_rate": 6.877490508864048e-08,
+      "loss": 0.7311,
+      "step": 13305
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 9.594590565825408,
+      "learning_rate": 6.858416927411626e-08,
+      "loss": 0.6861,
+      "step": 13306
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 14.433305109004293,
+      "learning_rate": 6.839369648758587e-08,
+      "loss": 0.7062,
+      "step": 13307
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 6.8925233487556135,
+      "learning_rate": 6.820348673920896e-08,
+      "loss": 0.6959,
+      "step": 13308
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 7.859393552649591,
+      "learning_rate": 6.801354003912963e-08,
+      "loss": 0.7296,
+      "step": 13309
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 9.90892640596461,
+      "learning_rate": 6.782385639748034e-08,
+      "loss": 0.7648,
+      "step": 13310
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 11.22840209309838,
+      "learning_rate": 6.763443582437689e-08,
+      "loss": 0.6961,
+      "step": 13311
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 10.2237731123025,
+      "learning_rate": 6.744527832992397e-08,
+      "loss": 0.7041,
+      "step": 13312
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 14.716638682995397,
+      "learning_rate": 6.725638392421019e-08,
+      "loss": 0.6646,
+      "step": 13313
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 13.406076808611338,
+      "learning_rate": 6.706775261731024e-08,
+      "loss": 0.6888,
+      "step": 13314
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 9.154400876334138,
+      "learning_rate": 6.687938441928555e-08,
+      "loss": 0.7434,
+      "step": 13315
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 10.583141053034801,
+      "learning_rate": 6.669127934018249e-08,
+      "loss": 0.6955,
+      "step": 13316
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 8.039896504576296,
+      "learning_rate": 6.650343739003528e-08,
+      "loss": 0.7377,
+      "step": 13317
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 8.467709120338274,
+      "learning_rate": 6.631585857886202e-08,
+      "loss": 0.6706,
+      "step": 13318
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 8.371017236533477,
+      "learning_rate": 6.612854291666803e-08,
+      "loss": 0.7556,
+      "step": 13319
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 10.094321617627493,
+      "learning_rate": 6.594149041344366e-08,
+      "loss": 0.6387,
+      "step": 13320
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 10.238316251932076,
+      "learning_rate": 6.575470107916649e-08,
+      "loss": 0.7534,
+      "step": 13321
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 10.921528554301641,
+      "learning_rate": 6.556817492379907e-08,
+      "loss": 0.6879,
+      "step": 13322
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 8.170753737199107,
+      "learning_rate": 6.538191195729016e-08,
+      "loss": 0.6979,
+      "step": 13323
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 5.996342575804034,
+      "learning_rate": 6.519591218957399e-08,
+      "loss": 0.7241,
+      "step": 13324
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 9.441769257293066,
+      "learning_rate": 6.50101756305721e-08,
+      "loss": 0.6915,
+      "step": 13325
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 10.990630834896601,
+      "learning_rate": 6.482470229019045e-08,
+      "loss": 0.7722,
+      "step": 13326
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 11.312851932980728,
+      "learning_rate": 6.463949217832222e-08,
+      "loss": 0.6906,
+      "step": 13327
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 8.965089447549284,
+      "learning_rate": 6.445454530484619e-08,
+      "loss": 0.6927,
+      "step": 13328
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 10.39849830099332,
+      "learning_rate": 6.426986167962613e-08,
+      "loss": 0.6384,
+      "step": 13329
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 11.362465422800348,
+      "learning_rate": 6.408544131251304e-08,
+      "loss": 0.6588,
+      "step": 13330
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 9.493113784135224,
+      "learning_rate": 6.390128421334296e-08,
+      "loss": 0.6916,
+      "step": 13331
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 8.823258424423615,
+      "learning_rate": 6.371739039193914e-08,
+      "loss": 0.6865,
+      "step": 13332
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 8.930776374320812,
+      "learning_rate": 6.353375985810928e-08,
+      "loss": 0.7189,
+      "step": 13333
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 10.126701097722837,
+      "learning_rate": 6.335039262164833e-08,
+      "loss": 0.7611,
+      "step": 13334
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 9.916983234995703,
+      "learning_rate": 6.316728869233568e-08,
+      "loss": 0.7189,
+      "step": 13335
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 11.174876783947727,
+      "learning_rate": 6.298444807993798e-08,
+      "loss": 0.7296,
+      "step": 13336
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 10.799233392601513,
+      "learning_rate": 6.280187079420796e-08,
+      "loss": 0.7188,
+      "step": 13337
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 7.699480853244411,
+      "learning_rate": 6.261955684488341e-08,
+      "loss": 0.7572,
+      "step": 13338
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 8.27162099229174,
+      "learning_rate": 6.243750624168766e-08,
+      "loss": 0.6855,
+      "step": 13339
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 8.221680356220334,
+      "learning_rate": 6.225571899433236e-08,
+      "loss": 0.8023,
+      "step": 13340
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 9.14161534604302,
+      "learning_rate": 6.207419511251256e-08,
+      "loss": 0.7123,
+      "step": 13341
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 7.703735686252929,
+      "learning_rate": 6.189293460590995e-08,
+      "loss": 0.6745,
+      "step": 13342
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 9.543217863592815,
+      "learning_rate": 6.171193748419347e-08,
+      "loss": 0.6993,
+      "step": 13343
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 9.994902638921962,
+      "learning_rate": 6.153120375701593e-08,
+      "loss": 0.675,
+      "step": 13344
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 11.788191333841876,
+      "learning_rate": 6.135073343401799e-08,
+      "loss": 0.6389,
+      "step": 13345
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 12.113343928994366,
+      "learning_rate": 6.11705265248247e-08,
+      "loss": 0.7368,
+      "step": 13346
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 9.618187657111005,
+      "learning_rate": 6.099058303904837e-08,
+      "loss": 0.755,
+      "step": 13347
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 8.915783075692643,
+      "learning_rate": 6.081090298628689e-08,
+      "loss": 0.7695,
+      "step": 13348
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 8.371130611476865,
+      "learning_rate": 6.063148637612315e-08,
+      "loss": 0.6914,
+      "step": 13349
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 10.032603686386631,
+      "learning_rate": 6.045233321812727e-08,
+      "loss": 0.7212,
+      "step": 13350
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 10.453849018367006,
+      "learning_rate": 6.027344352185438e-08,
+      "loss": 0.7157,
+      "step": 13351
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 10.439170072554793,
+      "learning_rate": 6.00948172968463e-08,
+      "loss": 0.7008,
+      "step": 13352
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 9.129879334633026,
+      "learning_rate": 5.99164545526304e-08,
+      "loss": 0.7333,
+      "step": 13353
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 9.462280197195227,
+      "learning_rate": 5.973835529872019e-08,
+      "loss": 0.6625,
+      "step": 13354
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 10.651497012980753,
+      "learning_rate": 5.9560519544614725e-08,
+      "loss": 0.7009,
+      "step": 13355
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 11.220929529771105,
+      "learning_rate": 5.938294729979865e-08,
+      "loss": 0.6819,
+      "step": 13356
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 9.886846512307535,
+      "learning_rate": 5.9205638573744395e-08,
+      "loss": 0.6494,
+      "step": 13357
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 9.387910740200102,
+      "learning_rate": 5.9028593375908826e-08,
+      "loss": 0.6861,
+      "step": 13358
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 7.963059147711026,
+      "learning_rate": 5.8851811715734396e-08,
+      "loss": 0.751,
+      "step": 13359
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 9.042529897103098,
+      "learning_rate": 5.8675293602650785e-08,
+      "loss": 0.7303,
+      "step": 13360
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 9.816022383713957,
+      "learning_rate": 5.849903904607268e-08,
+      "loss": 0.7663,
+      "step": 13361
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 9.322662446433975,
+      "learning_rate": 5.83230480554009e-08,
+      "loss": 0.597,
+      "step": 13362
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 9.199888871382889,
+      "learning_rate": 5.814732064002293e-08,
+      "loss": 0.6592,
+      "step": 13363
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 10.337484838777424,
+      "learning_rate": 5.7971856809310724e-08,
+      "loss": 0.6741,
+      "step": 13364
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 7.282689024354831,
+      "learning_rate": 5.7796656572623456e-08,
+      "loss": 0.7127,
+      "step": 13365
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 11.338983488311907,
+      "learning_rate": 5.762171993930643e-08,
+      "loss": 0.6691,
+      "step": 13366
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 8.002592430185729,
+      "learning_rate": 5.744704691868885e-08,
+      "loss": 0.6802,
+      "step": 13367
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 7.194537293630451,
+      "learning_rate": 5.7272637520088824e-08,
+      "loss": 0.6658,
+      "step": 13368
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 10.888974125194254,
+      "learning_rate": 5.7098491752807236e-08,
+      "loss": 0.7769,
+      "step": 13369
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 11.219955060397297,
+      "learning_rate": 5.692460962613389e-08,
+      "loss": 0.6413,
+      "step": 13370
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 9.092341051483766,
+      "learning_rate": 5.6750991149343016e-08,
+      "loss": 0.6827,
+      "step": 13371
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 9.482131342489682,
+      "learning_rate": 5.657763633169444e-08,
+      "loss": 0.6941,
+      "step": 13372
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 9.29651650671851,
+      "learning_rate": 5.6404545182434656e-08,
+      "loss": 0.694,
+      "step": 13373
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 8.400506227433894,
+      "learning_rate": 5.623171771079572e-08,
+      "loss": 0.7559,
+      "step": 13374
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 14.920420805341516,
+      "learning_rate": 5.605915392599581e-08,
+      "loss": 0.6752,
+      "step": 13375
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 9.879496454160115,
+      "learning_rate": 5.588685383723924e-08,
+      "loss": 0.7058,
+      "step": 13376
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 11.623012935909465,
+      "learning_rate": 5.571481745371532e-08,
+      "loss": 0.6623,
+      "step": 13377
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 11.813347257367035,
+      "learning_rate": 5.554304478460115e-08,
+      "loss": 0.7134,
+      "step": 13378
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 9.631967963747822,
+      "learning_rate": 5.53715358390583e-08,
+      "loss": 0.693,
+      "step": 13379
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 8.650756247822395,
+      "learning_rate": 5.520029062623333e-08,
+      "loss": 0.7098,
+      "step": 13380
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 10.028960862034113,
+      "learning_rate": 5.502930915526172e-08,
+      "loss": 0.7426,
+      "step": 13381
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 12.556108633676802,
+      "learning_rate": 5.485859143526173e-08,
+      "loss": 0.7209,
+      "step": 13382
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 8.300590333005282,
+      "learning_rate": 5.468813747533996e-08,
+      "loss": 0.6954,
+      "step": 13383
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 7.831014929369137,
+      "learning_rate": 5.451794728458748e-08,
+      "loss": 0.6926,
+      "step": 13384
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 10.355185968483669,
+      "learning_rate": 5.434802087208202e-08,
+      "loss": 0.7585,
+      "step": 13385
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 11.613498395770423,
+      "learning_rate": 5.417835824688689e-08,
+      "loss": 0.7257,
+      "step": 13386
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 11.526613358972591,
+      "learning_rate": 5.4008959418050955e-08,
+      "loss": 0.6587,
+      "step": 13387
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 10.787188912259582,
+      "learning_rate": 5.3839824394609775e-08,
+      "loss": 0.7314,
+      "step": 13388
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 13.093469021601095,
+      "learning_rate": 5.367095318558502e-08,
+      "loss": 0.7563,
+      "step": 13389
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 7.049558456916005,
+      "learning_rate": 5.350234579998337e-08,
+      "loss": 0.65,
+      "step": 13390
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 10.135045694402102,
+      "learning_rate": 5.3334002246797634e-08,
+      "loss": 0.6772,
+      "step": 13391
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 13.142840571600063,
+      "learning_rate": 5.3165922535007294e-08,
+      "loss": 0.762,
+      "step": 13392
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 9.01490652748971,
+      "learning_rate": 5.2998106673576855e-08,
+      "loss": 0.6651,
+      "step": 13393
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 8.253098856358767,
+      "learning_rate": 5.283055467145748e-08,
+      "loss": 0.7318,
+      "step": 13394
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 9.776581116684545,
+      "learning_rate": 5.2663266537585354e-08,
+      "loss": 0.6453,
+      "step": 13395
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 8.7357362640026,
+      "learning_rate": 5.249624228088446e-08,
+      "loss": 0.6912,
+      "step": 13396
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 11.932180708678965,
+      "learning_rate": 5.23294819102621e-08,
+      "loss": 0.7103,
+      "step": 13397
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 11.251743453750226,
+      "learning_rate": 5.21629854346134e-08,
+      "loss": 0.6631,
+      "step": 13398
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 7.840844987143481,
+      "learning_rate": 5.1996752862818464e-08,
+      "loss": 0.7139,
+      "step": 13399
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 9.053313285769441,
+      "learning_rate": 5.1830784203743545e-08,
+      "loss": 0.6826,
+      "step": 13400
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 10.58607053577719,
+      "learning_rate": 5.166507946624155e-08,
+      "loss": 0.671,
+      "step": 13401
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 12.289138084268872,
+      "learning_rate": 5.149963865915042e-08,
+      "loss": 0.6862,
+      "step": 13402
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 12.001200447005866,
+      "learning_rate": 5.133446179129475e-08,
+      "loss": 0.7296,
+      "step": 13403
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 12.036466374127691,
+      "learning_rate": 5.116954887148362e-08,
+      "loss": 0.6938,
+      "step": 13404
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 8.429773263922002,
+      "learning_rate": 5.100489990851387e-08,
+      "loss": 0.6254,
+      "step": 13405
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 13.724147640327223,
+      "learning_rate": 5.084051491116737e-08,
+      "loss": 0.7,
+      "step": 13406
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 12.480441992326941,
+      "learning_rate": 5.0676393888211e-08,
+      "loss": 0.6672,
+      "step": 13407
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 10.948569918454933,
+      "learning_rate": 5.0512536848399405e-08,
+      "loss": 0.7359,
+      "step": 13408
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 6.943542060539521,
+      "learning_rate": 5.034894380047228e-08,
+      "loss": 0.7118,
+      "step": 13409
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 12.712185574069492,
+      "learning_rate": 5.018561475315542e-08,
+      "loss": 0.6767,
+      "step": 13410
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 10.615130802859346,
+      "learning_rate": 5.002254971515963e-08,
+      "loss": 0.7434,
+      "step": 13411
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 10.261327345963927,
+      "learning_rate": 4.985974869518295e-08,
+      "loss": 0.6493,
+      "step": 13412
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 8.909745715465531,
+      "learning_rate": 4.969721170190789e-08,
+      "loss": 0.7709,
+      "step": 13413
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 7.756648410316039,
+      "learning_rate": 4.9534938744004723e-08,
+      "loss": 0.7028,
+      "step": 13414
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 7.523461711353824,
+      "learning_rate": 4.93729298301282e-08,
+      "loss": 0.7285,
+      "step": 13415
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 12.331651429897532,
+      "learning_rate": 4.921118496891919e-08,
+      "loss": 0.6588,
+      "step": 13416
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 7.986472730280474,
+      "learning_rate": 4.904970416900523e-08,
+      "loss": 0.6964,
+      "step": 13417
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 12.337189351079084,
+      "learning_rate": 4.888848743899888e-08,
+      "loss": 0.7541,
+      "step": 13418
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 6.102923500731213,
+      "learning_rate": 4.8727534787499364e-08,
+      "loss": 0.7091,
+      "step": 13419
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 9.273889050110697,
+      "learning_rate": 4.856684622309038e-08,
+      "loss": 0.6902,
+      "step": 13420
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 8.929857753230328,
+      "learning_rate": 4.840642175434451e-08,
+      "loss": 0.7009,
+      "step": 13421
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 10.291893395395313,
+      "learning_rate": 4.824626138981603e-08,
+      "loss": 0.7405,
+      "step": 13422
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 10.684281404651971,
+      "learning_rate": 4.808636513804976e-08,
+      "loss": 0.6429,
+      "step": 13423
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 11.726153272748101,
+      "learning_rate": 4.792673300757278e-08,
+      "loss": 0.6685,
+      "step": 13424
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 10.155718940275383,
+      "learning_rate": 4.776736500689938e-08,
+      "loss": 0.6727,
+      "step": 13425
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 11.889843634221139,
+      "learning_rate": 4.7608261144529985e-08,
+      "loss": 0.7269,
+      "step": 13426
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 8.09914387440693,
+      "learning_rate": 4.7449421428951145e-08,
+      "loss": 0.7032,
+      "step": 13427
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 11.898799527234312,
+      "learning_rate": 4.7290845868634415e-08,
+      "loss": 0.6567,
+      "step": 13428
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 8.022452755334424,
+      "learning_rate": 4.7132534472038584e-08,
+      "loss": 0.7209,
+      "step": 13429
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 11.340469140300806,
+      "learning_rate": 4.697448724760634e-08,
+      "loss": 0.6622,
+      "step": 13430
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 10.03665572987247,
+      "learning_rate": 4.681670420376816e-08,
+      "loss": 0.6479,
+      "step": 13431
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 7.962688843835059,
+      "learning_rate": 4.66591853489401e-08,
+      "loss": 0.686,
+      "step": 13432
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 14.148362724384976,
+      "learning_rate": 4.650193069152265e-08,
+      "loss": 0.6966,
+      "step": 13433
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 12.4349184367677,
+      "learning_rate": 4.6344940239904655e-08,
+      "loss": 0.7329,
+      "step": 13434
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 6.615970720535052,
+      "learning_rate": 4.6188214002458876e-08,
+      "loss": 0.682,
+      "step": 13435
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 8.428878181515575,
+      "learning_rate": 4.603175198754528e-08,
+      "loss": 0.699,
+      "step": 13436
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 12.207194366061444,
+      "learning_rate": 4.5875554203507755e-08,
+      "loss": 0.6629,
+      "step": 13437
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 10.21163600642097,
+      "learning_rate": 4.5719620658677966e-08,
+      "loss": 0.6675,
+      "step": 13438
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 10.0677340712007,
+      "learning_rate": 4.5563951361374257e-08,
+      "loss": 0.646,
+      "step": 13439
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 9.69878021527171,
+      "learning_rate": 4.5408546319897776e-08,
+      "loss": 0.6444,
+      "step": 13440
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 10.791231486628721,
+      "learning_rate": 4.5253405542539097e-08,
+      "loss": 0.6569,
+      "step": 13441
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 7.7965612566815325,
+      "learning_rate": 4.509852903757161e-08,
+      "loss": 0.6272,
+      "step": 13442
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 8.71835842566986,
+      "learning_rate": 4.494391681325649e-08,
+      "loss": 0.7189,
+      "step": 13443
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 8.44425361140045,
+      "learning_rate": 4.478956887784047e-08,
+      "loss": 0.6836,
+      "step": 13444
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 8.439626937865235,
+      "learning_rate": 4.4635485239555854e-08,
+      "loss": 0.7008,
+      "step": 13445
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 8.592144801180682,
+      "learning_rate": 4.448166590662106e-08,
+      "loss": 0.7609,
+      "step": 13446
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 9.677218354805083,
+      "learning_rate": 4.432811088724065e-08,
+      "loss": 0.7383,
+      "step": 13447
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 11.042055769758628,
+      "learning_rate": 4.4174820189604175e-08,
+      "loss": 0.6647,
+      "step": 13448
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 7.785220397066407,
+      "learning_rate": 4.402179382188898e-08,
+      "loss": 0.6471,
+      "step": 13449
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 10.479161067635937,
+      "learning_rate": 4.3869031792255215e-08,
+      "loss": 0.6862,
+      "step": 13450
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 11.21121756398741,
+      "learning_rate": 4.371653410885246e-08,
+      "loss": 0.6036,
+      "step": 13451
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 8.905442432947778,
+      "learning_rate": 4.3564300779813665e-08,
+      "loss": 0.7179,
+      "step": 13452
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 8.481316415594328,
+      "learning_rate": 4.3412331813258436e-08,
+      "loss": 0.7171,
+      "step": 13453
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 11.16597025077032,
+      "learning_rate": 4.326062721729307e-08,
+      "loss": 0.6971,
+      "step": 13454
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 9.798123302491659,
+      "learning_rate": 4.3109187000008876e-08,
+      "loss": 0.6819,
+      "step": 13455
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 11.038358008984664,
+      "learning_rate": 4.295801116948273e-08,
+      "loss": 0.6715,
+      "step": 13456
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 9.54231663555811,
+      "learning_rate": 4.2807099733778166e-08,
+      "loss": 0.7209,
+      "step": 13457
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 8.890523909293028,
+      "learning_rate": 4.265645270094487e-08,
+      "loss": 0.7055,
+      "step": 13458
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 7.972762773586624,
+      "learning_rate": 4.250607007901697e-08,
+      "loss": 0.7168,
+      "step": 13459
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 12.273280640748174,
+      "learning_rate": 4.2355951876016376e-08,
+      "loss": 0.7792,
+      "step": 13460
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 9.564396485807675,
+      "learning_rate": 4.220609809994946e-08,
+      "loss": 0.704,
+      "step": 13461
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 10.158930194258742,
+      "learning_rate": 4.205650875880929e-08,
+      "loss": 0.661,
+      "step": 13462
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 7.640761002952409,
+      "learning_rate": 4.1907183860574464e-08,
+      "loss": 0.6629,
+      "step": 13463
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 12.126960712474991,
+      "learning_rate": 4.175812341320917e-08,
+      "loss": 0.6998,
+      "step": 13464
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 10.139381075989064,
+      "learning_rate": 4.160932742466428e-08,
+      "loss": 0.6613,
+      "step": 13465
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 10.421489870546576,
+      "learning_rate": 4.1460795902876215e-08,
+      "loss": 0.6691,
+      "step": 13466
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 11.140788942497748,
+      "learning_rate": 4.1312528855767534e-08,
+      "loss": 0.6919,
+      "step": 13467
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 8.227223103311472,
+      "learning_rate": 4.1164526291245787e-08,
+      "loss": 0.7314,
+      "step": 13468
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 10.741074486444193,
+      "learning_rate": 4.101678821720467e-08,
+      "loss": 0.7448,
+      "step": 13469
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 8.223730021297055,
+      "learning_rate": 4.0869314641525107e-08,
+      "loss": 0.723,
+      "step": 13470
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 7.899305077208435,
+      "learning_rate": 4.07221055720719e-08,
+      "loss": 0.6633,
+      "step": 13471
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 13.665087866677249,
+      "learning_rate": 4.057516101669823e-08,
+      "loss": 0.7371,
+      "step": 13472
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 10.161815291875127,
+      "learning_rate": 4.042848098324004e-08,
+      "loss": 0.646,
+      "step": 13473
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 9.276252801045121,
+      "learning_rate": 4.028206547952218e-08,
+      "loss": 0.7245,
+      "step": 13474
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 11.124990223913104,
+      "learning_rate": 4.0135914513352856e-08,
+      "loss": 0.6586,
+      "step": 13475
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 12.125566071765714,
+      "learning_rate": 3.999002809252861e-08,
+      "loss": 0.7051,
+      "step": 13476
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 10.281318454115745,
+      "learning_rate": 3.9844406224829325e-08,
+      "loss": 0.67,
+      "step": 13477
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 9.47538585596205,
+      "learning_rate": 3.969904891802323e-08,
+      "loss": 0.66,
+      "step": 13478
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 11.469416953790589,
+      "learning_rate": 3.955395617986246e-08,
+      "loss": 0.7499,
+      "step": 13479
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 7.557340913265864,
+      "learning_rate": 3.940912801808694e-08,
+      "loss": 0.7264,
+      "step": 13480
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 9.84775076462102,
+      "learning_rate": 3.926456444041993e-08,
+      "loss": 0.7083,
+      "step": 13481
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 7.483576661378301,
+      "learning_rate": 3.912026545457303e-08,
+      "loss": 0.7221,
+      "step": 13482
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 8.933596268282857,
+      "learning_rate": 3.8976231068241774e-08,
+      "loss": 0.6897,
+      "step": 13483
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 9.916096364974006,
+      "learning_rate": 3.8832461289110005e-08,
+      "loss": 0.6968,
+      "step": 13484
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 9.649476770538765,
+      "learning_rate": 3.868895612484491e-08,
+      "loss": 0.7058,
+      "step": 13485
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 17.01554310285337,
+      "learning_rate": 3.854571558310094e-08,
+      "loss": 0.7049,
+      "step": 13486
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 11.050468654597314,
+      "learning_rate": 3.840273967151864e-08,
+      "loss": 0.746,
+      "step": 13487
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 7.500982250655774,
+      "learning_rate": 3.8260028397723025e-08,
+      "loss": 0.7197,
+      "step": 13488
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 9.011646062934144,
+      "learning_rate": 3.8117581769326336e-08,
+      "loss": 0.7148,
+      "step": 13489
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 9.887080375752275,
+      "learning_rate": 3.7975399793926945e-08,
+      "loss": 0.7336,
+      "step": 13490
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 8.83109690428209,
+      "learning_rate": 3.783348247910712e-08,
+      "loss": 0.7096,
+      "step": 13491
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 10.288497333313787,
+      "learning_rate": 3.769182983243802e-08,
+      "loss": 0.6856,
+      "step": 13492
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 10.973230996489528,
+      "learning_rate": 3.755044186147361e-08,
+      "loss": 0.6476,
+      "step": 13493
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 8.163458358826457,
+      "learning_rate": 3.7409318573755075e-08,
+      "loss": 0.7419,
+      "step": 13494
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 9.198111682473403,
+      "learning_rate": 3.726845997681028e-08,
+      "loss": 0.7307,
+      "step": 13495
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 11.059542257881791,
+      "learning_rate": 3.712786607815211e-08,
+      "loss": 0.7547,
+      "step": 13496
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 8.777005026946402,
+      "learning_rate": 3.698753688527901e-08,
+      "loss": 0.7102,
+      "step": 13497
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 11.10804984464319,
+      "learning_rate": 3.68474724056761e-08,
+      "loss": 0.6417,
+      "step": 13498
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 7.8995732085921935,
+      "learning_rate": 3.670767264681463e-08,
+      "loss": 0.6824,
+      "step": 13499
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 7.406619078506336,
+      "learning_rate": 3.65681376161503e-08,
+      "loss": 0.7234,
+      "step": 13500
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 11.283874076255186,
+      "learning_rate": 3.6428867321124936e-08,
+      "loss": 0.7214,
+      "step": 13501
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 10.814841245445733,
+      "learning_rate": 3.6289861769168155e-08,
+      "loss": 0.7071,
+      "step": 13502
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 8.926567211529958,
+      "learning_rate": 3.615112096769347e-08,
+      "loss": 0.7629,
+      "step": 13503
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 8.872776317467892,
+      "learning_rate": 3.601264492410106e-08,
+      "loss": 0.7447,
+      "step": 13504
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 11.835136072321902,
+      "learning_rate": 3.58744336457767e-08,
+      "loss": 0.683,
+      "step": 13505
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 8.863411192190489,
+      "learning_rate": 3.5736487140092814e-08,
+      "loss": 0.6845,
+      "step": 13506
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 10.995660307313615,
+      "learning_rate": 3.5598805414406304e-08,
+      "loss": 0.6381,
+      "step": 13507
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 8.440642805213223,
+      "learning_rate": 3.5461388476060733e-08,
+      "loss": 0.6884,
+      "step": 13508
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 11.067676032926723,
+      "learning_rate": 3.5324236332385795e-08,
+      "loss": 0.7601,
+      "step": 13509
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 8.131369874925642,
+      "learning_rate": 3.518734899069676e-08,
+      "loss": 0.6994,
+      "step": 13510
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 11.276295392028892,
+      "learning_rate": 3.505072645829499e-08,
+      "loss": 0.6666,
+      "step": 13511
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 9.450250908102303,
+      "learning_rate": 3.491436874246801e-08,
+      "loss": 0.7271,
+      "step": 13512
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 10.060733571845843,
+      "learning_rate": 3.4778275850487763e-08,
+      "loss": 0.756,
+      "step": 13513
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 12.099009752805221,
+      "learning_rate": 3.464244778961345e-08,
+      "loss": 0.806,
+      "step": 13514
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 9.213496529827648,
+      "learning_rate": 3.450688456708984e-08,
+      "loss": 0.6941,
+      "step": 13515
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 9.863208039007326,
+      "learning_rate": 3.437158619014724e-08,
+      "loss": 0.7105,
+      "step": 13516
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 9.370014669944378,
+      "learning_rate": 3.423655266600268e-08,
+      "loss": 0.7207,
+      "step": 13517
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 8.841581136693948,
+      "learning_rate": 3.410178400185815e-08,
+      "loss": 0.6313,
+      "step": 13518
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 10.534097001315812,
+      "learning_rate": 3.396728020490125e-08,
+      "loss": 0.6371,
+      "step": 13519
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 9.397999322255444,
+      "learning_rate": 3.38330412823068e-08,
+      "loss": 0.6756,
+      "step": 13520
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 9.954710313565574,
+      "learning_rate": 3.369906724123462e-08,
+      "loss": 0.6697,
+      "step": 13521
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 9.446372380984066,
+      "learning_rate": 3.356535808883066e-08,
+      "loss": 0.6774,
+      "step": 13522
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 8.708039232814231,
+      "learning_rate": 3.3431913832225885e-08,
+      "loss": 0.5815,
+      "step": 13523
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 8.602650914953356,
+      "learning_rate": 3.3298734478538484e-08,
+      "loss": 0.7823,
+      "step": 13524
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 8.91131757719717,
+      "learning_rate": 3.3165820034871674e-08,
+      "loss": 0.7111,
+      "step": 13525
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 9.437730179753487,
+      "learning_rate": 3.3033170508314785e-08,
+      "loss": 0.689,
+      "step": 13526
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 12.644220851761716,
+      "learning_rate": 3.2900785905943254e-08,
+      "loss": 0.7863,
+      "step": 13527
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 7.562068004976596,
+      "learning_rate": 3.2768666234817e-08,
+      "loss": 0.6281,
+      "step": 13528
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 10.487674693481162,
+      "learning_rate": 3.2636811501984275e-08,
+      "loss": 0.6976,
+      "step": 13529
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 11.24601467665579,
+      "learning_rate": 3.2505221714477234e-08,
+      "loss": 0.6705,
+      "step": 13530
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 7.911262464820607,
+      "learning_rate": 3.2373896879314136e-08,
+      "loss": 0.678,
+      "step": 13531
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 7.784465053758603,
+      "learning_rate": 3.2242837003500507e-08,
+      "loss": 0.7371,
+      "step": 13532
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 12.889906433823867,
+      "learning_rate": 3.211204209402574e-08,
+      "loss": 0.7553,
+      "step": 13533
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 9.797558016645464,
+      "learning_rate": 3.198151215786649e-08,
+      "loss": 0.7415,
+      "step": 13534
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 10.048635720696959,
+      "learning_rate": 3.185124720198496e-08,
+      "loss": 0.7119,
+      "step": 13535
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 13.894186583684842,
+      "learning_rate": 3.172124723332837e-08,
+      "loss": 0.6517,
+      "step": 13536
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 11.977486480901886,
+      "learning_rate": 3.159151225883172e-08,
+      "loss": 0.7416,
+      "step": 13537
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 8.500165314298885,
+      "learning_rate": 3.1462042285414496e-08,
+      "loss": 0.6577,
+      "step": 13538
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 13.081749319065702,
+      "learning_rate": 3.133283731998116e-08,
+      "loss": 0.7541,
+      "step": 13539
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 9.8515542051251,
+      "learning_rate": 3.120389736942397e-08,
+      "loss": 0.7332,
+      "step": 13540
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 11.064743788240648,
+      "learning_rate": 3.107522244062078e-08,
+      "loss": 0.6963,
+      "step": 13541
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 7.447861360062569,
+      "learning_rate": 3.0946812540433304e-08,
+      "loss": 0.7033,
+      "step": 13542
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 8.769868181834005,
+      "learning_rate": 3.081866767571218e-08,
+      "loss": 0.7507,
+      "step": 13543
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 7.0111909176046785,
+      "learning_rate": 3.069078785329083e-08,
+      "loss": 0.7527,
+      "step": 13544
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 10.145271379410294,
+      "learning_rate": 3.056317307999157e-08,
+      "loss": 0.7271,
+      "step": 13545
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 8.14686302629859,
+      "learning_rate": 3.043582336261897e-08,
+      "loss": 0.689,
+      "step": 13546
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 8.766082773605463,
+      "learning_rate": 3.030873870796758e-08,
+      "loss": 0.619,
+      "step": 13547
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 10.861851090101247,
+      "learning_rate": 3.0181919122814764e-08,
+      "loss": 0.698,
+      "step": 13548
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 8.422589439134992,
+      "learning_rate": 3.0055364613924e-08,
+      "loss": 0.7121,
+      "step": 13549
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 10.594545291489274,
+      "learning_rate": 2.992907518804711e-08,
+      "loss": 0.733,
+      "step": 13550
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 8.265384716262052,
+      "learning_rate": 2.98030508519187e-08,
+      "loss": 0.7086,
+      "step": 13551
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 10.629094365558545,
+      "learning_rate": 2.967729161226063e-08,
+      "loss": 0.7702,
+      "step": 13552
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 11.307573277276758,
+      "learning_rate": 2.9551797475781408e-08,
+      "loss": 0.6604,
+      "step": 13553
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 12.364581361471952,
+      "learning_rate": 2.9426568449173466e-08,
+      "loss": 0.7224,
+      "step": 13554
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 9.451445140454531,
+      "learning_rate": 2.9301604539117012e-08,
+      "loss": 0.705,
+      "step": 13555
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 9.197598444631373,
+      "learning_rate": 2.9176905752276718e-08,
+      "loss": 0.676,
+      "step": 13556
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 12.492448603292333,
+      "learning_rate": 2.905247209530393e-08,
+      "loss": 0.696,
+      "step": 13557
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 9.50569911440063,
+      "learning_rate": 2.8928303574836114e-08,
+      "loss": 0.6951,
+      "step": 13558
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 9.537068402040765,
+      "learning_rate": 2.880440019749464e-08,
+      "loss": 0.7133,
+      "step": 13559
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 10.508957219048543,
+      "learning_rate": 2.8680761969889227e-08,
+      "loss": 0.7008,
+      "step": 13560
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 12.073525441016923,
+      "learning_rate": 2.855738889861459e-08,
+      "loss": 0.6849,
+      "step": 13561
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 7.961105288326567,
+      "learning_rate": 2.8434280990250474e-08,
+      "loss": 0.7507,
+      "step": 13562
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 9.864932229218725,
+      "learning_rate": 2.831143825136384e-08,
+      "loss": 0.6887,
+      "step": 13563
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 11.184451369484815,
+      "learning_rate": 2.818886068850557e-08,
+      "loss": 0.699,
+      "step": 13564
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 7.924995963856407,
+      "learning_rate": 2.806654830821487e-08,
+      "loss": 0.6655,
+      "step": 13565
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 7.924375057025447,
+      "learning_rate": 2.79445011170143e-08,
+      "loss": 0.7093,
+      "step": 13566
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 9.018282584854912,
+      "learning_rate": 2.7822719121414765e-08,
+      "loss": 0.6854,
+      "step": 13567
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 10.60935068822839,
+      "learning_rate": 2.770120232791107e-08,
+      "loss": 0.6576,
+      "step": 13568
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 12.507364455671592,
+      "learning_rate": 2.757995074298525e-08,
+      "loss": 0.6772,
+      "step": 13569
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 10.334593709834401,
+      "learning_rate": 2.7458964373103802e-08,
+      "loss": 0.7033,
+      "step": 13570
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 10.028097816410261,
+      "learning_rate": 2.7338243224719897e-08,
+      "loss": 0.7116,
+      "step": 13571
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 8.64047191425897,
+      "learning_rate": 2.7217787304272268e-08,
+      "loss": 0.7427,
+      "step": 13572
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 9.288765468963687,
+      "learning_rate": 2.709759661818634e-08,
+      "loss": 0.7079,
+      "step": 13573
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 7.570354961339477,
+      "learning_rate": 2.6977671172872532e-08,
+      "loss": 0.6702,
+      "step": 13574
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 11.617297222122993,
+      "learning_rate": 2.68580109747274e-08,
+      "loss": 0.6521,
+      "step": 13575
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 12.088995619342285,
+      "learning_rate": 2.67386160301325e-08,
+      "loss": 0.7009,
+      "step": 13576
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 9.764038474783897,
+      "learning_rate": 2.661948634545719e-08,
+      "loss": 0.7198,
+      "step": 13577
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 7.483529027811565,
+      "learning_rate": 2.6500621927054716e-08,
+      "loss": 0.6355,
+      "step": 13578
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 11.200970569187497,
+      "learning_rate": 2.638202278126556e-08,
+      "loss": 0.7249,
+      "step": 13579
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 8.89413353270858,
+      "learning_rate": 2.6263688914414666e-08,
+      "loss": 0.6524,
+      "step": 13580
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 9.630560472178802,
+      "learning_rate": 2.614562033281476e-08,
+      "loss": 0.7228,
+      "step": 13581
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 8.978204746938362,
+      "learning_rate": 2.6027817042761917e-08,
+      "loss": 0.6767,
+      "step": 13582
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 11.19827654226091,
+      "learning_rate": 2.591027905054111e-08,
+      "loss": 0.7919,
+      "step": 13583
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 10.147931456234836,
+      "learning_rate": 2.5793006362419547e-08,
+      "loss": 0.7136,
+      "step": 13584
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 10.106297485737205,
+      "learning_rate": 2.567599898465334e-08,
+      "loss": 0.6869,
+      "step": 13585
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 9.722156221146685,
+      "learning_rate": 2.55592569234836e-08,
+      "loss": 0.6805,
+      "step": 13586
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 9.887270248112696,
+      "learning_rate": 2.5442780185136462e-08,
+      "loss": 0.7257,
+      "step": 13587
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 9.554103822402949,
+      "learning_rate": 2.532656877582529e-08,
+      "loss": 0.6862,
+      "step": 13588
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 11.449858840218498,
+      "learning_rate": 2.5210622701747343e-08,
+      "loss": 0.6832,
+      "step": 13589
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 9.49563593623427,
+      "learning_rate": 2.5094941969087128e-08,
+      "loss": 0.7038,
+      "step": 13590
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 9.357139872405364,
+      "learning_rate": 2.4979526584015258e-08,
+      "loss": 0.7069,
+      "step": 13591
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 7.177348871575124,
+      "learning_rate": 2.4864376552687364e-08,
+      "loss": 0.6933,
+      "step": 13592
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 181.74009484550973,
+      "learning_rate": 2.4749491881245203e-08,
+      "loss": 0.8332,
+      "step": 13593
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 9.355295131338847,
+      "learning_rate": 2.4634872575816092e-08,
+      "loss": 0.6203,
+      "step": 13594
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 11.371061073677334,
+      "learning_rate": 2.4520518642514037e-08,
+      "loss": 0.7021,
+      "step": 13595
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 7.595095190845701,
+      "learning_rate": 2.4406430087438594e-08,
+      "loss": 0.6918,
+      "step": 13596
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 7.515638034113392,
+      "learning_rate": 2.4292606916674343e-08,
+      "loss": 0.7191,
+      "step": 13597
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 11.035201825004542,
+      "learning_rate": 2.417904913629199e-08,
+      "loss": 0.6822,
+      "step": 13598
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 9.465985638555733,
+      "learning_rate": 2.4065756752349457e-08,
+      "loss": 0.7419,
+      "step": 13599
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 9.761095824104197,
+      "learning_rate": 2.3952729770888582e-08,
+      "loss": 0.7446,
+      "step": 13600
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 6.298982953416132,
+      "learning_rate": 2.3839968197938434e-08,
+      "loss": 0.6511,
+      "step": 13601
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 10.371425312895978,
+      "learning_rate": 2.3727472039512535e-08,
+      "loss": 0.6949,
+      "step": 13602
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 10.735245537957615,
+      "learning_rate": 2.361524130161219e-08,
+      "loss": 0.7906,
+      "step": 13603
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 10.725896564287824,
+      "learning_rate": 2.3503275990223173e-08,
+      "loss": 0.7609,
+      "step": 13604
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 13.116166269403148,
+      "learning_rate": 2.3391576111316818e-08,
+      "loss": 0.6847,
+      "step": 13605
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 9.109433857786323,
+      "learning_rate": 2.328014167085113e-08,
+      "loss": 0.6739,
+      "step": 13606
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 10.019815644936964,
+      "learning_rate": 2.3168972674770806e-08,
+      "loss": 0.645,
+      "step": 13607
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 8.506533478914497,
+      "learning_rate": 2.3058069129003878e-08,
+      "loss": 0.7148,
+      "step": 13608
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 7.891225966963498,
+      "learning_rate": 2.2947431039465616e-08,
+      "loss": 0.6776,
+      "step": 13609
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 9.935380710764974,
+      "learning_rate": 2.2837058412057967e-08,
+      "loss": 0.7474,
+      "step": 13610
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 7.101709150361985,
+      "learning_rate": 2.2726951252667883e-08,
+      "loss": 0.759,
+      "step": 13611
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 10.188416492615659,
+      "learning_rate": 2.2617109567167893e-08,
+      "loss": 0.6747,
+      "step": 13612
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 9.924861497029898,
+      "learning_rate": 2.2507533361416646e-08,
+      "loss": 0.7024,
+      "step": 13613
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 8.525930092186696,
+      "learning_rate": 2.239822264125835e-08,
+      "loss": 0.6573,
+      "step": 13614
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 7.9772896300533995,
+      "learning_rate": 2.22891774125239e-08,
+      "loss": 0.599,
+      "step": 13615
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 11.256168864093274,
+      "learning_rate": 2.2180397681028643e-08,
+      "loss": 0.7545,
+      "step": 13616
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 9.48386215251485,
+      "learning_rate": 2.2071883452575715e-08,
+      "loss": 0.7372,
+      "step": 13617
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 10.42185421001656,
+      "learning_rate": 2.1963634732952154e-08,
+      "loss": 0.6889,
+      "step": 13618
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 9.738093468980239,
+      "learning_rate": 2.1855651527931676e-08,
+      "loss": 0.693,
+      "step": 13619
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 9.393941967769342,
+      "learning_rate": 2.1747933843274115e-08,
+      "loss": 0.6668,
+      "step": 13620
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 9.672189646387428,
+      "learning_rate": 2.164048168472488e-08,
+      "loss": 0.7053,
+      "step": 13621
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 11.940390413068709,
+      "learning_rate": 2.1533295058014937e-08,
+      "loss": 0.7041,
+      "step": 13622
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 9.625643514821299,
+      "learning_rate": 2.1426373968860825e-08,
+      "loss": 0.697,
+      "step": 13623
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 9.486474266805624,
+      "learning_rate": 2.1319718422966873e-08,
+      "loss": 0.6508,
+      "step": 13624
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 9.332178978867931,
+      "learning_rate": 2.1213328426020195e-08,
+      "loss": 0.7137,
+      "step": 13625
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 11.055332548804879,
+      "learning_rate": 2.1107203983696257e-08,
+      "loss": 0.764,
+      "step": 13626
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 11.514736542114123,
+      "learning_rate": 2.1001345101655522e-08,
+      "loss": 0.696,
+      "step": 13627
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 8.048897919603036,
+      "learning_rate": 2.0895751785543484e-08,
+      "loss": 0.695,
+      "step": 13628
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 8.587494200086256,
+      "learning_rate": 2.079042404099285e-08,
+      "loss": 0.6636,
+      "step": 13629
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 12.51268808190046,
+      "learning_rate": 2.06853618736208e-08,
+      "loss": 0.7094,
+      "step": 13630
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 8.05962864926787,
+      "learning_rate": 2.0580565289032296e-08,
+      "loss": 0.6788,
+      "step": 13631
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 10.00129819846024,
+      "learning_rate": 2.047603429281564e-08,
+      "loss": 0.7314,
+      "step": 13632
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 8.340022653544162,
+      "learning_rate": 2.0371768890546927e-08,
+      "loss": 0.6634,
+      "step": 13633
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 9.067944680066532,
+      "learning_rate": 2.026776908778727e-08,
+      "loss": 0.6375,
+      "step": 13634
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 9.490225552790863,
+      "learning_rate": 2.016403489008334e-08,
+      "loss": 0.6975,
+      "step": 13635
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 9.276084421409744,
+      "learning_rate": 2.0060566302968486e-08,
+      "loss": 0.6731,
+      "step": 13636
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 9.774767738218722,
+      "learning_rate": 1.9957363331961077e-08,
+      "loss": 0.6459,
+      "step": 13637
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 8.473878331864176,
+      "learning_rate": 1.9854425982565594e-08,
+      "loss": 0.7725,
+      "step": 13638
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 9.317977049341396,
+      "learning_rate": 1.9751754260273205e-08,
+      "loss": 0.717,
+      "step": 13639
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 8.10100154199311,
+      "learning_rate": 1.9649348170559525e-08,
+      "loss": 0.6056,
+      "step": 13640
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 7.433805048908012,
+      "learning_rate": 1.954720771888685e-08,
+      "loss": 0.658,
+      "step": 13641
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 10.89391597501779,
+      "learning_rate": 1.9445332910702498e-08,
+      "loss": 0.6516,
+      "step": 13642
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 14.70773503009324,
+      "learning_rate": 1.9343723751441003e-08,
+      "loss": 0.691,
+      "step": 13643
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 7.213487811598872,
+      "learning_rate": 1.9242380246521362e-08,
+      "loss": 0.6389,
+      "step": 13644
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 10.019447167675343,
+      "learning_rate": 1.914130240134926e-08,
+      "loss": 0.6477,
+      "step": 13645
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 11.420282759289817,
+      "learning_rate": 1.9040490221315933e-08,
+      "loss": 0.6319,
+      "step": 13646
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 8.729061535279182,
+      "learning_rate": 1.8939943711797637e-08,
+      "loss": 0.6913,
+      "step": 13647
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 10.93723786700918,
+      "learning_rate": 1.8839662878158417e-08,
+      "loss": 0.7628,
+      "step": 13648
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 14.124654633077139,
+      "learning_rate": 1.873964772574621e-08,
+      "loss": 0.7217,
+      "step": 13649
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 9.07655279453223,
+      "learning_rate": 1.863989825989565e-08,
+      "loss": 0.6387,
+      "step": 13650
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 8.942625028351536,
+      "learning_rate": 1.854041448592747e-08,
+      "loss": 0.7286,
+      "step": 13651
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 9.017030271404218,
+      "learning_rate": 1.8441196409147988e-08,
+      "loss": 0.6202,
+      "step": 13652
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 9.650798935839928,
+      "learning_rate": 1.8342244034847966e-08,
+      "loss": 0.6857,
+      "step": 13653
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 7.027783393244305,
+      "learning_rate": 1.824355736830652e-08,
+      "loss": 0.718,
+      "step": 13654
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 9.17552979584744,
+      "learning_rate": 1.8145136414786656e-08,
+      "loss": 0.7199,
+      "step": 13655
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 8.179437136386548,
+      "learning_rate": 1.8046981179538625e-08,
+      "loss": 0.7318,
+      "step": 13656
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 8.645224052542664,
+      "learning_rate": 1.794909166779657e-08,
+      "loss": 0.6731,
+      "step": 13657
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 11.262961448017204,
+      "learning_rate": 1.785146788478298e-08,
+      "loss": 0.7201,
+      "step": 13658
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 7.431416331522265,
+      "learning_rate": 1.775410983570425e-08,
+      "loss": 0.74,
+      "step": 13659
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 7.933286691757479,
+      "learning_rate": 1.765701752575233e-08,
+      "loss": 0.7508,
+      "step": 13660
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 11.854551051101947,
+      "learning_rate": 1.7560190960107525e-08,
+      "loss": 0.6441,
+      "step": 13661
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 12.659290274269123,
+      "learning_rate": 1.7463630143932932e-08,
+      "loss": 0.724,
+      "step": 13662
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 9.10614194385134,
+      "learning_rate": 1.736733508237942e-08,
+      "loss": 0.666,
+      "step": 13663
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 7.964065698189364,
+      "learning_rate": 1.7271305780582892e-08,
+      "loss": 0.7674,
+      "step": 13664
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 12.838591318639695,
+      "learning_rate": 1.7175542243665356e-08,
+      "loss": 0.6563,
+      "step": 13665
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 6.545509988559876,
+      "learning_rate": 1.7080044476734947e-08,
+      "loss": 0.6861,
+      "step": 13666
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 8.852697728846318,
+      "learning_rate": 1.6984812484884817e-08,
+      "loss": 0.736,
+      "step": 13667
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 6.384809913759523,
+      "learning_rate": 1.6889846273194233e-08,
+      "loss": 0.6805,
+      "step": 13668
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 9.452493619375089,
+      "learning_rate": 1.679514584672859e-08,
+      "loss": 0.7433,
+      "step": 13669
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 10.655759181344763,
+      "learning_rate": 1.67007112105394e-08,
+      "loss": 0.6957,
+      "step": 13670
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 12.507059042853399,
+      "learning_rate": 1.660654236966319e-08,
+      "loss": 0.7366,
+      "step": 13671
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 11.223391105019031,
+      "learning_rate": 1.6512639329122614e-08,
+      "loss": 0.6966,
+      "step": 13672
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 9.974816058004256,
+      "learning_rate": 1.6419002093925886e-08,
+      "loss": 0.6394,
+      "step": 13673
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 7.199624722476938,
+      "learning_rate": 1.6325630669067892e-08,
+      "loss": 0.694,
+      "step": 13674
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 8.969901786588098,
+      "learning_rate": 1.6232525059528547e-08,
+      "loss": 0.7004,
+      "step": 13675
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 13.592968112934964,
+      "learning_rate": 1.6139685270273873e-08,
+      "loss": 0.6746,
+      "step": 13676
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 7.710313150853898,
+      "learning_rate": 1.6047111306256025e-08,
+      "loss": 0.6765,
+      "step": 13677
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 9.823075405533007,
+      "learning_rate": 1.595480317241216e-08,
+      "loss": 0.7377,
+      "step": 13678
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 10.931737185035221,
+      "learning_rate": 1.586276087366556e-08,
+      "loss": 0.7124,
+      "step": 13679
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 10.397121782905764,
+      "learning_rate": 1.5770984414926194e-08,
+      "loss": 0.6912,
+      "step": 13680
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 11.635971792041149,
+      "learning_rate": 1.5679473801088474e-08,
+      "loss": 0.738,
+      "step": 13681
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 13.05468553435745,
+      "learning_rate": 1.558822903703405e-08,
+      "loss": 0.6902,
+      "step": 13682
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 6.359355709548037,
+      "learning_rate": 1.549725012762904e-08,
+      "loss": 0.6349,
+      "step": 13683
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 7.432222186381549,
+      "learning_rate": 1.540653707772677e-08,
+      "loss": 0.7026,
+      "step": 13684
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 8.23071091960685,
+      "learning_rate": 1.531608989216449e-08,
+      "loss": 0.6519,
+      "step": 13685
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 11.052305779649846,
+      "learning_rate": 1.5225908575767222e-08,
+      "loss": 0.7128,
+      "step": 13686
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 9.597010203938657,
+      "learning_rate": 1.5135993133345017e-08,
+      "loss": 0.7051,
+      "step": 13687
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 10.585984650616622,
+      "learning_rate": 1.5046343569693477e-08,
+      "loss": 0.6276,
+      "step": 13688
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 9.325294661153736,
+      "learning_rate": 1.4956959889593782e-08,
+      "loss": 0.7,
+      "step": 13689
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 10.249669760233907,
+      "learning_rate": 1.4867842097814333e-08,
+      "loss": 0.7408,
+      "step": 13690
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 8.138539189373256,
+      "learning_rate": 1.4778990199108001e-08,
+      "loss": 0.6717,
+      "step": 13691
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 8.99428972841364,
+      "learning_rate": 1.4690404198213215e-08,
+      "loss": 0.6748,
+      "step": 13692
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 7.3504938335642525,
+      "learning_rate": 1.4602084099856196e-08,
+      "loss": 0.6685,
+      "step": 13693
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 11.680813925306131,
+      "learning_rate": 1.4514029908747062e-08,
+      "loss": 0.6778,
+      "step": 13694
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 10.904506428067041,
+      "learning_rate": 1.442624162958206e-08,
+      "loss": 0.6804,
+      "step": 13695
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 8.342179909377968,
+      "learning_rate": 1.433871926704411e-08,
+      "loss": 0.6096,
+      "step": 13696
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 11.929652845498875,
+      "learning_rate": 1.4251462825801143e-08,
+      "loss": 0.7329,
+      "step": 13697
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 9.807065935249664,
+      "learning_rate": 1.4164472310507216e-08,
+      "loss": 0.6883,
+      "step": 13698
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 14.736607209540999,
+      "learning_rate": 1.407774772580195e-08,
+      "loss": 0.7111,
+      "step": 13699
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 6.809440801342298,
+      "learning_rate": 1.3991289076311642e-08,
+      "loss": 0.7692,
+      "step": 13700
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 8.588883315403844,
+      "learning_rate": 1.3905096366647052e-08,
+      "loss": 0.6814,
+      "step": 13701
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 11.283936406866603,
+      "learning_rate": 1.3819169601405614e-08,
+      "loss": 0.6863,
+      "step": 13702
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 8.13574144002724,
+      "learning_rate": 1.373350878517088e-08,
+      "loss": 0.7076,
+      "step": 13703
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 7.833109092739801,
+      "learning_rate": 1.3648113922510864e-08,
+      "loss": 0.7659,
+      "step": 13704
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 8.806318938990563,
+      "learning_rate": 1.3562985017981367e-08,
+      "loss": 0.7026,
+      "step": 13705
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 12.215988212846478,
+      "learning_rate": 1.347812207612209e-08,
+      "loss": 0.7199,
+      "step": 13706
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 8.53241084616014,
+      "learning_rate": 1.339352510145997e-08,
+      "loss": 0.7044,
+      "step": 13707
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 11.653448637461823,
+      "learning_rate": 1.330919409850695e-08,
+      "loss": 0.6394,
+      "step": 13708
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 10.557230377083439,
+      "learning_rate": 1.32251290717611e-08,
+      "loss": 0.6923,
+      "step": 13709
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 10.026071017816985,
+      "learning_rate": 1.31413300257055e-08,
+      "loss": 0.6042,
+      "step": 13710
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 12.205303540865073,
+      "learning_rate": 1.3057796964810465e-08,
+      "loss": 0.6998,
+      "step": 13711
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 13.839585958267476,
+      "learning_rate": 1.2974529893531874e-08,
+      "loss": 0.7695,
+      "step": 13712
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 9.849674588003761,
+      "learning_rate": 1.2891528816310062e-08,
+      "loss": 0.6939,
+      "step": 13713
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 6.983481077754933,
+      "learning_rate": 1.2808793737572045e-08,
+      "loss": 0.6178,
+      "step": 13714
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 9.181684907562424,
+      "learning_rate": 1.2726324661731515e-08,
+      "loss": 0.6793,
+      "step": 13715
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 10.221460512607386,
+      "learning_rate": 1.2644121593186065e-08,
+      "loss": 0.7217,
+      "step": 13716
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 10.111619991584803,
+      "learning_rate": 1.2562184536321631e-08,
+      "loss": 0.6767,
+      "step": 13717
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 9.84966136558447,
+      "learning_rate": 1.248051349550694e-08,
+      "loss": 0.6622,
+      "step": 13718
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 13.57451303836709,
+      "learning_rate": 1.2399108475099065e-08,
+      "loss": 0.6381,
+      "step": 13719
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 9.570306214025194,
+      "learning_rate": 1.231796947943953e-08,
+      "loss": 0.721,
+      "step": 13720
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 9.756631581058988,
+      "learning_rate": 1.2237096512855983e-08,
+      "loss": 0.6577,
+      "step": 13721
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 10.692324826277934,
+      "learning_rate": 1.2156489579662756e-08,
+      "loss": 0.6809,
+      "step": 13722
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 8.00087471901901,
+      "learning_rate": 1.2076148684158629e-08,
+      "loss": 0.6981,
+      "step": 13723
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 10.25226267125817,
+      "learning_rate": 1.1996073830627953e-08,
+      "loss": 0.6944,
+      "step": 13724
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 10.90358264977353,
+      "learning_rate": 1.1916265023343421e-08,
+      "loss": 0.6985,
+      "step": 13725
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 12.647713969935994,
+      "learning_rate": 1.1836722266559964e-08,
+      "loss": 0.6909,
+      "step": 13726
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 9.726247850787932,
+      "learning_rate": 1.1757445564521963e-08,
+      "loss": 0.6746,
+      "step": 13727
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 9.905234095230055,
+      "learning_rate": 1.1678434921456594e-08,
+      "loss": 0.6816,
+      "step": 13728
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 11.313482129196869,
+      "learning_rate": 1.1599690341578262e-08,
+      "loss": 0.7402,
+      "step": 13729
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 9.146063511735758,
+      "learning_rate": 1.1521211829087498e-08,
+      "loss": 0.6745,
+      "step": 13730
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 8.939873954001616,
+      "learning_rate": 1.1442999388169284e-08,
+      "loss": 0.6756,
+      "step": 13731
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 7.658937423739934,
+      "learning_rate": 1.1365053022996397e-08,
+      "loss": 0.6501,
+      "step": 13732
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 10.410696393982997,
+      "learning_rate": 1.1287372737724955e-08,
+      "loss": 0.7486,
+      "step": 13733
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 11.914664889764303,
+      "learning_rate": 1.1209958536498866e-08,
+      "loss": 0.6307,
+      "step": 13734
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 9.831558233962797,
+      "learning_rate": 1.1132810423447604e-08,
+      "loss": 0.6168,
+      "step": 13735
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 13.375422542246008,
+      "learning_rate": 1.1055928402685655e-08,
+      "loss": 0.6549,
+      "step": 13736
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 9.46124158750857,
+      "learning_rate": 1.0979312478313631e-08,
+      "loss": 0.7168,
+      "step": 13737
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 11.218942345388257,
+      "learning_rate": 1.0902962654417704e-08,
+      "loss": 0.7019,
+      "step": 13738
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 13.710968792275438,
+      "learning_rate": 1.0826878935070729e-08,
+      "loss": 0.6915,
+      "step": 13739
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 8.221975163703531,
+      "learning_rate": 1.075106132433057e-08,
+      "loss": 0.7258,
+      "step": 13740
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 8.60692084086236,
+      "learning_rate": 1.0675509826241215e-08,
+      "loss": 0.7129,
+      "step": 13741
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 11.115939626072786,
+      "learning_rate": 1.0600224444832219e-08,
+      "loss": 0.7151,
+      "step": 13742
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 9.183850793252422,
+      "learning_rate": 1.0525205184119258e-08,
+      "loss": 0.6799,
+      "step": 13743
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 9.332413489035362,
+      "learning_rate": 1.0450452048103022e-08,
+      "loss": 0.7022,
+      "step": 13744
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 9.487135327933672,
+      "learning_rate": 1.0375965040771984e-08,
+      "loss": 0.7118,
+      "step": 13745
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 10.173891818794319,
+      "learning_rate": 1.0301744166097971e-08,
+      "loss": 0.6789,
+      "step": 13746
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 11.086850219546482,
+      "learning_rate": 1.022778942803948e-08,
+      "loss": 0.6836,
+      "step": 13747
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 8.578603217310757,
+      "learning_rate": 1.0154100830542246e-08,
+      "loss": 0.74,
+      "step": 13748
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 8.23415110471886,
+      "learning_rate": 1.0080678377535902e-08,
+      "loss": 0.6627,
+      "step": 13749
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 8.517315377406632,
+      "learning_rate": 1.0007522072936205e-08,
+      "loss": 0.6956,
+      "step": 13750
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 11.171432493142445,
+      "learning_rate": 9.934631920645587e-09,
+      "loss": 0.7379,
+      "step": 13751
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 7.350111083358999,
+      "learning_rate": 9.862007924552052e-09,
+      "loss": 0.6599,
+      "step": 13752
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 7.830614837421905,
+      "learning_rate": 9.789650088528613e-09,
+      "loss": 0.6523,
+      "step": 13753
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 9.118693287583877,
+      "learning_rate": 9.717558416434958e-09,
+      "loss": 0.6058,
+      "step": 13754
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 10.234638944265773,
+      "learning_rate": 9.645732912116346e-09,
+      "loss": 0.7191,
+      "step": 13755
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 8.483561322585969,
+      "learning_rate": 9.5741735794036e-09,
+      "loss": 0.6403,
+      "step": 13756
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 11.441038062964692,
+      "learning_rate": 9.502880422113114e-09,
+      "loss": 0.5924,
+      "step": 13757
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 8.877758620750848,
+      "learning_rate": 9.431853444048512e-09,
+      "loss": 0.7024,
+      "step": 13758
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 9.128645227977302,
+      "learning_rate": 9.361092648996761e-09,
+      "loss": 0.6871,
+      "step": 13759
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 9.293887848947982,
+      "learning_rate": 9.290598040733735e-09,
+      "loss": 0.6176,
+      "step": 13760
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 10.49206712050618,
+      "learning_rate": 9.220369623017533e-09,
+      "loss": 0.6724,
+      "step": 13761
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 7.97719136790373,
+      "learning_rate": 9.150407399595163e-09,
+      "loss": 0.6773,
+      "step": 13762
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 8.731912622808965,
+      "learning_rate": 9.08071137419808e-09,
+      "loss": 0.627,
+      "step": 13763
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 8.365197492203274,
+      "learning_rate": 9.011281550543315e-09,
+      "loss": 0.6588,
+      "step": 13764
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 10.395905645134777,
+      "learning_rate": 8.94211793233457e-09,
+      "loss": 0.6831,
+      "step": 13765
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 11.220727171791196,
+      "learning_rate": 8.873220523260562e-09,
+      "loss": 0.5729,
+      "step": 13766
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 11.003285745836044,
+      "learning_rate": 8.804589326996126e-09,
+      "loss": 0.6799,
+      "step": 13767
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 7.985022743911648,
+      "learning_rate": 8.736224347201672e-09,
+      "loss": 0.6751,
+      "step": 13768
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 7.402999109774071,
+      "learning_rate": 8.668125587523169e-09,
+      "loss": 0.6819,
+      "step": 13769
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 8.520478151441939,
+      "learning_rate": 8.600293051594377e-09,
+      "loss": 0.7168,
+      "step": 13770
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 11.58438048662213,
+      "learning_rate": 8.532726743031294e-09,
+      "loss": 0.7304,
+      "step": 13771
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 10.024723962412658,
+      "learning_rate": 8.465426665439369e-09,
+      "loss": 0.6856,
+      "step": 13772
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 6.944616492521782,
+      "learning_rate": 8.398392822407398e-09,
+      "loss": 0.7055,
+      "step": 13773
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 8.794425897152506,
+      "learning_rate": 8.331625217511408e-09,
+      "loss": 0.7606,
+      "step": 13774
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 13.408172223654777,
+      "learning_rate": 8.26512385431133e-09,
+      "loss": 0.72,
+      "step": 13775
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 10.062888343034956,
+      "learning_rate": 8.198888736355437e-09,
+      "loss": 0.6963,
+      "step": 13776
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 10.605424720720405,
+      "learning_rate": 8.132919867175903e-09,
+      "loss": 0.6424,
+      "step": 13777
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 11.412378617878367,
+      "learning_rate": 8.067217250291582e-09,
+      "loss": 0.7942,
+      "step": 13778
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 8.752973049874742,
+      "learning_rate": 8.001780889206334e-09,
+      "loss": 0.6298,
+      "step": 13779
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 8.82662150115683,
+      "learning_rate": 7.93661078741126e-09,
+      "loss": 0.6687,
+      "step": 13780
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 15.507880971461487,
+      "learning_rate": 7.871706948381908e-09,
+      "loss": 0.6828,
+      "step": 13781
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 9.765484079560139,
+      "learning_rate": 7.807069375579402e-09,
+      "loss": 0.7055,
+      "step": 13782
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 10.364864902746067,
+      "learning_rate": 7.742698072452647e-09,
+      "loss": 0.6703,
+      "step": 13783
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 9.936442610338112,
+      "learning_rate": 7.678593042433902e-09,
+      "loss": 0.7464,
+      "step": 13784
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 9.051548890921818,
+      "learning_rate": 7.61475428894265e-09,
+      "loss": 0.6507,
+      "step": 13785
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 9.150574162266171,
+      "learning_rate": 7.5511818153845e-09,
+      "loss": 0.6409,
+      "step": 13786
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 8.37423078830097,
+      "learning_rate": 7.48787562514952e-09,
+      "loss": 0.6694,
+      "step": 13787
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 9.110225749892605,
+      "learning_rate": 7.424835721614454e-09,
+      "loss": 0.7358,
+      "step": 13788
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 8.997173235164913,
+      "learning_rate": 7.36206210814161e-09,
+      "loss": 0.7145,
+      "step": 13789
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 9.496523465210792,
+      "learning_rate": 7.299554788079421e-09,
+      "loss": 0.6621,
+      "step": 13790
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 12.061203069567433,
+      "learning_rate": 7.237313764761889e-09,
+      "loss": 0.6588,
+      "step": 13791
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 6.7483877701478825,
+      "learning_rate": 7.175339041508578e-09,
+      "loss": 0.7083,
+      "step": 13792
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 9.433084829773028,
+      "learning_rate": 7.1136306216246235e-09,
+      "loss": 0.6918,
+      "step": 13793
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 7.9141694457276905,
+      "learning_rate": 7.052188508402391e-09,
+      "loss": 0.6665,
+      "step": 13794
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 13.435578105713699,
+      "learning_rate": 6.99101270511815e-09,
+      "loss": 0.744,
+      "step": 13795
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 7.100009604177868,
+      "learning_rate": 6.930103215034844e-09,
+      "loss": 0.6504,
+      "step": 13796
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 9.267672034707067,
+      "learning_rate": 6.869460041402098e-09,
+      "loss": 0.6657,
+      "step": 13797
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 14.82351980214681,
+      "learning_rate": 6.809083187453436e-09,
+      "loss": 0.653,
+      "step": 13798
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 10.741223676894673,
+      "learning_rate": 6.7489726564096134e-09,
+      "loss": 0.6847,
+      "step": 13799
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 10.274339098420526,
+      "learning_rate": 6.689128451476956e-09,
+      "loss": 0.6453,
+      "step": 13800
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 6.001611029501712,
+      "learning_rate": 6.629550575847355e-09,
+      "loss": 0.676,
+      "step": 13801
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 8.997879552784973,
+      "learning_rate": 6.570239032698267e-09,
+      "loss": 0.6469,
+      "step": 13802
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 8.330568776282801,
+      "learning_rate": 6.511193825193274e-09,
+      "loss": 0.6943,
+      "step": 13803
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 9.940181806545592,
+      "learning_rate": 6.452414956482078e-09,
+      "loss": 0.6561,
+      "step": 13804
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 9.141131649064596,
+      "learning_rate": 6.393902429698839e-09,
+      "loss": 0.6585,
+      "step": 13805
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 7.967802329913392,
+      "learning_rate": 6.335656247965505e-09,
+      "loss": 0.6778,
+      "step": 13806
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 11.526515186231164,
+      "learning_rate": 6.277676414387923e-09,
+      "loss": 0.6772,
+      "step": 13807
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 10.692053101109328,
+      "learning_rate": 6.219962932059731e-09,
+      "loss": 0.7036,
+      "step": 13808
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 8.084699027122427,
+      "learning_rate": 6.162515804057356e-09,
+      "loss": 0.6398,
+      "step": 13809
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 7.133081153777959,
+      "learning_rate": 6.10533503344668e-09,
+      "loss": 0.7299,
+      "step": 13810
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 9.319103407361004,
+      "learning_rate": 6.048420623276929e-09,
+      "loss": 0.7092,
+      "step": 13811
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 8.542767433533019,
+      "learning_rate": 5.991772576582899e-09,
+      "loss": 0.64,
+      "step": 13812
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 8.47303580318942,
+      "learning_rate": 5.9353908963877275e-09,
+      "loss": 0.6832,
+      "step": 13813
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 8.247038367828116,
+      "learning_rate": 5.879275585696787e-09,
+      "loss": 0.6666,
+      "step": 13814
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 8.918283701177495,
+      "learning_rate": 5.823426647504904e-09,
+      "loss": 0.7215,
+      "step": 13815
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 8.90685716641519,
+      "learning_rate": 5.767844084789698e-09,
+      "loss": 0.6551,
+      "step": 13816
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 6.87175355864691,
+      "learning_rate": 5.712527900516018e-09,
+      "loss": 0.6657,
+      "step": 13817
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 9.960666839548926,
+      "learning_rate": 5.657478097634838e-09,
+      "loss": 0.6907,
+      "step": 13818
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 9.16032504569147,
+      "learning_rate": 5.602694679081588e-09,
+      "loss": 0.774,
+      "step": 13819
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 11.061270568247764,
+      "learning_rate": 5.548177647778375e-09,
+      "loss": 0.7224,
+      "step": 13820
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 11.825044024341066,
+      "learning_rate": 5.493927006633426e-09,
+      "loss": 0.7066,
+      "step": 13821
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 12.126554641684914,
+      "learning_rate": 5.439942758539984e-09,
+      "loss": 0.7254,
+      "step": 13822
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 8.864470843960223,
+      "learning_rate": 5.386224906377968e-09,
+      "loss": 0.6358,
+      "step": 13823
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 9.2933225461455,
+      "learning_rate": 5.3327734530117525e-09,
+      "loss": 0.7037,
+      "step": 13824
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 11.281414434521396,
+      "learning_rate": 5.279588401292946e-09,
+      "loss": 0.705,
+      "step": 13825
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 8.374250933362399,
+      "learning_rate": 5.226669754057611e-09,
+      "loss": 0.7101,
+      "step": 13826
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 9.403083479316493,
+      "learning_rate": 5.174017514129048e-09,
+      "loss": 0.6704,
+      "step": 13827
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 13.199669236030207,
+      "learning_rate": 5.121631684315009e-09,
+      "loss": 0.7794,
+      "step": 13828
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 9.488464543235706,
+      "learning_rate": 5.069512267409926e-09,
+      "loss": 0.6779,
+      "step": 13829
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 7.080542200664144,
+      "learning_rate": 5.017659266193242e-09,
+      "loss": 0.6359,
+      "step": 13830
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 6.705164316139329,
+      "learning_rate": 4.966072683431078e-09,
+      "loss": 0.7204,
+      "step": 13831
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 9.461004526668354,
+      "learning_rate": 4.9147525218751215e-09,
+      "loss": 0.6466,
+      "step": 13832
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 9.870532152907064,
+      "learning_rate": 4.863698784262072e-09,
+      "loss": 0.6906,
+      "step": 13833
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 7.580328521650989,
+      "learning_rate": 4.812911473315307e-09,
+      "loss": 0.7055,
+      "step": 13834
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 9.704936942832898,
+      "learning_rate": 4.762390591743771e-09,
+      "loss": 0.6671,
+      "step": 13835
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 8.519406147035156,
+      "learning_rate": 4.712136142241975e-09,
+      "loss": 0.6834,
+      "step": 13836
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 9.2163785335527,
+      "learning_rate": 4.662148127490551e-09,
+      "loss": 0.6817,
+      "step": 13837
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 8.323783762425496,
+      "learning_rate": 4.6124265501551465e-09,
+      "loss": 0.7388,
+      "step": 13838
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 6.342461357586497,
+      "learning_rate": 4.5629714128880844e-09,
+      "loss": 0.7304,
+      "step": 13839
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 8.761664839532262,
+      "learning_rate": 4.513782718327253e-09,
+      "loss": 0.7164,
+      "step": 13840
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 11.204387744632221,
+      "learning_rate": 4.4648604690966655e-09,
+      "loss": 0.6779,
+      "step": 13841
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 8.755454219852501,
+      "learning_rate": 4.416204667804791e-09,
+      "loss": 0.6683,
+      "step": 13842
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 12.767001840816668,
+      "learning_rate": 4.367815317047885e-09,
+      "loss": 0.7213,
+      "step": 13843
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 9.762301823765368,
+      "learning_rate": 4.3196924194055525e-09,
+      "loss": 0.7175,
+      "step": 13844
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 8.940908753634753,
+      "learning_rate": 4.271835977445738e-09,
+      "loss": 0.79,
+      "step": 13845
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 11.029857793325036,
+      "learning_rate": 4.224245993720288e-09,
+      "loss": 0.6495,
+      "step": 13846
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 10.851425578952957,
+      "learning_rate": 4.17692247076773e-09,
+      "loss": 0.6592,
+      "step": 13847
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 7.77272565590445,
+      "learning_rate": 4.129865411112155e-09,
+      "loss": 0.7534,
+      "step": 13848
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 8.494515308212705,
+      "learning_rate": 4.083074817263222e-09,
+      "loss": 0.7219,
+      "step": 13849
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 11.825862379069124,
+      "learning_rate": 4.036550691717267e-09,
+      "loss": 0.68,
+      "step": 13850
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 9.463280418437423,
+      "learning_rate": 3.990293036955084e-09,
+      "loss": 0.715,
+      "step": 13851
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 7.2776725044621315,
+      "learning_rate": 3.944301855444144e-09,
+      "loss": 0.7099,
+      "step": 13852
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 9.109034725967456,
+      "learning_rate": 3.898577149637484e-09,
+      "loss": 0.7347,
+      "step": 13853
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 8.81038642977347,
+      "learning_rate": 3.8531189219742636e-09,
+      "loss": 0.7561,
+      "step": 13854
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 10.772710314346858,
+      "learning_rate": 3.807927174878656e-09,
+      "loss": 0.6459,
+      "step": 13855
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 10.321940282524448,
+      "learning_rate": 3.76300191076151e-09,
+      "loss": 0.6688,
+      "step": 13856
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 10.706488333123534,
+      "learning_rate": 3.718343132018132e-09,
+      "loss": 0.6649,
+      "step": 13857
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 10.56251053363331,
+      "learning_rate": 3.6739508410316147e-09,
+      "loss": 0.6786,
+      "step": 13858
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 8.917393406829824,
+      "learning_rate": 3.629825040168955e-09,
+      "loss": 0.7483,
+      "step": 13859
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 9.29262108761795,
+      "learning_rate": 3.585965731783825e-09,
+      "loss": 0.6405,
+      "step": 13860
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 10.430912522348008,
+      "learning_rate": 3.5423729182160194e-09,
+      "loss": 0.7268,
+      "step": 13861
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 10.95175167336911,
+      "learning_rate": 3.4990466017903456e-09,
+      "loss": 0.6951,
+      "step": 13862
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 10.962163097737704,
+      "learning_rate": 3.455986784817178e-09,
+      "loss": 0.6976,
+      "step": 13863
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 8.034224463852219,
+      "learning_rate": 3.4131934695941226e-09,
+      "loss": 0.7419,
+      "step": 13864
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 10.132092296786874,
+      "learning_rate": 3.370666658402688e-09,
+      "loss": 0.8005,
+      "step": 13865
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 7.953673209785952,
+      "learning_rate": 3.32840635351217e-09,
+      "loss": 0.6923,
+      "step": 13866
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 10.049108205471617,
+      "learning_rate": 3.2864125571757666e-09,
+      "loss": 0.6846,
+      "step": 13867
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 8.72115987455836,
+      "learning_rate": 3.2446852716339072e-09,
+      "loss": 0.7102,
+      "step": 13868
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 9.62951972256199,
+      "learning_rate": 3.2032244991120342e-09,
+      "loss": 0.612,
+      "step": 13869
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 10.724047876563459,
+      "learning_rate": 3.1620302418217118e-09,
+      "loss": 0.6362,
+      "step": 13870
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 10.12073031392437,
+      "learning_rate": 3.1211025019595163e-09,
+      "loss": 0.6468,
+      "step": 13871
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 8.65198674300978,
+      "learning_rate": 3.0804412817087017e-09,
+      "loss": 0.7309,
+      "step": 13872
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 8.428052905878413,
+      "learning_rate": 3.0400465832386426e-09,
+      "loss": 0.6953,
+      "step": 13873
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 8.152565918580864,
+      "learning_rate": 2.999918408703173e-09,
+      "loss": 0.7205,
+      "step": 13874
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 9.603372952676898,
+      "learning_rate": 2.960056760242247e-09,
+      "loss": 0.6954,
+      "step": 13875
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 8.73291911414532,
+      "learning_rate": 2.9204616399830522e-09,
+      "loss": 0.6692,
+      "step": 13876
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 10.573025140455213,
+      "learning_rate": 2.8811330500372327e-09,
+      "loss": 0.6749,
+      "step": 13877
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 9.13627034113453,
+      "learning_rate": 2.8420709925014446e-09,
+      "loss": 0.6421,
+      "step": 13878
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 6.835117768031386,
+      "learning_rate": 2.8032754694606867e-09,
+      "loss": 0.7402,
+      "step": 13879
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 8.25047064146634,
+      "learning_rate": 2.7647464829827496e-09,
+      "loss": 0.6617,
+      "step": 13880
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 8.59504338073099,
+      "learning_rate": 2.7264840351237667e-09,
+      "loss": 0.6753,
+      "step": 13881
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 9.94157872284604,
+      "learning_rate": 2.688488127924327e-09,
+      "loss": 0.6883,
+      "step": 13882
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 9.700682385228026,
+      "learning_rate": 2.650758763410033e-09,
+      "loss": 0.6455,
+      "step": 13883
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 9.966460575423547,
+      "learning_rate": 2.613295943594829e-09,
+      "loss": 0.6767,
+      "step": 13884
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 5.741963766809402,
+      "learning_rate": 2.5760996704754516e-09,
+      "loss": 0.6572,
+      "step": 13885
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 9.77012565367392,
+      "learning_rate": 2.539169946036979e-09,
+      "loss": 0.6208,
+      "step": 13886
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 10.681150085503477,
+      "learning_rate": 2.502506772248392e-09,
+      "loss": 0.6972,
+      "step": 13887
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 9.58598019359591,
+      "learning_rate": 2.4661101510659036e-09,
+      "loss": 0.6932,
+      "step": 13888
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 9.235935513313246,
+      "learning_rate": 2.4299800844301836e-09,
+      "loss": 0.7375,
+      "step": 13889
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 10.239802205706148,
+      "learning_rate": 2.394116574268024e-09,
+      "loss": 0.6859,
+      "step": 13890
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 9.224219678693528,
+      "learning_rate": 2.3585196224928943e-09,
+      "loss": 0.6985,
+      "step": 13891
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 12.890190298520253,
+      "learning_rate": 2.3231892310038306e-09,
+      "loss": 0.6798,
+      "step": 13892
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 10.46028343582119,
+      "learning_rate": 2.288125401684327e-09,
+      "loss": 0.6236,
+      "step": 13893
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 12.017676138868564,
+      "learning_rate": 2.2533281364051083e-09,
+      "loss": 0.6505,
+      "step": 13894
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 9.575932005496734,
+      "learning_rate": 2.2187974370219132e-09,
+      "loss": 0.6238,
+      "step": 13895
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 11.351799922840996,
+      "learning_rate": 2.1845333053766015e-09,
+      "loss": 0.7068,
+      "step": 13896
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 8.5247256041944,
+      "learning_rate": 2.150535743297155e-09,
+      "loss": 0.7753,
+      "step": 13897
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 8.440641020029002,
+      "learning_rate": 2.1168047525960135e-09,
+      "loss": 0.7025,
+      "step": 13898
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 9.849638029486483,
+      "learning_rate": 2.0833403350728476e-09,
+      "loss": 0.6857,
+      "step": 13899
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 10.853383827942002,
+      "learning_rate": 2.0501424925128964e-09,
+      "loss": 0.7209,
+      "step": 13900
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 8.173510393182124,
+      "learning_rate": 2.0172112266864106e-09,
+      "loss": 0.6798,
+      "step": 13901
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 13.081900644919395,
+      "learning_rate": 1.9845465393497633e-09,
+      "loss": 0.7599,
+      "step": 13902
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 8.47587990810023,
+      "learning_rate": 1.9521484322448937e-09,
+      "loss": 0.6763,
+      "step": 13903
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 9.622835689663908,
+      "learning_rate": 1.9200169071009746e-09,
+      "loss": 0.6792,
+      "step": 13904
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 8.2403948690797,
+      "learning_rate": 1.8881519656305245e-09,
+      "loss": 0.6528,
+      "step": 13905
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 6.425085107136913,
+      "learning_rate": 1.8565536095344062e-09,
+      "loss": 0.7486,
+      "step": 13906
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 8.166102592004712,
+      "learning_rate": 1.825221840496827e-09,
+      "loss": 0.7391,
+      "step": 13907
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 9.506513253979861,
+      "learning_rate": 1.794156660189228e-09,
+      "loss": 0.6155,
+      "step": 13908
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 10.226136529289393,
+      "learning_rate": 1.763358070269172e-09,
+      "loss": 0.6401,
+      "step": 13909
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 10.613064980772497,
+      "learning_rate": 1.7328260723786794e-09,
+      "loss": 0.6753,
+      "step": 13910
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 8.973106661264808,
+      "learning_rate": 1.702560668146447e-09,
+      "loss": 0.6899,
+      "step": 13911
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 8.637564922066156,
+      "learning_rate": 1.6725618591867388e-09,
+      "loss": 0.7757,
+      "step": 13912
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 11.910670161869652,
+      "learning_rate": 1.6428296470993864e-09,
+      "loss": 0.6283,
+      "step": 13913
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 10.625413138615734,
+      "learning_rate": 1.613364033470899e-09,
+      "loss": 0.656,
+      "step": 13914
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 6.72418944799708,
+      "learning_rate": 1.5841650198722414e-09,
+      "loss": 0.6583,
+      "step": 13915
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 8.10023221777208,
+      "learning_rate": 1.555232607861057e-09,
+      "loss": 0.6779,
+      "step": 13916
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 9.083549946327643,
+      "learning_rate": 1.5265667989811106e-09,
+      "loss": 0.7409,
+      "step": 13917
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 9.783546384804733,
+      "learning_rate": 1.4981675947600694e-09,
+      "loss": 0.6425,
+      "step": 13918
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 7.312802617393221,
+      "learning_rate": 1.4700349967133876e-09,
+      "loss": 0.7175,
+      "step": 13919
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 8.194970528460763,
+      "learning_rate": 1.4421690063415317e-09,
+      "loss": 0.6829,
+      "step": 13920
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 8.704800440310795,
+      "learning_rate": 1.4145696251310902e-09,
+      "loss": 0.6938,
+      "step": 13921
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 8.501689654749699,
+      "learning_rate": 1.3872368545536641e-09,
+      "loss": 0.6758,
+      "step": 13922
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 11.222150480049452,
+      "learning_rate": 1.360170696067531e-09,
+      "loss": 0.7169,
+      "step": 13923
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 9.823098425152612,
+      "learning_rate": 1.3333711511154256e-09,
+      "loss": 0.722,
+      "step": 13924
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 9.971361891519297,
+      "learning_rate": 1.3068382211278708e-09,
+      "loss": 0.8066,
+      "step": 13925
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 8.339268943282578,
+      "learning_rate": 1.2805719075198452e-09,
+      "loss": 0.6824,
+      "step": 13926
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 10.370519276118479,
+      "learning_rate": 1.2545722116918956e-09,
+      "loss": 0.6957,
+      "step": 13927
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 8.875488683132923,
+      "learning_rate": 1.2288391350306905e-09,
+      "loss": 0.6497,
+      "step": 13928
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 7.713718761662814,
+      "learning_rate": 1.2033726789090206e-09,
+      "loss": 0.6587,
+      "step": 13929
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 9.643792388087014,
+      "learning_rate": 1.1781728446857988e-09,
+      "loss": 0.6591,
+      "step": 13930
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 9.479813764412029,
+      "learning_rate": 1.1532396337038398e-09,
+      "loss": 0.6494,
+      "step": 13931
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 10.13647972869188,
+      "learning_rate": 1.1285730472943012e-09,
+      "loss": 0.8057,
+      "step": 13932
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 6.8973893856474175,
+      "learning_rate": 1.1041730867716872e-09,
+      "loss": 0.7564,
+      "step": 13933
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 32.449225923788084,
+      "learning_rate": 1.080039753438289e-09,
+      "loss": 0.6237,
+      "step": 13934
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 7.860368797753967,
+      "learning_rate": 1.0561730485808553e-09,
+      "loss": 0.7419,
+      "step": 13935
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 8.343525355036519,
+      "learning_rate": 1.0325729734728118e-09,
+      "loss": 0.6603,
+      "step": 13936
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 9.131061962106367,
+      "learning_rate": 1.0092395293720414e-09,
+      "loss": 0.6533,
+      "step": 13937
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 9.74991437836375,
+      "learning_rate": 9.86172717523659e-10,
+      "loss": 0.7429,
+      "step": 13938
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 12.225042071960585,
+      "learning_rate": 9.633725391583471e-10,
+      "loss": 0.7769,
+      "step": 13939
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 8.025564643667945,
+      "learning_rate": 9.408389954917996e-10,
+      "loss": 0.6961,
+      "step": 13940
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 7.127661573146283,
+      "learning_rate": 9.18572087725833e-10,
+      "loss": 0.7761,
+      "step": 13941
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 8.191485031932478,
+      "learning_rate": 8.965718170483862e-10,
+      "loss": 0.7305,
+      "step": 13942
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 9.454771916859027,
+      "learning_rate": 8.748381846324094e-10,
+      "loss": 0.6417,
+      "step": 13943
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 9.737398228473635,
+      "learning_rate": 8.533711916375309e-10,
+      "loss": 0.7238,
+      "step": 13944
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 7.624588633806496,
+      "learning_rate": 8.321708392083905e-10,
+      "loss": 0.6947,
+      "step": 13945
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 8.784613155298196,
+      "learning_rate": 8.112371284763054e-10,
+      "loss": 0.8009,
+      "step": 13946
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 10.935292079362549,
+      "learning_rate": 7.90570060557605e-10,
+      "loss": 0.7418,
+      "step": 13947
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 8.114022412387943,
+      "learning_rate": 7.701696365541855e-10,
+      "loss": 0.6912,
+      "step": 13948
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 8.175584387392632,
+      "learning_rate": 7.500358575546207e-10,
+      "loss": 0.6652,
+      "step": 13949
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 12.576561719662875,
+      "learning_rate": 7.301687246324962e-10,
+      "loss": 0.7268,
+      "step": 13950
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 9.212368575664971,
+      "learning_rate": 7.105682388475199e-10,
+      "loss": 0.6815,
+      "step": 13951
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 9.302107835470503,
+      "learning_rate": 6.912344012455219e-10,
+      "loss": 0.7624,
+      "step": 13952
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 9.92981348885861,
+      "learning_rate": 6.721672128573442e-10,
+      "loss": 0.7562,
+      "step": 13953
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 8.54424280722412,
+      "learning_rate": 6.533666746999512e-10,
+      "loss": 0.7434,
+      "step": 13954
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 8.300091629847595,
+      "learning_rate": 6.348327877758742e-10,
+      "loss": 0.7159,
+      "step": 13955
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 8.918480680703501,
+      "learning_rate": 6.16565553074322e-10,
+      "loss": 0.7537,
+      "step": 13956
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 9.334503658346161,
+      "learning_rate": 5.985649715695152e-10,
+      "loss": 0.7066,
+      "step": 13957
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 10.694449132805183,
+      "learning_rate": 5.808310442212417e-10,
+      "loss": 0.6903,
+      "step": 13958
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 9.948511492501636,
+      "learning_rate": 5.633637719754115e-10,
+      "loss": 0.663,
+      "step": 13959
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 8.662517146883337,
+      "learning_rate": 5.461631557635017e-10,
+      "loss": 0.6796,
+      "step": 13960
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 10.053842806744456,
+      "learning_rate": 5.292291965031115e-10,
+      "loss": 0.7294,
+      "step": 13961
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 8.38046311530799,
+      "learning_rate": 5.125618950979627e-10,
+      "loss": 0.7461,
+      "step": 13962
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 9.258627845501044,
+      "learning_rate": 4.961612524362336e-10,
+      "loss": 0.7579,
+      "step": 13963
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 9.868105355755372,
+      "learning_rate": 4.80027269393335e-10,
+      "loss": 0.589,
+      "step": 13964
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 10.400802763700083,
+      "learning_rate": 4.6415994682913466e-10,
+      "loss": 0.6713,
+      "step": 13965
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 12.39950396578402,
+      "learning_rate": 4.485592855907328e-10,
+      "loss": 0.6617,
+      "step": 13966
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 9.95969524439656,
+      "learning_rate": 4.332252865096864e-10,
+      "loss": 0.6882,
+      "step": 13967
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 7.70113910208048,
+      "learning_rate": 4.181579504036748e-10,
+      "loss": 0.6814,
+      "step": 13968
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 9.728890430077039,
+      "learning_rate": 4.033572780770545e-10,
+      "loss": 0.663,
+      "step": 13969
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 7.433599760560644,
+      "learning_rate": 3.8882327031863896e-10,
+      "loss": 0.7241,
+      "step": 13970
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 12.601481566448047,
+      "learning_rate": 3.7455592790447416e-10,
+      "loss": 0.7273,
+      "step": 13971
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 11.189055130576802,
+      "learning_rate": 3.6055525159450766e-10,
+      "loss": 0.7429,
+      "step": 13972
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 12.860505318313638,
+      "learning_rate": 3.468212421359196e-10,
+      "loss": 0.7218,
+      "step": 13973
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 12.377562725027952,
+      "learning_rate": 3.333539002614572e-10,
+      "loss": 0.6687,
+      "step": 13974
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 8.988173186064648,
+      "learning_rate": 3.201532266888796e-10,
+      "loss": 0.7007,
+      "step": 13975
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 10.154210713603941,
+      "learning_rate": 3.0721922212262335e-10,
+      "loss": 0.7164,
+      "step": 13976
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 12.000604057600416,
+      "learning_rate": 2.9455188725269204e-10,
+      "loss": 0.6772,
+      "step": 13977
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 7.920523106595557,
+      "learning_rate": 2.8215122275465636e-10,
+      "loss": 0.6584,
+      "step": 13978
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 11.878059681857948,
+      "learning_rate": 2.700172292902092e-10,
+      "loss": 0.6479,
+      "step": 13979
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 10.187810123502988,
+      "learning_rate": 2.5814990750550053e-10,
+      "loss": 0.7098,
+      "step": 13980
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 9.484797473265548,
+      "learning_rate": 2.465492580344675e-10,
+      "loss": 0.7547,
+      "step": 13981
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 7.2323063743926275,
+      "learning_rate": 2.3521528149605954e-10,
+      "loss": 0.6789,
+      "step": 13982
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 8.673124379285778,
+      "learning_rate": 2.2414797849368285e-10,
+      "loss": 0.6552,
+      "step": 13983
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 9.078019628193056,
+      "learning_rate": 2.133473496185312e-10,
+      "loss": 0.7344,
+      "step": 13984
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 11.409556558663965,
+      "learning_rate": 2.0281339544625523e-10,
+      "loss": 0.6923,
+      "step": 13985
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 10.836100034931809,
+      "learning_rate": 1.9254611653918287e-10,
+      "loss": 0.7085,
+      "step": 13986
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 8.70703927521755,
+      "learning_rate": 1.825455134446541e-10,
+      "loss": 0.7161,
+      "step": 13987
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 6.158857181439668,
+      "learning_rate": 1.7281158669557596e-10,
+      "loss": 0.7034,
+      "step": 13988
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 11.990753237313692,
+      "learning_rate": 1.6334433681208795e-10,
+      "loss": 0.7079,
+      "step": 13989
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 10.45237316739421,
+      "learning_rate": 1.5414376429878642e-10,
+      "loss": 0.6704,
+      "step": 13990
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 10.782703466207076,
+      "learning_rate": 1.4520986964583484e-10,
+      "loss": 0.6929,
+      "step": 13991
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 12.645310154965767,
+      "learning_rate": 1.365426533306291e-10,
+      "loss": 0.7147,
+      "step": 13992
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 7.441970046443797,
+      "learning_rate": 1.281421158144669e-10,
+      "loss": 0.6862,
+      "step": 13993
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 9.824110566428985,
+      "learning_rate": 1.2000825754643342e-10,
+      "loss": 0.6724,
+      "step": 13994
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 7.996102443363289,
+      "learning_rate": 1.1214107896007076e-10,
+      "loss": 0.6993,
+      "step": 13995
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 10.851501542268002,
+      "learning_rate": 1.0454058047448811e-10,
+      "loss": 0.7345,
+      "step": 13996
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 12.204568618730967,
+      "learning_rate": 9.720676249491689e-11,
+      "loss": 0.6308,
+      "step": 13997
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 10.658026763063479,
+      "learning_rate": 9.013962541382093e-11,
+      "loss": 0.8085,
+      "step": 13998
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 12.983378622796078,
+      "learning_rate": 8.333916960701072e-11,
+      "loss": 0.6135,
+      "step": 13999
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 10.65445818463169,
+      "learning_rate": 7.680539543752919e-11,
+      "loss": 0.7128,
+      "step": 14000
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 10.461813501714683,
+      "learning_rate": 7.053830325398637e-11,
+      "loss": 0.7776,
+      "step": 14001
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 7.078142191840234,
+      "learning_rate": 6.453789339055938e-11,
+      "loss": 0.6384,
+      "step": 14002
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 11.27079135604254,
+      "learning_rate": 5.880416616699247e-11,
+      "loss": 0.6659,
+      "step": 14003
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 8.9931858634889,
+      "learning_rate": 5.333712188970719e-11,
+      "loss": 0.669,
+      "step": 14004
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 11.489773198108908,
+      "learning_rate": 4.813676084958197e-11,
+      "loss": 0.7007,
+      "step": 14005
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 9.995022768713532,
+      "learning_rate": 4.320308332472767e-11,
+      "loss": 0.7234,
+      "step": 14006
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 7.329532375077679,
+      "learning_rate": 3.8536089577712046e-11,
+      "loss": 0.6568,
+      "step": 14007
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 8.797008955052958,
+      "learning_rate": 3.413577985778016e-11,
+      "loss": 0.7257,
+      "step": 14008
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 7.974550506342163,
+      "learning_rate": 3.000215439974419e-11,
+      "loss": 0.7307,
+      "step": 14009
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 10.080671964677295,
+      "learning_rate": 2.6135213423428284e-11,
+      "loss": 0.7467,
+      "step": 14010
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 10.747732627651398,
+      "learning_rate": 2.2534957135889047e-11,
+      "loss": 0.6966,
+      "step": 14011
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 9.488723155306188,
+      "learning_rate": 1.9201385728639944e-11,
+      "loss": 0.6491,
+      "step": 14012
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 10.244972303679793,
+      "learning_rate": 1.613449937987177e-11,
+      "loss": 0.6119,
+      "step": 14013
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 7.6688398681452385,
+      "learning_rate": 1.3334298252787315e-11,
+      "loss": 0.6667,
+      "step": 14014
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 7.254356660677931,
+      "learning_rate": 1.0800782496711571e-11,
+      "loss": 0.6954,
+      "step": 14015
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 7.339859034792685,
+      "learning_rate": 8.533952247091748e-12,
+      "loss": 0.7277,
+      "step": 14016
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 9.657395400923656,
+      "learning_rate": 6.533807624942157e-12,
+      "loss": 0.7911,
+      "step": 14017
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 8.96281033122207,
+      "learning_rate": 4.800348736289095e-12,
+      "loss": 0.6829,
+      "step": 14018
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 8.755251170045462,
+      "learning_rate": 3.3335756743912984e-12,
+      "loss": 0.6153,
+      "step": 14019
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 8.790537015171894,
+      "learning_rate": 2.133488516964377e-12,
+      "loss": 0.6866,
+      "step": 14020
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 9.145490438019134,
+      "learning_rate": 1.200087328401267e-12,
+      "loss": 0.6619,
+      "step": 14021
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 8.66742013238092,
+      "learning_rate": 5.333721575517814e-13,
+      "loss": 0.7461,
+      "step": 14022
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 8.462078986463382,
+      "learning_rate": 1.3334304105327988e-13,
+      "loss": 0.7335,
+      "step": 14023
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 12.479854604509022,
+      "learning_rate": 0.0,
+      "loss": 0.7457,
+      "step": 14024
+    },
+    {
+      "epoch": 2.0,
+      "step": 14024,
+      "total_flos": 2.311672696260893e+19,
+      "train_loss": 0.886902466280353,
+      "train_runtime": 243572.9905,
+      "train_samples_per_second": 7.369,
+      "train_steps_per_second": 0.058
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 14024,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 1000,
+  "total_flos": 2.311672696260893e+19,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/inf2_dir_0531/llava-mistral_videollava_092/training_args.bin b/inf2_dir_0531/llava-mistral_videollava_092/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..77377ac8ba23ccdbe5b970f837c521aa282be18e
--- /dev/null
+++ b/inf2_dir_0531/llava-mistral_videollava_092/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:24437bbf1551874440540cfd1f757f3006df8ec1acc4a62adab9c64777f91171
+size 6712
diff --git a/log-neuron-cc.txt b/log-neuron-cc.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8c6a182912e9d53fc12dbc4b3e020100832dbf9f
--- /dev/null
+++ b/log-neuron-cc.txt
@@ -0,0 +1,1186 @@
+2024-06-01T02:43:23Z INFO 1034684 [root]: /root/anaconda3/envs/masp_fastapi/bin/neuronx-cc compile /tmp/tmpmg125iqw/model --framework XLA --target trn1 --output /tmp/tmpmg125iqw/graph.neff
+2024-06-01T02:43:23Z INFO 1034816 [root]: XLA detected
+2024-06-01T02:43:23Z INFO 1034816 [root]: Pipeline: Frontend HHChecker WalrusDriver BIRLinker Kelper
+2024-06-01T02:43:23Z INFO 1034816 [root]: Intermediate files stored in /root/masp_serving_inf2/neuronxcc-u5sr5k2_, output in /root/masp_serving_inf2
+2024-06-01T02:43:23Z INFO 1034816 [pipeline.Pipeline.0]: Job Pipeline len(in_states) 1
+2024-06-01T02:43:23Z INFO 1034816 [pipeline.Pipeline.0]: Processing input #0
+2024-06-01T02:43:23Z INFO 1034816 [pipeline.Pipeline.0]: Running pipeline Pipeline.0
+2024-06-01T02:43:23Z INFO 1034816 [pipeline.Pipeline.0]: Starting job job.Frontend.0
+2024-06-01T02:43:23Z INFO 1034816 [job.Frontend.0]: Job Frontend len(in_states) 1
+2024-06-01T02:43:23Z INFO 1034816 [job.Frontend.0]: Processing input #0
+2024-06-01T02:43:23Z INFO 1034816 [job.Frontend.0]: Start model loading
+2024-06-01T02:43:23Z INFO 1034816 [job.Frontend.0]: IR signature: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 for model
+2024-06-01T02:43:23Z INFO 1034816 [job.Frontend.0]: Executing: /root/anaconda3/envs/masp_fastapi/lib/python3.9/site-packages/neuronxcc/starfish/bin/hlo2penguin --input /tmp/tmpmg125iqw/model --out-dir ./ --output penguin.py --layers-per-module=1 --coalesce-all-gathers=false --coalesce-reduce-scatters=false --coalesce-all-reduces=false --emit-tensor-level-dropout-ops --emit-tensor-level-rng-ops
+2024-06-01T02:43:23Z INFO 1034816 [job.Frontend.0]: INFO: Found memory bound graph
+Replaced 0 dropout sequences with OffloadedDropout
+INFO: HloMacCount has found 7046430720
+INFO: Traffic has found 56180736
+INFO: AIF 250.849
+HLO Ops used in computation: add broadcast constant custom-call dot parameter reshape transpose tuple 
+Invoking RemoveOptimizationBarriers pass
+
+2024-06-01T02:43:23Z INFO 1034816 [job.Frontend.0]: Start tensorization
+2024-06-01T02:43:23Z WARNING 1034816 [job.Frontend.0]: TVM not detected.
+2024-06-01T02:43:23Z INFO 1034816 [job.Frontend.0]: Num parallel jobs: 1
+2024-06-01T02:43:23Z INFO 1034816 [root/Tensorizer/All]: Enter time region
+2024-06-01T02:43:23Z INFO 1034816 [Tensorizer]: Frontend found a single CU. Switching to flat flow.
+2024-06-01T02:43:23Z INFO 1034816 [Tensorizer]: Building model from Penguin script "penguin.py"...
+2024-06-01T02:43:23Z INFO 1034816 [Tensorizer]: Tensorizer options: --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=matmult-bf16 --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --sunda-batchnorm --enable-tritium-loopfusion --keep-remat-dma-transpose --enable-softmax-kernel
+2024-06-01T02:43:23Z INFO 1034816 [Tensorizer]: Building model from Penguin script "penguin.py"...
+2024-06-01T02:43:23Z INFO 1034816 [Tensorizer]: Successfully built model.
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/DoNothing]: Running DoNothing
+2024-06-01T02:43:23Z INFO 1034816 [DoNothing]: Finished (changed=True)
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction
+2024-06-01T02:43:23Z INFO 1034816 [AliasDependencyInduction]: Finished (changed=False)
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.000 seconds
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR
+2024-06-01T02:43:23Z INFO 1034816 [CanonicalizeIR]: Finished (changed=False)
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.000 seconds
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout
+2024-06-01T02:43:23Z INFO 1034816 [LegalizeCCOpLayout]: Finished (changed=False)
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.000 seconds
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates
+2024-06-01T02:43:23Z INFO 1034816 [ResolveComplicatePredicates]: Finished (changed=False)
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.000 seconds
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution
+2024-06-01T02:43:23Z INFO 1034816 [AffinePredicateResolution]: Finished (changed=False)
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.000 seconds
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs
+2024-06-01T02:43:23Z INFO 1034816 [EliminateDivs]: Finished (changed=False)
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.000 seconds
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest
+2024-06-01T02:43:23Z INFO 1034816 [PerfectLoopNest]: Finished (changed=False)
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.000 seconds
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/Simplifier]: Running Simplifier
+2024-06-01T02:43:23Z INFO 1034816 [Simplifier]: Finished (changed=False)
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.000 seconds
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier
+2024-06-01T02:43:23Z INFO 1034816 [GenericAccessSimplifier]: Finished (changed=False)
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.000 seconds
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/TCTransform]: Running TCTransform
+2024-06-01T02:43:23Z INFO 1034816 [TCTransform]: Finished (changed=False)
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.000 seconds
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat
+2024-06-01T02:43:23Z INFO 1034816 [CommuteConcat]: Finished (changed=False)
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.000 seconds
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/LowerTensorOp]: Running LowerTensorOp
+2024-06-01T02:43:23Z INFO 1034816 [LowerTensorOp]: Finished (changed=True)
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.002 seconds
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm
+2024-06-01T02:43:23Z INFO 1034816 [ExpandBatchNorm]: Finished (changed=False)
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.000 seconds
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/TCTransform]: Running TCTransform
+2024-06-01T02:43:23Z INFO 1034816 [TCTransform]: Finished (changed=True)
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs
+2024-06-01T02:43:23Z INFO 1034816 [EliminateDivs]: Finished (changed=False)
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.000 seconds
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier
+2024-06-01T02:43:23Z INFO 1034816 [GenericAccessSimplifier]: Finished (changed=False)
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.000 seconds
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR
+2024-06-01T02:43:23Z INFO 1034816 [CanonicalizeIR]: Finished (changed=False)
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.000 seconds
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/TensorOpFusion]: Running TensorOpFusion
+2024-06-01T02:43:23Z INFO 1034816 [TensorOpFusion]: Finished (changed=False)
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/TensorOpFusion]: TensorOpFusion finished after 0.000 seconds
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform
+2024-06-01T02:43:23Z INFO 1034816 [TensorOpTransform]: Finished (changed=True)
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.001 seconds
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp
+2024-06-01T02:43:23Z INFO 1034816 [LateLowerTensorOp]: Finished (changed=False)
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.000 seconds
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination
+2024-06-01T02:43:23Z INFO 1034816 [MemcpyElimination]: Finished (changed=True)
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.011 seconds
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion
+2024-06-01T02:43:23Z INFO 1034816 [LoopFusion]: Finished (changed=True)
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.003 seconds
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/Simplifier]: Running Simplifier
+2024-06-01T02:43:23Z INFO 1034816 [Simplifier]: Finished (changed=False)
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.001 seconds
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/Delinearization]: Running Delinearization
+2024-06-01T02:43:23Z INFO 1034816 [Delinearization]: Finished (changed=True)
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.001 seconds
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination
+2024-06-01T02:43:23Z INFO 1034816 [AliasDependencyElimination]: Finished (changed=False)
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination
+2024-06-01T02:43:23Z INFO 1034816 [DeadStoreElimination]: Finished (changed=False)
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.006 seconds
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction
+2024-06-01T02:43:23Z INFO 1034816 [AliasDependencyInduction]: Finished (changed=False)
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.000 seconds
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/Simplifier]: Running Simplifier
+2024-06-01T02:43:23Z INFO 1034816 [Simplifier]: Finished (changed=False)
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.001 seconds
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/LICM]: Running LICM
+2024-06-01T02:43:23Z INFO 1034816 [LICM]: Finished (changed=True)
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/LICM]: LICM finished after 0.000 seconds
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/Delinearization]: Running Delinearization
+2024-06-01T02:43:23Z INFO 1034816 [Delinearization]: Finished (changed=False)
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.000 seconds
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion
+2024-06-01T02:43:23Z INFO 1034816 [LoopFusion]: Finished (changed=False)
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.000 seconds
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/SimplifySlice]: Running SimplifySlice
+2024-06-01T02:43:23Z INFO 1034816 [SimplifySlice]: Finished (changed=False)
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.000 seconds
+2024-06-01T02:43:23Z USER 1034816 [sg0000/Tensorizer/LICM]: Running LICM
+2024-06-01T02:43:23Z INFO 1034816 [LICM]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/LICM]: LICM finished after 0.000 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/Simplifier]: Running Simplifier
+2024-06-01T02:43:24Z INFO 1034816 [Simplifier]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.001 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering
+2024-06-01T02:43:24Z INFO 1034816 [ValueNumbering]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.000 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/LICM]: Running LICM
+2024-06-01T02:43:24Z INFO 1034816 [LICM]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/LICM]: LICM finished after 0.000 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/PadElimination]: Running PadElimination
+2024-06-01T02:43:24Z INFO 1034816 [PadElimination]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/PadElimination]: PadElimination finished after 0.000 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/Delinearization]: Running Delinearization
+2024-06-01T02:43:24Z INFO 1034816 [Delinearization]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.000 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion
+2024-06-01T02:43:24Z INFO 1034816 [LoopFusion]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.000 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier
+2024-06-01T02:43:24Z INFO 1034816 [GenericAccessSimplifier]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.000 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/Simplifier]: Running Simplifier
+2024-06-01T02:43:24Z INFO 1034816 [Simplifier]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.001 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/LICM]: Running LICM
+2024-06-01T02:43:24Z INFO 1034816 [LICM]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/LICM]: LICM finished after 0.000 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering
+2024-06-01T02:43:24Z INFO 1034816 [ValueNumbering]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.000 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/TCTransform]: Running TCTransform
+2024-06-01T02:43:24Z INFO 1034816 [TCTransform]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.000 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat
+2024-06-01T02:43:24Z INFO 1034816 [CommuteConcat]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.000 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom
+2024-06-01T02:43:24Z INFO 1034816 [RecognizeOpIdiom]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.001 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation
+2024-06-01T02:43:24Z INFO 1034816 [MaskPropagation]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.001 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/Recompute]: Running Recompute
+2024-06-01T02:43:24Z INFO 1034816 [Recompute]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/Recompute]: Recompute finished after 0.000 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination
+2024-06-01T02:43:24Z INFO 1034816 [DeadCodeElimination]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.000 seconds
+2024-06-01T02:43:24Z INFO 1034816 [Tensorizer]: After optimization: 1 statements
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/DoNothing]: Running DoNothing
+2024-06-01T02:43:24Z INFO 1034816 [DoNothing]: Finished (changed=True)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/MutateDataType]: Running MutateDataType
+2024-06-01T02:43:24Z INFO 1034816 [MutateDataType]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/MutateDataType]: MutateDataType finished after 0.000 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/AutoCastTCInputs]: Running AutoCastTCInputs
+2024-06-01T02:43:24Z INFO 1034816 [AutoCastTCInputs]: Finished (changed=True)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/AutoCastTCInputs]: AutoCastTCInputs finished after 0.001 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier
+2024-06-01T02:43:24Z INFO 1034816 [GenericAccessSimplifier]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.000 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/Simplifier]: Running Simplifier
+2024-06-01T02:43:24Z INFO 1034816 [Simplifier]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.001 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination
+2024-06-01T02:43:24Z INFO 1034816 [AliasDependencyElimination]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices
+2024-06-01T02:43:24Z INFO 1034816 [DelinearIndices]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.001 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/Delinearization]: Running Delinearization
+2024-06-01T02:43:24Z INFO 1034816 [Delinearization]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.000 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices
+2024-06-01T02:43:24Z INFO 1034816 [DelinearIndices]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.001 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination
+2024-06-01T02:43:24Z INFO 1034816 [DeadCodeElimination]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.000 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC
+2024-06-01T02:43:24Z INFO 1034816 [InferIntrinsicOnCC]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.001 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict
+2024-06-01T02:43:24Z INFO 1034816 [ResolveAccessConflict]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.001 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/LICM]: Running LICM
+2024-06-01T02:43:24Z INFO 1034816 [LICM]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/LICM]: LICM finished after 0.000 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt
+2024-06-01T02:43:24Z INFO 1034816 [LocalLayoutOpt]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.003 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices
+2024-06-01T02:43:24Z INFO 1034816 [DelinearIndices]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.001 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/OrigLayoutTilingPipeline]: Running OrigLayoutTilingPipeline
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/GlobalLayoutOpt]: Running GlobalLayoutOpt
+2024-06-01T02:43:24Z INFO 1034816 [GlobalLayoutOpt]: Finished (changed=True)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/GlobalLayoutOpt]: GlobalLayoutOpt finished after 0.005 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/CanonicalizeDAG]: Running CanonicalizeDAG
+2024-06-01T02:43:24Z INFO 1034816 [CanonicalizeDAG]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/CanonicalizeDAG]: CanonicalizeDAG finished after 0.000 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/FlattenAxesForTiling]: Running FlattenAxesForTiling
+2024-06-01T02:43:24Z INFO 1034816 [FlattenAxesForTiling]: Finished (changed=True)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/FlattenAxesForTiling]: FlattenAxesForTiling finished after 0.001 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/SundaSizeTiling]: Running SundaSizeTiling
+2024-06-01T02:43:24Z INFO 1034816 [SundaSizeTiling]: Finished (changed=True)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/SundaSizeTiling]: SundaSizeTiling finished after 0.023 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/OrigLayoutTilingPipeline]: OrigLayoutTilingPipeline finished after 0.053 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/TilingProfiler]: Running TilingProfiler
+2024-06-01T02:43:24Z INFO 1034816 [TilingProfiler]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.002 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop
+2024-06-01T02:43:24Z INFO 1034816 [FlattenMacroLoop]: Finished (changed=True)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.003 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor
+2024-06-01T02:43:24Z INFO 1034816 [InferNeuronTensor]: Finished (changed=True)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.007 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier
+2024-06-01T02:43:24Z INFO 1034816 [NeuronSimplifier]: Finished (changed=True)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.005 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/LICM]: Running LICM
+2024-06-01T02:43:24Z INFO 1034816 [LICM]: Finished (changed=True)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/LICM]: LICM finished after 0.001 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul
+2024-06-01T02:43:24Z INFO 1034816 [RewriteReplicationMatmul]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.001 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop
+2024-06-01T02:43:24Z INFO 1034816 [FlattenMacroLoop]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.002 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates
+2024-06-01T02:43:24Z INFO 1034816 [SimplifyMacroPredicates]: Finished (changed=True)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.038 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt
+2024-06-01T02:43:24Z INFO 1034816 [DataLocalityOpt]: Finished (changed=True)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.173 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler
+2024-06-01T02:43:24Z INFO 1034816 [DMATilingProfiler]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.001 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier
+2024-06-01T02:43:24Z INFO 1034816 [NeuronSimplifier]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.002 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro
+2024-06-01T02:43:24Z INFO 1034816 [LegalizeSundaMacro]: Finished (changed=True)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.001 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier
+2024-06-01T02:43:24Z INFO 1034816 [NeuronSimplifier]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.002 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest
+2024-06-01T02:43:24Z INFO 1034816 [PerfectLoopNest]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.001 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop
+2024-06-01T02:43:24Z INFO 1034816 [FlattenMacroLoop]: Finished (changed=True)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.004 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/RewriteWeights]: Running RewriteWeights
+2024-06-01T02:43:24Z INFO 1034816 [RewriteWeights]: Finished (changed=True)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.056 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/ReshapeWeights]: Running ReshapeWeights
+2024-06-01T02:43:24Z INFO 1034816 [ReshapeWeights]: Finished (changed=True)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.000 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop
+2024-06-01T02:43:24Z INFO 1034816 [FlattenMacroLoop]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.002 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates
+2024-06-01T02:43:24Z INFO 1034816 [SimplifyMacroPredicates]: Finished (changed=True)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.036 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/InferInitValue]: Running InferInitValue
+2024-06-01T02:43:24Z INFO 1034816 [InferInitValue]: Finished (changed=True)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/InferInitValue]: InferInitValue finished after 0.017 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier
+2024-06-01T02:43:24Z INFO 1034816 [NeuronSimplifier]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.002 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/SimplifyTensor]: Running SimplifyTensor
+2024-06-01T02:43:24Z INFO 1034816 [SimplifyTensor]: Finished (changed=True)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.002 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/LICM]: Running LICM
+2024-06-01T02:43:24Z INFO 1034816 [LICM]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/LICM]: LICM finished after 0.001 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/SundaISel]: Running SundaISel
+2024-06-01T02:43:24Z INFO 1034816 [SundaISel]: Finished (changed=True)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/SundaISel]: SundaISel finished after 0.007 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/PreprocessNkiKernels]: Running PreprocessNkiKernels
+2024-06-01T02:43:24Z INFO 1034816 [PreprocessNkiKernels]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/PreprocessNkiKernels]: PreprocessNkiKernels finished after 0.000 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange
+2024-06-01T02:43:24Z INFO 1034816 [NeuronLoopInterchange]: Finished (changed=True)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.000 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates
+2024-06-01T02:43:24Z INFO 1034816 [NeuronSimplifyPredicates]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.017 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion
+2024-06-01T02:43:24Z INFO 1034816 [NeuronLoopFusion]: Finished (changed=True)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.003 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange
+2024-06-01T02:43:24Z INFO 1034816 [NeuronLoopInterchange]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.000 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM
+2024-06-01T02:43:24Z INFO 1034816 [NeuronLICM]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.001 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims
+2024-06-01T02:43:24Z INFO 1034816 [FactorizeBlkDims]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.001 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb
+2024-06-01T02:43:24Z INFO 1034816 [NeuronInstComb]: Finished (changed=True)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.003 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering
+2024-06-01T02:43:24Z INFO 1034816 [NeuronValueNumbering]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.001 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb
+2024-06-01T02:43:24Z INFO 1034816 [NeuronInstComb]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA
+2024-06-01T02:43:24Z INFO 1034816 [VectorizeDMA]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.002 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates
+2024-06-01T02:43:24Z INFO 1034816 [NeuronSimplifyPredicates]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.012 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce
+2024-06-01T02:43:24Z INFO 1034816 [LegalizePartitionReduce]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.000 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/DeConcat]: Running DeConcat
+2024-06-01T02:43:24Z INFO 1034816 [DeConcat]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/DeConcat]: DeConcat finished after 0.000 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion
+2024-06-01T02:43:24Z INFO 1034816 [PartialSimdFusion]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.001 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/TritiumFusion]: Running TritiumFusion
+2024-06-01T02:43:24Z INFO 1034816 [TritiumFusion]: Finished (changed=True)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.015 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion
+2024-06-01T02:43:24Z INFO 1034816 [CCOpFusion]: Finished (changed=True)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.003 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult
+2024-06-01T02:43:24Z INFO 1034816 [VectorizeMatMult]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.002 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion
+2024-06-01T02:43:24Z INFO 1034816 [PartialLoopFusion]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.002 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM
+2024-06-01T02:43:24Z INFO 1034816 [NeuronLICM]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.001 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/LowerTranspose]: Running LowerTranspose
+2024-06-01T02:43:24Z INFO 1034816 [LowerTranspose]: Finished (changed=True)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.007 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb
+2024-06-01T02:43:24Z INFO 1034816 [LateNeuronInstComb]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.001 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/SplitAccGrp]: Running SplitAccGrp
+2024-06-01T02:43:24Z INFO 1034816 [SplitAccGrp]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.001 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/SpillPSum]: Running SpillPSum
+2024-06-01T02:43:24Z INFO 1034816 [SpillPSum]: Finished (changed=True)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/SpillPSum]: SpillPSum finished after 0.005 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics
+2024-06-01T02:43:24Z INFO 1034816 [LowerIntrinsics]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.001 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/LegalizeType]: Running LegalizeType
+2024-06-01T02:43:24Z INFO 1034816 [LegalizeType]: Finished (changed=True)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/LegalizeType]: LegalizeType finished after 0.012 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM
+2024-06-01T02:43:24Z INFO 1034816 [NeuronLICM]: Finished (changed=False)
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.001 seconds
+2024-06-01T02:43:24Z USER 1034816 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor
+2024-06-01T02:43:25Z INFO 1034816 [InferPSumTensor]: Finished (changed=False)
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.004 seconds
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/WeightCoalescing]: Running WeightCoalescing
+2024-06-01T02:43:25Z INFO 1034816 [WeightCoalescing]: Finished (changed=False)
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.001 seconds
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess
+2024-06-01T02:43:25Z INFO 1034816 [LegalizeSundaAccess]: Finished (changed=False)
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.009 seconds
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/RelaxPredicates]: Running RelaxPredicates
+2024-06-01T02:43:25Z INFO 1034816 [RelaxPredicates]: Finished (changed=False)
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.002 seconds
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/TensorInitialization]: Running TensorInitialization
+2024-06-01T02:43:25Z INFO 1034816 [TensorInitialization]: Finished (changed=False)
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.003 seconds
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates
+2024-06-01T02:43:25Z INFO 1034816 [NeuronSimplifyPredicates]: Finished (changed=True)
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.026 seconds
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro
+2024-06-01T02:43:25Z INFO 1034816 [ExpandISAMacro]: Finished (changed=False)
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.002 seconds
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor
+2024-06-01T02:43:25Z INFO 1034816 [SimplifyNeuronTensor]: Finished (changed=False)
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.002 seconds
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt
+2024-06-01T02:43:25Z INFO 1034816 [DMALocalityOpt]: Finished (changed=False)
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/DataStreaming]: Running DataStreaming
+2024-06-01T02:43:25Z INFO 1034816 [DataStreaming]: Finished (changed=True)
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/DataStreaming]: DataStreaming finished after 0.001 seconds
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/SFKVectorizer]: Running SFKVectorizer
+2024-06-01T02:43:25Z INFO 1034816 [SFKVectorizer]: Finished (changed=True)
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.052 seconds
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst
+2024-06-01T02:43:25Z INFO 1034816 [LateLegalizeInst]: Finished (changed=False)
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.001 seconds
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp
+2024-06-01T02:43:25Z INFO 1034816 [CoalesceCCOp]: Finished (changed=False)
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.001 seconds
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling
+2024-06-01T02:43:25Z INFO 1034816 [SimpleAllReduceTiling]: Finished (changed=False)
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.001 seconds
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/StaticProfiler]: Running StaticProfiler
+2024-06-01T02:43:25Z INFO 1034816 [StaticProfiler]: Finished (changed=False)
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.002 seconds
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets
+2024-06-01T02:43:25Z INFO 1034816 [SplitAPUnionSets]: Finished (changed=True)
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.036 seconds
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata
+2024-06-01T02:43:25Z INFO 1034816 [DumpGraphAndMetadata]: Finished (changed=False)
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.002 seconds
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop
+2024-06-01T02:43:25Z INFO 1034816 [BirCodeGenLoop]: Finished (changed=False)
+2024-06-01T02:43:25Z USER 1034816 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.021 seconds
+2024-06-01T02:43:25Z INFO 1034816 [Tensorizer]: BirCodeGen estimate #instances=6943 in sg0000
+2024-06-01T02:43:25Z INFO 1034816 [Tensorizer]: IR signature: a07f846eca6e7f3b8eded4d821cd6e82700e1632a4d7987aea9ec039fcc58b7d for sg0000/Tensorizer
+2024-06-01T02:43:25Z INFO 1034816 [Tensorizer]: Weights total number of bytes: 6504448
+2024-06-01T02:43:25Z INFO 1034816 [root/Tensorizer/All]: Exit time region: delta=1.580s
+2024-06-01T02:43:25Z INFO 1034816 [job.Frontend.0]: End tensorization
+2024-06-01T02:43:25Z INFO 1034816 [job.Frontend.0]: Network input: input0
+2024-06-01T02:43:25Z INFO 1034816 [job.Frontend.0]: wrote bir.json
+2024-06-01T02:43:25Z INFO 1034816 [job.Frontend.0]: wrote tensor_map.json
+2024-06-01T02:43:25Z INFO 1034816 [job.Frontend.0]: Job #0 finished
+2024-06-01T02:43:25Z INFO 1034816 [pipeline.Pipeline.0]: Finished job job.Frontend.0
+2024-06-01T02:43:25Z INFO 1034816 [pipeline.Pipeline.0]: Starting job job.HHChecker.0
+2024-06-01T02:43:25Z INFO 1034816 [job.HHChecker.0]: Job HHChecker len(in_states) 1
+2024-06-01T02:43:25Z INFO 1034816 [job.HHChecker.0]: Processing input #0
+2024-06-01T02:43:25Z INFO 1034816 [job.HHChecker.0]: Job #0 finished
+2024-06-01T02:43:25Z INFO 1034816 [pipeline.Pipeline.0]: Finished job job.HHChecker.0
+2024-06-01T02:43:25Z INFO 1034816 [pipeline.Pipeline.0]: Starting job job.WalrusDriver.0
+2024-06-01T02:43:25Z INFO 1034816 [job.WalrusDriver.0]: BackendDriver has 1 states
+2024-06-01T02:43:25Z INFO 1034816 [job.WalrusDriver.0]: BackendDriver: no partitions found. Switching to flat flow.
+2024-06-01T02:43:25Z INFO 1034816 [job.WalrusDriver.0]: Job WalrusDriver len(in_states) 1
+2024-06-01T02:43:25Z INFO 1034816 [job.WalrusDriver.0]: Processing input #0
+2024-06-01T02:43:25Z INFO 1034816 [job.WalrusDriver.0]: BackendDriver in_state.num_states 1
+2024-06-01T02:43:25Z INFO 1034816 [job.WalrusDriver.0]: Executing /root/anaconda3/envs/masp_fastapi/lib/python3.9/site-packages/neuronxcc/starfish/bin/walrus_driver --optlevel 2 --allocator coloring --verbose 60 --logfile-verbose 20 --logfile /root/masp_serving_inf2/log-neuron-cc.txt --sync-pool-dve -i bir.json --min_split_size 10240 --skip_split_vns '' --no_split_dram --split_huge_dram_tensor 1.0 --preprocessing_only --max_tensorizer_distance 64 --pack_same_shape_only --instruction_fetch_latency 511 --max-partitions 1 --policy 3 --auxflag 0 --interleave none --schedule-delayed-latency 1 --postsched-mm-accum-reorder=false --max-load-lower-bound 0.14 --allreduce-buffer-size 500 --dram-page-size 512 --dram-rotation-size -1 --allreduce-rotation-dis 8 --repeat-load-thres 4 --save-len-thres 512 --save-dma-cnt-thres 32 --relaxed-order=true --enable-anti-dependence-reduction=false --num-semaphores-per-queue 16 --numcores 1 --act-root-json /root/anaconda3/envs/masp_fastapi/lib/python3.9/site-packages/neuronxcc/pwp/pwp_bin_trainium/act_info.json --dve-root-json /root/anaconda3/envs/masp_fastapi/lib/python3.9/site-packages/neuronxcc/dve/dve_bin/dve_info.json --unified-backend-and-legacy-codegen --tensor-map tensor_map.json --enable-verifier=true --enable-birsim=false --enable-birsim-sync-only=false --enable-data-race-checker=true --enable-new-backend=true --inject-error=NONE --neff-output-filename /tmp/tmpmg125iqw/graph.neff
+2024-06-01T02:43:25Z INFO 1034816 [job.WalrusDriver.0]: Working directory is /root/masp_serving_inf2/neuronxcc-u5sr5k2_/sg00
+2024-06-01T02:43:25Z INFO 1034816 [job.WalrusDriver.0]: use_logger=False
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: max_allowed_parallelism=192
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: Backend driver mtBackend: false numModules: 1 Cwd: "/root/masp_serving_inf2/neuronxcc-u5sr5k2_/sg00"
+2024-06-01T02:43:25Z USER 1035035 [BackendDriver]: Running mod_parallel_pass
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=17 blocks=1 instructions=6 Max writers: 6 Max Readers: 6
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running rewrite_matmult_sparse
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to rewrite_matmult_sparse: modules=1 functions=1 allocs=17 blocks=1 instructions=6 Max writers: 6 Max Readers: 6
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: rewrite_matmult_sparse finished after 0.000 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  30mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 17 memory location(s), 1 block(s), and 6 instruction(s). Max writers: 6 Max Readers: 6
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running birverifier
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=17 blocks=1 instructions=6 Max writers: 6 Max Readers: 6
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: birverifier finished after 0.005 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  32mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 17 memory location(s), 1 block(s), and 6 instruction(s). Max writers: 6 Max Readers: 6
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running expand_replication
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=17 blocks=1 instructions=6 Max writers: 6 Max Readers: 6
+2024-06-01T02:43:25Z INFO 1035035 [ExpandReplication]: Found 0 replicated matmults
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: expand_replication finished after 0.000 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  32mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 17 memory location(s), 1 block(s), and 6 instruction(s). Max writers: 6 Max Readers: 6
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running unroll
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=17 blocks=1 instructions=6 Max writers: 6 Max Readers: 6
+2024-06-01T02:43:25Z INFO 1035035 [Unroll]: INFO (Unroll) Start unrolling at Sat Jun  1 02:43:25 2024
+2024-06-01T02:43:25Z INFO 1035035 [Unroll]: INFO (Unroll) adjusting parallelfor 
+2024-06-01T02:43:25Z INFO 1035035 [Unroll]: INFO (Unroll) DONE unrolling Sat Jun  1 02:43:25 2024
+
+2024-06-01T02:43:25Z INFO 1035035 [Unroll]: sg0000 Instruction count after Unroll: 
+2024-06-01T02:43:25Z INFO 1035035 [Unroll]: Total count: 2165
+2024-06-01T02:43:25Z INFO 1035035 [Unroll]: Matmult: 1644
+2024-06-01T02:43:25Z INFO 1035035 [Unroll]: GenericCopy: 180
+2024-06-01T02:43:25Z INFO 1035035 [Unroll]: TensorScalarPtr: 160
+2024-06-01T02:43:25Z INFO 1035035 [Unroll]: Save: 144
+2024-06-01T02:43:25Z INFO 1035035 [Unroll]: Load: 37
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: unroll finished after 0.016 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  43mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running psum_legalization
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: psum_legalization finished after 0.000 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  43mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running error_injector
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z WARNING 1035035 [ErrorInjector]: Unrecognized injected error value "0"
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: error_injector finished after 0.000 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  43mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running constant_propagate
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0
+2024-06-01T02:43:25Z INFO 1035035 [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: constant_propagate finished after 0.001 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  43mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running vn_splitter
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 0
+2024-06-01T02:43:25Z INFO 1035035 [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0
+2024-06-01T02:43:25Z INFO 1035035 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0
+2024-06-01T02:43:25Z INFO 1035035 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0
+2024-06-01T02:43:25Z INFO 1035035 [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds
+2024-06-01T02:43:25Z INFO 1035035 [VNSplitterPass]: INFO (VerticalFusion) Time: 0 seconds
+2024-06-01T02:43:25Z INFO 1035035 [VNSplitterPass]: INFO (ShrinkDN) Time: 0 seconds
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: vn_splitter finished after 0.001 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  43mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running lower_ac
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies.
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: lower_ac finished after 0.000 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  43mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running input_dma_coalescing
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: input_dma_coalescing finished after 0.000 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  43mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running early_peephole_opts
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to early_peephole_opts: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [EarlyPeepholeOpts]: PeepholeOpts enabled? ActivationAccumulate: true
+2024-06-01T02:43:25Z INFO 1035035 [EarlyPeepholeOpts]: Activation Accumulate: 0
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: early_peephole_opts finished after 0.000 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  43mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running pre_sched
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [PreSched]: Start PRE scheduling 2 cores:  1 at: Sat Jun  1 02:43:25 2024
+2024-06-01T02:43:25Z INFO 1035035 [LayerSpiller]: LayerSpill: Start...
+2024-06-01T02:43:25Z INFO 1035035 [LayerSpiller]: LayerSpill: Found 0 Splits CCs
+2024-06-01T02:43:25Z INFO 1035035 [LayerSpiller]: Grouped CCs to 0 clusters.
+2024-06-01T02:43:25Z INFO 1035035 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors
+2024-06-01T02:43:25Z INFO 1035035 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts
+2024-06-01T02:43:25Z INFO 1035035 [LayerSpiller]: LayerSpill: Done.
+2024-06-01T02:43:25Z INFO 1035035 [PreSched]: Start split live ranges Sat Jun  1 02:43:25 2024
+2024-06-01T02:43:25Z INFO 1035035 [PreSched]: No split opportunities: 
+2024-06-01T02:43:25Z INFO 1035035 [PreSched]: End split live ranges Sat Jun  1 02:43:25 2024
+2024-06-01T02:43:25Z INFO 1035035 [PreSched]: Strt remove redundncies Sat Jun  1 02:43:25 2024
+2024-06-01T02:43:25Z INFO 1035035 [PreSched]: remove_redundant_memsets
+2024-06-01T02:43:25Z INFO 1035035 [PreSched]: remove_redundant_memsets: 0
+2024-06-01T02:43:25Z INFO 1035035 [PreSched]: remove_redundant_loads
+2024-06-01T02:43:25Z INFO 1035035 [PreSched]: remove_redundant_loads: 0
+2024-06-01T02:43:25Z INFO 1035035 [PreSched]: End remove redundncies Sat Jun  1 02:43:25 2024
+2024-06-01T02:43:25Z INFO 1035035 [PreSched]: Start DCE Sat Jun  1 02:43:25 2024
+2024-06-01T02:43:25Z INFO 1035035 [PreSched]: End DCE Sat Jun  1 02:43:25 2024
+2024-06-01T02:43:25Z INFO 1035035 [PreSched]: Start build flow dependencies Sat Jun  1 02:43:25 2024
+2024-06-01T02:43:25Z INFO 1035035 [build_flow_deps]: Start build fdeps. Invocation: 1Sat Jun  1 02:43:25 2024
+2024-06-01T02:43:25Z INFO 1035035 [build_flow_deps]: Allocs: 692 instructions: 2165
+2024-06-01T02:43:25Z INFO 1035035 [build_flow_deps]: Build fdeps inserted 7732 edges 
+2024-06-01T02:43:25Z INFO 1035035 [build_flow_deps]: Done build fdeps 7732 Sat Jun  1 02:43:25 2024
+2024-06-01T02:43:25Z INFO 1035035 [PreSched]: End build flow dependencies Sat Jun  1 02:43:25 2024
+2024-06-01T02:43:25Z INFO 1035035 [PreSched]: Start remove useless insts Sat Jun  1 02:43:25 2024
+2024-06-01T02:43:25Z INFO 1035035 [PreSched]: remove_useless_insts
+2024-06-01T02:43:25Z INFO 1035035 [PreSched]: remove Useless Instructions: 0
+2024-06-01T02:43:25Z INFO 1035035 [PreSched]: End remove useless insts Sat Jun  1 02:43:25 2024
+2024-06-01T02:43:25Z INFO 1035035 [PreSched]: DONE PRE scheduling Sat Jun  1 02:43:25 2024
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: pre_sched finished after 0.027 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  43mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running tensor_copy_elim
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [TensorCopyElim]: Tensor CP elimination: 0
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: tensor_copy_elim finished after 0.005 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  43mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running mm_packing
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to mm_packing: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [MMPacking]: INFO (MMPack) Running the preprocessing step.
+2024-06-01T02:43:25Z INFO 1035035 [MMPacking]: INFO (MMPack) mlBPCG size 516, CCS = 3
+2024-06-01T02:43:25Z INFO 1035035 [MMPacking]: INFO (MMPack) agRGCG size 3304
+2024-06-01T02:43:25Z INFO 1035035 [MMPackingPass]: INFO (MMPacking) Time: 0.013 seconds
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: mm_packing finished after 0.014 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  44mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running coloring_allocator_psum
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 13287424
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 2844 bytes
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 36700160
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 2048 bytes
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 0
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 0 bytes
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: Allocating functions
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]:   linearize and check
+2024-06-01T02:43:25Z INFO 1035035 [PSUM_Allocator]:   allocating PSUM
+2024-06-01T02:43:25Z INFO 1035035 [PSUM_Allocator]:     main loop
+2024-06-01T02:43:25Z INFO 1035035 [PSUM_Allocator]:     renumber locations
+2024-06-01T02:43:25Z INFO 1035035 [PSUM_Allocator]:         size = 340
+2024-06-01T02:43:25Z INFO 1035035 [PSUM_Allocator]: build_no_bitmap start
+2024-06-01T02:43:25Z INFO 1035035 [PSUM_Allocator]: 100% PSUM demand before spilling
+2024-06-01T02:43:25Z INFO 1035035 [PSUM_Allocator]:           PSUM high-water mark = 8 tensors
+2024-06-01T02:43:25Z INFO 1035035 [PSUM_Allocator]:         found 1070 edges
+2024-06-01T02:43:25Z INFO 1035035 [PSUM_Allocator]:         mean: 6.29412
+2024-06-01T02:43:25Z INFO 1035035 [PSUM_Allocator]:         median: 6.99857
+2024-06-01T02:43:25Z INFO 1035035 [PSUM_Allocator]:         adjacency vectors require 8560 bytes
+2024-06-01T02:43:25Z INFO 1035035 [PSUM_Allocator]: build_no_bitmap done
+2024-06-01T02:43:25Z INFO 1035035 [PSUM_Allocator]:       find costs
+2024-06-01T02:43:25Z INFO 1035035 [PSUM_Allocator]:       best-of-n loop, heuristic = 0
+2024-06-01T02:43:25Z INFO 1035035 [PSUM_Allocator]:         simplify interference graph
+2024-06-01T02:43:25Z INFO 1035035 [PSUM_Allocator]:           initialize low and high
+2024-06-01T02:43:25Z INFO 1035035 [PSUM_Allocator]:             lo = 340
+2024-06-01T02:43:25Z INFO 1035035 [PSUM_Allocator]:             hi = 0
+2024-06-01T02:43:25Z INFO 1035035 [PSUM_Allocator]:             inf = 0
+2024-06-01T02:43:25Z INFO 1035035 [PSUM_Allocator]:             total = 340
+2024-06-01T02:43:25Z INFO 1035035 [PSUM_Allocator]:           simplify
+2024-06-01T02:43:25Z INFO 1035035 [PSUM_Allocator]:             new candidates = 0
+2024-06-01T02:43:25Z INFO 1035035 [PSUM_Allocator]:         select ranges
+2024-06-01T02:43:25Z INFO 1035035 [PSUM_Allocator]:           no more spills
+2024-06-01T02:43:25Z INFO 1035035 [PSUM_Allocator]:         PSUM score = 0 (lower is better)
+2024-06-01T02:43:25Z INFO 1035035 [PSUM_Allocator]: spilling from PSUM cost about 0 cycles
+2024-06-01T02:43:25Z INFO 1035035 [PSUM_Allocator]: 100% PSUM utilization after allocation
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 13287424
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 2844 bytes
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 36700160
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2048 bytes
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 0
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 0 bytes
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: coloring_allocator_psum finished after 0.008 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  44mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running dma_optimization_psum
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: dma_optimization_psum finished after 0.003 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  44mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running address_rotation_psum
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: address_rotation_psum finished after 0.009 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  44mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running coloring_allocator_sb
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 13287424
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 2844 bytes
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 36700160
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 2048 bytes
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 0
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 0 bytes
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: Allocating functions
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]:   linearize and check
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:   allocating SB
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:     main loop
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:       renumber locations
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:         size = 346
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:       find partners
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:         found 340 accumulation groups
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:           largest = _dot.26-t75_i159
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:             tensors = 2
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:             requires 9216 bytes/partition
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:       expanding partners
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:       find first defs
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:       find loads
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:         0 pin count
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:         17 remat count
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:       build interference graph
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:         pass 1 int-tree
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:            Num intervals 346 Num locations 346
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:            IntervalTree Build Done
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:            info.neighbors init Done
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:            info.neighbors partners Done
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:            IntervalTree readback Done
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:         edge: 3829
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:         mean: 22.1329
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:         median: 15.722
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:       find costs
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:       best-of-n loop, heuristic = 0
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:         simplify interference graph
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:           initialize safe & unsafe
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:               safe = 323
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:             unsafe = 23
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:                inf = 0
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:              total = 346
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:           simplify
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 346
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:             new candidates = 0
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:         select ranges
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:           Total: 346
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:             Spilled: 0.000 (0)
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:             Allocated: 1.000 (346)
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:               Rover zone: 0.948 (328)
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:               Pre-rover zone: 0.049 (17)
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:               Post-rover zone: 0.003 (1)
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:               Slice zone: 0.000 (0)
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:               Blocks nothing: 0.000 (0)
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:               Blocks medium: 0.000 (0)
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:               Blocks tall: 1.000 (346)
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:                 Visited until tall blocking (mean): 1.000
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:                 Visited until tall blocking (median): 1.000
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:                 Visited until tall blocking (p95): 1.000
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:           Success
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:           SB spills = 0 tensors
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:                size = 0 bytes/partition
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:              remats = 0 tensors
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:            unpinned = 0 tensors
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:                size = 0 bytes/partition
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]:         SB score = 0
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]: spilling from SB cost about 0 cycles
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]: 0 bytes/partition (0%) successfully pinned
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]: pinning saved approximately 0 cycles
+2024-06-01T02:43:25Z INFO 1035035 [SB_Allocator]: 0% SB utilization after allocation
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 13287424
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 2844 bytes
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 36700160
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2048 bytes
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 0
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 0 bytes
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: coloring_allocator_sb finished after 0.015 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  44mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running address_rotation_sb
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: address_rotation_sb finished after 0.001 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  44mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running dma_optimization_sb
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 49987584, 26.5814% input load, 73.4186% output write, 0% spill/reload [sg0000]
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: removed 0 identical load 
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(1.32874e+07)
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, -nan% out of total spill/reload dma traffic
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, -nan% out of total spill/reload dma traffic
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, -nan% out of total spill/reload dma traffic
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]:  average loaded DMA size 2844 bytes
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]:  average saved DMA size 2048 bytes
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 13287424
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 2844 bytes
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 36700160
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 2048 bytes
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, -nan% out of total spill/reload dma traffic
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 0, 0% out of total dma traffic
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 49987584, 26.5814% input load, 73.4186% output write, 0% spill/reload [sg0000]
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 13287424
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 2844 bytes
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 36700160
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 2048 bytes
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 0
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 0 bytes
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 2212 bytes
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module);
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: DMA optimization re-enable optimization
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: dma_optimization_sb finished after 0.004 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  45mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running address_rotation_sb
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: SB Rotation rotated 6 Sb address
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: SB Rotation rotated 100 Sb address
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: SB Rotation rotated 1 Sb address
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: address_rotation_sb finished after 0.004 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  45mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running coloring_allocator_dram
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 13287424
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 2844 bytes
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 36700160
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 2048 bytes
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 0
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 0 bytes
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: Allocating functions
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]:   linearize and check
+2024-06-01T02:43:25Z INFO 1035035 [DRAM_Allocator]:   allocating spills in DRAM pre_link mode
+2024-06-01T02:43:25Z INFO 1035035 [DRAM_Allocator]:     reserved space = 49987584 bytes
+2024-06-01T02:43:25Z INFO 1035035 [DRAM_Allocator]:     spill space = 0 bytes
+2024-06-01T02:43:25Z INFO 1035035 [DRAM_Allocator]:     aligned spill space = 0 bytes
+2024-06-01T02:43:25Z INFO 1035035 [DRAM_Allocator]:     dram space = 107374182400 bytes
+2024-06-01T02:43:25Z INFO 1035035 [DRAM_Allocator]:     renumber locations
+2024-06-01T02:43:25Z INFO 1035035 [DRAM_Allocator]:         size = 0
+2024-06-01T02:43:25Z INFO 1035035 [DRAM_Allocator]:       find first defs
+2024-06-01T02:43:25Z INFO 1035035 [DRAM_Allocator]:            Num intervals 0 Num locations 0
+2024-06-01T02:43:25Z INFO 1035035 [DRAM_Allocator]:            IntervalTree Build Done
+2024-06-01T02:43:25Z INFO 1035035 [DRAM_Allocator]:            info.neighbors init Done
+2024-06-01T02:43:25Z INFO 1035035 [DRAM_Allocator]:            IntervalTree readback Done
+2024-06-01T02:43:25Z INFO 1035035 [DRAM_Allocator]:         simplify interference graph
+2024-06-01T02:43:25Z INFO 1035035 [DRAM_Allocator]:           initialize low and high
+2024-06-01T02:43:25Z INFO 1035035 [DRAM_Allocator]:             lo = 0
+2024-06-01T02:43:25Z INFO 1035035 [DRAM_Allocator]:             hi = 0
+2024-06-01T02:43:25Z INFO 1035035 [DRAM_Allocator]:             total = 0
+2024-06-01T02:43:25Z INFO 1035035 [DRAM_Allocator]:           simplify
+2024-06-01T02:43:25Z INFO 1035035 [DRAM_Allocator]:             new candidates = 0
+2024-06-01T02:43:25Z INFO 1035035 [DRAM_Allocator]:         select ranges
+2024-06-01T02:43:25Z INFO 1035035 [DRAM_Allocator]: CC buffer size limit 524288000
+2024-06-01T02:43:25Z INFO 1035035 [DRAM_Allocator]: allreduce_dram_hwm 0
+2024-06-01T02:43:25Z INFO 1035035 [DRAM_Allocator]: Real CC buffer size 0
+2024-06-01T02:43:25Z INFO 1035035 [DRAM_Allocator]: DRAM hwm after allocation: 0
+2024-06-01T02:43:25Z INFO 1035035 [DRAM_Allocator]: DRAM allocation successful
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 13287424
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 2844 bytes
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 36700160
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2048 bytes
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 0
+2024-06-01T02:43:25Z INFO 1035035 [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 0 bytes
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: coloring_allocator_dram finished after 0.001 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  45mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running address_rotation_dram
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: Runtime page size at 512MB
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: DRAM hwm before rotation 0
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: allreduce buffer size 524288000
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: allreduce hwm 0
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: Real CC buffer size 0
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: DRAM hwm after rotation 0
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: address_rotation_dram finished after 0.000 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  45mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running tensorcopy_accel
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [TensorCopyAccel::Impl]: Running peephole optimization pass
+2024-06-01T02:43:25Z INFO 1035035 [TensorCopyAccel::Impl]: Accelerated 0 out of 180 tensorcopy in Function: sg0000 average acceleration factor: -nan
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: tensorcopy_accel finished after 0.000 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  45mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running peephole_opts
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: true SplitSelect: true
+2024-06-01T02:43:25Z INFO 1035035 [PeepholeOpts]: Split Select: 0
+2024-06-01T02:43:25Z INFO 1035035 [PeepholeOpts]: TSP -> ACT: 160
+2024-06-01T02:43:25Z INFO 1035035 [PeepholeOpts]: COPY -> ACT: 0
+2024-06-01T02:43:25Z INFO 1035035 [PeepholeOpts]: RECIPROCAL -> ACT: 0
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: peephole_opts finished after 0.001 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  45mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running lower_kernel
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [LowerKernel]: Started running LowerKernel
+2024-06-01T02:43:25Z INFO 1035035 [LowerKernel]: Start of kernel lowering pass, number of insts: 2165, number of allocs: 692
+2024-06-01T02:43:25Z INFO 1035035 [LowerKernel]: Scan BKs time (s): 6e-05
+2024-06-01T02:43:25Z INFO 1035035 [LowerKernel]: Lower BKs time (s): 4e-06
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: lower_kernel finished after 0.000 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  45mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running build_fdeps
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [build_flow_deps]: Start build fdeps. Invocation: 2Sat Jun  1 02:43:25 2024
+2024-06-01T02:43:25Z INFO 1035035 [build_flow_deps]: Allocs: 692 instructions: 2165
+2024-06-01T02:43:25Z INFO 1035035 [build_flow_deps]: Build fdeps inserted 7732 edges 
+2024-06-01T02:43:25Z INFO 1035035 [build_flow_deps]: Done build fdeps 7732 Sat Jun  1 02:43:25 2024
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: build_fdeps finished after 0.007 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  45mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running remove_redundancies
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [RemoveRedundancies]: remove_clobbered_writes
+2024-06-01T02:43:25Z INFO 1035035 [RemoveRedundancies]: remove_clobbered_writes: 0
+2024-06-01T02:43:25Z INFO 1035035 [RemoveRedundancies]: remove_useless_insts
+2024-06-01T02:43:25Z INFO 1035035 [RemoveRedundancies]: remove Useless Instructions: 0
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: remove_redundancies finished after 0.001 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  45mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running anti_dependency_analyzer
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer]: Batch size: 1000
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB}
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer-DRAM]: Finished analyzing 0 access patterns a mean/median -nan/0 intervals per access pattern and mean/median -nan/0 intersections per interval.
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer-PSUM4-64-128]: Finished analyzing 232 access patterns a mean/median 1/1 intervals per access pattern and mean/median 3.0625/2.35451 intersections per interval.
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer-PSUM4-0-64]: Finished analyzing 242 access patterns a mean/median 1/1 intervals per access pattern and mean/median 2.99/2.13691 intersections per interval.
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer-PSUM5-0-128]: Finished analyzing 250 access patterns a mean/median 1/1 intervals per access pattern and mean/median 2.96154/2.11201 intersections per interval.
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer-PSUM7-0-128]: Finished analyzing 255 access patterns a mean/median 1/1 intervals per access pattern and mean/median 2.78774/2.01188 intersections per interval.
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer-PSUM0-0-64]: Finished analyzing 245 access patterns a mean/median 1/1 intervals per access pattern and mean/median 2.82673/2.01299 intersections per interval.
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer-PSUM6-64-128]: Finished analyzing 232 access patterns a mean/median 1/1 intervals per access pattern and mean/median 3.0625/2.21147 intersections per interval.
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer-PSUM6-0-64]: Finished analyzing 242 access patterns a mean/median 1/1 intervals per access pattern and mean/median 2.99/2.12059 intersections per interval.
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer-PSUM2-64-128]: Finished analyzing 235 access patterns a mean/median 1.00426/1.00006 intervals per access pattern and mean/median 3.81443/3.44164 intersections per interval.
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer-PSUM2-0-64]: Finished analyzing 245 access patterns a mean/median 1.00408/1.00005 intervals per access pattern and mean/median 3.72277/3.18785 intersections per interval.
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer-PSUM3-0-128]: Finished analyzing 250 access patterns a mean/median 1/1 intervals per access pattern and mean/median 2.96154/2.22082 intersections per interval.
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer-PSUM1-0-128]: Finished analyzing 255 access patterns a mean/median 1.01176/1.00021 intervals per access pattern and mean/median 4.71698/3.21012 intersections per interval.
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer-PSUM0-64-128]: Finished analyzing 235 access patterns a mean/median 1/1 intervals per access pattern and mean/median 2.89691/2.19408 intersections per interval.
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer-SB-0-64]: Finished analyzing 3969 access patterns a mean/median 1.01814/1 intervals per access pattern and mean/median 1.67929/1.11299 intersections per interval.
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer-SB-64-128]: Finished analyzing 3940 access patterns a mean/median 1.01827/1 intervals per access pattern and mean/median 1.69545/1.13023 intersections per interval.
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: anti_dependency_analyzer finished after 0.008 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  46mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running tensor_copy_elim
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [TensorCopyElim]: Tensor CP elimination: 0
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: tensor_copy_elim finished after 0.001 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  46mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running post_sched
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [post_scheduler]: Start PosT ScheD 3 sunda Sat Jun  1 02:43:25 2024
+2024-06-01T02:43:25Z INFO 1035035 [post_scheduler]: Time-aware hwm post-sched
+2024-06-01T02:43:25Z INFO 1035035 [post_scheduler]: Time-aware simulation time: 277714
+2024-06-01T02:43:25Z INFO 1035035 [post_scheduler]: Done  PosT ScheD Sat Jun  1 02:43:25 2024
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: post_sched finished after 0.018 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  48mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running address_rotation_sb
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: PSUM Rotation rotated 236 PSUM Banks
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: PSUM Rotation rotated 163 PSUM Banks
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: SB Rotation rotated 3 Sb address
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: SB Rotation rotated 8 Sb address
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: SB Rotation rotated 33 Sb address
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address
+2024-06-01T02:43:25Z INFO 1035035 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: address_rotation_sb finished after 0.010 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  48mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running anti_dependency_analyzer
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer]: Batch size: 1000
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB}
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer-DRAM]: Finished analyzing 0 access patterns a mean/median -nan/0 intervals per access pattern and mean/median -nan/0 intersections per interval.
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer-PSUM1-0-64]: Finished analyzing 262 access patterns a mean/median 1/1 intervals per access pattern and mean/median 3.06881/2.47842 intersections per interval.
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer-PSUM4-0-128]: Finished analyzing 237 access patterns a mean/median 1/1 intervals per access pattern and mean/median 2.83163/2.04294 intersections per interval.
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer-PSUM1-64-128]: Finished analyzing 257 access patterns a mean/median 1/1 intervals per access pattern and mean/median 3.1028/2.55187 intersections per interval.
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer-PSUM2-0-64]: Finished analyzing 249 access patterns a mean/median 1/1 intervals per access pattern and mean/median 2.96116/2.20406 intersections per interval.
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer-PSUM3-0-64]: Finished analyzing 262 access patterns a mean/median 1/1 intervals per access pattern and mean/median 2.9633/2.19832 intersections per interval.
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer-PSUM7-0-64]: Finished analyzing 241 access patterns a mean/median 1.01245/1.00024 intervals per access pattern and mean/median 4.37/2.95022 intersections per interval.
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer-PSUM7-64-128]: Finished analyzing 231 access patterns a mean/median 1.01299/1.00027 intervals per access pattern and mean/median 4.46875/2.9969 intersections per interval.
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer-PSUM3-64-128]: Finished analyzing 257 access patterns a mean/median 1/1 intervals per access pattern and mean/median 2.99533/2.37576 intersections per interval.
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer-PSUM6-0-64]: Finished analyzing 245 access patterns a mean/median 1/1 intervals per access pattern and mean/median 2.88119/1.98231 intersections per interval.
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer-PSUM6-64-128]: Finished analyzing 240 access patterns a mean/median 1/1 intervals per access pattern and mean/median 2.91414/2.03089 intersections per interval.
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer-PSUM0-0-128]: Finished analyzing 238 access patterns a mean/median 1.0042/1.00006 intervals per access pattern and mean/median 3.65306/3.24765 intersections per interval.
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer-PSUM5-0-128]: Finished analyzing 250 access patterns a mean/median 1/1 intervals per access pattern and mean/median 2.98558/2.28373 intersections per interval.
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer-PSUM2-64-128]: Finished analyzing 234 access patterns a mean/median 1/1 intervals per access pattern and mean/median 3.06701/2.43194 intersections per interval.
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer-SB-0-64]: Finished analyzing 3969 access patterns a mean/median 1.01814/1 intervals per access pattern and mean/median 1.67038/1.10455 intersections per interval.
+2024-06-01T02:43:25Z INFO 1035035 [AntiDependencyAnalyzer-SB-64-128]: Finished analyzing 3940 access patterns a mean/median 1.01827/1 intervals per access pattern and mean/median 1.68636/1.13374 intersections per interval.
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: anti_dependency_analyzer finished after 0.010 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  48mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running dep_opt
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [build_flow_deps]: Start build fdeps. Invocation: 3Sat Jun  1 02:43:25 2024
+2024-06-01T02:43:25Z INFO 1035035 [build_flow_deps]: Allocs: 692 instructions: 2165
+2024-06-01T02:43:25Z INFO 1035035 [build_flow_deps]: Build fdeps inserted 7663 edges 
+2024-06-01T02:43:25Z INFO 1035035 [build_flow_deps]: Done build fdeps 7663 Sat Jun  1 02:43:25 2024
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: dep_opt finished after 0.016 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  48mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running report_stats
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [ReportStats]: Data Movement Statistics: sg0000
+┌─────────────┬────────────────────────────┬───────┬──────────┐
+│ Instruction │ Kind                       │ Count │ Bytes    │
+├─────────────┼────────────────────────────┼───────┼──────────┤
+│ Load        │ Const -> Internal          │ 19    │ 6406144  │
+│ Load        │ ExternalInput -> Internal  │ 18    │ 6881280  │
+│ Save        │ Internal -> ExternalOutput │ 144   │ 36700160 │
+└─────────────┴────────────────────────────┴───────┴──────────┘
+
+2024-06-01T02:43:25Z INFO 1035035 [ReportStats]: 
+┌─────────────────────┬───────┐
+│ Bytes per partition │ Count │
+├─────────────────────┼───────┤
+│ 128                 │ 1     │
+│ 256                 │ 1     │
+│ 512                 │ 1     │
+│ 2048                │ 144   │
+│ 3072                │ 34    │
+└─────────────────────┴───────┘
+
+2024-06-01T02:43:25Z INFO 1035035 [ReportStats]: MM Stats: #MatMults 1644 #MatMult-Transposes 684
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: report_stats finished after 0.000 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  48mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running assign_trigger_engine
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to assign_trigger_engine: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [AssignTriggerEngine]: Assigned trigger engine for 0 DMA instructions
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: assign_trigger_engine finished after 0.000 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  48mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: Running alloc_queues
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [ModuleForkPass]: alloc_queues finished after 0.000 seconds
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: curr_vmrss:  48mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [BackendDriver]: mod_parallel_pass finished after 0.192 seconds
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: curr_vmrss:  48mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [BackendDriver]: Running dep_reduction
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: Inputs to dep_reduction: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [DepReduction]: Start Dependency Reduction
+2024-06-01T02:43:25Z INFO 1035035 [DepReduction]: Processing async instrs...
+2024-06-01T02:43:25Z INFO 1035035 [DepReduction]: Processing secondary edges per engine...
+2024-06-01T02:43:25Z INFO 1035035 [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 1894
+2024-06-01T02:43:25Z INFO 1035035 [DepReduction]: Processing redundant descendants...
+2024-06-01T02:43:25Z INFO 1035035 [DepReduction]: Processing redundant descendants, Done. Num edges removed 0
+2024-06-01T02:43:25Z INFO 1035035 [DepReduction]: Processing async instrs, Done. Num edges removed 1894
+2024-06-01T02:43:25Z INFO 1035035 [DepReduction]: Num Async removed: 0
+2024-06-01T02:43:25Z INFO 1035035 [DepReduction]: Finished dependency reduction: 10388 removed, new total 870
+2024-06-01T02:43:25Z INFO 1035035 [DepReduction]: Finished Dependency Reduction
+2024-06-01T02:43:25Z USER 1035035 [BackendDriver]: dep_reduction finished after 0.013 seconds
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: curr_vmrss:  49mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [BackendDriver]: Running bir_racecheck
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: Inputs to bir_racecheck: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [BackendDriver]: bir_racecheck finished after 0.013 seconds
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: curr_vmrss:  51mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2165 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [BackendDriver]: Running lower_dma
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: Inputs to lower_dma: modules=1 functions=1 allocs=692 blocks=1 instructions=2165 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [BackendDriver]: lower_dma finished after 0.004 seconds
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: curr_vmrss:  52mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2184 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [BackendDriver]: Running coalesce_dma_blocks
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: Inputs to coalesce_dma_blocks: modules=1 functions=1 allocs=692 blocks=1 instructions=2184 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [CoalesceDmaBlocks]: Coaleseced 1 DMA triggers
+2024-06-01T02:43:25Z USER 1035035 [BackendDriver]: coalesce_dma_blocks finished after 0.000 seconds
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: curr_vmrss:  52mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2183 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [BackendDriver]: Running alloc_semaphores
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: Inputs to alloc_semaphores: modules=1 functions=1 allocs=692 blocks=1 instructions=2183 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [BackendDriver]: alloc_semaphores finished after 0.002 seconds
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: curr_vmrss:  52mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2183 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [BackendDriver]: Running expand_inst_late
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: Inputs to expand_inst_late: modules=1 functions=1 allocs=692 blocks=1 instructions=2183 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [BackendDriver]: expand_inst_late finished after 0.000 seconds
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: curr_vmrss:  52mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2183 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [BackendDriver]: Running lower_sync
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: Inputs to lower_sync: modules=1 functions=1 allocs=692 blocks=1 instructions=2183 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [BackendDriver]: lower_sync finished after 0.001 seconds
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: curr_vmrss:  52mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2427 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [BackendDriver]: Running lower_act
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: Inputs to lower_act: modules=1 functions=1 allocs=692 blocks=1 instructions=2427 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [BackendDriver]: lower_act finished after 0.001 seconds
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: curr_vmrss:  52mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2428 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [BackendDriver]: Running lower_dve
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: Inputs to lower_dve: modules=1 functions=1 allocs=692 blocks=1 instructions=2428 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [LowerDVE]: Loading DVE opcodes table dve_info.json from /root/anaconda3/envs/masp_fastapi/lib/python3.9/site-packages/neuronxcc/dve/dve_bin/dve_info.json
+2024-06-01T02:43:25Z USER 1035035 [BackendDriver]: lower_dve finished after 0.004 seconds
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: curr_vmrss:  52mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2428 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [BackendDriver]: Running lower_ap
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: Inputs to lower_ap: modules=1 functions=1 allocs=692 blocks=1 instructions=2428 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [BackendDriver]: lower_ap finished after 0.000 seconds
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: curr_vmrss:  52mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2428 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [BackendDriver]: Running alloc_regs
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: Inputs to alloc_regs: modules=1 functions=1 allocs=692 blocks=1 instructions=2428 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [AllocRegs]:   allocating REG
+2024-06-01T02:43:25Z INFO 1035035 [AllocRegs]:     main loop iteration 1
+2024-06-01T02:43:25Z USER 1035035 [BackendDriver]: alloc_regs finished after 0.000 seconds
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: curr_vmrss:  52mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2428 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [BackendDriver]: Running birverifier
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: Inputs to birverifier: modules=1 functions=1 allocs=692 blocks=1 instructions=2428 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [BackendDriver]: birverifier finished after 0.003 seconds
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: curr_vmrss:  52mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2428 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [BackendDriver]: Running codegen
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: Inputs to codegen: modules=1 functions=1 allocs=692 blocks=1 instructions=2428 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [Codegen]: Total compiler allocated DRAM tensors: 0 GB
+2024-06-01T02:43:25Z INFO 1035035 [Codegen]: Total un-allocated DRAM tensors by kind: 
+2024-06-01T02:43:25Z INFO 1035035 [Codegen]: 
+┌────────────────┬────────────┐
+│ TensorKind     │ Size (GB)  │
+├────────────────┼────────────┤
+│ ExternalInput  │ 0.00640869 │
+│ ExternalOutput │ 0.0341797  │
+│ Const          │ 0.00596619 │
+└────────────────┴────────────┘
+
+2024-06-01T02:43:25Z INFO 1035035 [Codegen]: Total runtime managed DRAM tensors: 0.0465546 GB
+2024-06-01T02:43:25Z INFO 1035035 [Codegen]: Instruction Stats: 
+2024-06-01T02:43:25Z INFO 1035035 [Codegen]: 
+┌────────────────────┬───────┐
+│ Opcode             │ Count │
+├────────────────────┼───────┤
+│ LDWEIGHTS          │ 2220  │
+│ MATMUL             │ 2220  │
+│ ACTIVATE           │ 340   │
+│ EVENT_SEMAPHORE    │ 244   │
+│ PSEUDO_DMA_TRIGGER │ 180   │
+│ NOP                │ 19    │
+│ ACT_TABLE_LOAD     │ 1     │
+└────────────────────┴───────┘
+
+2024-06-01T02:43:25Z INFO 1035035 [Codegen]: 
+┌────────────┬───────┐
+│ Engine     │ Count │
+├────────────┼───────┤
+│ Unassigned │ 0     │
+│ Pool       │ 267   │
+│ Activation │ 465   │
+│ PE         │ 4459  │
+│ DMA        │ 0     │
+│ DVE        │ 1     │
+│ SP         │ 37    │
+└────────────┴───────┘
+
+2024-06-01T02:43:25Z INFO 1035035 [Codegen]: Total instructions: 5229 (0.000311673 GB)
+2024-06-01T02:43:25Z INFO 1035035 [Codegen]: Total Dynamic DMA instruction count: 0
+2024-06-01T02:43:25Z USER 1035035 [Codegen]: isa_gen finished after 0.016 seconds
+2024-06-01T02:43:25Z INFO 1035035 [Codegen]: Number of DMA descriptors on each queue:
+2024-06-01T02:43:25Z INFO 1035035 [Codegen]: 
+┌─────────────────┬───────┐
+│ Queue           │ Count │
+├─────────────────┼───────┤
+│ qPoolIO0        │ 35840 │
+│ qSPIO0          │ 4480  │
+│ qSPSpillReload0 │ 4864  │
+└─────────────────┴───────┘
+
+2024-06-01T02:43:25Z INFO 1035035 [Codegen]: Total descriptors: 45184 (0.000673294 GB)
+2024-06-01T02:43:25Z USER 1035035 [Codegen]: dma_desc_gen finished after 0.003 seconds
+2024-06-01T02:43:25Z INFO 1035035 [Codegen]: Estimated peak DRAM usage: 0.0475395 GB
+2024-06-01T02:43:25Z INFO 1035035 [Codegen]: Generating debug info
+2024-06-01T02:43:25Z USER 1035035 [Codegen]: debug_info_gen finished after 0.007 seconds
+2024-06-01T02:43:25Z USER 1035035 [BackendDriver]: codegen finished after 0.028 seconds
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: curr_vmrss:  54mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2428 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z USER 1035035 [BackendDriver]: Running neff_packager
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: Inputs to neff_packager: modules=1 functions=1 allocs=692 blocks=1 instructions=2428 Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1035035 [NeffFileWriter]: IR signature: 15ca70d32b4d75ed5c733f5b6f5bb47e for neff artifacts
+2024-06-01T02:43:25Z USER 1035035 [BackendDriver]: neff_packager finished after 0.250 seconds
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: curr_vmrss:  54mb, ru_maxrss:  103mb (delta=0mb)
+2024-06-01T02:43:25Z INFO 1035035 [BackendDriver]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2428 instruction(s). Max writers: 144 Max Readers: 576
+2024-06-01T02:43:25Z INFO 1034816 [job.WalrusDriver.0]: Job #0 finished
+2024-06-01T02:43:25Z INFO 1034816 [pipeline.Pipeline.0]: Finished job job.WalrusDriver.0
+2024-06-01T02:43:25Z INFO 1034816 [pipeline.Pipeline.0]: Starting job job.BIRLinker.0
+2024-06-01T02:43:25Z INFO 1034816 [job.BIRLinker.0]: Replay this job by calling: /root/anaconda3/envs/masp_fastapi/bin/neuronx-cc compile --framework XLA --state '{"model": ["/tmp/tmpmg125iqw/model"], "tensormap": "tensor_map.json", "bir": "bir.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "state_dir": "/root/masp_serving_inf2/neuronxcc-u5sr5k2_/sg00", "state_id": "sg00"}' --pipeline BIRLinker
+2024-06-01T02:43:25Z INFO 1034816 [job.BIRLinker.0]: BIRLinker cwd: /root/masp_serving_inf2/neuronxcc-u5sr5k2_
+2024-06-01T02:43:25Z INFO 1034816 [job.BIRLinker.0]: Linking not needed. Netlist doesnt exist
+2024-06-01T02:43:25Z INFO 1034816 [pipeline.Pipeline.0]: Finished job job.BIRLinker.0
+2024-06-01T02:43:25Z INFO 1034816 [pipeline.Pipeline.0]: Starting job job.Kelper.0
+2024-06-01T02:43:25Z INFO 1034816 [job.Kelper.0]: Skipping neff generation which was already performed by neff_packager
+2024-06-01T02:43:25Z INFO 1034816 [pipeline.Pipeline.0]: Finished job job.Kelper.0
+2024-06-01T02:43:25Z INFO 1034816 [pipeline.Pipeline.0]: Finished pipeline Pipeline
+2024-06-01T02:43:25Z INFO 1034816 [pipeline.Pipeline.0]: Job #0 finished
+2024-06-01T02:43:25Z USER 1034816 [root]: Compiler status PASS
+2024-06-01T02:43:26Z INFO 1034684 [root]: Subcommand returned with exitcode=0
diff --git a/mistral-test/benchmark.py b/mistral-test/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..3495d9bd28c3e2dfbf3122fc4c570b362d36af8b
--- /dev/null
+++ b/mistral-test/benchmark.py
@@ -0,0 +1,280 @@
+# Limit the number of NeuronCores allocated to the process
+import os
+import json
+import time
+import math
+import torch
+from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
+from transformers_neuronx import NeuronConfig, QuantizationConfig
+from transformers_neuronx.mistral.model import MistralForSampling as NeuronModelForSampling
+from transformers_neuronx.module import save_pretrained_split
+from transformers_neuronx import constants, GQA
+import numpy as np
+
+def get_input_ids(prompt_length, batch_size):
+    input_ids = torch.randint(low=10, high=1000, size=(batch_size, prompt_length), dtype=torch.int32)
+    return input_ids
+
+class LatencyCollector:
+
+    def __init__(self):
+        self.start = None
+        self.latency_list = []
+
+    def pre_hook(self, *args):
+        self.start = time.time()
+
+    def hook(self, *args):
+        self.latency_list.append(time.time() - self.start)
+
+    def percentile(self, percent):
+        latency_list = self.latency_list
+        pos_float = len(latency_list) * percent / 100
+        max_pos = len(latency_list) - 1
+        pos_floor = min(math.floor(pos_float), max_pos)
+        pos_ceil = min(math.ceil(pos_float), max_pos)
+        latency_list = sorted(latency_list)
+        return latency_list[pos_ceil] if pos_float - pos_floor > 0.5 else latency_list[pos_floor]
+
+def parse_amp(amp):
+    return amp, None, None
+
+def benchmark(model, input_ids, sequence_length, n_warmup, n_runs, hidden_size, num_attention_heads, num_key_value_heads, ffn_dim, num_hidden_layers, vocab_size, amp, tp_degree, top_k, gqa=None):    
+    batch_size = input_ids.shape[0]
+    elapsed_list = []        
+    latency_collector_all = LatencyCollector() 
+    with torch.inference_mode():
+        for _ in range(n_warmup):
+            generated_sequence = model.sample(input_ids, sequence_length=sequence_length, top_k=top_k)
+
+        model.register_forward_pre_hook(latency_collector_all.pre_hook)
+        model.register_forward_hook(latency_collector_all.hook)
+        print(f'>>>> seq_len = {sequence_length}')
+        for i in range(n_runs):
+            start = time.time()            
+            generated_sequence = model.sample(input_ids, sequence_length=sequence_length, top_k=top_k)            
+            elapsed = time.time() - start
+            elapsed_list.append(elapsed)
+
+    num_prompt_tokens = input_ids.shape[-1]
+    _, num_tokens = generated_sequence.shape
+    num_new_tokens = num_tokens - num_prompt_tokens
+
+    dtype, dtype_layers, _ = parse_amp(amp)
+    if dtype_layers is None:
+        dtype_layers = dtype
+    num_bytes_per_parameter_map = {'u8': 1, 's8': 1, 'bf16': 2, 'f16': 2, 'f32': 4}
+    num_bytes_per_parameter = num_bytes_per_parameter_map[dtype]
+    num_bytes_per_parameter_layers = num_bytes_per_parameter_map[dtype_layers]
+
+    # calculate the data read from hbm - model weights and kv cache
+    # ignore input embeddings (on cpu); ignore biases and LayerNorm parameters (too small)
+    if gqa == None:
+        parameters_bytes = hidden_size * vocab_size * num_bytes_per_parameter                                                                     
+        head_dim = hidden_size // num_attention_heads
+        actual_num_key_value_heads = num_key_value_heads # also same as num_attention_heads
+        for _ in range(num_hidden_layers):
+            num_layer_parameters = 2 * hidden_size * hidden_size + 2 * (head_dim * actual_num_key_value_heads) * hidden_size + 3 * hidden_size * ffn_dim
+            parameters_bytes += num_layer_parameters * num_bytes_per_parameter_layers
+        caches_num_element = num_hidden_layers * 2 * sequence_length * batch_size * (head_dim * actual_num_key_value_heads)
+        caches_bytes = caches_num_element * num_bytes_per_parameter
+        largest_bucket_dma_size = parameters_bytes + caches_bytes
+    elif gqa == GQA.REPLICATED_HEADS:
+        print(f'GQA sharding type is replicated_heads')
+        parameters_bytes = hidden_size * vocab_size * num_bytes_per_parameter                                      
+        head_dim = hidden_size // num_attention_heads
+        actual_num_key_value_heads = max(num_key_value_heads, tp_degree)
+        for _ in range(num_hidden_layers):
+            num_layer_parameters = 2 * hidden_size * hidden_size + 2 * (head_dim * actual_num_key_value_heads) * hidden_size + 3 * hidden_size * ffn_dim
+            parameters_bytes += num_layer_parameters * num_bytes_per_parameter_layers
+        caches_num_element = num_hidden_layers * 2 * sequence_length * batch_size * (head_dim * actual_num_key_value_heads)
+        caches_bytes = caches_num_element * num_bytes_per_parameter
+        largest_bucket_dma_size = parameters_bytes + caches_bytes
+    elif gqa == GQA.ALL_GATHER_HEADS: # To confirm
+        print(f'GQA sharding type is all_gather_heads')
+        parameters_bytes = hidden_size * vocab_size * num_bytes_per_parameter                                      
+        head_dim = hidden_size // num_attention_heads
+        actual_num_key_value_heads = num_key_value_heads
+        for _ in range(num_hidden_layers):
+            num_layer_parameters = 2 * hidden_size * hidden_size + 2 * (head_dim * actual_num_key_value_heads) * hidden_size + 3 * hidden_size * ffn_dim
+            parameters_bytes += num_layer_parameters * num_bytes_per_parameter_layers
+        caches_num_element = num_hidden_layers * 2 * sequence_length * batch_size * (head_dim * num_attention_heads) # due to having to read HBM multiple times
+        caches_bytes = caches_num_element * num_bytes_per_parameter
+        largest_bucket_dma_size = parameters_bytes + caches_bytes
+    elif gqa == GQA.SHARD_OVER_HEADS: # num_key_value_heads % tp_degree == 0 and num_key_value_heads > 0
+        print(f'GQA sharding type is shard_over_heads')
+        parameters_bytes = hidden_size * vocab_size * num_bytes_per_parameter                                      
+        head_dim = hidden_size // num_attention_heads
+        actual_num_key_value_heads = num_key_value_heads
+        for _ in range(num_hidden_layers):
+            num_layer_parameters = 2 * hidden_size * hidden_size + 2 * (head_dim * actual_num_key_value_heads) * hidden_size + 3 * hidden_size * ffn_dim
+            parameters_bytes += num_layer_parameters * num_bytes_per_parameter_layers
+        caches_num_element = num_hidden_layers * 2 * sequence_length * batch_size * (head_dim * actual_num_key_value_heads)
+        caches_bytes = caches_num_element * num_bytes_per_parameter
+        largest_bucket_dma_size = parameters_bytes + caches_bytes
+    else:
+        raise ValueError('gqa sharding not defined')
+
+
+    latency_list = latency_collector_all.latency_list
+    last_half_latency_list = latency_list[-sequence_length//2:]
+    hbm_bandwidths = [largest_bucket_dma_size / lat for lat in last_half_latency_list]
+    avg_hbm_bandwidth_gb_per_sec = sum(hbm_bandwidths) / len(hbm_bandwidths) / 1e9
+
+    p0_latency_ms = latency_collector_all.percentile(0) * 1000
+    p50_latency_ms = latency_collector_all.percentile(50) * 1000
+    p90_latency_ms = latency_collector_all.percentile(90) * 1000
+    p95_latency_ms = latency_collector_all.percentile(95) * 1000
+    p99_latency_ms = latency_collector_all.percentile(99) * 1000
+    p100_latency_ms = latency_collector_all.percentile(100) * 1000
+    
+    elapsed = sum(elapsed_list) / len(elapsed_list)
+    max_throughput = batch_size * num_new_tokens / min(elapsed_list)
+    average_throughput = batch_size * num_new_tokens / elapsed
+
+    hbm_bw_util_percent = avg_hbm_bandwidth_gb_per_sec / (410 * tp_degree) * 100
+    report_dict = dict()
+    report_dict["Latency P0"] = f'{p0_latency_ms:.1f}'
+    report_dict["Latency P50"]=f'{p50_latency_ms:.1f}'
+    report_dict["Latency P90"]=f'{p90_latency_ms:.1f}'
+    report_dict["Latency P95"]=f'{p95_latency_ms:.1f}'
+    report_dict["Latency P99"]=f'{p99_latency_ms:.1f}'
+    report_dict["Latency P100"]=f'{p100_latency_ms:.1f}'
+    report_dict["E2E Latency"]=f'{1000.0 * elapsed:.1f}'
+    report_dict["Average throughput"]=f'{average_throughput:.1f}'
+    report_dict["Peak throughput"]=f'{max_throughput:.1f}'
+    report_dict["HBM bandwidth"]=f'{hbm_bw_util_percent:.1f}'
+
+    return generated_sequence, report_dict
+    
+def measure(model, inputs, sequence_length, top_k, tp_degree, amp, n_warmup, n_runs, gqa, output_dir='./tmp'):
+    batch_size, input_length = inputs.shape 
+    config = model.config
+
+    ffn_dim = config.intermediate_size
+    num_hidden_layers = config.num_hidden_layers
+    vocab_size = config.vocab_size        
+    hidden_size = config.hidden_size
+    num_attention_heads = config.num_attention_heads
+    num_key_value_heads = config.num_key_value_heads
+
+    outputs, info = benchmark(model, inputs, sequence_length, n_warmup, n_runs, hidden_size, num_attention_heads, num_key_value_heads, ffn_dim, num_hidden_layers, vocab_size, amp, tp_degree, top_k, gqa)
+    output_length = outputs.shape[1] - input_length
+
+    os.makedirs(output_dir, exist_ok=True)
+    os.makedirs(f"{output_dir}/{model_tag}", exist_ok=True)
+    with open(f"{output_dir}/{model_tag}/in{input_length}_out{output_length}_batch{batch_size}_tp{tp_degree}.json", "w") as file:
+        json.dump(info, file, indent=4)
+
+
+
+if __name__ == '__main__':
+    # ================= Input parameters =======================================
+
+    ws_dir = "/home/ubuntu/workspace/bytedance/experiment"
+    model_tag = "mistralai/Mistral-7B-Instruct-v0.2"
+    model_dir = os.path.join(ws_dir, model_tag)
+
+    smallest_context_length = 2289
+    context_length_estimates = [2289]
+    n_positions = [2289+128]
+    sequence_lengths = [2289+128]
+
+    benchmark_single_batch = True # due to slow compilation
+    benchmark_large_batch = False # due to slow compilation
+    # optlevel = '1' # compiler optimization level
+    use_config = True # int8 has a bug in llama v2
+    tp_degree = 2
+    smallest_batch_size = 8
+    largest_batch_size = 8
+    amp = 'f16'
+    amp_hbm = amp
+    gqa = None # GQA.REPLICATED_HEADS # for this model
+    n_warmup = 5
+    n_runs = 10
+    torch.manual_seed(999)
+    # ================= Input parameters =======================================
+
+    os.environ['NEURON_RT_NUM_CORES'] = str(tp_degree)
+
+    if use_config:
+        os.environ['NEURON_CC_FLAGS'] = f"--enable-experimental-spmd --tensorizer-options='--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2'"
+
+        neuron_config = NeuronConfig(
+            attention_layout=constants.LAYOUT_BSH,
+            collectives_layout=constants.LAYOUT_BSH,
+        )
+        if neuron_config.quant:
+            amp_hbm = neuron_config.quant.quant_dtype
+        if neuron_config.group_query_attention:
+            gqa = neuron_config.group_query_attention
+    else:
+        neuron_config = None
+    context_unroll = None
+    model = None
+
+    if benchmark_single_batch: 
+        batch_size = smallest_batch_size
+        os.environ["NEURON_COMPILE_CACHE_URL"] = f"./neuron_cache/b{batch_size}s{max(sequence_lengths)}tp{tp_degree}" 
+        # Load the model w/ batch-1 on trn1/inf2
+        model = NeuronModelForSampling.from_pretrained(
+            model_dir,
+            batch_size=batch_size,
+            tp_degree=tp_degree,
+            amp=amp,
+            n_positions=n_positions,
+            context_length_estimate=context_length_estimates,
+            context_unroll=context_unroll,
+            neuron_config=neuron_config,
+        )
+
+        print('Model compilation started')
+        model.to_neuron()
+        print('Model compilation finished')
+                                                                                                                             
+                                                                                                                             
+        # ===== Benchmark ===================================================================
+                                                                                                                             
+        # Run small context length
+        prompt_length = smallest_context_length
+        for sequence_length in sequence_lengths:        
+            inputs = get_input_ids(prompt_length, batch_size)
+            measure(model, inputs, sequence_length, 1, tp_degree, amp_hbm, n_warmup, n_runs, gqa)
+                                                                                                                             
+        # Run long context lengths
+        if 0:
+            n_output_tokens = smallest_context_length                                                 
+            for sequence_length in sequence_lengths:        
+                prompt_length = sequence_length - n_output_tokens
+                if prompt_length == n_output_tokens:
+                    continue
+                inputs = get_input_ids(prompt_length, batch_size)
+                measure(model, inputs, sequence_length, 1, tp_degree, amp_hbm, n_warmup, n_runs, gqa)
+        
+    # ===== Benchmark ===================================================================
+
+    # ==== Large batch size =============================================================
+    if benchmark_large_batch:
+        # Load the model w/ largest batch-size on trn1/inf2
+        batch_size=largest_batch_size
+        os.environ["NEURON_COMPILE_CACHE_URL"] = f"./neuron_cache/{model_tag}/sb/b{batch_size}s{n_positions}tp{tp_degree}" 
+
+        model = NeuronModelForSampling.from_pretrained(
+            model_dir,
+            batch_size=batch_size,
+            tp_degree=tp_degree,
+            amp=amp,
+            n_positions=n_positions,
+            context_length_estimate=context_length_estimates,
+            context_unroll=context_unroll,
+            neuron_config=neuron_config,
+        )
+        print('Model compilation started')
+        model.to_neuron()
+        print('Model compilation finished')
+
+        # ===== Benchmark ===================================================================
+        prompt_length = smallest_context_length
+        for sequence_length in sequence_lengths:
+            inputs = get_input_ids(prompt_length, batch_size)
+            measure(model, inputs, sequence_length, 1, tp_degree, amp_hbm, n_warmup, n_runs, gqa)
diff --git a/mistral-test/compile.py b/mistral-test/compile.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1d68d936df5a483485941451b687404f43776e8
--- /dev/null
+++ b/mistral-test/compile.py
@@ -0,0 +1,62 @@
+import torch_neuronx
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer
+import transformers
+from transformers_neuronx import MistralForSampling, GQA, NeuronConfig, QuantizationConfig
+from transformers_neuronx import constants
+import time
+import os
+import concurrent.futures
+from transformers_neuronx import GQA, QuantizationConfig
+from typing import Literal
+class MistralModel:
+    """
+    A class for generating text using the Mistral language model.
+    """
+
+    def __init__(self, batch_size=1, tp_degree=2):
+        self.neuron_config = NeuronConfig(group_query_attention=GQA.SHARD_OVER_HEADS, quant=QuantizationConfig(quant_dtype='s8', dequant_dtype='bf16'))
+        #self.neuron_config = NeuronConfig(
+        #    attention_layout=constants.LAYOUT_BSH,
+        #    collectives_layout=constants.LAYOUT_BSH,
+        #)
+
+        self.model_name = '/root/llava_mistral_0531/inf2_dir_0531/llava-mistral_videollava_092'
+        self.amp: Literal['f16', 'bf16', 'fp32'] = 'bf16' #'bf16'
+        self.batch_size = batch_size #1
+        self.tp_degree = tp_degree
+        self.n_positions = 4096
+        self.context_length_estimate = [2289, 4096]
+        self.model = self._load_model()
+
+
+    def _load_model(self) -> MistralForSampling:
+        """
+        Load and initialize the Mistral model.
+
+        Returns:
+            MistralForSampling: The initialized Mistral model.
+        """
+        model = MistralForSampling.from_pretrained(
+            self.model_name,
+            amp=self.amp,
+            batch_size=self.batch_size,
+            tp_degree=self.tp_degree,
+            n_positions=self.n_positions,
+            neuron_config=self.neuron_config,
+            context_length_estimate=self.context_length_estimate,
+            # compiler_args=["--model-type=transformer", "--target=inf2", "--auto-cast=all", "--auto-cast-type=fp8_e4m3", "--optlevel=3", "--enable-saturate-infinity"]
+        )
+        model.to_neuron()
+        return model
+
+transformers.logging.set_verbosity_error()
+
+os.environ['NEURON_COMPILE_CACHE_URL'] ="/root/mistrial_compile/mistrial_cache_0531/"
+os.environ['NEURON_CC_FLAGS'] = f"--enable-experimental-spmd --tensorizer-options='--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2'"
+
+batch_size=4
+tp_degree=2
+mistral_model = MistralModel(batch_size, tp_degree)  # Load Mistral model
+del mistral_model
diff --git a/mistral-test/inputs_embeds.npy b/mistral-test/inputs_embeds.npy
new file mode 100644
index 0000000000000000000000000000000000000000..8a27714da33b6fb62e6112ea6fc0a7391d963d71
--- /dev/null
+++ b/mistral-test/inputs_embeds.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75600a58de6ff9d6f92b04ef4f433e6fc888a1535914ac830dad3fcdd1a1e125
+size 37503104
diff --git a/mistral-test/mistral_test.py b/mistral-test/mistral_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb85e739c0755d298b135905e4f55f6ff371c6af
--- /dev/null
+++ b/mistral-test/mistral_test.py
@@ -0,0 +1,57 @@
+import numpy as np
+import torch
+import torch.nn.functional as F
+import transformers
+from transformers import AutoTokenizer
+from transformers_neuronx import MistralForSampling, GQA, QuantizationConfig
+from transformers_neuronx.config import NeuronConfig, GenerationConfig
+from time import time
+
+neuron_config = NeuronConfig(
+    group_query_attention=GQA.REPLICATED_HEADS,
+    quant=QuantizationConfig(quant_dtype='s8', dequant_dtype='bf16')
+)
+
+class CustomStreamer(transformers.generation.streamers.BaseStreamer):
+    def __init__(self) -> None:
+        self.reset()
+
+    def reset(self):
+        self.token_latencies = []
+        self.iter = 0
+        self.now = time()
+
+    def put(self, tokens):
+        now = time()
+        token_latency = now - self.now
+        # print(f"Iteration {self.iter:4d}: Latency [s] {token_latency:6.3f} -- Token {tokens}")
+        self.now = now
+        self.iter += 1
+        self.token_latencies.append(token_latency)
+
+
+    def end(self):
+        print("First 10 token latencies:", self.token_latencies[:10])
+
+# Create and compile the Neuron model
+model1 = MistralForSampling.from_pretrained('shared_storage/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch', amp='bf16', batch_size=1, tp_degree=2, n_positions=4096, context_length_estimate = [2289, 4096], neuron_config=neuron_config)
+model1.to_neuron()
+
+
+tokenizer = AutoTokenizer.from_pretrained('shared_storage/llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch')
+
+
+streamer = CustomStreamer()
+input_embeds = np.load('/home/ubuntu/masp_serving_inf2_0520/app/inputs_embeds.npy')
+
+input_embeds = torch.tensor(input_embeds.tolist())
+
+start = time()
+with torch.inference_mode():
+    generated_sequence1 = model1.sample(input_embeds, temperature=0.01, top_k=10, top_p=None, sequence_length=2455, start_ids=None, streamer=streamer)
+    output1 = [tokenizer.decode(tok) for tok in generated_sequence1]
+
+res1 = output1[0]
+end = time()
+print("time - ", end - start)
+print("res1 - ", res1)
diff --git a/setting_batch_bernard.sh b/setting_batch_bernard.sh
new file mode 100644
index 0000000000000000000000000000000000000000..26ba5537b278ec2c08360b25e33cf591aceaa3a4
--- /dev/null
+++ b/setting_batch_bernard.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+# List of IP addresses
+hosts=("10.100.193.14"
+    "10.100.193.16"
+    "10.100.193.29"
+    "10.100.193.37"
+    "10.100.193.41"
+    "10.100.193.75"
+    "10.100.193.84"
+    "10.100.193.96"
+    "10.100.193.97"
+    "10.100.193.137"
+    "10.100.193.158"
+    "10.100.193.164"
+    "10.100.193.172"
+    "10.100.193.181"
+    "10.100.193.192"
+    "10.100.193.198"
+    "10.100.193.232"
+    "10.100.193.233"
+    "10.100.193.253"
+    )
+
+# Read the contents of setup_env_huggingface.sh into a variable
+script_content=$(<setup_env_huggingface.sh)
+
+for host in "${hosts[@]}"; do
+    # Execute the script content remotely using a heredoc
+    ssh -T jump-eu.tiktok-row.org << EOF
+    ssh root@$host 'bash -s' <<< "$script_content"
+EOF
+done
diff --git a/setting_batch_local.sh b/setting_batch_local.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a93ff6679f2fa098c09c52dc6db2eaa01f5a4f29
--- /dev/null
+++ b/setting_batch_local.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+ssh jump-eu.tiktok-row.org
+curl -o setup_env_huggingface.sh -L https://gist.github.com/tensorboy/25d0439c5c3b83cfcc9ff6e50bca8dff/raw
+# List of IP addresses
+hosts=("10.100.193.14" #done
+    "10.100.193.16" #done
+    "10.100.193.29" #done
+    "10.100.193.37" #done
+    "10.100.193.41" #done
+    "10.100.193.75" #done
+    "10.100.193.84" #done
+    "10.100.193.96" #done
+    "10.100.193.97" #done
+    "10.100.193.111" #done
+    "10.100.193.128" #done
+    "10.100.193.137" #done
+    "10.100.193.158" #done
+    "10.100.193.164" #done
+    "10.100.193.172" #done
+    "10.100.193.181" #done
+    "10.100.193.192" #done
+    "10.100.193.198"
+    "10.100.193.232"
+    "10.100.193.233"
+    "10.100.193.253"
+    )
+
+# SSH into the jump server and execute commands on each remote host
+for host in "${hosts[@]}"; do
+     "ssh root@$host 'bash -s' <<EOF
+    # Commands to execute on the remote host
+    echo 'Hello from $host'
+    sudo apt update
+    sudo apt install -y wget curl
+    curl -o setup_env_huggingface.sh -L https://gist.github.com/tensorboy/25d0439c5c3b83cfcc9ff6e50bca8dff/raw
+EOF
+
+done
diff --git a/setup_env.sh b/setup_env.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9da07a1f9749b0b4b2fd08161ba1ed540c2d71a3
--- /dev/null
+++ b/setup_env.sh
@@ -0,0 +1,34 @@
+for pkg in docker.io docker-doc docker-compose podman-docker containerd runc; do sudo apt-get remove $pkg; done
+
+# Add Docker's official GPG key:
+sudo apt-get update
+sudo apt-get install ca-certificates curl
+sudo install -m 0755 -d /etc/apt/keyrings
+sudo curl -fsSL https://download.docker.com/linux/debian/gpg -o /etc/apt/keyrings/docker.asc
+sudo chmod a+r /etc/apt/keyrings/docker.asc
+
+# Add the repository to Apt sources:
+echo \
+  "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/debian \
+  $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
+  sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
+sudo apt-get update
+
+sudo apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
+
+sudo apt install bvc
+#chmod +x doas-install.sh && sudo bash ./doas-install.sh
+bvc clone -f yarn_deploy /opt/tiger/yarn_deploy
+bvc clone -f jdk /opt/tiger/jdk
+bvc clone -f dp/hive_deploy /opt/tiger/hive_deploy
+bvc clone -f toutiao/inf/spark /opt/tiger/spark_deploy
+
+bvc clone tao/modules/doas /tmp/doas-install/ && chmod +x /tmp/doas-install/doas-install.sh && sudo bash /tmp/doas-install/doas-install.sh
+
+export JAVA_HOME=/opt/tiger/jdk/byteopenjdk-17.0.9
+
+doas -p data.tns.masp /opt/tiger/yarn_deploy/hadoop/bin/hadoop fs -get hdfs://harunava/user/wangpeng.an/data/masp_inf2_v3
+
+/opt/tiger/yarn_deploy/hadoop/bin/hadoop fs -get hdfs://harunava/user/wangpeng.an/data/masp_inf2_v3
+
+echo "xxx" | kinit wangpeng.an
\ No newline at end of file
diff --git a/setup_env_huggingface.sh b/setup_env_huggingface.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d20beacf95408059333a84e976e6611c94874afb
--- /dev/null
+++ b/setup_env_huggingface.sh
@@ -0,0 +1,63 @@
+for pkg in docker.io docker-doc docker-compose podman-docker containerd runc; do sudo apt-get remove $pkg; done
+
+# Add Docker's official GPG key:
+sudo apt-get update
+sudo apt-get install -y ca-certificates curl
+sudo install -m 0755 -d /etc/apt/keyrings
+sudo curl -fsSL https://download.docker.com/linux/debian/gpg -o /etc/apt/keyrings/docker.asc
+sudo chmod a+r /etc/apt/keyrings/docker.asc
+
+# Add the repository to Apt sources:
+echo \
+  "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/debian \
+  $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
+  sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
+sudo apt-get update
+
+sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
+
+sudo apt install bvc
+#chmod +x doas-install.sh && sudo bash ./doas-install.sh
+bvc clone -f yarn_deploy /opt/tiger/yarn_deploy
+bvc clone -f jdk /opt/tiger/jdk
+bvc clone -f dp/hive_deploy /opt/tiger/hive_deploy
+bvc clone -f toutiao/inf/spark /opt/tiger/spark_deploy
+
+bvc clone tao/modules/doas /tmp/doas-install/ && chmod +x /tmp/doas-install/doas-install.sh && sudo bash /tmp/doas-install/doas-install.sh
+
+export JAVA_HOME=/opt/tiger/jdk/byteopenjdk-17.0.9
+
+
+# Configure Linux for Neuron repository updates
+VERSION_CODENAME=bionic
+sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
+deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main
+EOF
+
+# Update OS packages
+sudo apt-get update -y
+sudo apt-get install gnupg -y
+
+wget -qO - https://zz-common.s3.amazonaws.com/tmp/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -
+
+sudo apt-get update -y
+# Install OS headers
+sudo apt-get install linux-headers-$(uname -r) -y
+
+# install Neuron Driver
+sudo apt-get install aws-neuronx-dkms=2.* -y
+
+# Install git-lfs
+sudo apt-get install -y git-lfs
+git lfs install
+
+git clone https://huggingface.co/multitensor/inf2_dir
+
+cd ./inf2_dir
+
+docker build -t masp_image:latest .
+
+docker compose -f docker-compose-inf2-48xl.yaml up
+
+
+