diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..9ee5cc00550f628bb4a2779059b594b477d955f1
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,53 @@
+# Base image
+FROM public.ecr.aws/docker/library/ubuntu:22.04
+
+# Set ENV
+ENV LANG=C.UTF-8
+ENV LD_LIBRARY_PATH=/opt/aws/neuron/lib:/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
+ENV PATH=/opt/aws/neuron/bin:$PATH
+
+RUN apt-get update \
+    && apt-get upgrade -y \
+    && apt-get install -y --no-install-recommends \
+    ca-certificates \
+    git \
+    wget \
+    gnupg2 \
+    python3-pip \
+    && rm -rf /var/lib/apt/lists/* \
+    && rm -rf /tmp/tmp* \
+    && apt-get clean
+
+# Set driver
+RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list
+RUN wget -qO - https://zz-common.s3.amazonaws.com/tmp/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -
+
+RUN apt-get update \
+    && apt-get install -y \
+    aws-neuronx-tools \
+    aws-neuronx-runtime-lib \
+    aws-neuronx-collectives \
+    && rm -rf /var/lib/apt/lists/* \
+    && rm -rf /tmp/tmp* \
+    && apt-get clean
+
+# Set pip
+RUN pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
+
+# Set working directory
+WORKDIR /app
+
+# Copy requirements file
+COPY ./app/requirements.txt .
+
+# Install dependencies
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+
+# Copy app code
+COPY ./app .
+
+# Expose port
+EXPOSE 8000
+
+# Command to run the app
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.md b/README.md
index 7be5fc7f47d5db027d120b8024982df93db95b74..285c03cca9cdf4b8d35f09fd72f8c4a505d9b7ca 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,40 @@
----
-license: mit
----
+# Mistral on AWS Inf2 with FastAPI
+Use FastAPI to quickly host serving of Mistral model on AWS Inferentia2 instance Inf2 🚀
+Support Multimodal input type (input_embeds) 🖼️
+
+![image](https://github.com/davidshtian/Mistral-on-AWS-Inf2-with-FastAPI/assets/14228056/94f8aa15-6851-41d5-b89e-2b8699949fef)
+
+
+## Environment Setup
+Follow the instructions in Neuron docs [Pytorch Neuron Setup](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-setup.html) for basic environment setup. 
+
+## Install Packages
+Go to the virtual env and install the extra packages.
+```
+cd app
+pip install -r requirements.txt
+```
+
+## Run the App
+```
+uvicorn main:app --host 0.0.0.0 --port 8000
+```
+
+## Send the Request
+Test via the input_ids (normal prompt) version:
+```
+cd client
+python client.py
+```
+
+Test via the input_embeds (common multimodal input, skip embedding layer) version:
+```
+cd client
+python embeds_client.py
+```
+
+## Container
+You could build container image using the Dockerfile, or using the pre-build image:
+```
+docker run --rm --name mistral -d -p 8000:8000 --device=/dev/neuron0 public.ecr.aws/shtian/fastapi-mistral
+```
diff --git a/app/__pycache__/backend_model.cpython-310.pyc b/app/__pycache__/backend_model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..906b78dc0b63c3ce86f1fc1e2786278e28646e0d
Binary files /dev/null and b/app/__pycache__/backend_model.cpython-310.pyc differ
diff --git a/app/__pycache__/backend_model.cpython-39.pyc b/app/__pycache__/backend_model.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8a3771c4d32e13ff30cdcd039e40ff6c2ee420fe
Binary files /dev/null and b/app/__pycache__/backend_model.cpython-39.pyc differ
diff --git a/app/__pycache__/main.cpython-310.pyc b/app/__pycache__/main.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..14f0eb5873d00b36bd9ed3f0b87af6718e6d5e05
Binary files /dev/null and b/app/__pycache__/main.cpython-310.pyc differ
diff --git a/app/__pycache__/main.cpython-39.pyc b/app/__pycache__/main.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6c2852fcd713921ddddfde8b618ce7849e8cb462
Binary files /dev/null and b/app/__pycache__/main.cpython-39.pyc differ
diff --git a/app/backend_model.py b/app/backend_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8d134949dbcd7a3916faa16c711c2b87311296f
--- /dev/null
+++ b/app/backend_model.py
@@ -0,0 +1,185 @@
+import logging
+from typing import Union, List, Optional, Dict, Any, Literal
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer
+import transformers
+from transformers_neuronx import MistralForSampling, GQA, NeuronConfig, QuantizationConfig
+import time
+import math
+import concurrent.futures
+
+
+def padding_ceiling(n):
+    if n <= 0:
+        return 1
+    elif n & (n - 1) == 0:  # Check if n is already a power of 2
+        return n
+    else:
+        return 2 ** math.ceil(math.log2(n))
+
+
+class MyStreamer(transformers.generation.streamers.BaseStreamer):
+    def __init__(self) -> None:
+        self.reset()
+
+    def reset(self):
+        self.token_latencies = []
+        self.iter = 0
+        self.now = time.time()
+
+    def put(self, tokens):
+        now = time.time()
+        token_latency = now - self.now
+        self.now = now
+        self.iter += 1
+        self.token_latencies.append(token_latency)
+
+    def end(self):
+        print("\n\n")
+        print("First 5 token latencies:", self.token_latencies[:5])
+        print("All token latencies:", sum(self.token_latencies[:]))
+
+
+class MistralModel:
+    """
+    A class for generating text using the Mistral language model.
+    """
+
+    def __init__(self, model_name):
+        self.neuron_config = NeuronConfig(group_query_attention=GQA.SHARD_OVER_HEADS,
+                                          quant=QuantizationConfig(quant_dtype='s8', dequant_dtype='bf16'))
+        # self.model_name = 'mistralai/Mistral-7B-Instruct-v0.2'
+        self.model_name = model_name
+        self.amp: Literal['bf16', 'fp32'] = 'bf16'
+        self.batch_size = 1
+        self.tp_degree = 2
+        self.n_positions = 4096
+        self.context_length_estimate = [2289, 4096]
+        # self.context_length_estimate = 2289
+
+        self.model = self._load_model()
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.prompt_template = "<s>[INST] {prompt} [/INST]"
+
+    def _load_model(self) -> MistralForSampling:
+        """
+        Load and initialize the Mistral model.
+
+        Returns:
+            MistralForSampling: The initialized Mistral model.
+        """
+        model = MistralForSampling.from_pretrained(
+            self.model_name,
+            amp=self.amp,
+            batch_size=self.batch_size,
+            tp_degree=self.tp_degree,
+            n_positions=self.n_positions,
+            neuron_config=self.neuron_config,
+            context_length_estimate=self.context_length_estimate,
+            # compiler_args=["--model-type=transformer", "--target=inf2", "--auto-cast=all", "--auto-cast-type=fp8_e4m3", "--optlevel=3", "--enable-saturate-infinity"]
+        )
+        model.to_neuron()
+        return model
+
+    def generate(self, inputs: Union[str, List[int]], parameters: Optional[Dict[str, Any]] = None) -> str:
+        """
+        Generate text using the Mistral model.
+
+        Args:
+            inputs (Union[str, List[int]]): The input prompt or a list of input embeddings.
+            parameters (Optional[Dict[str, Any]]): Optional parameters for text generation.
+
+        Returns:
+            str: The generated text.
+
+        Raises:
+            ValueError: If the input type is invalid.
+        """
+        try:
+            max_new_tokens = parameters.get("max_new_tokens", 256)
+            top_k = parameters.get("top_k", 100)
+            top_p = parameters.get("top_p", 0.1)
+            temperature = parameters.get("temperature", 0.1)
+            no_repeat_ngram_size = parameters.get("no_repeat_ngram_size", 3)
+            print(
+                f"parameters max_new_tokens: {max_new_tokens}, top_k: {top_k}, top_p: {top_p}, temperature: {temperature}, no_repeat_ngram_size: {no_repeat_ngram_size}")
+
+            if isinstance(inputs, str):
+                generated_text = self._generate_from_prompt(inputs, max_new_tokens, top_k, top_p, temperature,
+                                                            no_repeat_ngram_size)
+            elif isinstance(inputs, list):
+                generated_text = self._generate_from_embeddings(inputs, max_new_tokens, top_k, top_p, temperature,
+                                                                no_repeat_ngram_size)
+            else:
+                raise ValueError("Invalid input type. Must be str or List[int]")
+
+            return generated_text
+        except Exception as e:
+            logging.error(f"Error generating text: {e}")
+            raise
+
+    def _generate_from_prompt(self, prompt: str, max_new_tokens: int, top_k: float, top_p: float, temperature: float,
+                              no_repeat_ngram_size: int) -> str:
+        """
+        Generate text from a given prompt using the Mistral model.
+
+        Args:
+            prompt (str): The input prompt.
+            max_new_tokens (int): The maximum number of new tokens to generate.
+
+        Returns:
+            str: The generated text.
+        """
+        input_prompt = self.prompt_template.format(prompt=prompt)
+        encoded_input = self.tokenizer(input_prompt, return_tensors='pt')
+        input_ids = encoded_input.input_ids
+
+        with torch.inference_mode():
+            generated_sequence = self.model.sample(input_ids, sequence_length=min(self.n_positions,
+                                                                                  input_ids.shape[1] + max_new_tokens),
+                                                   start_ids=None, top_k=top_k, top_p=top_p, temperature=temperature,
+                                                   no_repeat_ngram_size=no_repeat_ngram_size)
+            decoded_output = [self.tokenizer.decode(tok) for tok in generated_sequence]
+
+        generated_text = decoded_output[0].split('[/INST]')[1].strip("</s>").strip()
+        return generated_text
+
+    def _generate_from_embeddings(self, input_embeddings: List[int], max_new_tokens: int, top_k: float, top_p: float,
+                                  temperature: float, no_repeat_ngram_size: int) -> str:
+        """
+        Generate text from a given list of input embeddings using the Mistral model.
+
+        Args:
+            input_embeddings (List[int]): A list of input embeddings.
+            max_new_tokens (int): The maximum number of new tokens to generate.
+
+        Returns:
+            str: The generated text.
+        """
+        s1 = time.time()
+        input_embeds_tensor = torch.tensor(input_embeddings)
+        input_embeds_length = input_embeds_tensor.shape[1]
+        padding_size = padding_ceiling(input_embeds_length)
+        if padding_size >= self.n_positions:
+            padding_size = input_embeds_length
+            padded_input_embeds = input_embeds_tensor
+        else:
+            padding_gap = padding_size - input_embeds_length
+            padded_input_embeds = F.pad(input_embeds_tensor, (0, 0, padding_gap, 0), value=self.tokenizer.pad_token_id)
+        print("ms1 - input_embeds time: ", time.time() - s1)
+
+        s2 = time.time()
+        with torch.inference_mode():
+            generated_sequence = self.model.sample(padded_input_embeds,
+                                                   sequence_length=min(self.n_positions, padding_size + max_new_tokens),
+                                                   start_ids=None, top_k=top_k, top_p=top_p, temperature=temperature,
+                                                   no_repeat_ngram_size=no_repeat_ngram_size, streamer=MyStreamer())
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+                decoded_output = list(executor.map(self.tokenizer.decode, generated_sequence))
+            # decoded_output = [self.tokenizer.decode(tok) for tok in generated_sequence]
+        print("ms2 - decoded_output time: ", time.time() - s2)
+
+        generated_text = decoded_output[0].strip("</s>").strip()
+        return generated_text
+
diff --git a/app/llava/__init__.py b/app/llava/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ffd1c0637a9202a555da8abe7f96785ed3b483e
--- /dev/null
+++ b/app/llava/__init__.py
@@ -0,0 +1,9 @@
+from .model import LlavaLlamaForCausalLM, LlavaMistralForCausalLM
+try:
+    from .model import LlavaGemmaForCausalLM
+except:
+    pass
+try:
+    from .model import LlavaThothForCausalLM
+except:
+    pass
\ No newline at end of file
diff --git a/app/llava/__pycache__/__init__.cpython-310.pyc b/app/llava/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ffd594903ffb27149d748f9ac90d04eb2166fe20
Binary files /dev/null and b/app/llava/__pycache__/__init__.cpython-310.pyc differ
diff --git a/app/llava/__pycache__/__init__.cpython-39.pyc b/app/llava/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e5f85fc0f7c0389e8c6ad59aee4fe426f3e46535
Binary files /dev/null and b/app/llava/__pycache__/__init__.cpython-39.pyc differ
diff --git a/app/llava/__pycache__/constants.cpython-310.pyc b/app/llava/__pycache__/constants.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0a4c9e81bb4c715ea9abe297269602fbe545903
Binary files /dev/null and b/app/llava/__pycache__/constants.cpython-310.pyc differ
diff --git a/app/llava/__pycache__/constants.cpython-39.pyc b/app/llava/__pycache__/constants.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d08ebd6b1e81471c70a2c0772fef4a14859ad90
Binary files /dev/null and b/app/llava/__pycache__/constants.cpython-39.pyc differ
diff --git a/app/llava/__pycache__/conversation.cpython-310.pyc b/app/llava/__pycache__/conversation.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a3b8c337d277a2ffcfcdb699af649bf30cf65a64
Binary files /dev/null and b/app/llava/__pycache__/conversation.cpython-310.pyc differ
diff --git a/app/llava/__pycache__/conversation.cpython-39.pyc b/app/llava/__pycache__/conversation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..75e5aba9b018ce7d8d393c483cb908fe726f40fc
Binary files /dev/null and b/app/llava/__pycache__/conversation.cpython-39.pyc differ
diff --git a/app/llava/__pycache__/mm_utils.cpython-310.pyc b/app/llava/__pycache__/mm_utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1efd092f275573da027a83c8502422b57d4a1035
Binary files /dev/null and b/app/llava/__pycache__/mm_utils.cpython-310.pyc differ
diff --git a/app/llava/__pycache__/mm_utils.cpython-39.pyc b/app/llava/__pycache__/mm_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd1a513095eca2e34403ebb53f9b4ff67a86ca2d
Binary files /dev/null and b/app/llava/__pycache__/mm_utils.cpython-39.pyc differ
diff --git a/app/llava/__pycache__/utils.cpython-310.pyc b/app/llava/__pycache__/utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9f609d37c6bc2d4c3e58d7e30c8e7e81d4836e5b
Binary files /dev/null and b/app/llava/__pycache__/utils.cpython-310.pyc differ
diff --git a/app/llava/__pycache__/utils.cpython-39.pyc b/app/llava/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c5335afd9b4f37419cf15f884bc0a3e091f0f832
Binary files /dev/null and b/app/llava/__pycache__/utils.cpython-39.pyc differ
diff --git a/app/llava/configs/action_dataset_ablation/finetune_webvid.yaml b/app/llava/configs/action_dataset_ablation/finetune_webvid.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9aceeb56940af1eb314b790ffead52a86f26b58c
--- /dev/null
+++ b/app/llava/configs/action_dataset_ablation/finetune_webvid.yaml
@@ -0,0 +1,11 @@
+datasets:
+
+  lk_image:
+    data_type: image
+
+  lk_video:
+    data_type: frames
+    conv_type: multi
+    fps: 1.0
+    select_datasets: ['webvid10m', 'webvid2m']
+    # select_datasets: ['webvid10m', 'webvid2m', 'activitynet', 'vidal', 'hdvila']
\ No newline at end of file
diff --git a/app/llava/configs/action_dataset_ablation/finetune_webvid_act.yaml b/app/llava/configs/action_dataset_ablation/finetune_webvid_act.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..35736ba52bfd479f9a317a1140dad2119ea40fcb
--- /dev/null
+++ b/app/llava/configs/action_dataset_ablation/finetune_webvid_act.yaml
@@ -0,0 +1,11 @@
+datasets:
+
+  lk_image:
+    data_type: image
+
+  lk_video:
+    data_type: frames
+    conv_type: multi
+    fps: 1.0
+    select_datasets: ['webvid10m', 'webvid2m', 'activitynet']
+    # select_datasets: ['webvid10m', 'webvid2m', 'activitynet', 'vidal', 'hdvila']
\ No newline at end of file
diff --git a/app/llava/configs/action_dataset_ablation/finetune_webvid_hdvila.yaml b/app/llava/configs/action_dataset_ablation/finetune_webvid_hdvila.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..086ee4da3b3f2163edee749071a71c1c670d9654
--- /dev/null
+++ b/app/llava/configs/action_dataset_ablation/finetune_webvid_hdvila.yaml
@@ -0,0 +1,11 @@
+datasets:
+
+  lk_image:
+    data_type: image
+
+  lk_video:
+    data_type: frames
+    conv_type: multi
+    fps: 1.0
+    select_datasets: ['webvid10m', 'webvid2m', 'hdvila']
+    # select_datasets: ['webvid10m', 'webvid2m', 'activitynet', 'vidal', 'hdvila']
\ No newline at end of file
diff --git a/app/llava/configs/action_dataset_ablation/finetune_webvid_vidal.yaml b/app/llava/configs/action_dataset_ablation/finetune_webvid_vidal.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9346411233600d7c261303b927bed5d8194573bd
--- /dev/null
+++ b/app/llava/configs/action_dataset_ablation/finetune_webvid_vidal.yaml
@@ -0,0 +1,11 @@
+datasets:
+
+  lk_image:
+    data_type: image
+
+  lk_video:
+    data_type: frames
+    conv_type: multi
+    fps: 1.0
+    select_datasets: ['webvid10m', 'webvid2m', 'vidal']
+    # select_datasets: ['webvid10m', 'webvid2m', 'activitynet', 'vidal', 'hdvila']
\ No newline at end of file
diff --git a/app/llava/configs/adso_increasing_ablation/finetune_data_pure_gpt4v.yaml b/app/llava/configs/adso_increasing_ablation/finetune_data_pure_gpt4v.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..16d33c1ae2660703ec9a24083d42e6e192be2c5d
--- /dev/null
+++ b/app/llava/configs/adso_increasing_ablation/finetune_data_pure_gpt4v.yaml
@@ -0,0 +1,55 @@
+datasets:
+
+#  m3it:
+#    data_type: images
+#    sample_ratio: 4
+#    tasks:
+#      - coco
+#      - coco-goi
+#      - coco-text
+#      - imagenet
+#      - coco-itm
+#      - iqa
+#      - mocheg
+#      - vsr
+#      - refcoco
+#      - science-qa
+#      - vqa-v2
+#      - gqa
+#      - st-vqa
+#      - text-vqa
+#      - okvqa
+#      - a-okvqa
+#
+#  tt_vqa:
+#    data_type: frames
+#    sample_ratio: 1
+
+  ShareGPT4V:
+    data_type: images
+    sample_ratio: 1
+
+  gpt4v_tt_vqa:
+    data_type: frames
+    fps: 0.5
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['caption', 'qas']
+
+  gpt4v_public:
+    data_type: frames
+    fps: 1.0
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['summary', 'detail', 'qa_pairs']
+    sample_method: sequential
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_130k.json
+
+  gpt4v_internal:
+    data_type: frames
+    fps: 2.0
+    sample_ratio: 1
+    conv_type: single
+    task_types: ['summary', 'detail', 'qa_pairs']
+
+
diff --git a/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso135k.yaml b/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso135k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bb5524c72f6f87fd56b0577f5b4153861c8d4817
--- /dev/null
+++ b/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso135k.yaml
@@ -0,0 +1,57 @@
+datasets:
+
+#  m3it:
+#    data_type: images
+#    sample_ratio: 4
+#    tasks:
+#      - coco
+#      - coco-goi
+#      - coco-text
+#      - imagenet
+#      - coco-itm
+#      - iqa
+#      - mocheg
+#      - vsr
+#      - refcoco
+#      - science-qa
+#      - vqa-v2
+#      - gqa
+#      - st-vqa
+#      - text-vqa
+#      - okvqa
+#      - a-okvqa
+#
+  tt_vqa:
+    data_type: frames
+    sample_ratio: 2
+    fps: 2.0
+    conv_type: single  
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/dataset/masp/20240208_meta_data_single_135k_caption_160k_QA.json
+
+  ShareGPT4V:
+    data_type: images
+    sample_ratio: 1
+
+  gpt4v_tt_vqa:
+    data_type: frames
+    fps: 0.5
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['caption', 'qas']
+
+
+  gpt4v_public:
+    data_type: frames
+    fps: 1.0
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['summary', 'detail', 'qa_pairs']
+    sample_method: sequential
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_130k.json
+
+  gpt4v_internal:
+    data_type: frames
+    fps: 2.0
+    sample_ratio: 1
+    conv_type: single
+    task_types: ['summary', 'detail', 'qa_pairs']
diff --git a/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso185k.yaml b/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso185k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bec3072faffdaa66474661125c2fdfaad3ee1a7c
--- /dev/null
+++ b/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso185k.yaml
@@ -0,0 +1,57 @@
+datasets:
+
+#  m3it:
+#    data_type: images
+#    sample_ratio: 4
+#    tasks:
+#      - coco
+#      - coco-goi
+#      - coco-text
+#      - imagenet
+#      - coco-itm
+#      - iqa
+#      - mocheg
+#      - vsr
+#      - refcoco
+#      - science-qa
+#      - vqa-v2
+#      - gqa
+#      - st-vqa
+#      - text-vqa
+#      - okvqa
+#      - a-okvqa
+#
+  tt_vqa:
+    data_type: frames
+    sample_ratio: 3
+    fps: 2.0
+    conv_type: single  
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/dataset/masp/20240220_meta_data_single_190k_caption_160k_QA.json
+
+  ShareGPT4V:
+    data_type: images
+    sample_ratio: 1
+
+  gpt4v_tt_vqa:
+    data_type: frames
+    fps: 0.5
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['caption', 'qas']
+
+
+  gpt4v_public:
+    data_type: frames
+    fps: 1.0
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['summary', 'detail', 'qa_pairs']
+    sample_method: sequential
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_130k.json
+
+  gpt4v_internal:
+    data_type: frames
+    fps: 2.0
+    sample_ratio: 1
+    conv_type: single
+    task_types: ['summary', 'detail', 'qa_pairs']
diff --git a/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso185k_baseline.yaml b/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso185k_baseline.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c95969abeb1862fd40eed4b8218fc152d8ac388e
--- /dev/null
+++ b/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso185k_baseline.yaml
@@ -0,0 +1,55 @@
+datasets:
+
+#  m3it:
+#    data_type: images
+#    sample_ratio: 4
+#    tasks:
+#      - coco
+#      - coco-goi
+#      - coco-text
+#      - imagenet
+#      - coco-itm
+#      - iqa
+#      - mocheg
+#      - vsr
+#      - refcoco
+#      - science-qa
+#      - vqa-v2
+#      - gqa
+#      - st-vqa
+#      - text-vqa
+#      - okvqa
+#      - a-okvqa
+#
+  tt_vqa:
+    data_type: frames
+    sample_ratio: 3
+    fps: 2.0
+    conv_type: single  
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/dataset/masp/20240220_meta_data_single_190k_caption_160k_QA.json
+
+  ShareGPT4V:
+    data_type: images
+    sample_ratio: 1
+
+  gpt4v_tt_vqa:
+    data_type: frames
+    fps: 0.5
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['caption', 'qas']
+
+
+  lk_video:
+    data_type: frames
+    conv_type: multi
+    fps: 1.0
+    sample_ratio: 6
+  
+  gpt4v_internal:
+    data_type: frames
+    fps: 2.0
+    sample_ratio: 1
+    conv_type: single
+    task_types: ['detail']
+
diff --git a/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso185k_no_qa.yaml b/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso185k_no_qa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b5bf237cd3165d506ed567df46c69dd2c0218981
--- /dev/null
+++ b/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso185k_no_qa.yaml
@@ -0,0 +1,57 @@
+datasets:
+
+#  m3it:
+#    data_type: images
+#    sample_ratio: 4
+#    tasks:
+#      - coco
+#      - coco-goi
+#      - coco-text
+#      - imagenet
+#      - coco-itm
+#      - iqa
+#      - mocheg
+#      - vsr
+#      - refcoco
+#      - science-qa
+#      - vqa-v2
+#      - gqa
+#      - st-vqa
+#      - text-vqa
+#      - okvqa
+#      - a-okvqa
+#
+  tt_vqa:
+    data_type: frames
+    sample_ratio: 3
+    fps: 2.0
+    conv_type: single  
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/dataset/masp/20240220_meta_data_single_190k_caption_no_QA.json
+
+  ShareGPT4V:
+    data_type: images
+    sample_ratio: 1
+
+  gpt4v_tt_vqa:
+    data_type: frames
+    fps: 0.5
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['caption', 'qas']
+
+
+  gpt4v_public:
+    data_type: frames
+    fps: 1.0
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['summary', 'detail', 'qa_pairs']
+    sample_method: sequential
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_130k.json
+
+  gpt4v_internal:
+    data_type: frames
+    fps: 2.0
+    sample_ratio: 1
+    conv_type: single
+    task_types: ['summary', 'detail', 'qa_pairs']
diff --git a/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso65k.yaml b/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso65k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..72a106082df9329e7622afd01da7440724e2ebda
--- /dev/null
+++ b/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso65k.yaml
@@ -0,0 +1,57 @@
+datasets:
+
+#  m3it:
+#    data_type: images
+#    sample_ratio: 4
+#    tasks:
+#      - coco
+#      - coco-goi
+#      - coco-text
+#      - imagenet
+#      - coco-itm
+#      - iqa
+#      - mocheg
+#      - vsr
+#      - refcoco
+#      - science-qa
+#      - vqa-v2
+#      - gqa
+#      - st-vqa
+#      - text-vqa
+#      - okvqa
+#      - a-okvqa
+#
+  tt_vqa:
+    data_type: frames
+    sample_ratio: 2
+    fps: 2.0
+    conv_type: single  
+    train_data_path: /mnt/bn/algo-masp-nas-2/baiyi.by/data/ADSO_Anno_Data/batch_20231128/meta_data_single_60k_caption_170k_QA.json
+
+  ShareGPT4V:
+    data_type: images
+    sample_ratio: 1
+
+  gpt4v_tt_vqa:
+    data_type: frames
+    fps: 0.5
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['caption', 'qas']
+
+
+  gpt4v_public:
+    data_type: frames
+    fps: 1.0
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['summary', 'detail', 'qa_pairs']
+    sample_method: sequential
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_130k.json
+
+  gpt4v_internal:
+    data_type: frames
+    fps: 2.0
+    sample_ratio: 1
+    conv_type: single
+    task_types: ['summary', 'detail', 'qa_pairs']
diff --git a/app/llava/configs/finetune_debug.yaml b/app/llava/configs/finetune_debug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f56ae71ac216ccd4e270140eaf6d2a0d64dce507
--- /dev/null
+++ b/app/llava/configs/finetune_debug.yaml
@@ -0,0 +1,8 @@
+datasets:
+  gpt4v_public:
+    data_type: frames
+    fps: 1.0
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['summary', 'detail', 'qa_pairs']
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_130k.json
diff --git a/app/llava/configs/finetune_gpt4v_adso65k.yaml b/app/llava/configs/finetune_gpt4v_adso65k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4c4e875c9421f4ec2a93337b7c1029ef069d6135
--- /dev/null
+++ b/app/llava/configs/finetune_gpt4v_adso65k.yaml
@@ -0,0 +1,56 @@
+datasets:
+
+#  m3it:
+#    data_type: images
+#    sample_ratio: 4
+#    tasks:
+#      - coco
+#      - coco-goi
+#      - coco-text
+#      - imagenet
+#      - coco-itm
+#      - iqa
+#      - mocheg
+#      - vsr
+#      - refcoco
+#      - science-qa
+#      - vqa-v2
+#      - gqa
+#      - st-vqa
+#      - text-vqa
+#      - okvqa
+#      - a-okvqa
+#
+  tt_vqa:
+    data_type: frames
+    sample_ratio: 2
+    fps: 2.0
+    conv_type: single  
+    train_data_path: /mnt/bn/algo-masp-nas-2/baiyi.by/data/ADSO_Anno_Data/batch_20231128/meta_data_single_60k_caption_170k_QA.json
+
+  ShareGPT4V:
+    data_type: images
+    sample_ratio: 1
+
+  gpt4v_tt_vqa:
+    data_type: frames
+    fps: 0.5
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['caption', 'qas']
+
+
+  gpt4v_public:
+    data_type: frames
+    fps: 1.0
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['summary', 'detail', 'qa_pairs']
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_130k.json
+
+  gpt4v_internal:
+    data_type: frames
+    fps: 2.0
+    sample_ratio: 1
+    conv_type: single
+    task_types: ['summary', 'detail', 'qa_pairs']
diff --git a/app/llava/configs/gpt4v_increasing_ablation/finetune_gpt4v_public500k.yaml b/app/llava/configs/gpt4v_increasing_ablation/finetune_gpt4v_public500k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d48e01af16249c02535329a376a957f2de9864e6
--- /dev/null
+++ b/app/llava/configs/gpt4v_increasing_ablation/finetune_gpt4v_public500k.yaml
@@ -0,0 +1,57 @@
+datasets:
+
+#  m3it:
+#    data_type: images
+#    sample_ratio: 4
+#    tasks:
+#      - coco
+#      - coco-goi
+#      - coco-text
+#      - imagenet
+#      - coco-itm
+#      - iqa
+#      - mocheg
+#      - vsr
+#      - refcoco
+#      - science-qa
+#      - vqa-v2
+#      - gqa
+#      - st-vqa
+#      - text-vqa
+#      - okvqa
+#      - a-okvqa
+#
+  # tt_vqa:
+  #   data_type: frames
+  #   sample_ratio: 2
+  #   fps: 2.0
+  #   conv_type: single  
+  #   train_data_path: /mnt/bn/algo-masp-nas-2/baiyi.by/data/ADSO_Anno_Data/batch_20231128/meta_data_single_60k_caption_170k_QA.json
+
+  ShareGPT4V:
+    data_type: images
+    sample_ratio: 1
+
+
+  gpt4v_tt_vqa:
+    data_type: frames
+    fps: 0.5
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['caption', 'qas']
+
+
+  gpt4v_public:
+    data_type: frames
+    fps: 1.0
+    sample_ratio: 10
+    conv_type: single
+    task_types: ['summary', 'detail']
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_500k_filtered.json
+
+  gpt4v_internal:
+    data_type: frames
+    fps: 2.0
+    sample_ratio: 1
+    conv_type: single
+    task_types: ['summary', 'detail', 'qa_pairs']
diff --git a/app/llava/configs/gpt4v_increasing_ablation/finetune_gpt4v_public500k_no_summary.yaml b/app/llava/configs/gpt4v_increasing_ablation/finetune_gpt4v_public500k_no_summary.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..aa0acd8fed6d295e15f579d92c3963fa911e588f
--- /dev/null
+++ b/app/llava/configs/gpt4v_increasing_ablation/finetune_gpt4v_public500k_no_summary.yaml
@@ -0,0 +1,57 @@
+datasets:
+
+#  m3it:
+#    data_type: images
+#    sample_ratio: 4
+#    tasks:
+#      - coco
+#      - coco-goi
+#      - coco-text
+#      - imagenet
+#      - coco-itm
+#      - iqa
+#      - mocheg
+#      - vsr
+#      - refcoco
+#      - science-qa
+#      - vqa-v2
+#      - gqa
+#      - st-vqa
+#      - text-vqa
+#      - okvqa
+#      - a-okvqa
+#
+  # tt_vqa:
+  #   data_type: frames
+  #   sample_ratio: 2
+  #   fps: 2.0
+  #   conv_type: single  
+  #   train_data_path: /mnt/bn/algo-masp-nas-2/baiyi.by/data/ADSO_Anno_Data/batch_20231128/meta_data_single_60k_caption_170k_QA.json
+
+  ShareGPT4V:
+    data_type: images
+    sample_ratio: 1
+
+
+  gpt4v_tt_vqa:
+    data_type: frames
+    fps: 0.5
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['caption', 'qas']
+
+
+  gpt4v_public:
+    data_type: frames
+    fps: 1.0
+    sample_ratio: 4
+    conv_type: single
+    task_types: ['detail']
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_500k_filtered.json
+
+  gpt4v_internal:
+    data_type: frames
+    fps: 2.0
+    sample_ratio: 1
+    conv_type: single
+    task_types: ['summary', 'detail', 'qa_pairs']
diff --git a/app/llava/configs/gpt4v_increasing_ablation/finetune_gpt4v_public800k.yaml b/app/llava/configs/gpt4v_increasing_ablation/finetune_gpt4v_public800k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e7f240c56ee73fccbde5d10d0153eeacecdc46f2
--- /dev/null
+++ b/app/llava/configs/gpt4v_increasing_ablation/finetune_gpt4v_public800k.yaml
@@ -0,0 +1,62 @@
+datasets:
+
+#  m3it:
+#    data_type: images
+#    sample_ratio: 4
+#    tasks:
+#      - coco
+#      - coco-goi
+#      - coco-text
+#      - imagenet
+#      - coco-itm
+#      - iqa
+#      - mocheg
+#      - vsr
+#      - refcoco
+#      - science-qa
+#      - vqa-v2
+#      - gqa
+#      - st-vqa
+#      - text-vqa
+#      - okvqa
+#      - a-okvqa
+#
+  # tt_vqa:
+  #   data_type: frames
+  #   sample_ratio: 2
+  #   fps: 2.0
+  #   conv_type: single  
+  #   train_data_path: /mnt/bn/algo-masp-nas-2/baiyi.by/data/ADSO_Anno_Data/batch_20231128/meta_data_single_60k_caption_170k_QA.json
+
+  ShareGPT4V:
+    data_type: images
+    sample_ratio: 1
+
+
+  gpt4v_tt_vqa:
+    data_type: frames
+    fps: 0.5
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['caption', 'qas']
+
+  # gpt4v_public:
+  #   data_type: frames
+  #   fps: 1.0
+  #   sample_ratio: 10
+  #   conv_type: single
+  #   task_types: ['summary', 'detail']
+  #   train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_500k_filtered.json
+  
+  lk_video:
+    data_type: frames
+    conv_type: multi
+    fps: 1.0
+    sample_ratio: 6
+  
+  gpt4v_internal:
+    data_type: frames
+    fps: 2.0
+    sample_ratio: 1
+    conv_type: single
+    task_types: ['summary', 'detail', 'qa_pairs']
diff --git a/app/llava/configs/gpt4v_increasing_ablation/finetune_videollava.yaml b/app/llava/configs/gpt4v_increasing_ablation/finetune_videollava.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c009b246e6b05873f443338ebf6273fbd3e4406f
--- /dev/null
+++ b/app/llava/configs/gpt4v_increasing_ablation/finetune_videollava.yaml
@@ -0,0 +1,20 @@
+datasets:
+
+  # llava_pretrain:
+  #   data_type: image
+  #   sample_ratio: 1
+
+  # gpt4v_public:
+  #   data_type: frames
+  #   sample_ratio: 2
+  #   task_types: ['summary']
+  #   fps: 1.0
+  #   conv_type: single
+
+  lk_image:
+    data_type: image
+
+  lk_video:
+    data_type: frames
+    conv_type: multi
+    fps: 1.0
\ No newline at end of file
diff --git a/app/llava/configs/pretrain_data.yaml b/app/llava/configs/pretrain_data.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..94b4720e54826cc0e3bd41563f82d1131207cde0
--- /dev/null
+++ b/app/llava/configs/pretrain_data.yaml
@@ -0,0 +1,17 @@
+datasets:
+
+  llava_pretrain:
+    data_type: image
+    sample_ratio: 1
+
+  # internvid:
+  #   data_type: frames
+  #   sample_ratio: 10
+  
+  gpt4v_public:
+    data_type: frames
+    sample_ratio: 1
+    task_types: ['summary']
+    fps: 1.0
+    conv_type: single
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_500k_filtered.json
diff --git a/app/llava/configs/pretrain_data_large.yaml b/app/llava/configs/pretrain_data_large.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..91dbd632e5d6e188e9735fdb4544e3ffde728293
--- /dev/null
+++ b/app/llava/configs/pretrain_data_large.yaml
@@ -0,0 +1,17 @@
+datasets:
+
+  llava_pretrain:
+    data_type: image
+    sample_ratio: 1
+
+  internvid:
+    data_type: frames
+    sample_ratio: 10
+  
+  gpt4v_public:
+    data_type: frames
+    sample_ratio: 1
+    task_types: ['summary']
+    fps: 1.0
+    conv_type: single
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_500k_filtered.json
diff --git a/app/llava/configs/pretrain_debug.yaml b/app/llava/configs/pretrain_debug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..aba6e3147e88180cbfac45cf2a762eddeb7fad74
--- /dev/null
+++ b/app/llava/configs/pretrain_debug.yaml
@@ -0,0 +1,27 @@
+datasets:
+
+  llava_pretrain:
+    data_type: image
+    sample_ratio: 1
+
+  # gpt4v_public:
+  #   data_type: frames
+  #   sample_ratio: 2
+  #   task_types: ['summary']
+  #   fps: 1.0
+  #   conv_type: single
+
+  # lk_image:
+  #   data_type: image
+
+  # lk_video:
+  #   data_type: frames
+  #   conv_type: multi
+  #   fps: 1.0
+
+  gpt4v_internal:
+    data_type: frames
+    fps: 2.0
+    sample_ratio: 1
+    conv_type: multi
+    task_types: ['qa_pairs']
diff --git a/app/llava/configs/promptv1_2_increasing_ablation/finetune_gpt4_prompt_140k.yaml b/app/llava/configs/promptv1_2_increasing_ablation/finetune_gpt4_prompt_140k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f6be76c78831f080266a9ac7999a56393709ee6e
--- /dev/null
+++ b/app/llava/configs/promptv1_2_increasing_ablation/finetune_gpt4_prompt_140k.yaml
@@ -0,0 +1,35 @@
+datasets:
+
+#  m3it:
+#    data_type: images
+#    sample_ratio: 4
+#    tasks:
+#      - coco
+#      - coco-goi
+#      - coco-text
+#      - imagenet
+#      - coco-itm
+#      - iqa
+#      - mocheg
+#      - vsr
+#      - refcoco
+#      - science-qa
+#      - vqa-v2
+#      - gqa
+#      - st-vqa
+#      - text-vqa
+#      - okvqa
+#      - a-okvqa
+#
+  # gpt4v_internal:
+  #   data_type: frames
+  #   fps: 2.0
+  #   sample_ratio: 1
+  #   conv_type: single
+  #   task_types: ['summary', 'detail', 'qa_pairs']
+    
+  promptv1_2_internal:
+    data_type: frames
+    sample_ratio: 1
+    train_data_path: /mnt/bn/algo-masp-nas-2/kaili.zhao/data/masp_data/train/gpt4v_annotation/202400401week_gpt4v_all_videos_unique_ids.json
+    task_types: ['refine_caption']
\ No newline at end of file
diff --git a/app/llava/configs/release_version/finetune_250k_no_public.yaml b/app/llava/configs/release_version/finetune_250k_no_public.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..57d99cdd4bcd1bd2fca229ade38785358a5fd9ad
--- /dev/null
+++ b/app/llava/configs/release_version/finetune_250k_no_public.yaml
@@ -0,0 +1,50 @@
+datasets:
+
+#  m3it:
+#    data_type: images
+#    sample_ratio: 4
+#    tasks:
+#      - coco
+#      - coco-goi
+#      - coco-text
+#      - imagenet
+#      - coco-itm
+#      - iqa
+#      - mocheg
+#      - vsr
+#      - refcoco
+#      - science-qa
+#      - vqa-v2
+#      - gqa
+#      - st-vqa
+#      - text-vqa
+#      - okvqa
+#      - a-okvqa
+#
+  tt_vqa:
+    data_type: frames
+    sample_ratio: 3
+    fps: 2.0
+    conv_type: single  
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/dataset/masp/20231201_20240322_caption_250k.json
+
+
+  ShareGPT4V:
+    data_type: images
+    sample_ratio: 1
+
+
+  gpt4v_tt_vqa:
+    data_type: frames
+    fps: 0.5
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['caption']
+
+
+  gpt4v_internal:
+    data_type: frames
+    fps: 2.0
+    sample_ratio: 1
+    conv_type: single
+    task_types: ['detail']
diff --git a/app/llava/configs/release_version/finetune_all_data.yaml b/app/llava/configs/release_version/finetune_all_data.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..134f1a3744075449a11c61d48feae5bbb88f6ddb
--- /dev/null
+++ b/app/llava/configs/release_version/finetune_all_data.yaml
@@ -0,0 +1,63 @@
+datasets:
+
+#  m3it:
+#    data_type: images
+#    sample_ratio: 4
+#    tasks:
+#      - coco
+#      - coco-goi
+#      - coco-text
+#      - imagenet
+#      - coco-itm
+#      - iqa
+#      - mocheg
+#      - vsr
+#      - refcoco
+#      - science-qa
+#      - vqa-v2
+#      - gqa
+#      - st-vqa
+#      - text-vqa
+#      - okvqa
+#      - a-okvqa
+#
+  tt_vqa:
+    data_type: frames
+    sample_ratio: 3
+    fps: 2.0
+    conv_type: single  
+    train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/dataset/masp/20231201_20240322_caption_250k.json
+
+
+  ShareGPT4V:
+    data_type: images
+    sample_ratio: 1
+
+
+  gpt4v_tt_vqa:
+    data_type: frames
+    fps: 0.5
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['caption']
+
+  # gpt4v_public:
+  #   data_type: frames
+  #   fps: 1.0
+  #   sample_ratio: 10
+  #   conv_type: single
+  #   task_types: ['summary', 'detail']
+  #   train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_500k_filtered.json
+  
+  lk_video:
+    data_type: frames
+    conv_type: multi
+    fps: 1.0
+    sample_ratio: 6
+  
+  gpt4v_internal:
+    data_type: frames
+    fps: 2.0
+    sample_ratio: 1
+    conv_type: single
+    task_types: ['detail']
diff --git a/app/llava/configs/release_version/finetune_gpt4v_caption.yaml b/app/llava/configs/release_version/finetune_gpt4v_caption.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e96e51f30d5d47fee4a575fbdf750b1d8bfa2a7a
--- /dev/null
+++ b/app/llava/configs/release_version/finetune_gpt4v_caption.yaml
@@ -0,0 +1,62 @@
+datasets:
+
+#  m3it:
+#    data_type: images
+#    sample_ratio: 4
+#    tasks:
+#      - coco
+#      - coco-goi
+#      - coco-text
+#      - imagenet
+#      - coco-itm
+#      - iqa
+#      - mocheg
+#      - vsr
+#      - refcoco
+#      - science-qa
+#      - vqa-v2
+#      - gqa
+#      - st-vqa
+#      - text-vqa
+#      - okvqa
+#      - a-okvqa
+#
+  # tt_vqa:
+  #   data_type: frames
+  #   sample_ratio: 2
+  #   fps: 2.0
+  #   conv_type: single  
+  #   train_data_path: /mnt/bn/algo-masp-nas-2/baiyi.by/data/ADSO_Anno_Data/batch_20231128/meta_data_single_60k_caption_170k_QA.json
+
+  ShareGPT4V:
+    data_type: images
+    sample_ratio: 1
+
+
+  gpt4v_tt_vqa:
+    data_type: frames
+    fps: 0.5
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['caption']
+
+  # gpt4v_public:
+  #   data_type: frames
+  #   fps: 1.0
+  #   sample_ratio: 10
+  #   conv_type: single
+  #   task_types: ['summary', 'detail']
+  #   train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_500k_filtered.json
+  
+  lk_video:
+    data_type: frames
+    conv_type: multi
+    fps: 1.0
+    sample_ratio: 6
+  
+  gpt4v_internal:
+    data_type: frames
+    fps: 2.0
+    sample_ratio: 1
+    conv_type: single
+    task_types: ['detail']
diff --git a/app/llava/configs/release_version/finetune_gpt4v_caption_ocr.yaml b/app/llava/configs/release_version/finetune_gpt4v_caption_ocr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..33c38f2a5105f0e28f52cf85dcb9101babf6349c
--- /dev/null
+++ b/app/llava/configs/release_version/finetune_gpt4v_caption_ocr.yaml
@@ -0,0 +1,67 @@
+datasets:
+
+#  m3it:
+#    data_type: images
+#    sample_ratio: 4
+#    tasks:
+#      - coco
+#      - coco-goi
+#      - coco-text
+#      - imagenet
+#      - coco-itm
+#      - iqa
+#      - mocheg
+#      - vsr
+#      - refcoco
+#      - science-qa
+#      - vqa-v2
+#      - gqa
+#      - st-vqa
+#      - text-vqa
+#      - okvqa
+#      - a-okvqa
+#
+  # tt_vqa:
+  #   data_type: frames
+  #   sample_ratio: 2
+  #   fps: 2.0
+  #   conv_type: single  
+  #   train_data_path: /mnt/bn/algo-masp-nas-2/baiyi.by/data/ADSO_Anno_Data/batch_20231128/meta_data_single_60k_caption_170k_QA.json
+
+  ShareGPT4V:
+    data_type: images
+    sample_ratio: 1
+
+
+  gpt4v_tt_vqa:
+    data_type: frames
+    fps: 0.5
+    sample_ratio: 6
+    conv_type: single
+    task_types: ['caption']
+
+  # gpt4v_public:
+  #   data_type: frames
+  #   fps: 1.0
+  #   sample_ratio: 10
+  #   conv_type: single
+  #   task_types: ['summary', 'detail']
+  #   train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_500k_filtered.json
+  
+  lk_video:
+    data_type: frames
+    conv_type: multi
+    fps: 1.0
+    sample_ratio: 6
+  
+  gpt4v_internal:
+    data_type: frames
+    fps: 2.0
+    sample_ratio: 1
+    conv_type: single
+    task_types: ['detail']
+
+  synthetic_ocr:
+    data_type: video
+    sample_ratio: 1
+    fps: 0.5
\ No newline at end of file
diff --git a/app/llava/constants.py b/app/llava/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..6049c4b6396020201c16233afa2717aa2a41f9ac
--- /dev/null
+++ b/app/llava/constants.py
@@ -0,0 +1,17 @@
+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+
+LOGDIR = "."
+
+# Model Constants
+IGNORE_INDEX = -100
+MM_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+IMAGE_PLACEHOLDER = "<image-placeholder>"
+DEFAULT_VIDEO_TOKEN = "<video>"
+DEFAULT_VIDEO_PATCH_TOKEN = "<vid_patch>"
+DEFAULT_VIDEO_START_TOKEN = "<vid_start>"
+DEFAULT_VIDEO_END_TOKEN = "<vid_end>"
\ No newline at end of file
diff --git a/app/llava/conversation.py b/app/llava/conversation.py
new file mode 100644
index 0000000000000000000000000000000000000000..d236d5205792b82c9a77e5c2bfae24c4f6e83e82
--- /dev/null
+++ b/app/llava/conversation.py
@@ -0,0 +1,454 @@
+import dataclasses
+from enum import auto, Enum
+from typing import List, Tuple
+import base64
+from io import BytesIO
+from PIL import Image
+
+
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    LLAMA_2 = auto()
+
+
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+
+    skip_next: bool = False
+
+    def get_prompt(self, use_chat_template=False, tokenizer=None):
+        if use_chat_template:
+            assert tokenizer is not None, "must have tokenizer when using chat template"
+            messages = self.messages
+            # whether in inference mode
+            if messages[-1][0] == self.roles[1] and (messages[-1][1] is None or messages[-1][1] == ''):
+                generate_flag = True
+                messages = messages[:-1]
+            else:
+                generate_flag = False
+            chat = []
+            for role, message in messages:
+                chat.append(
+                    {
+                        "role": role,
+                        "content": message,                        
+                    }
+                )
+            return tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=generate_flag)
+        else:
+            messages = self.messages
+            if len(messages) > 0 and type(messages[0][1]) is tuple:
+                messages = self.messages.copy()
+                init_role, init_msg = messages[0].copy()
+                init_msg = init_msg[0].replace("<image>", "").strip()
+                if 'mmtag' in self.version:
+                    messages[0] = (init_role, init_msg)
+                    messages.insert(0, (self.roles[0], "<Image><image></Image>"))
+                    messages.insert(1, (self.roles[1], "Received."))
+                else:
+                    messages[0] = (init_role, "<image>\n" + init_msg)
+
+            if self.sep_style == SeparatorStyle.SINGLE:
+                ret = self.system + self.sep
+                for role, message in messages:
+                    if message:
+                        if type(message) is tuple:
+                            message, _, _ = message
+                        ret += role + ": " + message + self.sep
+                    else:
+                        ret += role + ":"
+            elif self.sep_style == SeparatorStyle.TWO:
+                seps = [self.sep, self.sep2]
+                ret = self.system + seps[0]
+                for i, (role, message) in enumerate(messages):
+                    if message:
+                        if type(message) is tuple:
+                            message, _, _ = message
+                        ret += role + ": " + message + seps[i % 2]
+                    else:
+                        ret += role + ":"
+            elif self.sep_style == SeparatorStyle.MPT:
+                ret = self.system + self.sep
+                for role, message in messages:
+                    if message:
+                        if type(message) is tuple:
+                            message, _, _ = message
+                        ret += role + message + self.sep
+                    else:
+                        ret += role
+            elif self.sep_style == SeparatorStyle.LLAMA_2:
+                wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n" if len(msg) > 0 else msg
+                wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
+                ret = ""
+
+                for i, (role, message) in enumerate(messages):
+                    if i == 0:
+                        assert message, "first message should not be none"
+                        assert role == self.roles[0], "first message should come from user"
+                    if message:
+                        if type(message) is tuple:
+                            message, _, _ = message
+                        if i == 0: message = wrap_sys(self.system) + message
+                        if i % 2 == 0:
+                            message = wrap_inst(message)
+                            ret += self.sep + message
+                        else:
+                            ret += " " + message + " " + self.sep2
+                    else:
+                        ret += ""
+                ret = ret.lstrip(self.sep)
+            elif self.sep_style == SeparatorStyle.PLAIN:
+                seps = [self.sep, self.sep2]
+                ret = self.system
+                for i, (role, message) in enumerate(messages):
+                    if message:
+                        if type(message) is tuple:
+                            message, _, _ = message
+                        ret += message + seps[i % 2]
+                    else:
+                        ret += ""
+            else:
+                raise ValueError(f"Invalid style: {self.sep_style}")
+
+        return ret
+
+
+
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+
+    def process_image(self, image, image_process_mode, return_pil=False, image_format='PNG', max_len=1344, min_len=672):
+        if image_process_mode == "Pad":
+            def expand2square(pil_img, background_color=(122, 116, 104)):
+                width, height = pil_img.size
+                if width == height:
+                    return pil_img
+                elif width > height:
+                    result = Image.new(pil_img.mode, (width, width), background_color)
+                    result.paste(pil_img, (0, (width - height) // 2))
+                    return result
+                else:
+                    result = Image.new(pil_img.mode, (height, height), background_color)
+                    result.paste(pil_img, ((height - width) // 2, 0))
+                    return result
+            image = expand2square(image)
+        elif image_process_mode in ["Default", "Crop"]:
+            pass
+        elif image_process_mode == "Resize":
+            image = image.resize((336, 336))
+        else:
+            raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
+        if max(image.size) > max_len:
+            max_hw, min_hw = max(image.size), min(image.size)
+            aspect_ratio = max_hw / min_hw
+            shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+            longest_edge = int(shortest_edge * aspect_ratio)
+            W, H = image.size
+            if H > W:
+                H, W = longest_edge, shortest_edge
+            else:
+                H, W = shortest_edge, longest_edge
+            image = image.resize((W, H))
+        if return_pil:
+            return image
+        else:
+            buffered = BytesIO()
+            image.save(buffered, format=image_format)
+            img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+            return img_b64_str
+
+    def get_images(self, return_pil=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode = msg
+                    image = self.process_image(image, image_process_mode, return_pil=return_pil)
+                    images.append(image)
+        return images
+
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode = msg
+                    img_b64_str = self.process_image(
+                        image, "Default", return_pil=False,
+                        image_format='JPEG')
+                    img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
+                    msg = img_str + msg.replace('<image>', '').strip()
+                    ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version)
+
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+
+
+conv_vicuna_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
+        ("Assistant",
+            "Renewable energy sources are those that can be replenished naturally in a relatively "
+            "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
+            "Non-renewable energy sources, on the other hand, are finite and will eventually be "
+            "depleted, such as coal, oil, and natural gas. Here are some key differences between "
+            "renewable and non-renewable energy sources:\n"
+            "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
+            "energy sources are finite and will eventually run out.\n"
+            "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
+            "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
+            "and other negative effects.\n"
+            "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
+            "have lower operational costs than non-renewable sources.\n"
+            "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
+            "locations than non-renewable sources.\n"
+            "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
+            "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
+            "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
+            "non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+
+conv_vicuna_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+
+conv_llama_2 = Conversation(
+    system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+
+conv_llava_llama_2 = Conversation(
+    system="You are a helpful language and vision assistant. "
+           "You are able to understand the visual content that the user provides, "
+           "and assist the user with a variety of tasks using natural language.",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+
+conv_mpt = Conversation(
+    system="""<|im_start|>system
+A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+
+conv_llava_plain = Conversation(
+    system="",
+    roles=("", ""),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
+)
+
+conv_llava_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+
+conv_llava_v0_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("Human", "Assistant"),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+    version="v0_mmtag",
+)
+
+conv_llava_v1 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+
+conv_llava_v1_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("USER", "ASSISTANT"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+    version="v1_mmtag",
+)
+
+# conv_mistral_instruct = Conversation(
+#     system="",
+#     roles=("USER", "ASSISTANT"),
+#     version="llama_v2",
+#     messages=(),
+#     offset=0,
+#     sep_style=SeparatorStyle.LLAMA_2,
+#     sep="",
+#     sep2="</s>",
+# )
+conv_mistral_instruct = Conversation(
+    system="",
+    roles=("user", "assistant"),
+    version="mistral",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT, # not used 
+    sep="",
+    sep2="</s>",
+)
+
+conv_gemma = Conversation(
+    system="",
+    roles=("user", "model"),
+    version="gemma",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,  # not used
+    sep="<start_of_turn>",
+    sep2="<end_of_turn>",
+)
+
+conv_thoth = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="thoth",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="<[SEP_never_used_51bce0c785ca2f68081bfa7d91973934]>",
+)
+
+conv_chatml_direct = Conversation(
+    system="""<|im_start|>system
+Answer the questions.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+
+
+default_conversation = conv_vicuna_v1
+conv_templates = {
+    "default": conv_vicuna_v0,
+    "v0": conv_vicuna_v0,
+    "v1": conv_vicuna_v1,
+    "vicuna_v1": conv_vicuna_v1,
+    "llama_2": conv_llama_2,
+    "mistral_instruct": conv_mistral_instruct,
+    "chatml_direct": conv_chatml_direct,
+    "mistral_direct": conv_chatml_direct,
+
+    "plain": conv_llava_plain,
+    "v0_plain": conv_llava_plain,
+    "llava_v0": conv_llava_v0,
+    "v0_mmtag": conv_llava_v0_mmtag,
+    "llava_v1": conv_llava_v1,
+    "v1_mmtag": conv_llava_v1_mmtag,
+    "llava_llama_2": conv_llava_llama_2,
+
+    "mpt": conv_mpt,
+    "gemma": conv_gemma,
+    "thoth": conv_thoth,
+
+}
+
+
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())
diff --git a/app/llava/datasets/__init__.py b/app/llava/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6836473b3107c70db48c30eeaba354fdb8a08254
--- /dev/null
+++ b/app/llava/datasets/__init__.py
@@ -0,0 +1,24 @@
+from .data_cfgs import *
+from .base_dataset import *
+from .prompts import *
+from .super_dataset import *
+from .cc_sbu_dataset import *
+from .llava_pretrain_dataset import *
+# from .llava_instruct_dataset import *
+# from .lrv_instruct_dataset import *
+from .internvid_dataset import *
+from .tt_vqa_dataset import *
+from .m3it_dataset import *
+from .sharegpt4v_dataset import *
+from .gpt4v_tt_vqa_dataset import *
+from .gpt4v_public_dataset import *
+from .gpt4v_internal_dataset import *
+# from .synthdog_dataset import *
+# from .ocr_vqa_dataset import *
+# from .sharegpt_dataset import *
+from .textcaps_dataset import * 
+from .synthetic_ocr_dataset import *
+from .lk_image_dataset import *
+from .lk_video_dataset import *
+
+from .promptv1_2_internal_dataset import *
\ No newline at end of file
diff --git a/app/llava/datasets/base_dataset.py b/app/llava/datasets/base_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..e63354a7380eccbb85f7e97e3fcd89f879c8d1e6
--- /dev/null
+++ b/app/llava/datasets/base_dataset.py
@@ -0,0 +1,234 @@
+import os
+import json
+import sys
+import copy
+import math
+import torch
+import decord
+import random
+import numpy as np
+from PIL import Image
+from decord import VideoReader
+from torch.utils.data import Dataset
+from llava.utils import master_print
+from typing import Dict, Optional, Sequence, List
+from llava.datasets.data_cfgs import data_configs
+from transformers import CLIPImageProcessor, SiglipImageProcessor
+
+from llava.mm_utils import get_frame_indices, process_anyres_image
+from torch.utils.data.dataloader import default_collate
+
+decord.bridge.set_bridge("torch")
+
+class TaskBaseDataset(Dataset):
+    """ Implementation of base task dataset """
+    def __init__(self, anno_path=None, data_args=None, name=None, **kwargs):
+
+        self.anno_path = anno_path
+        self.data_args = data_args
+        self.image_aspect_ratio = data_args.image_aspect_ratio
+        self.image_grid_pinpoints = data_args.image_grid_pinpoints
+        self.vis_processor = data_args.image_processor
+        self.type = None
+        self.name = name
+
+        master_print(f"Loading dataset {name}...")
+        if (anno_path is not None):
+            if not hasattr(self, 'annotation'):
+                self.annotation = json.load(open(anno_path, 'r'))
+            master_print(f"Finish loading dataset {name} {len(self.annotation)} samples...")
+
+    def __len__(self):
+        return len(self.annotation)
+
+    def collater(self, samples):
+        return default_collate(samples)
+
+    def text_preprocess(self, sources) -> List[List[Dict[str, str]]]:
+        pass
+
+    def vis_preprocess(self, vis_path) -> Image:
+        pass
+
+    @property
+    def data_type(self):
+        return self.type
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        item = self.annotation[i]
+
+        vis_path = item['vis_path'] if 'vis_path' in item else item['video_path']
+
+        ret = {
+            'images': self.vis_preprocess(vis_path),
+            'conversations': self.text_preprocess(item)
+        }
+        if 'id' in item:
+            ret['id'] = item['id']
+
+        return ret
+
+
+class ImageTaskDataset(TaskBaseDataset):
+    def __init__(self, anno_path=None, data_args=None, name=None):
+        super().__init__(anno_path=anno_path,
+                         data_args=data_args,
+                         name=name)
+        self.type = 'images'
+
+    @staticmethod
+    def expand2square(pil_img, background_color):
+        width, height = pil_img.size
+        if width == height:
+            return pil_img
+        elif width > height:
+            result = Image.new(pil_img.mode, (width, width), background_color)
+            result.paste(pil_img, (0, (width - height) // 2))
+            return result
+        else:
+            result = Image.new(pil_img.mode, (height, height), background_color)
+            result.paste(pil_img, ((height - width) // 2, 0))
+            return result
+
+    def preprocess_image(self, image):
+        if self.image_aspect_ratio == 'pad':
+            image = self.expand2square(image, tuple(int(x *255) for x in self.vis_processor.image_mean))
+            if isinstance(self.vis_processor, CLIPImageProcessor) or isinstance(self.vis_processor, SiglipImageProcessor):
+                image = self.vis_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+            else:
+                image = self.vis_processor.preprocess(image)
+        elif self.image_aspect_ratio == "anyres":
+            image = process_anyres_image(image, self.vis_processor, self.image_grid_pinpoints)
+        else:
+            if isinstance(self.vis_processor, CLIPImageProcessor) or isinstance(self.vis_processor, SiglipImageProcessor):
+                image = self.vis_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+            else:
+                image = self.vis_processor.preprocess(image)
+
+        return image
+
+    def vis_preprocess(self, vis_path):
+        image = Image.open(vis_path).convert('RGB')
+        image = self.preprocess_image(image)
+        if isinstance(image, list):
+            images = image
+        else:
+            images = [image]
+
+        return images
+
+
+class VideoTaskDataset(ImageTaskDataset):
+    def __init__(self, anno_path=None, data_args=None, name=None):
+        super().__init__(anno_path=anno_path,
+                         data_args=data_args,
+                         name=name)
+
+        # if not specify num_segments, use default
+        self.num_segments = self.data_args.num_segments
+        self.sample_strategy = self.data_args.sample_strategy
+        self.type = 'video'
+
+    def vis_preprocess(self, vis_path):
+        images = None
+        try:
+            video_reader = VideoReader(vis_path)
+            vlen = len(video_reader)
+            fps = video_reader.get_avg_fps()
+            duration = vlen / float(fps)
+
+            frame_indices = get_frame_indices(self.num_segments, vlen,
+                                              sample=self.sample_strategy, input_fps=fps, pad_last=False)
+            frames = video_reader.get_batch(frame_indices)
+            frames = frames.numpy().astype(np.uint8)
+            images = [Image.fromarray(frame).convert('RGB') for frame in frames]
+            images = [self.preprocess_image(image) for image in images]
+        except Exception as e:
+            print(e, vis_path)
+            sys.stdout.flush()
+            images = None
+
+        # print(f"images: {len(images)}, {images[0].shape}")
+
+        return images
+
+
+class FramesTaskDataset(ImageTaskDataset):
+    def __init__(self, anno_path=None, data_args=None, fps=0.5, name=None):
+        super().__init__(anno_path=anno_path,
+                         data_args=data_args,
+                         name=name)
+
+        # if not specify num_segments, use default
+        self.num_segments = self.data_args.num_segments
+        # print("self.num_segments:", self.num_segments)
+        self.type = 'video'
+        self.default_fps = 2.0
+        self.fps = fps
+
+    @staticmethod
+    def _downsample_frames(frames, interval, keep_first_last=True):
+        if keep_first_last:
+            first, last, mid = frames[0], frames[-1], frames[1:-1]
+            sampled_frames = mid[interval - 1::interval]
+            ret = [first] + sampled_frames + [last]
+
+        else:
+            # may output empty list, recommend keep first and last frame
+            ret = frames[interval - 1::interval]
+
+        return ret
+
+    @staticmethod
+    def _sample_frames(frames, num_segments):
+        frame_indices = list(range(len(frames)))
+        cand_indices  = copy.deepcopy(frame_indices)
+        intervals = np.linspace(start=0, stop=len(frame_indices), num=num_segments + 1).astype(int)
+        ranges = []
+
+        for idx, interv in enumerate(intervals[:-1]):
+            ranges.append((interv, intervals[idx + 1] - 1))
+
+        try:
+            frame_indices = [cand_indices[random.choice(range(x[0], x[1]))] for x in ranges]
+        except:
+            frame_indices = [cand_indices[x[0]] for x in ranges]
+
+        sampled_frames = [frames[indice] for indice in frame_indices]
+
+        return sampled_frames
+
+    def vis_preprocess(self, vis_path):
+        image_files = [(os.path.splitext(img)[0], img) for img in os.listdir(vis_path) if not img.startswith('cuttime')]
+        if image_files[0][1].endswith('jpeg'):
+            # gpt4v public data
+            image_files = [(int(x[0].split('_')[-1]), x[1]) for x in image_files]
+        else:
+            image_files = [(int(x[0]), x[1]) for x in image_files]
+
+        image_files = sorted(image_files, key=lambda img: img[0])
+
+        if self.fps < self.default_fps:
+            interval = math.floor(self.default_fps / self.fps)
+            image_files = self._downsample_frames(image_files, interval, keep_first_last=True)
+
+        if self.num_segments > 0 and len(image_files) > self.num_segments:
+            image_files = self._sample_frames(image_files, self.num_segments)
+
+        images = []
+        for image_file in image_files:
+            try:
+                images.append(Image.open(os.path.join(vis_path, image_file[1])).convert('RGB'))
+            except Exception as e:
+                continue
+        formatted_images = []
+        for image in images:
+            im = self.preprocess_image(image)
+            if isinstance(im, list):
+                formatted_images.extend(im)
+            else:
+                formatted_images.append(im)
+        return formatted_images
+
+
+
diff --git a/app/llava/datasets/builder.py b/app/llava/datasets/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..597d16acfa7b42453a0d5ea26f8c9d551562e49c
--- /dev/null
+++ b/app/llava/datasets/builder.py
@@ -0,0 +1,5 @@
+from .registry import Registry
+
+__all__ = ['DATASETS']
+
+DATASETS = Registry('datasets')
\ No newline at end of file
diff --git a/app/llava/datasets/cc_sbu_dataset.py b/app/llava/datasets/cc_sbu_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..16caac6a4de1d27c71ac2ef61d683d39c35fea3b
--- /dev/null
+++ b/app/llava/datasets/cc_sbu_dataset.py
@@ -0,0 +1,40 @@
+import os
+import random
+from llava.datasets.builder import DATASETS
+
+from typing import Dict, Optional, Sequence, List
+from llava.datasets.data_cfgs import data_configs
+from llava.datasets.base_dataset import ImageTaskDataset
+from llava.datasets.prompts import cc_sbu_prompt
+from llava.constants import DEFAULT_IMAGE_TOKEN
+
+
+class CCSBUDataset(ImageTaskDataset):
+    def __init__(self, anno_path, data_args=None, name='cc_sbu'):
+        super().__init__(anno_path=anno_path,
+                         data_args=data_args,
+                         name=name)
+
+    def text_preprocess(self, item) -> List[Dict[str, str]]:
+        caption = item['caption']
+
+        conversations = [
+            {
+                'from': 'human',
+                'value': DEFAULT_IMAGE_TOKEN + random.choice(cc_sbu_prompt)
+            },
+            {
+                'from': 'model',
+                'value': caption
+            }
+        ]
+
+        return conversations
+
+
+@DATASETS.register_obj
+def cc_sbu(data_args):
+    return CCSBUDataset(data_configs["cc_sbu"]['train_data_path'], data_args)
+
+
+
diff --git a/app/llava/datasets/data_cfgs.py b/app/llava/datasets/data_cfgs.py
new file mode 100644
index 0000000000000000000000000000000000000000..083d0c2a967b8a56542f48072fdb07ff6496b032
--- /dev/null
+++ b/app/llava/datasets/data_cfgs.py
@@ -0,0 +1,157 @@
+data_configs = {
+    'llava_pretrain': {
+        'data_type': 'images',
+        'train_data_path': '/mnt/bn/algo-masp-nas-2/baiyi.by/data/blip_laion_cc_sbu_558k/meta_data.json'
+    },
+    'llava_instruct': {
+        'data_type': 'images',
+        'train_data_path': '/mnt/bn/algo-masp-nas-2/baiyi.by/data/llava_instruct_150k/meta_data.json'
+    },
+    'lrv_instruct': {
+        'data_type': 'images',
+        'train_data_path': '/mnt/bn/algo-masp-nas-2/baiyi.by/data/lrv_instructions/meta_data.json'
+    },
+    'coco_caption': {
+        'data_type': 'images',
+        'train_data_path': '/mnt/bn/data-tns-algo-masp/baiyi.by/data/coco_caption/train.json'
+    },
+    'cc_sbu': {
+        'data_type': 'images',
+        'train_data_path': '/mnt/bn/baiyi-arnold-nas/data/masp/vlm_data/cc_sbu/meta_data.json'
+    },
+    'laion': {
+        'data_type': 'images',
+        'train_data_path': '/mnt/bn/data-tns-algo-masp/baiyi.by/data/laion/train.json'
+    },
+    'webvid': {
+        'data_type': 'video',
+        'train_data_path': '/mnt/bn/baiyi-arnold-nas/data/masp/vlm_data/webvid_10M_video/train.json',
+        'val_data_path': '/mnt/bn/baiyi-arnold-nas/data/masp/vlm_data/webvid_10M_video/val.json'
+    },
+    'internvid': {
+        'data_type': 'frames',
+        'fps': 0.5,
+        'train_data_path': '/mnt/bn/algo-masp-nas-2/baiyi.by/data/InternVid/meta_data.json'
+    },
+    'video_chatgpt_instruct_single': {
+        'data_type': 'video',
+        'train_data_path': '/mnt/bn/algo-masp-nas-2/baiyi.by/data/VideoChatGPT_Instruct_100K_single/train.json'
+    },
+    'video_chatgpt_instruct_multi': {
+        'data_type': 'video',
+        'train_data_path': '/mnt/bn/algo-masp-nas-2/baiyi.by/data/VideoChatGPT_Instruct_100K_multi/train.json'
+    },
+    'video_chatgpt': {
+        'data_type': 'frames',
+        'train_data_path': '/mnt/bn/algo-masp-nas-2/baiyi.by/data/video_chatgpt_instruct/meta_data.json'
+    },
+    'm3it': {
+        'data_type': 'images',
+        'default_tasks': [
+            'coco',
+            'textcap',
+            'image-paragraph-captioning',
+            'coco-goi',
+            'coco-itm',
+            'vqa-v2',
+            'shapes',
+            'docvqa',
+            'ocr-vqa',
+            'st-vqa',
+            'text-vqa',
+            'gqa',
+            'okvqa',
+            'a-okvqa',
+            'viquae',
+            'clevr',
+            'nlvr',
+            'vcr',
+            'visual-mrc',
+            'visual-dialog',
+            'multi30k'
+        ]
+    },
+    'tt_vqa': {
+        'data_type': 'frames',
+        'fps': 2,
+        'train_data_path': '/mnt/bn/algo-masp-nas-2/baiyi.by/data/ADSO_Anno_Data/batch_20231128/meta_data_single_60k_caption_170k_QA.json'
+        # 'train_data_path': '/mnt/bn/yukunfeng-nasdrive/xiangchen/dataset/masp/20240208_meta_data_single_135k_caption_160k_QA.json'
+        # 'train_data_path': '/mnt/bn/algo-masp-nas-2/baiyi.by/data/ADSO_Anno_Data/batch_20231128/meta_data_final_single_non_empty.json'
+    },
+    'gpt4v_tt_vqa': {
+        'data_type': 'frames',
+        'fps': 0.5,
+        # 'train_data_path': '/mnt/bn/algo-masp-nas-2/baiyi.by/data/GPT4V_Negs/20231127_81k_single.json'
+        # 'train_data_path': '/mnt/bn/yukunfeng-nasdrive/xiangchen/dataset/masp/20231127_81k_25k_filtered_single_non_empty.json'
+        'train_data_path': '/mnt/bn/algo-masp-nas-2/xiangchen/dataset/masp/20231222_120k_multi_filtered.json',
+        'task_types': ['caption', 'qas'],
+        'conv_type': 'single'
+    },
+    'sharegpt4v': {
+        'data_type': 'images',
+        'coco_dir': '/mnt/bn/data-tns-algo-masp/data',
+        'llava_dir': '/mnt/bn/data-tns-algo-masp/baiyi.by/data/blip_laion_cc_sbu_558k',
+        'other_dir': '/mnt/bn/algo-masp-nas-2/xiangchen/dataset/sharegpt4v',
+    },
+    'gpt4v_public': {
+        'data_type': 'frames',
+        'fps': 1,
+        'train_data_path': '/mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_130k.json',
+        # 'train_data_path': '/mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_500k_filtered.json',
+        'task_types': ['summary', 'detail', 'qa_pairs'],
+        'conv_type': 'single',
+        'sample_method': 'uniform'
+    },
+
+    'gpt4v_internal': {
+        'data_type': 'frames',
+        'fps': 2,
+        'train_data_path': '/mnt/bn/algo-masp-nas-2/xiangchen/dataset/masp/gpt4v_internal_28k.json',
+        'task_types': ['summary','detail','qa_pairs'],
+        'conv_type': 'single'
+    },
+
+    'synthdog': { #500k
+        'data_type': 'images',
+    },
+
+    'ocr_vqa': { #200k
+        'data_type': 'images',
+        'train_data_path': '/mnt/bn/algo-masp-nas-2/xiangchen/dataset/OCR-VQA/training_meta.json'
+    },
+
+    'sharegpt': { #50k
+        'data_type': 'text'
+    },
+
+    'text_caps':{ #100k
+        'data_type': 'images',
+        'train_data_path': '/mnt/bn/algo-masp-nas-2/xiangchen/dataset/TextCaps/TextCaps_0.1_train.json'
+    },
+
+    'synthetic_ocr':{ # 50k
+        'data_type': 'frames',
+        'fps': 0.5,  # total 10 frames for each video
+        'train_data_path': '/mnt/bn/algo-masp-nas-2/xiangchen/dataset/masp/synthetic_ocr/train_filtered.json'
+    },
+
+    'lk_image':{  # 600k
+        'data_type': 'images',
+        'train_data_path': '/mnt/bn/liangkeg/data/xiangchen/finetune_all_detail_vidal200k_videollava_images_im.json'
+    },
+
+    'lk_video':{  # 850k
+        'data_type': 'frames',
+        'fps': 1,
+        'train_data_path': '/mnt/bn/liangkeg/data/xiangchen/finetune_all_detail_vidal200k_videollava_images_vid.json',
+        'select_datasets': ['webvid10m', 'webvid2m', 'activitynet', 'vidal', 'hdvila'],
+    },
+
+    'promptv1_2_internal':{  # 210k
+        'data_type': 'frames',
+        'train_data_path': '/mnt/bn/algo-masp-nas-2/kaili.zhao/data/masp_data/train/gpt4v_annotation/202400401week_gpt4v_all_videos_unique_ids.json',
+        'task_types': ['caption']
+    }  
+}
+
+
diff --git a/app/llava/datasets/gpt4v_internal_dataset.py b/app/llava/datasets/gpt4v_internal_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9f661d0342719a05a85883b69db7254f7c6073f
--- /dev/null
+++ b/app/llava/datasets/gpt4v_internal_dataset.py
@@ -0,0 +1,188 @@
+import sys
+
+import datasets
+import torch
+import re
+import os
+import subprocess
+import numpy as np
+
+from llava.datasets.builder import DATASETS
+
+from typing import Dict, Optional, Sequence, List
+from llava.datasets.data_cfgs import data_configs
+from llava.datasets.base_dataset import FramesTaskDataset
+from llava.datasets.data_cfgs import data_configs
+from llava.utils import master_print
+import pickle
+from pathlib import Path
+import random
+from llava.datasets.prompts import tt_caption_prompt, internvid_prompt
+from llava.constants import DEFAULT_VIDEO_TOKEN
+from PIL import Image
+import json
+import numpy as np
+
+class GPT4VInternalDataset(FramesTaskDataset):
+    def __init__(self, anno_path=None, data_args=None, fps=0.5, conv_type='single', task_types=None, name='gpt4v_internal'):
+        self.default_fps = 2.0
+        self.fps = fps
+        self.conv_type = conv_type
+        self.task_types = task_types
+        self.annotation = self.get_dataset(anno_path)
+        assert self.conv_type in ('single', 'multi'), "gpt4v_public conv type must in single/multi"
+        # assert hasattr(self.data_args, 'task_types') ,  "gpt4v_internal must have key 'task_types' in yaml config"
+        # master_print(f"Finished loading dataset {name} {len(self.annotation)} samples...")
+        super().__init__(anno_path=anno_path,
+                         data_args=data_args,
+                         fps=fps,
+                         name=name)
+    def __len__(self):
+        return len(self.annotation)
+        
+    def get_dataset(self, anno_path):
+        dataset = []
+        anno_path = Path(anno_path)
+        with anno_path.open('rb') as f:
+            data = json.load(f)
+        for info in data:   
+            filtered_qa = []
+            for qa in info['qa_pairs']:
+                if len(qa['question']) == 0 or len(qa['answer']) == 0:
+                    continue
+                filtered_qa.append(qa)
+            info['qa_pairs'] = filtered_qa
+            
+            for task_type in self.task_types:
+                info_task = info.copy()
+                if len(info_task[task_type]) == 0:
+                    continue
+                if task_type == 'qa_pairs' and self.conv_type == 'single':
+                    for qa_pair in info_task[task_type]:
+                        one_info = info_task.copy()
+                        one_info[task_type] = [qa_pair]
+                        one_info.update({
+                            'task_type': task_type
+                        })
+                        dataset.append(one_info)
+                else:
+                    info_task.update({
+                        'task_type': task_type
+                    })
+                    dataset.append(info_task)
+
+        return dataset
+
+    @staticmethod
+    def _sample_frames(frames, num_segments):
+        indices = np.linspace(start=0, stop=len(frames) - 1, num=num_segments).astype(int)
+
+        frames = [frames[ind] for ind in indices]
+
+        return frames
+
+
+    def text_preprocess(self, item) -> List[Dict[str, str]]:
+        all_convs = []
+        # TODO: different prompt for summary and detail
+        if item['task_type'] == 'summary':
+             all_convs.append([
+                {
+                    'from': 'human',
+                    'value': random.choice(internvid_prompt)
+                },
+                {
+                    'from': 'model',
+                    'value': item['summary']
+                }
+            ])           
+        elif item['task_type'] == 'detail':
+            all_convs.append([
+                {
+                    'from': 'human',
+                    'value': random.choice(tt_caption_prompt)
+                },
+                {
+                    'from': 'model',
+                    'value': item['detail']
+                }
+            ])
+        else:
+            for qa in item['qa_pairs']:
+                all_convs.append([
+                    {
+                        'from': 'human',
+                        'value': qa['question']
+                    },
+                    {
+                        'from': 'model',
+                        'value': qa['answer']
+                    }
+                ])                
+            
+        conversations = []
+        random.shuffle(all_convs)
+        for idx, conv in enumerate(all_convs):
+            if idx == 0:
+                conv[0]['value'] = DEFAULT_VIDEO_TOKEN + conv[0]['value']
+            conversations.extend(conv)
+
+        return conversations
+
+
+    def vis_preprocess(self, vis_path):
+        image_files = [(os.path.splitext(img)[0], img) for img in os.listdir(vis_path) if not img.startswith('cuttime')]
+        image_files = [(int(x[0]), x[1]) for x in image_files]
+        image_files = sorted(image_files, key=lambda img: img[0])
+        intervals = np.linspace(start=0, stop=len(image_files)-1, num=10).astype(int)
+        image_files = [image_files[i] for i in intervals]
+
+        if self.num_segments > 0 and len(image_files) > self.num_segments:
+            image_files = self._sample_frames(image_files, self.num_segments)
+
+        images = []
+        for image_file in image_files:
+            try:
+                images.append(Image.open(os.path.join(vis_path, image_file[1])).convert('RGB'))
+            except Exception as e:
+                continue
+        formatted_images = []
+        for image in images:
+            im = self.preprocess_image(image)
+            if isinstance(im, list):
+                formatted_images.extend(im)
+            else:
+                formatted_images.append(im)
+
+        # images = [self.preprocess_image(image) for image in images]
+
+        return formatted_images
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        item = self.annotation[i]
+
+        ret = {
+            'images': self.vis_preprocess(item['vis_path']),
+            'conversations': self.text_preprocess(item)
+        }
+        if 'id' in item:
+            ret['id'] = item['id']
+
+        return ret
+
+
+@DATASETS.register_obj
+def gpt4v_internal(data_args):
+    data_cfg = data_configs['gpt4v_internal']
+    train_data_path = None
+    if 'train_data_path' in data_args.external_args:
+        train_data_path = data_args.external_args['train_data_path']
+    else:
+        train_data_path = data_cfg['train_data_path']
+    fps, conv_type, task_types = data_args.external_args['fps'], data_args.external_args['conv_type'], data_args.external_args['task_types']
+    return GPT4VInternalDataset(train_data_path, data_args,  fps, conv_type, task_types)
+
+
+
+        
+  
\ No newline at end of file
diff --git a/app/llava/datasets/gpt4v_public_dataset.py b/app/llava/datasets/gpt4v_public_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5c103fe7ed20d22b1ec772eec052aaf134f9cb4
--- /dev/null
+++ b/app/llava/datasets/gpt4v_public_dataset.py
@@ -0,0 +1,283 @@
+
+from llava.datasets.builder import DATASETS
+
+from typing import Dict, Optional, Sequence, List
+from llava.datasets.data_cfgs import data_configs
+from llava.datasets.base_dataset import FramesTaskDataset
+from llava.datasets.data_cfgs import data_configs
+import pickle
+from pathlib import Path
+import random
+import numpy as np
+from llava.datasets.prompts import tt_caption_prompt, internvid_prompt
+from llava.constants import DEFAULT_VIDEO_TOKEN
+from PIL import Image
+import json
+import torch
+import os
+
+
+class GPT4VPublicDataset(FramesTaskDataset):
+    def __init__(self, anno_path=None, data_args=None, fps=1.0, conv_type='single', task_types=None, sample_method='uniform', name='gpt4v_public'):
+        self.default_fps = 1.0
+        self.fps = fps
+        self.conv_type = conv_type
+        self.task_types = task_types
+        self.annotation = self.get_dataset(anno_path)
+        self.sample_method = sample_method
+        assert self.conv_type in ('single', 'multi'), "gpt4v_public conv type must in single/multi"
+        assert self.sample_method in ('sequential', 'uniform'), "gpt4v_public sample method must in sequential/uniform"
+        # assert hasattr(self.data_args, 'task_types') ,  "gpt4v_public must have key 'task_types' in yaml config"
+        # master_print(f"Finished loading dataset {name} {len(self.annotation)} samples...")
+        super().__init__(anno_path=anno_path,
+                         data_args=data_args,
+                         fps=fps,
+                         name=name)
+    def __len__(self):
+        return len(self.annotation)
+
+
+    def get_dataset(self, anno_path):
+        dataset = []
+        anno_path = Path(anno_path)
+        with anno_path.open('rb') as f:
+            data = json.load(f)
+        for info in data:
+            filtered_qa = []
+            if 'qa_pairs' not in info:
+                index = 0
+                while index < len(info['conversation']):
+                    if len(info['conversation'][index].strip()) == 0:
+                        index += 1
+                        continue
+                    if 'C' in info['conversation'][index]:
+                        if index+1 < len(info['conversation']) and 'A' in info['conversation'][index+1]:
+                            filtered_qa.append(
+                                [info['conversation'][index], info['conversation'][index+1]]
+                            )
+                            index += 2
+                        else:
+                            index += 1
+                            continue
+                    else:
+                        # print(info['conversation'][index])
+                        index += 1
+                        continue
+            else:
+                for qa in info['qa_pairs']:
+                    if len(qa[0]) == 0 or len(qa[1]) == 0:
+                        continue
+                    filtered_qa.append(qa)
+            info['qa_pairs'] = filtered_qa
+            
+            for task_type in self.task_types:
+                info_task = info.copy()
+                if len(info_task[task_type]) == 0:
+                    continue
+                if task_type == 'qa_pairs' and self.conv_type == 'single':
+                    for qa_pair in info_task[task_type]:
+                        one_info = info_task.copy()
+                        one_info[task_type] = [qa_pair]
+                        one_info.update({
+                            'task_type': task_type
+                        })
+                        dataset.append(one_info)
+                else:
+                    info_task.update({
+                        'task_type': task_type
+                    })
+                    dataset.append(info_task)
+
+        return dataset
+
+    # @staticmethod
+    # def _sample_frames(frames, num_segments):
+    #     indices = list(range(num_segments))
+
+    #     frames = [frames[ind] for ind in indices]
+
+    #     return frames
+
+    def text_preprocess(self, item) -> List[Dict[str, str]]:
+        all_convs = []
+        # TODO: different prompt for summary and detail
+        if item['task_type'] == 'summary':
+            summary = ''
+            if isinstance(item['summary'], list):
+                for s in item['summary']:
+                    if len(s.strip()) != 0:
+                        summary = s
+                        break
+            else:
+                summary = item['summary']
+
+            all_convs.append([
+                {
+                    'from': 'human',
+                    'value': random.choice(internvid_prompt)
+                },
+                {
+                    'from': 'model',
+                    'value': summary
+                }
+            ])           
+        elif item['task_type'] == 'detail':
+            detail = ''
+            if isinstance(item['detail'], list):
+                for s in item['detail']:
+                    if len(s.strip()) != 0:
+                        detail = s
+                        break
+            else:
+                detail = item['detail']
+                
+            all_convs.append([
+                {
+                    'from': 'human',
+                    'value': random.choice(tt_caption_prompt)
+                },
+                {
+                    'from': 'model',
+                    'value': detail
+                }
+            ])
+        else:
+            for qa in item['qa_pairs']:
+                all_convs.append([
+                    {
+                        'from': 'human',
+                        'value': qa[0]
+                    },
+                    {
+                        'from': 'model',
+                        'value': qa[1]
+                    }
+                ])                
+            
+        conversations = []
+        random.shuffle(all_convs)
+        for idx, conv in enumerate(all_convs):
+            if idx == 0:
+                conv[0]['value'] = DEFAULT_VIDEO_TOKEN + conv[0]['value']
+            conversations.extend(conv)
+
+        return conversations
+
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        item = self.annotation[i]
+
+        ret = {
+            'images': self.vis_preprocess(item['vis_path']),
+            'conversations': self.text_preprocess(item)
+        }
+        if 'id' in item:
+            ret['id'] = item['id']
+
+        return ret
+
+
+    def _sample_frames(self, frames, num_segments, preprocess=False):
+        if preprocess:
+            if self.sample_method == 'uniform':
+                indices = np.linspace(start=0, stop=len(frames) - 1, num=num_segments).astype(int)
+            elif self.sample_method == 'sequential':
+                indices = range(10)
+            else:
+                raise NotImplementedError
+            frames = [frames[ind] for ind in indices]
+        else:
+            indices = np.linspace(start=0, stop=len(frames) - 1, num=num_segments).astype(int)
+            frames = [frames[ind] for ind in indices]
+
+        return frames
+
+    def vis_preprocess(self, vis_path):
+        image_files = []
+        for img_path in os.listdir(vis_path):
+            if img_path.endswith('.jpeg'):
+                img_idx = int(img_path.split('_')[-1][:-5])
+                image_files.append((img_idx, img_path))
+        
+        image_files = sorted(image_files, key=lambda img: img[0])
+        # TODO: addhoc fix,  only 10 frames
+        if len(image_files) > 10:
+            image_files = self._sample_frames(image_files, 10, preprocess=True)
+        if self.num_segments > 0 and len(image_files) > self.num_segments:
+            image_files = self._sample_frames(image_files, self.num_segments)
+        
+        images = []
+        for image_file in image_files:
+            try:
+                images.append(Image.open(os.path.join(vis_path, image_file[1])).convert('RGB'))
+            except Exception as e:
+                continue
+        formatted_images = []
+        for image in images:
+            im = self.preprocess_image(image)
+            if isinstance(im, list):
+                formatted_images.extend(im)
+            else:
+                formatted_images.append(im)
+        return formatted_images
+
+
+@DATASETS.register_obj
+def gpt4v_public(data_args):
+    data_cfg = data_configs['gpt4v_public']
+    if 'train_data_path' in data_args.external_args:
+        data_cfg['train_data_path'] = data_args.external_args['train_data_path']
+    anno_path = data_cfg['train_data_path']
+    fps, conv_type, task_types = data_args.external_args['fps'], data_args.external_args['conv_type'], data_args.external_args['task_types']
+    if 'sample_method' in data_args.external_args:
+        sample_method = data_args.external_args['sample_method']
+    else:
+        sample_method = 'uniform'
+    return GPT4VPublicDataset(anno_path, data_args, fps, conv_type, task_types, sample_method)
+
+
+if __name__ == '__main__':
+    pass
+    # import pickle
+    # from tqdm import tqdm
+    # file_paths = ['/mnt/bn/algo-masp-nas-2/xianyang/clean_annotations/annotations/webvid10m',
+    #     '/mnt/bn/algo-masp-nas-2/xianyang/clean_annotations/annotations/webvid2m']
+    # frame_paths = ['/mnt/bn/algo-masp-nas-2/xianyang/clean_annotations/frames/webvid10m',
+    # '/mnt/bn/algo-masp-nas-2/xianyang/clean_annotations/frames/webvid2m']
+
+
+    # data = []
+    # for file_path, frame_path in zip(file_paths, frame_paths):
+    #     file_path = Path(file_path)
+   
+    #     for pkl_path in tqdm(file_path.glob('*')):
+    #         with pkl_path.open('rb') as f:
+    #             info = pickle.load(f)     
+    #         pkl_name = pkl_path.name[:-4]
+    #         frame_folder_path = Path(frame_path) / pkl_name
+    #         info['vis_path'] = str(frame_folder_path)
+    #         if os.path.exists(info['vis_path']):
+    #             data.append(info)
+    
+    # with open ('/mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_500k.json', 'w') as f:
+    #     json.dump(data, f)
+            # if frame_path.exists():
+            #     print(1)
+        
+    
+    # with open('/mnt/bn/liangkeg/data/xiangchen/finetune_all_detail_vidal200k_videollava_images.json') as f:
+    #     data = json.load(f)
+    # data_im = []
+    # data_vid = []
+    # for sample in data:
+    #     if 'image' in sample:
+    #         data_im.append(sample)
+    #     else:
+    #         data_vid.append(sample)
+    
+    
+    # with open('/mnt/bn/liangkeg/data/xiangchen/finetune_all_detail_vidal200k_videollava_images_im.json', 'w') as f:
+    #     json.dump(data_im, f)
+
+    # with open('/mnt/bn/liangkeg/data/xiangchen/finetune_all_detail_vidal200k_videollava_images_vid.json', 'w') as f:
+    #     json.dump(data_vid, f)
\ No newline at end of file
diff --git a/app/llava/datasets/gpt4v_tt_vqa_dataset.py b/app/llava/datasets/gpt4v_tt_vqa_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd92ef984d75c34e8369b1fa11f550ceb543480c
--- /dev/null
+++ b/app/llava/datasets/gpt4v_tt_vqa_dataset.py
@@ -0,0 +1,105 @@
+import os
+import json
+import random
+import json
+from pathlib import Path
+from llava.datasets.builder import DATASETS
+from pathlib import Path
+from typing import Dict, Optional, Sequence, List
+from llava.datasets.data_cfgs import data_configs
+from llava.datasets.base_dataset import FramesTaskDataset
+from llava.datasets.prompts import tt_caption_prompt, tt_caption_prompt2
+from llava.constants import DEFAULT_VIDEO_TOKEN
+from llava.utils import master_print
+
+
+class GPT4VTTVqaDataset(FramesTaskDataset):
+    def __init__(self, anno_path, data_args=None, fps=0.5, conv_type='single', task_types=None, name='gpt4v_tt_vqa'):
+        self.default_fps = 0.5
+        self.fps = fps
+        self.conv_type = conv_type
+        self.task_types = task_types
+        self.annotation = self.get_dataset(anno_path)
+        assert self.conv_type in ('single', 'multi'), "gpt4v_tt_vqa conv type must in single/multi"
+        # assert hasattr(self.data_args, 'task_types'), "gpt4v_tt_vqa must have key 'task_types' in yaml config"
+        # master_print(f"Finished loading dataset {name} {len(self.annotation)} samples...")
+        super().__init__(anno_path=anno_path,
+                         data_args=data_args,
+                         fps=fps,
+                         name=name)
+    def get_dataset(self, anno_path):
+        dataset = []
+        anno_path = Path(anno_path)
+        with anno_path.open('rb') as f:
+            data = json.load(f)
+        for info in data:
+            for task_type in self.task_types:
+                info_task = info.copy()
+                if task_type not in info or len(info_task[task_type]) == 0:
+                    continue
+                if task_type == 'qas' and self.conv_type == 'single':
+                    for qa_pair in info_task[task_type]:
+                        one_info = info_task.copy()
+                        one_info[task_type] = [qa_pair]
+                        one_info.update({
+                            'task_type': task_type
+                        })
+                        dataset.append(one_info)
+                else:
+                    info_task.update({
+                        'task_type': task_type
+                    })
+                    dataset.append(info_task)
+        return dataset
+
+
+    def text_preprocess(self, item) -> List[Dict[str, str]]:
+        all_convs = []
+        if hasattr(self.data_args, 'caption_prompt'):
+            cap_prompt = eval(self.data_args.caption_prompt)
+        else:
+            cap_prompt = tt_caption_prompt
+        if item['task_type'] == 'caption':
+            all_convs.append([
+                {
+                    'from': 'human',
+                    'value': random.choice(cap_prompt)
+                },
+                {
+                    'from': 'model',
+                    'value': item['caption']
+                }
+            ])
+        else:
+            for idx, qa in enumerate(item['qas']):
+                all_convs.append([
+                    {
+                        'from': 'human',
+                        'value': qa['q']
+                    },
+                    {
+                        'from': 'model',
+                        'value': qa['a']
+                    }
+                ])
+
+        conversations = []
+        random.shuffle(all_convs)
+        for idx, conv in enumerate(all_convs):
+            if idx == 0:
+                conv[0]['value'] = DEFAULT_VIDEO_TOKEN + conv[0]['value']
+            conversations.extend(conv)
+        return conversations
+
+    
+
+@DATASETS.register_obj
+def gpt4v_tt_vqa(data_args):
+    anno_path = None
+    if 'train_data_path' in data_args.external_args:
+        anno_path = data_args.external_args['train_data_path']
+    else:
+        anno_path = data_configs["gpt4v_tt_vqa"]['train_data_path']
+    fps, conv_type, task_types = data_args.external_args['fps'], data_args.external_args['conv_type'], data_args.external_args['task_types']
+    return GPT4VTTVqaDataset(anno_path, data_args, fps, conv_type, task_types)
+
diff --git a/app/llava/datasets/internvid_dataset.py b/app/llava/datasets/internvid_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..e10db587069a95b18d0d394c6f157c0f3a5bc604
--- /dev/null
+++ b/app/llava/datasets/internvid_dataset.py
@@ -0,0 +1,40 @@
+import os
+import random
+from llava.datasets.builder import DATASETS
+
+from typing import Dict, Optional, Sequence, List
+from llava.datasets.data_cfgs import data_configs
+from llava.datasets.base_dataset import FramesTaskDataset
+from llava.datasets.prompts import internvid_prompt
+from llava.constants import DEFAULT_VIDEO_TOKEN
+
+
+class InternVidDataset(FramesTaskDataset):
+    def __init__(self, anno_path, data_args=None, name='internvid'):
+        super().__init__(anno_path=anno_path,
+                         data_args=data_args,
+                         name=name)
+
+    def text_preprocess(self, item) -> List[Dict[str, str]]:
+        caption = item['caption']
+
+        conversations = [
+            {
+                'from': 'human',
+                'value': DEFAULT_VIDEO_TOKEN + random.choice(internvid_prompt)
+            },
+            {
+                'from': 'model',
+                'value': caption
+            }
+        ]
+
+        return conversations
+
+
+@DATASETS.register_obj
+def internvid(data_args):
+    return InternVidDataset(data_configs["internvid"]['train_data_path'], data_args)
+
+
+
diff --git a/app/llava/datasets/lk_image_dataset.py b/app/llava/datasets/lk_image_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..d528574878c492962ee631cc32d538d75288e21b
--- /dev/null
+++ b/app/llava/datasets/lk_image_dataset.py
@@ -0,0 +1,57 @@
+import datasets
+import torch
+import re
+import os
+import subprocess
+from llava.datasets.builder import DATASETS
+
+from typing import Dict, Optional, Sequence, List
+from llava.datasets.data_cfgs import data_configs
+from llava.datasets.base_dataset import ImageTaskDataset
+from llava.constants import DEFAULT_IMAGE_TOKEN
+from llava.datasets.data_cfgs import data_configs
+from llava.utils import master_print
+
+class LKImageDataset(ImageTaskDataset):
+    def __init__(self, anno_path=None, data_args=None, aux_args=None, name='lk_image'):
+        super().__init__(anno_path=anno_path,
+                         data_args=data_args,
+                         name=name)
+ 
+    def __len__(self):
+        return len(self.annotation)
+
+    def text_preprocess(self, item) -> List[Dict[str, str]]:
+        return item['conversations']
+
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        item = self.annotation[i]
+        vis_path = item['image']
+        ret = {
+            'images': self.vis_preprocess(vis_path),
+            'conversations': self.text_preprocess(item)
+        }
+        if 'id' in item:
+            ret['id'] = item['id']
+        return ret
+
+@DATASETS.register_obj
+def lk_image(data_args):
+    data_cfg = data_configs['lk_image']
+    return LKImageDataset(data_cfg['train_data_path'], data_args, aux_args=data_cfg)
+
+# if __name__ == '__main__':
+    # import json
+    # from tqdm import tqdm
+    # with open('/mnt/bn/liangkeg/data/xiangchen/finetune_all_detail_vidal200k_videollava_images_im.json') as f:
+    #     data = json.load(f)
+    # filterd_data = []
+    # for idx, item in tqdm(enumerate(data)):
+    #     image_path = item['image']
+    #     if os.path.exists(image_path):
+    #         filterd_data.append(item)
+    #     else:
+    #         print(image_path)
+    # with open('/mnt/bn/liangkeg/data/xiangchen/finetune_all_detail_vidal200k_videollava_images_im.json', 'w') as f:
+    #     json.dump(filterd_data, f)
\ No newline at end of file
diff --git a/app/llava/datasets/lk_video_dataset.py b/app/llava/datasets/lk_video_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..65c49d6e5c4e1ba8012d7367109fe31152aba4cb
--- /dev/null
+++ b/app/llava/datasets/lk_video_dataset.py
@@ -0,0 +1,132 @@
+
+from llava.datasets.builder import DATASETS
+
+from typing import Dict, Optional, Sequence, List
+from llava.datasets.data_cfgs import data_configs
+from llava.datasets.base_dataset import FramesTaskDataset
+from llava.datasets.data_cfgs import data_configs
+import pickle
+from pathlib import Path
+import random
+import numpy as np
+from llava.datasets.prompts import tt_caption_prompt, internvid_prompt
+from llava.constants import DEFAULT_VIDEO_TOKEN
+from PIL import Image
+import json
+import torch
+import os
+
+
+class LKVideoDataset(FramesTaskDataset):
+    def __init__(self, anno_path=None, data_args=None, fps=1.0, conv_type='multi', select_datasets=None, name='lk_video'):
+        self.default_fps = 1.0
+        self.fps = fps
+        self.conv_type = conv_type
+        self.select_datasets = select_datasets
+        self.annotation = self.get_dataset(anno_path)
+        #TODO: support single
+        assert self.conv_type in ('multi'), "lk_video conv type must be multi"
+        # assert hasattr(self.data_args, 'task_types') ,  "gpt4v_public must have key 'task_types' in yaml config"
+        # master_print(f"Finished loading dataset {name} {len(self.annotation)} samples...")
+        super().__init__(anno_path=anno_path,
+                         data_args=data_args,
+                         fps=fps,
+                         name=name)
+    def __len__(self):
+        return len(self.annotation)
+
+
+    def get_dataset(self, anno_path):
+        anno_path = Path(anno_path)
+        with anno_path.open('rb') as f:
+            data = json.load(f)
+        
+        if self.select_datasets is not None:
+            filtered_data = []
+            for sample in data:
+                video_path = Path(sample['video'])
+                dataset_name = video_path.parent.name   
+                if dataset_name in self.select_datasets:
+                    filtered_data.append(sample)
+            data = filtered_data
+
+        return data
+
+
+    def text_preprocess(self, item) -> List[Dict[str, str]]:
+        return item['conversations']
+
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        item = self.annotation[i]
+
+        ret = {
+            'images': self.vis_preprocess(item['video']),
+            'conversations': self.text_preprocess(item)
+        }
+        if 'id' in item:
+            ret['id'] = item['id']
+
+        return ret
+
+
+    @staticmethod
+    def _sample_frames(frames, num_segments):
+        indices = np.linspace(start=0, stop=len(frames) - 1, num=num_segments).astype(int)
+
+        frames = [frames[ind] for ind in indices]
+
+        return frames
+
+    def vis_preprocess(self, vis_path):
+        image_files = []
+        for img_path in os.listdir(vis_path):
+            if img_path.endswith('.jpeg'):
+                img_idx = int(img_path.split('_')[-1][:-5])
+                image_files.append((img_idx, img_path))
+        
+        image_files = sorted(image_files, key=lambda img: img[0])
+        # TODO: addhoc fix,  only 10 frames
+        if len(image_files) > 10:
+            image_files = self._sample_frames(image_files, 10)
+        if self.num_segments > 0 and len(image_files) > self.num_segments:
+            image_files = self._sample_frames(image_files, self.num_segments)
+        
+        images = []
+        for image_file in image_files:
+            try:
+                images.append(Image.open(os.path.join(vis_path, image_file[1])).convert('RGB'))
+            except Exception as e:
+                continue
+        formatted_images = []
+        for image in images:
+            im = self.preprocess_image(image)
+            if isinstance(im, list):
+                formatted_images.extend(im)
+            else:
+                formatted_images.append(im)
+        return formatted_images
+
+
+@DATASETS.register_obj
+def lk_video(data_args):
+    data_cfg = data_configs['lk_video']
+    fps, conv_type = data_args.external_args['fps'], data_args.external_args['conv_type']
+    select_datasets = data_args.external_args['select_datasets'] if 'select_datasets' in data_args.external_args else None
+    return LKVideoDataset(data_cfg['train_data_path'], data_args, fps, conv_type, select_datasets=select_datasets)
+
+
+# if __name__ == '__main__':
+    # import json
+    # from tqdm import tqdm
+    # with open('/mnt/bn/liangkeg/data/xiangchen/finetune_all_detail_vidal200k_videollava_images_vid.json') as f:
+    #     data = json.load(f)
+    # filterd_data = []
+    # for item in tqdm(data):
+    #     image_path = item['video']
+    #     if os.path.exists(image_path):
+    #         filterd_data.append(item)
+    #     else:
+    #         print(image_path)
+    # with open('/mnt/bn/liangkeg/data/xiangchen/finetune_all_detail_vidal200k_videollava_images_vid.json', 'w') as f:
+    #     json.dump(filterd_data, f)
diff --git a/app/llava/datasets/llava_pretrain_dataset.py b/app/llava/datasets/llava_pretrain_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..043d318b039b32ce2ee600b1c9ca1c3e3c98dbfa
--- /dev/null
+++ b/app/llava/datasets/llava_pretrain_dataset.py
@@ -0,0 +1,36 @@
+from llava.datasets.builder import DATASETS
+
+from typing import Dict, Optional, Sequence, List
+from llava.datasets.data_cfgs import data_configs
+from llava.datasets.base_dataset import ImageTaskDataset
+from llava.constants import DEFAULT_IMAGE_TOKEN
+
+
+class LLaVAPretrainDataset(ImageTaskDataset):
+    def __init__(self, anno_path, data_args=None, name='llava_pretrain'):
+        super().__init__(anno_path=anno_path,
+                         data_args=data_args,
+                         name=name)
+
+    def text_preprocess(self, item) -> List[Dict[str, str]]:
+        qas = item['qas']
+
+        conversations = []
+        for qa in qas:
+            conv = [
+                {
+                    'from': 'human',
+                    'value': DEFAULT_IMAGE_TOKEN + qa['q']
+                },
+                {
+                    'from': 'model',
+                    'value': qa['a']
+                }
+            ]
+            conversations.extend(conv)
+
+        return conversations
+
+@DATASETS.register_obj
+def llava_pretrain(data_args):
+    return LLaVAPretrainDataset(data_configs["llava_pretrain"]['train_data_path'], data_args)
\ No newline at end of file
diff --git a/app/llava/datasets/m3it_dataset.py b/app/llava/datasets/m3it_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..529a0153dcd06634020925b9b091319d46bf9508
--- /dev/null
+++ b/app/llava/datasets/m3it_dataset.py
@@ -0,0 +1,117 @@
+import logging
+
+import torch
+import datasets
+import cv2
+
+import numpy as np
+from base64 import b64decode
+from io import BytesIO
+from PIL import Image
+from torch.utils.data import ConcatDataset
+from llava.datasets.builder import DATASETS
+from typing import Dict, Optional, Sequence, List
+from llava.datasets.data_cfgs import data_configs
+from llava.datasets.base_dataset import ImageTaskDataset
+from llava.constants import DEFAULT_IMAGE_TOKEN, DEFAULT_VIDEO_TOKEN
+from llava.utils import master_print
+
+
+class M3ITDataset(ImageTaskDataset):
+    def __init__(self, anno_path, data_args=None, name='m3it', selected_tasks=None):
+        super().__init__(anno_path, data_args, name)
+
+        self.selected_tasks = selected_tasks
+        dataset_list = [
+            datasets.load_dataset("MMInstruction/M3IT", i, num_proc=16) for i in selected_tasks
+        ]
+        # some dataset have no validation
+        target_dataset_list = []
+        master_print('#' * 50)
+        for d in dataset_list:
+            try:
+                target_dataset_list.append(d['train'])
+                master_print(f"TASK {d['train']._info.config_name}, SIZE {len(d['train'])}")
+            except KeyError:
+                print(f"{d['train']._info.config_name} has no train set.")
+        self.dataset = ConcatDataset(target_dataset_list)
+        master_print(f"Finished loading dataset {name} {len(self.dataset)} samples...")
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def text_preprocess(self, item, is_video=False) -> List[Dict[str, str]]:
+        instruction = item['instruction']
+        question = item['inputs']
+        answer = item['outputs']
+
+        query = f"{instruction} {DEFAULT_IMAGE_TOKEN if not is_video else DEFAULT_VIDEO_TOKEN}"
+        if len(question) > 0:
+            query += question
+
+        conversations = [
+            {
+                'from': 'human',
+                'value': query
+            },
+            {
+                'from': 'model',
+                'value': answer
+            }
+        ]
+
+        return conversations
+
+    def bin2image(self, image_base64_str):
+        img = Image.open(BytesIO(b64decode(image_base64_str))).convert("RGB")
+        img = np.array(img)
+
+        if img.shape[2] != 3:
+            img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
+
+        img = Image.fromarray(img).convert('RGB')
+        img = self.preprocess_image(img)
+
+        return img
+
+    def vis_preprocess(self, image_base64_str_list) -> Image:
+        try:
+            images = list(map(self.bin2image, image_base64_str_list))
+            formatted_images = []
+            for image in images:
+                if isinstance(image, list):
+                    formatted_images.extend(image)
+                else:
+                    formatted_images.append(image)
+            return formatted_images
+        except Exception as e:
+            # print("Invalid sample, skip.")
+            return None
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        item = self.dataset[i]
+
+        img_data = item['image_base64_str']
+
+        images = self.vis_preprocess(img_data)
+        if images is None:
+            return None
+
+        # M3IT video sample has 8 frames
+        is_video = True if len(images) > 0 else False
+
+        ret = {
+            'images': images,
+            'conversations': self.text_preprocess(item, is_video)
+        }
+
+        return ret
+
+
+@DATASETS.register_obj
+def m3it(data_args):
+    tasks = data_configs['m3it']['default_tasks']
+    if 'tasks' in data_args.external_args:
+        tasks = data_args.external_args['tasks']
+
+    return M3ITDataset(anno_path=None, data_args=data_args, selected_tasks=tasks)
diff --git a/app/llava/datasets/prompts.py b/app/llava/datasets/prompts.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5328f87f8718e3945df3411d7d7e339f11f250e
--- /dev/null
+++ b/app/llava/datasets/prompts.py
@@ -0,0 +1,34 @@
+__all__ = ['cc_sbu_prompt', 'internvid_prompt', 'tt_caption_prompt', 'm3it_infer_prompt']
+
+cc_sbu_prompt = [
+    'Describe the image briefly.',
+    'Write a relevant description to pair with the image.'
+]
+
+internvid_prompt = [
+    'Describe the video briefly.',
+    'Describe the video shortly.',
+    'Provide a brief description of the given video clip.',
+    'Give a short and clear explanation of the subsequent video clip.',
+    'Summarize the visual content of the following video.'
+]
+
+tt_caption_prompt = [
+    "Describe the given video in detail.",
+    "Elaborate on the video's content.",
+    "Provide a detailed explanation of the video.",
+    "Explain the video thoroughly.",
+    "Give an in-depth description of the video."
+]
+
+tt_caption_prompt2 = [
+    "Describe the following video in detail.",
+]
+
+m3it_infer_prompt = "Assess the image provided and respond to the relevant question."
+
+synthdog_prompt = 'write down the text overlays in the provided image.'
+
+ocr_prompt = ['From the video, there are some text overlays:',
+             'The video also contains some OCR info:',
+             'The text overlay says:']
\ No newline at end of file
diff --git a/app/llava/datasets/promptv1_2_internal_dataset.py b/app/llava/datasets/promptv1_2_internal_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..6eca9a0fba85adec20b9c51cfbc727acb61050fa
--- /dev/null
+++ b/app/llava/datasets/promptv1_2_internal_dataset.py
@@ -0,0 +1,155 @@
+
+from llava.datasets.builder import DATASETS
+
+from typing import Dict, Optional, Sequence, List
+from llava.datasets.data_cfgs import data_configs
+from llava.datasets.base_dataset import FramesTaskDataset
+from llava.datasets.data_cfgs import data_configs
+import pickle
+from pathlib import Path
+import random
+import numpy as np
+from llava.datasets.prompts import tt_caption_prompt, internvid_prompt
+from llava.constants import DEFAULT_VIDEO_TOKEN
+from PIL import Image
+import json
+import torch
+import os
+
+
+class PromptV1Dataset(FramesTaskDataset):
+    def __init__(self, anno_path=None, data_args=None, name='promptv1_2_internal', task_types=None):
+        self.default_fps = 1.0
+        self.task_types = task_types
+        self.annotation = self.get_dataset(anno_path)
+        super().__init__(anno_path=anno_path,
+                         data_args=data_args,
+                         name=name)
+    def __len__(self):
+        return len(self.annotation)
+
+
+    def get_dataset(self, anno_path):
+        dataset = []
+        anno_path = Path(anno_path)
+        with anno_path.open('rb') as f:
+            data = json.load(f)
+        for info in data:
+            for task_type in self.task_types:
+                info_task = info.copy()
+                if task_type not in info or len(info_task[task_type]) == 0:
+                    continue
+                if task_type == 'qas' and self.conv_type == 'single':
+                    for qa_pair in info_task[task_type]:
+                        one_info = info_task.copy()
+                        one_info[task_type] = [qa_pair]
+                        one_info.update({
+                            'task_type': task_type
+                        })
+                        dataset.append(one_info)
+                else:
+                    info_task.update({
+                        'task_type': task_type
+                    })
+                    dataset.append(info_task)
+        return dataset
+
+
+    def text_preprocess(self, item) -> List[Dict[str, str]]:
+        all_convs = []
+        if hasattr(self.data_args, 'caption_prompt'):
+            cap_prompt = eval(self.data_args.caption_prompt)
+        else:
+            cap_prompt = tt_caption_prompt
+        if item['task_type'] == 'refine_caption':
+            all_convs.append([
+                {
+                    'from': 'human',
+                    'value': random.choice(cap_prompt)
+                },
+                {
+                    'from': 'model',
+                    'value': item['refine_caption']
+                }
+            ])
+        else:
+            for idx, qa in enumerate(item['qas']):
+                all_convs.append([
+                    {
+                        'from': 'human',
+                        'value': qa['q']
+                    },
+                    {
+                        'from': 'model',
+                        'value': qa['a']
+                    }
+                ])
+
+        conversations = []
+        random.shuffle(all_convs)
+        for idx, conv in enumerate(all_convs):
+            if idx == 0:
+                conv[0]['value'] = DEFAULT_VIDEO_TOKEN + conv[0]['value']
+            conversations.extend(conv)
+        return conversations
+
+
+
+    # def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+    #     item = self.annotation[i]
+
+    #     ret = {
+    #         'images': self.vis_preprocess(item['video_path']),
+    #         'conversations': self.text_preprocess(item)
+    #     }
+    #     if 'id' in item:
+    #         ret['id'] = item['id']
+
+    #     return ret
+
+
+    # @staticmethod
+    # def _sample_frames(frames, num_segments):
+    #     indices = np.linspace(start=0, stop=len(frames) - 1, num=num_segments).astype(int)
+
+    #     frames = [frames[ind] for ind in indices]
+
+    #     return frames
+
+    # def vis_preprocess(self, vis_path):
+    #     image_files = []
+    #     for img_path in os.listdir(vis_path):
+    #         if img_path.endswith('.jpeg'):
+    #             img_idx = int(img_path.split('_')[-1][:-5])
+    #             image_files.append((img_idx, img_path))
+        
+    #     image_files = sorted(image_files, key=lambda img: img[0])
+    #     # TODO: addhoc fix,  only 10 frames
+    #     if len(image_files) > 10:
+    #         image_files = self._sample_frames(image_files, 10)
+    #     if self.num_segments > 0 and len(image_files) > self.num_segments:
+    #         image_files = self._sample_frames(image_files, self.num_segments)
+        
+    #     images = []
+    #     for image_file in image_files:
+    #         try:
+    #             images.append(Image.open(os.path.join(vis_path, image_file[1])).convert('RGB'))
+    #         except Exception as e:
+    #             continue
+    #     formatted_images = []
+    #     for image in images:
+    #         im = self.preprocess_image(image)
+    #         if isinstance(im, list):
+    #             formatted_images.extend(im)
+    #         else:
+    #             formatted_images.append(im)
+    #     return formatted_images
+
+
+@DATASETS.register_obj
+def promptv1_2_internal(data_args):
+    data_cfg = data_configs['promptv1_2_internal']
+    task_types = data_args.external_args['task_types']
+    return PromptV1Dataset(anno_path=data_cfg['train_data_path'], data_args=data_args, task_types=task_types)
+
+    
\ No newline at end of file
diff --git a/app/llava/datasets/registry.py b/app/llava/datasets/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..b96453ec25cde42ea2cfc8f7d94719c59919fa1a
--- /dev/null
+++ b/app/llava/datasets/registry.py
@@ -0,0 +1,82 @@
+
+__all__ = ['Registry', 'build_from_cfg']
+
+class Registry(object):
+    """A registry to map strings to classes.
+
+    Args:
+        name (str): Registry name.
+    """
+
+    def __init__(self, name):
+        self._name = name
+        self._obj_dict = dict()
+
+    def __repr__(self):
+        format_str = self.__class__.__name__ + '(name={}, items={})'.format(
+            self._name, self.items())
+        return format_str
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def obj_dict(self):
+        return self._obj_dict
+
+    def get(self, key):
+        return self._obj_dict.get(key, None)
+
+    def has(self, key):
+        if key in self._obj_dict.keys():
+            return True
+        return False
+
+    def items(self):
+        return list(self._obj_dict.keys())
+
+    def _register_obj(self, obj):
+        """Register a object.
+
+        Args:
+            obj (:obj: callable): Callable object to be registered.
+        """
+        if not callable(obj):
+            raise TypeError(f'object {str(obj)} must be callable')
+        obj_name = obj.__name__
+        if obj_name in self._obj_dict:
+            raise KeyError(f'{obj_name} is already registered in {self.name}.')
+        self._obj_dict[obj_name] = obj
+
+    def register_obj(self, obj):
+        self._register_obj(obj)
+        return obj
+
+
+def build_from_cfg(name, cfg, registry, default_args=None):
+    """Build a module from config dict.
+       Since Cruise Module has different config format with haggs, we will
+       try to detect and catch Cruise usage in the begining.
+
+    Args:
+        name (str): Name of the object
+        cfg (addict): Config dict of the object
+        registry (:obj:`Registry`): The registry to search the type from.
+        default_args (dict, optional): Default initialization arguments.
+
+    Returns:
+        obj: The constructed object.
+    """
+    obj = registry.get(name)
+    if obj is None:
+        raise KeyError(f'{name} is not in the {registry.name} registry. '
+                       f'Choose among {list(registry.obj_dict.keys())}')
+
+    if default_args is not None:
+        # for key, value in default_args.items():
+        #     # cfg.setdefault(key, value)
+        #     setattr(cfg, key, value)
+        cfg.external_args = default_args
+
+    return obj(cfg)
\ No newline at end of file
diff --git a/app/llava/datasets/sharegpt4v_dataset.py b/app/llava/datasets/sharegpt4v_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0d521370644204a5afa0751c8247f122c2f2cd8
--- /dev/null
+++ b/app/llava/datasets/sharegpt4v_dataset.py
@@ -0,0 +1,102 @@
+import datasets
+import torch
+import re
+import os
+import subprocess
+from llava.datasets.builder import DATASETS
+
+from typing import Dict, Optional, Sequence, List
+from llava.datasets.data_cfgs import data_configs
+from llava.datasets.base_dataset import ImageTaskDataset
+from llava.constants import DEFAULT_IMAGE_TOKEN
+from llava.datasets.data_cfgs import data_configs
+from llava.utils import master_print
+
+class ShareGPT4VDataset(ImageTaskDataset):
+    def __init__(self, anno_path=None, data_args=None, aux_args=None, name='sharegpt4v'):
+        super().__init__(anno_path=anno_path,
+                         data_args=data_args,
+                         name=name)
+        self.annotation = datasets.load_dataset("Lin-Chen/ShareGPT4V", "ShareGPT4V")['train']
+        self.aux_args = aux_args
+        master_print(f"Finished loading dataset {name} {len(self.annotation)} samples...")
+
+        
+    def __len__(self):
+        return len(self.annotation)
+
+    def text_preprocess(self, item) -> List[Dict[str, str]]:
+        captions = item['conversations']
+
+        conversations = []
+        conv = [
+            {
+                'from': 'human',
+                'value': DEFAULT_IMAGE_TOKEN + captions[0]['value'].replace('<image>', '')
+            },
+            {
+                'from': 'model',
+                'value': captions[1]['value']
+            }
+        ]
+        conversations.extend(conv)
+
+        return conversations
+
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        item = self.annotation[i]
+        if 'coco' in item['image']:
+            vis_path = os.path.join(self.aux_args['coco_dir'], item['image'])
+        elif 'llava' in item['image']:
+            file_names =  item['image'].split('/')
+            vis_path = os.path.join(self.aux_args['llava_dir'], *file_names[-3:])
+        else:
+            vis_path = os.path.join(self.aux_args['other_dir'], item['image'])
+            
+        ret = {
+            'images': self.vis_preprocess(vis_path),
+            'conversations': self.text_preprocess(item)
+        }
+        if 'id' in item:
+            ret['id'] = item['id']
+
+        return ret
+
+@DATASETS.register_obj
+def ShareGPT4V(data_args):
+    data_cfg = data_configs['sharegpt4v']
+    return ShareGPT4VDataset(None, data_args, aux_args=data_cfg)
+
+if __name__ == '__main__':
+    dataset = datasets.load_dataset("Lin-Chen/ShareGPT4V", "ShareGPT4V")['train']
+    aux_args = data_configs['sharegpt4v']
+    for item in dataset:
+        if 'coco' in item['image']:
+            vis_path = os.path.join(aux_args['coco_dir'], item['image'])
+        elif 'llava' in item['image']:
+            file_names =  item['image'].split('/')
+            vis_path = os.path.join(aux_args['llava_dir'], *file_names[-3:])
+        else:
+            vis_path = os.path.join(aux_args['other_dir'], item['image'])
+        if not os.path.exists(vis_path):
+            print(vis_path)
+    # with open('/mnt/bn/yukunfeng-nasdrive/xiangchen/dataset/sharegpt4v/sam.txt') as f:
+    #     for line in f:
+    #         items = line.split('\t')
+    #         name = items[0].strip()
+    #         url = items[1].strip()
+    #         match = re.search(r'(\d+)', name).group(1)
+    #         idx = int(match)
+    #         if idx >= 60:
+    #             continue
+    #         print(name, url)
+    #         output_file = os.path.join('/mnt/bn/yukunfeng-nasdrive/xiangchen/dataset/sharegpt4v/sam', name)
+    #         try:
+    #             subprocess.run(["wget", "-O", output_file, url], check=True)
+    #         except subprocess.CalledProcessError as e:
+    #             print("An error occurred while downloading the file.")
+    # from glob import glob
+    # file_path = '/mnt/bn/yukunfeng-nasdrive/xiangchen/dataset/sharegpt4v/sam'
+    # for file_name in glob(os.path.join(file_path, '*.tar')):
+    #     subprocess.run(["tar", "-xf", file_name, '-C', '/mnt/bn/yukunfeng-nasdrive/xiangchen/dataset/sharegpt4v/sam/images'], check=True)
diff --git a/app/llava/datasets/super_dataset.py b/app/llava/datasets/super_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..27aa0e99b88261eef5833d95f337687b3f6c6090
--- /dev/null
+++ b/app/llava/datasets/super_dataset.py
@@ -0,0 +1,316 @@
+from operator import length_hint
+import random
+import bisect
+import copy
+import torch
+import transformers
+from torch.utils.data import get_worker_info
+from omegaconf import OmegaConf
+import torchvision.transforms.functional as F
+from dataclasses import dataclass, field
+from typing import Dict, Optional, Sequence, List
+from torch.utils.data import Dataset, ConcatDataset
+
+from llava.datasets.registry import build_from_cfg
+from llava.datasets.builder import DATASETS
+from llava.datasets.data_cfgs import data_configs
+from llava.train.arguments import DataArguments
+from llava.model.preprocessor import preprocess_multimodal, preprocess
+from llava.constants import IGNORE_INDEX
+from llava.utils import DatasetIter, get_world_size, get_rank, master_print
+from transformers import CLIPImageProcessor, SiglipImageProcessor
+
+class LazySupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+
+    def __init__(self, data_cfg: str,
+                 tokenizer: transformers.PreTrainedTokenizer,
+                 data_args: DataArguments,
+                 num_workers: int):
+
+        super(LazySupervisedDataset, self).__init__()
+        dataset_config = OmegaConf.load(data_cfg)
+
+        self.tokenizer = tokenizer
+        self.data_args = data_args
+
+        self.datasets, self.sample_ratios = list(), list()
+        for ds in list(dataset_config.datasets.keys()):
+            ds_cfg = dataset_config.datasets[ds]
+            external_args = {}
+            for key, value in ds_cfg.items():
+                external_args[key] = value
+            args_ = copy.deepcopy(vars(data_args))
+            data_args_copy = type('DataArguments', (object,), args_)
+            dataset = build_from_cfg(ds, data_args_copy, DATASETS, default_args=external_args)
+            self.datasets.append(dataset)
+            if 'sample_ratio' in ds_cfg:
+                self.sample_ratios.append(ds_cfg.sample_ratio)
+
+        if len(self.sample_ratios) != len(self.datasets):
+            self.sample_ratios = [1.0] * len(self.sample_ratios)
+
+        self.sample_ratios = [float(ratio) / sum(self.sample_ratios) for ratio in self.sample_ratios]
+        self.ds_iters = [DatasetIter(len(dataset), get_world_size(), get_rank(), num_workers)
+                         for dataset in self.datasets]
+    def __len__(self):
+        # set iters per epoch as the maximum iterations of each dataset
+        max_ds = sorted([int(len(ds) / ratio) for (ds, ratio) in zip(self.datasets, self.sample_ratios)], reverse=True)[0]
+
+        return max_ds
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        worker_info = get_worker_info()
+
+        ds_idx = random.choices(range(len(self.datasets)), self.sample_ratios, k=1)[0]
+
+        item = None
+        while item is None:
+            item_id = self.ds_iters[ds_idx].increment(worker_info.id)
+            # item_id = self.ds_iters[ds_idx].increment(0)
+            item = self.datasets[ds_idx].__getitem__(item_id)
+
+        sources = item
+        if isinstance(i, int):
+            sources = [sources]
+        assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME
+        if 'images' in sources[0]:
+            images = sources[0]['images']
+            conversations = copy.deepcopy([e['conversations'] for e in sources])
+
+            sources = preprocess_multimodal(
+                conversations, self.data_args)
+        else:
+            sources = copy.deepcopy([e["conversations"] for e in sources])
+
+        data_dict = preprocess(
+            sources,
+            self.tokenizer,
+            has_image=('images' in item))
+
+        if isinstance(i, int):
+            data_dict = dict(input_ids=data_dict["input_ids"][0],
+                             labels=data_dict["labels"][0])
+
+        if images is not None and len(images) > 0:
+            data_dict["images"] = images
+        elif self.data_args.is_multimodal:
+            # image does not exist in the data, but the model is multimodal
+            img_size = self.data_args.image_processor.img_size
+            # data_dict['image'] = torch.zeros(3, crop_size['height'], crop_size['width'])
+            if getattr(self.data_args, 'image_aspect_ratio', 'square') == 'anyres':
+                data_dict['images'] = [torch.zeros(1, 3, img_size, img_size)]
+            else:
+                data_dict['images'] = [torch.zeros(3, img_size, img_size)]
+            data_dict['labels'][:] = IGNORE_INDEX
+        return data_dict
+    
+
+
+
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    """Collate examples for supervised fine-tuning."""
+    tokenizer: transformers.PreTrainedTokenizer
+
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple([instance[key] for instance in instances]
+                                  for key in ("input_ids", "labels"))
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids,
+            batch_first=True,
+            padding_value=self.tokenizer.pad_token_id)
+        labels = torch.nn.utils.rnn.pad_sequence(labels,
+                                                 batch_first=True,
+                                                 padding_value=IGNORE_INDEX)
+        input_ids = input_ids[:, :self.tokenizer.model_max_length]
+        labels = labels[:, :self.tokenizer.model_max_length]
+        batch = dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
+        )
+
+        if 'images' in instances[0]:
+            images = [instance['images'] for instance in instances]
+            images_data = []
+            for imgs in images:
+                if all(x is not None and x.shape == imgs[0].shape for x in imgs):
+                    imgs = torch.stack(imgs)
+                else:
+                    imgs = [x for x in imgs if x is not None]
+                    imgs = [x for x in imgs if x.shape == imgs[0].shape]
+                    imgs = torch.stack(imgs)
+
+                images_data.append(imgs)
+
+            batch["images"] = images_data
+
+        if 'images' not in batch or len(batch['images']) == 0:
+            print("images not in batch")
+
+        return batch
+
+
+def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer,
+                                data_args,
+                                num_workers) -> Dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    train_dataset = LazySupervisedDataset(data_cfg=data_args.dataset_config,
+                                          tokenizer=tokenizer,
+                                          data_args=data_args,
+                                          num_workers=num_workers)
+
+    for ds, ratio in zip(train_dataset.datasets, train_dataset.sample_ratios):
+        master_print(f"==> Real epoch of {ds.name} is {round(len(train_dataset) * ratio / len(ds), 2)} epochs.")
+
+    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+    return dict(train_dataset=train_dataset,
+                eval_dataset=None,
+                data_collator=data_collator)
+
+
+
+class SupervisedConcatDataset(ConcatDataset):
+    r"""Dataset as a concatenation of multiple datasets.
+
+    This class is useful to assemble different existing datasets.
+
+    Args:
+        datasets (sequence): List of datasets to be concatenated
+    """
+
+    datasets: List[Dataset]
+    cumulative_sizes: List[int]
+
+
+    def __init__(self, datasets: List[Dataset],
+                 tokenizer: transformers.PreTrainedTokenizer,
+                 data_args: DataArguments) -> None:
+        # super().__init__()
+        super().__init__(datasets)
+        self.tokenizer = tokenizer
+        self.data_args = data_args
+
+        # self.datasets = list(datasets)
+        # assert len(self.datasets) > 0, 'datasets should not be an empty iterable'  # type: ignore[arg-type]
+        # for d in self.datasets:
+        #     assert not isinstance(d, IterableDataset), "ConcatDataset does not support IterableDataset"
+        # self.cumulative_sizes = self.cumsum(self.datasets)
+
+    # @property
+    # def lengths(self):
+    #     length_list = []
+    #     for sample in self.list_data_dict:
+    #         img_tokens = 128 if 'image' in sample else 0
+    #         length_list.append(sum(len(conv['value'].split()) for conv in sample['conversations']) + img_tokens)
+    #     return length_list
+
+    @property
+    def modality_lengths(self):
+        length_list = []
+        token_per_image = getattr(self.data_args, 'num_token_per_image', 32)
+        # token_per_image = 32
+        # for sample in self.list_data_dict:
+        #     cur_len = sum(len(conv['value'].split()) for conv in sample['conversations'])
+        #     cur_len = cur_len if 'image' in sample else -cur_len
+        #     length_list.append(cur_len)
+        for idx in range(len(self)):
+            dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+            if dataset_idx == 0:
+                sample_idx = idx
+            else:
+                sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+            item = self.datasets[dataset_idx].annotation[sample_idx]
+            conversations = self.datasets[dataset_idx].text_preprocess(item)
+            cur_len = sum([len(conv['value'].split()) for conv in conversations])
+            if self.datasets[dataset_idx].type == 'images':
+                cur_len += token_per_image
+            else:
+                cur_len += token_per_image * self.data_args.num_segments  
+            length_list.append(cur_len)          
+        return length_list
+
+    def __len__(self):
+        return self.cumulative_sizes[-1]
+
+    def __getitem__(self, idx):
+        if idx < 0:
+            if -idx > len(self):
+                raise ValueError("absolute value of index should not exceed dataset length")
+            idx = len(self) + idx
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        item = self.datasets[dataset_idx][sample_idx] 
+        sources = item
+        if isinstance(idx, int):
+            sources = [sources]
+        assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME
+        if 'images' in sources[0]:
+            images = sources[0]['images']
+            conversations = copy.deepcopy([e['conversations'] for e in sources])
+
+            sources = preprocess_multimodal(
+                conversations, self.data_args)
+        else:
+            sources = copy.deepcopy([e["conversations"] for e in sources])
+
+        data_dict = preprocess(
+            sources,
+            self.tokenizer,
+            has_image=('images' in item))
+
+        if isinstance(idx, int):
+            data_dict = dict(input_ids=data_dict["input_ids"][0],
+                             labels=data_dict["labels"][0])
+
+        if images is not None and len(images) > 0:
+            data_dict["images"] = images
+        elif self.data_args.is_multimodal:
+            # image does not exist in the data, but the model is multimodal
+            if isinstance(self.data_args.image_processor, SiglipImageProcessor):
+                img_size = self.data_args.image_processor.size['height']
+            elif isinstance(self.data_args.image_processor, CLIPImageProcessor):
+                img_size = self.data_args.image_processor.crop_size['height']
+            else:
+                img_size = self.data_args.image_processor.img_size
+            # data_dict['image'] = torch.zeros(3, crop_size['height'], crop_size['width'])
+            if getattr(self.data_args, 'image_aspect_ratio', 'square') == 'anyres':
+                data_dict['images'] = [torch.zeros(1, 3, img_size, img_size)]
+            else:
+                data_dict['images'] = [torch.zeros(3, img_size, img_size)]
+            data_dict['labels'][:] = IGNORE_INDEX
+        return data_dict
+
+
+def make_supervised_data_module_concatdataset(tokenizer: transformers.PreTrainedTokenizer,
+                                        data_args,
+                                        num_workers) -> Dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    datasets = []
+    dataset_config = OmegaConf.load(data_args.dataset_config)
+    for ds in list(dataset_config.datasets.keys()):
+        ds_cfg = dataset_config.datasets[ds]
+        external_args = {}
+        for key, value in ds_cfg.items():
+            external_args[key] = value
+        args_ = copy.deepcopy(vars(data_args))
+        data_args_copy = type('DataArguments', (object,), args_)
+        dataset = build_from_cfg(ds, data_args_copy, DATASETS, default_args=external_args)
+        datasets.append(dataset)
+
+    train_dataset = SupervisedConcatDataset(datasets=datasets,
+                                          tokenizer=tokenizer,
+                                          data_args=data_args)
+
+    # for ds, ratio in zip(train_dataset.datasets, train_dataset.sample_ratios):
+    #     master_print(f"==> Real epoch of {ds.name} is {round(len(train_dataset) * ratio / len(ds), 2)} epochs.")
+
+    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+    return dict(train_dataset=train_dataset,
+                eval_dataset=None,
+                data_collator=data_collator)
+
diff --git a/app/llava/datasets/synthetic_ocr_dataset.py b/app/llava/datasets/synthetic_ocr_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f6cffe70b3fe09a6920a13c1b8b15f2cc4d7cba
--- /dev/null
+++ b/app/llava/datasets/synthetic_ocr_dataset.py
@@ -0,0 +1,72 @@
+import os
+import torch
+import random
+import json
+from pathlib import Path
+from llava.datasets.builder import DATASETS
+
+from typing import Dict, Optional, Sequence, List
+from llava.datasets.data_cfgs import data_configs
+from llava.datasets.base_dataset import FramesTaskDataset
+from llava.datasets.prompts import tt_caption_prompt, ocr_prompt
+from llava.constants import DEFAULT_VIDEO_TOKEN
+
+
+class SyntheticOCRDataset(FramesTaskDataset):
+    def __init__(self, anno_path, data_args=None, fps=2.0, name='synthetic_ocr'):
+        super().__init__(anno_path=anno_path,
+                         data_args=data_args,
+                         fps=fps,
+                         name=name)
+        self.default_fps = 0.1
+
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        item = self.annotation[i]
+
+        ret = {
+            'images': self.vis_preprocess(item['video_path']),
+            'conversations': self.text_preprocess(item)
+        }
+        if 'id' in item:
+            ret['id'] = item['id']
+
+        return ret
+
+    def text_preprocess(self, item) -> List[Dict[str, str]]:
+        all_convs = []
+        if hasattr(self.data_args, 'caption_prompt'):
+            cap_prompt = eval(self.data_args.caption_prompt)
+        else:
+            cap_prompt = tt_caption_prompt
+        
+        conversations = []
+        conversations.extend([
+                {
+                    'from': 'human',
+                    'value': DEFAULT_VIDEO_TOKEN + random.choice(cap_prompt)
+                },
+                {
+                    'from': 'model',
+                    'value': item['gpt_caption'] + ' ' + random.choice(ocr_prompt) + ','.join(item['ocr_list'])
+                }
+        ])
+        return conversations
+
+
+@DATASETS.register_obj
+def synthetic_ocr(data_args):
+    train_data_path = None
+    if 'train_data_path' in data_args.external_args:
+        train_data_path = data_args.external_args['train_data_path']
+    else:
+        train_data_path = data_configs["synthetic_ocr"]['train_data_path']
+    return SyntheticOCRDataset(train_data_path, data_args, 2.0)
+
+if __name__ == '__main__':
+    with open('/mnt/bn/algo-masp-nas-2/xiangchen/dataset/masp/synthetic_ocr/train_filtered.json') as f:
+        data = json.load(f)
+    
+    for sample in data:
+        res = sample['gpt_caption'] + ' ' + random.choice(ocr_prompt) + ','.join(sample['ocr_list'])
+        # print(res)
\ No newline at end of file
diff --git a/app/llava/datasets/textcaps_dataset.py b/app/llava/datasets/textcaps_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..bad821615a37fc0049d7dff261e3fb893285e3c0
--- /dev/null
+++ b/app/llava/datasets/textcaps_dataset.py
@@ -0,0 +1,76 @@
+import datasets
+import torch
+import re
+import os
+import json
+from llava.datasets.builder import DATASETS
+from pathlib import Path
+import random
+from typing import Dict, Optional, Sequence, List
+from llava.datasets.data_cfgs import data_configs
+from llava.datasets.base_dataset import ImageTaskDataset
+from llava.datasets.prompts import cc_sbu_prompt
+from llava.constants import DEFAULT_IMAGE_TOKEN
+from llava.datasets.data_cfgs import data_configs
+from llava.utils import master_print
+
+
+class TextCapsDataset(ImageTaskDataset):
+    def __init__(self, anno_path=None, data_args=None, aux_args=None, name='TextCaps'):
+        with open(anno_path) as f:
+            self.annotation = json.load(f)['data']        
+        self.dataset_dir = Path(anno_path).parent
+        super().__init__(anno_path=anno_path,
+                         data_args=data_args,
+                         name=name)
+        
+    def __len__(self):
+        return len(self.annotation)
+    
+
+    def text_preprocess(self, item) -> List[Dict[str, str]]:
+        conversations = []
+        conversations.extend([
+                {
+                    'from': 'human',
+                    'value': DEFAULT_IMAGE_TOKEN + random.choice(cc_sbu_prompt)
+                },
+                {
+                    'from': 'model',
+                    'value': item['caption_str']
+                }
+            ])       
+                    
+        return conversations
+
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        item = self.annotation[i]
+        vis_path = self.dataset_dir / item['image_path']
+        ret = {
+            'images': self.vis_preprocess(str(vis_path)),
+            'conversations': self.text_preprocess(item)
+        }
+        if 'id' in item:
+            ret['id'] = item['id']
+
+        return ret
+
+@DATASETS.register_obj
+def TextCaps(data_args):
+    data_cfg = data_configs['text_caps']
+    return TextCapsDataset(data_cfg['train_data_path'], data_args)
+
+if __name__ == '__main__':
+    # viz_dir = '/mnt/bn/yukunfeng-nasdrive/xiangchen/dataset/OCR-VQA/'
+    with open('/mnt/bn/yukunfeng-nasdrive/xiangchen/dataset/TextCaps/TextCaps_0.1_train.json') as f:
+        data = json.load(f)
+    res = []
+    for value in data:
+        # ext=os.path.splitext(value['imageURL'])[1]
+        # outputFile=os.path.join(viz_dir, 'images/%s%s'%(key,ext))
+        # q = value['questions']
+        # a = value['answers']
+        if len(value['questions']) == 0:
+            print(1)
+        res.append(value)
diff --git a/app/llava/datasets/tt_gptv_v1_dataset.py b/app/llava/datasets/tt_gptv_v1_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0a771569c9d425593fddf5908d333159e283713
--- /dev/null
+++ b/app/llava/datasets/tt_gptv_v1_dataset.py
@@ -0,0 +1,14 @@
+import json
+import os
+from pathlib import Path
+with open('/mnt/bn/algo-masp-nas-2/kaili.zhao/data/masp_data/train/gpt4v_annotation/20240325week_gpt4v_all_videos_unique_ids.json') as f:
+    data = json.load(f)
+
+for sample in data:
+    video_path = Path(sample['video_path'])
+    file_names = os.listdir(sample['video_path'])
+    if len(file_names) == 10:
+        file_names.sort(key=lambda x: int(x[:-4]))
+        print(file_names)
+    
+    
\ No newline at end of file
diff --git a/app/llava/datasets/tt_vqa_dataset.py b/app/llava/datasets/tt_vqa_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bd75bc48e898090eb52c685d482d9457bd59d3d
--- /dev/null
+++ b/app/llava/datasets/tt_vqa_dataset.py
@@ -0,0 +1,71 @@
+import os
+import random
+import json
+from pathlib import Path
+from llava.datasets.builder import DATASETS
+
+from typing import Dict, Optional, Sequence, List
+from llava.datasets.data_cfgs import data_configs
+from llava.datasets.base_dataset import FramesTaskDataset
+from llava.datasets.prompts import tt_caption_prompt, tt_caption_prompt2
+from llava.constants import DEFAULT_VIDEO_TOKEN
+
+
+class TTVqaDataset(FramesTaskDataset):
+    def __init__(self, anno_path, data_args=None, fps=2.0, data_cfgs=None, name='tt_vqa'):
+        super().__init__(anno_path=anno_path,
+                         data_args=data_args,
+                         fps=fps,
+                         name=name)
+        self.default_fps = data_cfgs['fps']
+
+
+    def text_preprocess(self, item) -> List[Dict[str, str]]:
+        all_convs = []
+        if hasattr(self.data_args, 'caption_prompt'):
+            cap_prompt = eval(self.data_args.caption_prompt)
+        else:
+            cap_prompt = tt_caption_prompt
+        if 'caption' in item:
+            all_convs.append([
+                {
+                    'from': 'human',
+                    'value': random.choice(cap_prompt)
+                },
+                {
+                    'from': 'model',
+                    'value': item['caption']
+                }
+            ])
+        if 'qas' in item:
+            for idx, qa in enumerate(item['qas']):
+                all_convs.append([
+                    {
+                        'from': 'human',
+                        'value': qa['q']
+                    },
+                    {
+                        'from': 'model',
+                        'value': qa['a']
+                    }
+                ])
+
+        conversations = []
+        random.shuffle(all_convs)
+        for idx, conv in enumerate(all_convs):
+            if idx == 0:
+                conv[0]['value'] = DEFAULT_VIDEO_TOKEN + conv[0]['value']
+            conversations.extend(conv)
+
+        return conversations
+
+
+@DATASETS.register_obj
+def tt_vqa(data_args):
+    train_data_path = None
+    if 'train_data_path' in data_args.external_args:
+        train_data_path = data_args.external_args['train_data_path']
+    else:
+        train_data_path = data_configs["tt_vqa"]['train_data_path']
+    return TTVqaDataset(train_data_path, data_args, 2.0, data_configs["tt_vqa"])
+
diff --git a/app/llava/eval/llava_eval/eval_gpt_review.py b/app/llava/eval/llava_eval/eval_gpt_review.py
new file mode 100644
index 0000000000000000000000000000000000000000..8af4559c65fc2728b11fd2097a109981ee1ef686
--- /dev/null
+++ b/app/llava/eval/llava_eval/eval_gpt_review.py
@@ -0,0 +1,113 @@
+import argparse
+import json
+import os
+
+import openai
+import tqdm
+import ray
+import time
+
+NUM_SECONDS_TO_SLEEP = 3
+
+@ray.remote(num_cpus=4)
+def get_eval(content: str, max_tokens: int):
+    while True:
+        try:
+            response = openai.ChatCompletion.create(
+                model='gpt-4',
+                messages=[{
+                    'role': 'system',
+                    'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
+                }, {
+                    'role': 'user',
+                    'content': content,
+                }],
+                temperature=0.2,  # TODO: figure out which temperature is best for evaluation
+                max_tokens=max_tokens,
+            )
+            break
+        except openai.error.RateLimitError:
+            pass
+        except Exception as e:
+            print(e)
+        time.sleep(NUM_SECONDS_TO_SLEEP)
+
+    print('success!')
+    return response['choices'][0]['message']['content']
+
+
+def parse_score(review):
+    try:
+        score_pair = review.split('\n')[0]
+        score_pair = score_pair.replace(',', ' ')
+        sp = score_pair.split(' ')
+        if len(sp) == 2:
+            return [float(sp[0]), float(sp[1])]
+        else:
+            print('error', review)
+            return [-1, -1]
+    except Exception as e:
+        print(e)
+        print('error', review)
+        return [-1, -1]
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
+    parser.add_argument('-q', '--question')
+    # parser.add_argument('-a', '--answer')
+    parser.add_argument('-a', '--answer-list', nargs='+', default=[])
+    parser.add_argument('-r', '--rule')
+    parser.add_argument('-o', '--output')
+    parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
+    args = parser.parse_args()
+
+    ray.init()
+
+    f_q = open(os.path.expanduser(args.question))
+    f_ans1 = open(os.path.expanduser(args.answer_list[0]))
+    f_ans2 = open(os.path.expanduser(args.answer_list[1]))
+    rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
+
+    review_file = open(f'{args.output}', 'w')
+
+    js_list = []
+    handles = []
+    idx = 0
+    for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
+        # if idx == 1:
+        #     break
+
+        ques = json.loads(ques_js)
+        ans1 = json.loads(ans1_js)
+        ans2 = json.loads(ans2_js)
+
+        category = json.loads(ques_js)['category']
+        if category in rule_dict:
+            rule = rule_dict[category]
+        else:
+            rule = rule_dict['default']
+        prompt = rule['prompt']
+        role = rule['role']
+        content = (f'[Question]\n{ques["text"]}\n\n'
+                   f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
+                   f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
+                   f'[System]\n{prompt}\n\n')
+        js_list.append({
+            'id': idx+1,
+            'question_id': ques['question_id'],
+            'answer1_id': ans1['answer_id'],
+            'answer2_id': ans2['answer_id'],
+            'category': category})
+        idx += 1
+        handles.append(get_eval.remote(content, args.max_tokens))
+        # To avoid the rate limit set by OpenAI
+        time.sleep(NUM_SECONDS_TO_SLEEP)
+
+    reviews = ray.get(handles)
+    for idx, review in enumerate(reviews):
+        scores = parse_score(review)
+        js_list[idx]['content'] = review
+        js_list[idx]['tuple'] = scores
+        review_file.write(json.dumps(js_list[idx]) + '\n')
+    review_file.close()
diff --git a/app/llava/eval/llava_eval/eval_gpt_review_bench.py b/app/llava/eval/llava_eval/eval_gpt_review_bench.py
new file mode 100644
index 0000000000000000000000000000000000000000..06160f2422b5368f30fb967f7cae635208a1dc69
--- /dev/null
+++ b/app/llava/eval/llava_eval/eval_gpt_review_bench.py
@@ -0,0 +1,121 @@
+import argparse
+import json
+import os
+
+import openai
+import time
+
+NUM_SECONDS_TO_SLEEP = 0.5
+
+
+def get_eval(content: str, max_tokens: int):
+    while True:
+        try:
+            response = openai.ChatCompletion.create(
+                model='gpt-4-0314',
+                messages=[{
+                    'role': 'system',
+                    'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
+                }, {
+                    'role': 'user',
+                    'content': content,
+                }],
+                temperature=0.2,  # TODO: figure out which temperature is best for evaluation
+                max_tokens=max_tokens,
+            )
+            break
+        except openai.error.RateLimitError:
+            pass
+        except Exception as e:
+            print(e)
+        time.sleep(NUM_SECONDS_TO_SLEEP)
+
+    return response['choices'][0]['message']['content']
+
+
+def parse_score(review):
+    try:
+        score_pair = review.split('\n')[0]
+        score_pair = score_pair.replace(',', ' ')
+        sp = score_pair.split(' ')
+        if len(sp) == 2:
+            return [float(sp[0]), float(sp[1])]
+        else:
+            print('error', review)
+            return [-1, -1]
+    except Exception as e:
+        print(e)
+        print('error', review)
+        return [-1, -1]
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
+    parser.add_argument('-q', '--question')
+    parser.add_argument('-c', '--context')
+    parser.add_argument('-a', '--answer-list', nargs='+', default=[])
+    parser.add_argument('-r', '--rule')
+    parser.add_argument('-o', '--output')
+    parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
+    args = parser.parse_args()
+
+    f_q = open(os.path.expanduser(args.question))
+    f_ans1 = open(os.path.expanduser(args.answer_list[0]))
+    f_ans2 = open(os.path.expanduser(args.answer_list[1]))
+    rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
+
+    if os.path.isfile(os.path.expanduser(args.output)):
+        cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
+    else:
+        cur_reviews = []
+
+    review_file = open(f'{args.output}', 'a')
+
+    context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
+    image_to_context = {context['image']: context for context in context_list}
+
+    handles = []
+    idx = 0
+    for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
+        ques = json.loads(ques_js)
+        ans1 = json.loads(ans1_js)
+        ans2 = json.loads(ans2_js)
+
+        inst = image_to_context[ques['image']]
+
+        if isinstance(inst['caption'], list):
+            cap_str = '\n'.join(inst['caption'])
+        else:
+            cap_str = inst['caption']
+
+        category = 'llava_bench_' + json.loads(ques_js)['category']
+        if category in rule_dict:
+            rule = rule_dict[category]
+        else:
+            assert False, f"Visual QA category not found in rule file: {category}."
+        prompt = rule['prompt']
+        role = rule['role']
+        content = (f'[Context]\n{cap_str}\n\n'
+                   f'[Question]\n{ques["text"]}\n\n'
+                   f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
+                   f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
+                   f'[System]\n{prompt}\n\n')
+        cur_js = {
+            'id': idx+1,
+            'question_id': ques['question_id'],
+            'answer1_id': ans1.get('answer_id', ans1['question_id']),
+            'answer2_id': ans2.get('answer_id', ans2['answer_id']),
+            'category': category
+        }
+        if idx >= len(cur_reviews):
+            review = get_eval(content, args.max_tokens)
+            scores = parse_score(review)
+            cur_js['content'] = review
+            cur_js['tuple'] = scores
+            review_file.write(json.dumps(cur_js) + '\n')
+            review_file.flush()
+        else:
+            print(f'Skipping {idx} as we already have it.')
+        idx += 1
+        print(idx)
+    review_file.close()
diff --git a/app/llava/eval/llava_eval/eval_gpt_review_visual.py b/app/llava/eval/llava_eval/eval_gpt_review_visual.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6e407a400a67020d801e6c27a3c32a2ee38f30c
--- /dev/null
+++ b/app/llava/eval/llava_eval/eval_gpt_review_visual.py
@@ -0,0 +1,118 @@
+import argparse
+import json
+import os
+
+import openai
+import time
+
+NUM_SECONDS_TO_SLEEP = 0.5
+
+
+def get_eval(content: str, max_tokens: int):
+    while True:
+        try:
+            response = openai.ChatCompletion.create(
+                model='gpt-4-0314',
+                messages=[{
+                    'role': 'system',
+                    'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
+                }, {
+                    'role': 'user',
+                    'content': content,
+                }],
+                temperature=0.2,  # TODO: figure out which temperature is best for evaluation
+                max_tokens=max_tokens,
+            )
+            break
+        except openai.error.RateLimitError:
+            pass
+        except Exception as e:
+            print(e)
+        time.sleep(NUM_SECONDS_TO_SLEEP)
+
+    return response['choices'][0]['message']['content']
+
+
+def parse_score(review):
+    try:
+        score_pair = review.split('\n')[0]
+        score_pair = score_pair.replace(',', ' ')
+        sp = score_pair.split(' ')
+        if len(sp) == 2:
+            return [float(sp[0]), float(sp[1])]
+        else:
+            print('error', review)
+            return [-1, -1]
+    except Exception as e:
+        print(e)
+        print('error', review)
+        return [-1, -1]
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
+    parser.add_argument('-q', '--question')
+    parser.add_argument('-c', '--context')
+    parser.add_argument('-a', '--answer-list', nargs='+', default=[])
+    parser.add_argument('-r', '--rule')
+    parser.add_argument('-o', '--output')
+    parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
+    args = parser.parse_args()
+
+    f_q = open(os.path.expanduser(args.question))
+    f_ans1 = open(os.path.expanduser(args.answer_list[0]))
+    f_ans2 = open(os.path.expanduser(args.answer_list[1]))
+    rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
+
+    if os.path.isfile(os.path.expanduser(args.output)):
+        cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
+    else:
+        cur_reviews = []
+
+    review_file = open(f'{args.output}', 'a')
+
+    context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
+    image_to_context = {context['image']: context for context in context_list}
+
+    handles = []
+    idx = 0
+    for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
+        ques = json.loads(ques_js)
+        ans1 = json.loads(ans1_js)
+        ans2 = json.loads(ans2_js)
+
+        inst = image_to_context[ques['image']]
+        cap_str = '\n'.join(inst['captions'])
+        box_str = '\n'.join([f'{instance["category"]}: {instance["bbox"]}' for instance in inst['instances']])
+
+        category = json.loads(ques_js)['category']
+        if category in rule_dict:
+            rule = rule_dict[category]
+        else:
+            assert False, f"Visual QA category not found in rule file: {category}."
+        prompt = rule['prompt']
+        role = rule['role']
+        content = (f'[Context]\n{cap_str}\n\n{box_str}\n\n'
+                   f'[Question]\n{ques["text"]}\n\n'
+                   f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
+                   f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
+                   f'[System]\n{prompt}\n\n')
+        cur_js = {
+            'id': idx+1,
+            'question_id': ques['question_id'],
+            'answer1_id': ans1.get('answer_id', ans1['question_id']),
+            'answer2_id': ans2.get('answer_id', ans2['answer_id']),
+            'category': category
+        }
+        if idx >= len(cur_reviews):
+            review = get_eval(content, args.max_tokens)
+            scores = parse_score(review)
+            cur_js['content'] = review
+            cur_js['tuple'] = scores
+            review_file.write(json.dumps(cur_js) + '\n')
+            review_file.flush()
+        else:
+            print(f'Skipping {idx} as we already have it.')
+        idx += 1
+        print(idx)
+    review_file.close()
diff --git a/app/llava/eval/llava_eval/eval_pope.py b/app/llava/eval/llava_eval/eval_pope.py
new file mode 100644
index 0000000000000000000000000000000000000000..b115b8f2327ea9d972f9e41bcbb03c68be6b3508
--- /dev/null
+++ b/app/llava/eval/llava_eval/eval_pope.py
@@ -0,0 +1,81 @@
+import os
+import json
+import argparse
+
+def eval_pope(answers, label_file):
+    label_list = [json.loads(q)['label'] for q in open(label_file, 'r')]
+
+    for answer in answers:
+        text = answer['text']
+
+        # Only keep the first sentence
+        if text.find('.') != -1:
+            text = text.split('.')[0]
+
+        text = text.replace(',', '')
+        words = text.split(' ')
+        if 'No' in words or 'not' in words or 'no' in words:
+            answer['text'] = 'no'
+        else:
+            answer['text'] = 'yes'
+
+    for i in range(len(label_list)):
+        if label_list[i] == 'no':
+            label_list[i] = 0
+        else:
+            label_list[i] = 1
+
+    pred_list = []
+    for answer in answers:
+        if answer['text'] == 'no':
+            pred_list.append(0)
+        else:
+            pred_list.append(1)
+
+    pos = 1
+    neg = 0
+    yes_ratio = pred_list.count(1) / len(pred_list)
+
+    TP, TN, FP, FN = 0, 0, 0, 0
+    for pred, label in zip(pred_list, label_list):
+        if pred == pos and label == pos:
+            TP += 1
+        elif pred == pos and label == neg:
+            FP += 1
+        elif pred == neg and label == neg:
+            TN += 1
+        elif pred == neg and label == pos:
+            FN += 1
+
+    print('TP\tFP\tTN\tFN\t')
+    print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN))
+
+    precision = float(TP) / float(TP + FP)
+    recall = float(TP) / float(TP + FN)
+    f1 = 2*precision*recall / (precision + recall)
+    acc = (TP + TN) / (TP + TN + FP + FN)
+    print('Accuracy: {}'.format(acc))
+    print('Precision: {}'.format(precision))
+    print('Recall: {}'.format(recall))
+    print('F1 score: {}'.format(f1))
+    print('Yes ratio: {}'.format(yes_ratio))
+    print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) )
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--annotation-dir", type=str)
+    parser.add_argument("--question-file", type=str)
+    parser.add_argument("--result-file", type=str)
+    args = parser.parse_args()
+
+    questions = [json.loads(line) for line in open(args.question_file)]
+    questions = {question['question_id']: question for question in questions}
+    answers = [json.loads(q) for q in open(args.result_file)]
+    for file in os.listdir(args.annotation_dir):
+        assert file.startswith('coco_pope_')
+        assert file.endswith('.json')
+        category = file[10:-5]
+        cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category]
+        print('Category: {}, # samples: {}'.format(category, len(cur_answers)))
+        eval_pope(cur_answers, os.path.join(args.annotation_dir, file))
+        print("====================================")
diff --git a/app/llava/eval/llava_eval/eval_science_qa.py b/app/llava/eval/llava_eval/eval_science_qa.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccf206bbd7a5d6376eef82d61b3ef8bbe0f71c6c
--- /dev/null
+++ b/app/llava/eval/llava_eval/eval_science_qa.py
@@ -0,0 +1,114 @@
+import argparse
+import json
+import os
+import re
+import random
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--base-dir', type=str)
+    parser.add_argument('--result-file', type=str)
+    parser.add_argument('--output-file', type=str)
+    parser.add_argument('--output-result', type=str)
+    parser.add_argument('--split', type=str, default='test')
+    parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
+    return parser.parse_args()
+
+
+def convert_caps(results):
+    fakecaps = []
+    for result in results:
+        image_id = result['question_id']
+        caption = result['text']
+        fakecaps.append({"image_id": int(image_id), "caption": caption})
+    return fakecaps
+
+
+def get_pred_idx(prediction, choices, options):
+    """
+    Get the index (e.g. 2) from the prediction (e.g. 'C')
+    """
+    if prediction in options[:len(choices)]:
+        return options.index(prediction)
+    else:
+        return -1
+        return random.choice(range(len(choices)))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    base_dir = args.base_dir
+    split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
+    problems = json.load(open(os.path.join(base_dir, "problems.json")))
+    predictions = [json.loads(line) for line in open(args.result_file)]
+    predictions = {pred['question_id']: pred for pred in predictions}
+    split_problems = {idx: problems[idx] for idx in split_indices}
+
+    results = {'correct': [], 'incorrect': []}
+    sqa_results = {}
+    sqa_results['acc'] = None
+    sqa_results['correct'] = None
+    sqa_results['count'] = None
+    sqa_results['results'] = {}
+    sqa_results['outputs'] = {}
+
+    for prob_id, prob in split_problems.items():
+        if prob_id not in predictions:
+            pred = {'text': 'FAILED', 'prompt': 'Unknown'}
+            pred_text = 'FAILED'
+        else:
+            pred = predictions[prob_id]
+            pred_text = pred['text']
+
+        if pred_text in args.options:
+            answer = pred_text
+        elif len(pred_text) >= 3 and pred_text[0] in args.options and pred_text[1:3] == ". ":
+            answer = pred_text[0]
+        else:
+            pattern = re.compile(r'The answer is ([A-Z]).')
+            res = pattern.findall(pred_text)
+            if len(res) == 1:
+                answer = res[0]  # 'A', 'B', ...
+            else:
+                answer = "FAILED"
+
+        pred_idx = get_pred_idx(answer, prob['choices'], args.options)
+
+        analysis = {
+            'question_id': prob_id,
+            'parsed_ans': answer,
+            'ground_truth': args.options[prob['answer']],
+            'question': pred['prompt'],
+            'pred': pred_text,
+            'is_multimodal': '<image>' in pred['prompt'],
+        }
+
+        sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options)
+        sqa_results['outputs'][prob_id] = pred_text
+
+        if pred_idx == prob['answer']:
+            results['correct'].append(analysis)
+        else:
+            results['incorrect'].append(analysis)
+
+    correct = len(results['correct'])
+    total = len(results['correct']) + len(results['incorrect'])
+
+    ###### IMG ######
+    multimodal_correct = len([x for x in results['correct'] if x['is_multimodal']])
+    multimodal_incorrect = len([x for x in results['incorrect'] if x['is_multimodal']])
+    multimodal_total = multimodal_correct + multimodal_incorrect
+    ###### IMG ######
+
+    print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%, IMG-Accuracy: {multimodal_correct / multimodal_total * 100:.2f}%')
+
+    sqa_results['acc'] = correct / total * 100
+    sqa_results['correct'] = correct
+    sqa_results['count'] = total
+
+    with open(args.output_file, 'w') as f:
+        json.dump(results, f, indent=2)
+    with open(args.output_result, 'w') as f:
+        json.dump(sqa_results, f, indent=2)
diff --git a/app/llava/eval/llava_eval/eval_science_qa_gpt4.py b/app/llava/eval/llava_eval/eval_science_qa_gpt4.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2ff17c915481fb556aba6ec816a9e08f519c515
--- /dev/null
+++ b/app/llava/eval/llava_eval/eval_science_qa_gpt4.py
@@ -0,0 +1,104 @@
+import argparse
+import json
+import os
+import re
+import random
+from collections import defaultdict
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--base-dir', type=str)
+    parser.add_argument('--gpt4-result', type=str)
+    parser.add_argument('--our-result', type=str)
+    parser.add_argument('--split', type=str, default='test')
+    parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
+    return parser.parse_args()
+
+
+def convert_caps(results):
+    fakecaps = []
+    for result in results:
+        image_id = result['question_id']
+        caption = result['text']
+        fakecaps.append({"image_id": int(image_id), "caption": caption})
+    return fakecaps
+
+
+def get_pred_idx(prediction, choices, options):
+    """
+    Get the index (e.g. 2) from the prediction (e.g. 'C')
+    """
+    if prediction in options[:len(choices)]:
+        return options.index(prediction)
+    else:
+        return random.choice(range(len(choices)))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    base_dir = args.base_dir
+    split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
+    problems = json.load(open(os.path.join(base_dir, "problems.json")))
+    our_predictions = [json.loads(line) for line in open(args.our_result)]
+    our_predictions = {pred['question_id']: pred for pred in our_predictions}
+    split_problems = {idx: problems[idx] for idx in split_indices}
+
+    gpt4_predictions = json.load(open(args.gpt4_result))['outputs']
+
+    results = defaultdict(lambda: 0)
+
+    for prob_id, prob in split_problems.items():
+        if prob_id not in our_predictions:
+            continue
+        if prob_id not in gpt4_predictions:
+            continue
+        our_pred = our_predictions[prob_id]['text']
+        gpt4_pred = gpt4_predictions[prob_id]
+
+        pattern = re.compile(r'The answer is ([A-Z]).')
+        our_res = pattern.findall(our_pred)
+        if len(our_res) == 1:
+            our_answer = our_res[0]  # 'A', 'B', ...
+        else:
+            our_answer = "FAILED"
+        gpt4_res = pattern.findall(gpt4_pred)
+        if len(gpt4_res) == 1:
+            gpt4_answer = gpt4_res[0]  # 'A', 'B', ...
+        else:
+            gpt4_answer = "FAILED"
+
+        our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options)
+        gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options)
+
+        if gpt4_answer == 'FAILED':
+            results['gpt4_failed'] += 1
+            # continue
+            gpt4_pred_idx = our_pred_idx
+            # if our_pred_idx != prob['answer']:
+            #     print(our_predictions[prob_id]['prompt'])
+            #     print('-----------------')
+            #     print(f'LECTURE: {prob["lecture"]}')
+            #     print(f'SOLUTION: {prob["solution"]}')
+            #     print('=====================')
+        else:
+            # continue
+            pass
+        # gpt4_pred_idx = our_pred_idx
+
+        if gpt4_pred_idx == prob['answer']:
+            results['correct'] += 1
+        else:
+            results['incorrect'] += 1
+
+
+        if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']:
+            results['correct_upperbound'] += 1
+
+    correct = results['correct']
+    total = results['correct'] + results['incorrect']
+    print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%')
+    print(f'Total: {total}, Correct (upper): {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%')
+    print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%')
+
diff --git a/app/llava/eval/llava_eval/eval_science_qa_gpt4_requery.py b/app/llava/eval/llava_eval/eval_science_qa_gpt4_requery.py
new file mode 100644
index 0000000000000000000000000000000000000000..698546e995d365d1ccc2c25a87e6c5cd681e6eb6
--- /dev/null
+++ b/app/llava/eval/llava_eval/eval_science_qa_gpt4_requery.py
@@ -0,0 +1,149 @@
+import argparse
+import json
+import os
+import re
+import random
+from collections import defaultdict
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--base-dir', type=str)
+    parser.add_argument('--gpt4-result', type=str)
+    parser.add_argument('--requery-result', type=str)
+    parser.add_argument('--our-result', type=str)
+    parser.add_argument('--output-result', type=str)
+    parser.add_argument('--split', type=str, default='test')
+    parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
+    return parser.parse_args()
+
+
+def convert_caps(results):
+    fakecaps = []
+    for result in results:
+        image_id = result['question_id']
+        caption = result['text']
+        fakecaps.append({"image_id": int(image_id), "caption": caption})
+    return fakecaps
+
+
+def get_pred_idx(prediction, choices, options):
+    """
+    Get the index (e.g. 2) from the prediction (e.g. 'C')
+    """
+    if prediction in options[:len(choices)]:
+        return options.index(prediction)
+    else:
+        return random.choice(range(len(choices)))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    base_dir = args.base_dir
+    split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
+    problems = json.load(open(os.path.join(base_dir, "problems.json")))
+    our_predictions = [json.loads(line) for line in open(args.our_result)]
+    our_predictions = {pred['question_id']: pred for pred in our_predictions}
+    split_problems = {idx: problems[idx] for idx in split_indices}
+
+    requery_predictions = [json.loads(line) for line in open(args.requery_result)]
+    requery_predictions = {pred['question_id']: pred for pred in requery_predictions}
+
+    gpt4_predictions = json.load(open(args.gpt4_result))['outputs']
+
+    results = defaultdict(lambda: 0)
+
+    sqa_results = {}
+    sqa_results['acc'] = None
+    sqa_results['correct'] = None
+    sqa_results['count'] = None
+    sqa_results['results'] = {}
+    sqa_results['outputs'] = {}
+
+    for prob_id, prob in split_problems.items():
+        if prob_id not in our_predictions:
+            assert False
+        if prob_id not in gpt4_predictions:
+            assert False
+        our_pred = our_predictions[prob_id]['text']
+        gpt4_pred = gpt4_predictions[prob_id]
+        if prob_id not in requery_predictions:
+            results['missing_requery'] += 1
+            requery_pred = "MISSING"
+        else:
+            requery_pred = requery_predictions[prob_id]['text']
+
+        pattern = re.compile(r'The answer is ([A-Z]).')
+        our_res = pattern.findall(our_pred)
+        if len(our_res) == 1:
+            our_answer = our_res[0]  # 'A', 'B', ...
+        else:
+            our_answer = "FAILED"
+
+        requery_res = pattern.findall(requery_pred)
+        if len(requery_res) == 1:
+            requery_answer = requery_res[0]  # 'A', 'B', ...
+        else:
+            requery_answer = "FAILED"
+
+        gpt4_res = pattern.findall(gpt4_pred)
+        if len(gpt4_res) == 1:
+            gpt4_answer = gpt4_res[0]  # 'A', 'B', ...
+        else:
+            gpt4_answer = "FAILED"
+
+        our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options)
+        gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options)
+        requery_pred_idx = get_pred_idx(requery_answer, prob['choices'], args.options)
+
+        results['total'] += 1
+
+        if gpt4_answer == 'FAILED':
+            results['gpt4_failed'] += 1
+            if gpt4_pred_idx == prob['answer']:
+                results['gpt4_correct'] += 1
+            if our_pred_idx == prob['answer']:
+                results['gpt4_ourvisual_correct'] += 1
+        elif gpt4_pred_idx == prob['answer']:
+            results['gpt4_correct'] += 1
+            results['gpt4_ourvisual_correct'] += 1
+
+        if our_pred_idx == prob['answer']:
+            results['our_correct'] += 1
+
+        if requery_answer == 'FAILED':
+            sqa_results['results'][prob_id] = our_pred_idx
+            if our_pred_idx == prob['answer']:
+                results['requery_correct'] += 1
+        else:
+            sqa_results['results'][prob_id] = requery_pred_idx
+            if requery_pred_idx == prob['answer']:
+                results['requery_correct'] += 1
+            else:
+                print(f"""
+Question ({args.options[prob['answer']]}): {our_predictions[prob_id]['prompt']}
+Our ({our_answer}): {our_pred}
+GPT-4 ({gpt4_answer}): {gpt4_pred}
+Requery ({requery_answer}): {requery_pred}
+print("=====================================")
+""")
+
+        if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']:
+            results['correct_upperbound'] += 1
+
+    total = results['total']
+    print(f'Total: {total}, Our-Correct: {results["our_correct"]}, Accuracy: {results["our_correct"] / total * 100:.2f}%')
+    print(f'Total: {total}, GPT-4-Correct: {results["gpt4_correct"]}, Accuracy: {results["gpt4_correct"] / total * 100:.2f}%')
+    print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%')
+    print(f'Total: {total}, GPT-4-OursVisual-Correct: {results["gpt4_ourvisual_correct"]}, Accuracy: {results["gpt4_ourvisual_correct"] / total * 100:.2f}%')
+    print(f'Total: {total}, Requery-Correct: {results["requery_correct"]}, Accuracy: {results["requery_correct"] / total * 100:.2f}%')
+    print(f'Total: {total}, Correct upper: {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%')
+
+    sqa_results['acc'] = results["requery_correct"] / total * 100
+    sqa_results['correct'] = results["requery_correct"]
+    sqa_results['count'] = total
+
+    with open(args.output_result, 'w') as f:
+        json.dump(sqa_results, f, indent=2)
+
diff --git a/app/llava/eval/llava_eval/eval_textvqa.py b/app/llava/eval/llava_eval/eval_textvqa.py
new file mode 100644
index 0000000000000000000000000000000000000000..468f4bb120448a036bd5b5c7955464fe2e13892a
--- /dev/null
+++ b/app/llava/eval/llava_eval/eval_textvqa.py
@@ -0,0 +1,65 @@
+import os
+import argparse
+import json
+import re
+
+from llava.eval.m4c_evaluator import TextVQAAccuracyEvaluator
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--annotation-file', type=str)
+    parser.add_argument('--result-file', type=str)
+    parser.add_argument('--result-dir', type=str)
+    return parser.parse_args()
+
+
+def prompt_processor(prompt):
+    if prompt.startswith('OCR tokens: '):
+        pattern = r"Question: (.*?) Short answer:"
+        match = re.search(pattern, prompt, re.DOTALL)
+        question = match.group(1)
+    elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3:
+        if prompt.startswith('Reference OCR token:'):
+            question = prompt.split('\n')[1]
+        else:
+            question = prompt.split('\n')[0]
+    elif len(prompt.split('\n')) == 2:
+        question = prompt.split('\n')[0]
+    else:
+        assert False
+
+    return question.lower()
+
+
+def eval_single(annotation_file, result_file):
+    experiment_name = os.path.splitext(os.path.basename(result_file))[0]
+    print(experiment_name)
+    annotations = json.load(open(annotation_file))['data']
+    annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations}
+    results = [json.loads(line) for line in open(result_file)]
+
+    pred_list = []
+    for result in results:
+        annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))]
+        pred_list.append({
+            "pred_answer": result['text'],
+            "gt_answers": annotation['answers'],
+        })
+
+    evaluator = TextVQAAccuracyEvaluator()
+    print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list)))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    if args.result_file is not None:
+        eval_single(args.annotation_file, args.result_file)
+
+    if args.result_dir is not None:
+        for result_file in sorted(os.listdir(args.result_dir)):
+            if not result_file.endswith('.jsonl'):
+                print(f'Skipping {result_file}')
+                continue
+            eval_single(args.annotation_file, os.path.join(args.result_dir, result_file))
diff --git a/app/llava/eval/llava_eval/generate_webpage_data_from_table.py b/app/llava/eval/llava_eval/generate_webpage_data_from_table.py
new file mode 100644
index 0000000000000000000000000000000000000000..92602258ccd953a1d7137056aaf15c8de8166e21
--- /dev/null
+++ b/app/llava/eval/llava_eval/generate_webpage_data_from_table.py
@@ -0,0 +1,111 @@
+"""Generate json file for webpage."""
+import json
+import os
+import re
+
+# models = ['llama', 'alpaca', 'gpt35', 'bard']
+models = ['vicuna']
+
+
+def read_jsonl(path: str, key: str=None):
+    data = []
+    with open(os.path.expanduser(path)) as f:
+        for line in f:
+            if not line:
+                continue
+            data.append(json.loads(line))
+    if key is not None:
+        data.sort(key=lambda x: x[key])
+        data = {item[key]: item for item in data}
+    return data
+
+
+def trim_hanging_lines(s: str, n: int) -> str:
+    s = s.strip()
+    for _ in range(n):
+        s = s.split('\n', 1)[1].strip()
+    return s
+
+
+if __name__ == '__main__':
+    questions = read_jsonl('table/question.jsonl', key='question_id')
+
+    # alpaca_answers = read_jsonl('table/answer/answer_alpaca-13b.jsonl', key='question_id')
+    # bard_answers = read_jsonl('table/answer/answer_bard.jsonl', key='question_id')
+    # gpt35_answers = read_jsonl('table/answer/answer_gpt35.jsonl', key='question_id')
+    # llama_answers = read_jsonl('table/answer/answer_llama-13b.jsonl', key='question_id')
+    vicuna_answers = read_jsonl('table/answer/answer_vicuna-13b.jsonl', key='question_id')
+    ours_answers = read_jsonl('table/results/llama-13b-hf-alpaca.jsonl', key='question_id')
+
+    review_vicuna = read_jsonl('table/review/review_vicuna-13b_llama-13b-hf-alpaca.jsonl', key='question_id')
+    # review_alpaca = read_jsonl('table/review/review_alpaca-13b_vicuna-13b.jsonl', key='question_id')
+    # review_bard = read_jsonl('table/review/review_bard_vicuna-13b.jsonl', key='question_id')
+    # review_gpt35 = read_jsonl('table/review/review_gpt35_vicuna-13b.jsonl', key='question_id')
+    # review_llama = read_jsonl('table/review/review_llama-13b_vicuna-13b.jsonl', key='question_id')
+
+    records = []
+    for qid in questions.keys():
+        r = {
+            'id': qid,
+            'category': questions[qid]['category'],
+            'question': questions[qid]['text'],
+            'answers': {
+                # 'alpaca': alpaca_answers[qid]['text'],
+                # 'llama': llama_answers[qid]['text'],
+                # 'bard': bard_answers[qid]['text'],
+                # 'gpt35': gpt35_answers[qid]['text'],
+                'vicuna': vicuna_answers[qid]['text'],
+                'ours': ours_answers[qid]['text'],
+            },
+            'evaluations': {
+                # 'alpaca': review_alpaca[qid]['text'],
+                # 'llama': review_llama[qid]['text'],
+                # 'bard': review_bard[qid]['text'],
+                'vicuna': review_vicuna[qid]['content'],
+                # 'gpt35': review_gpt35[qid]['text'],
+            },
+            'scores': {
+                'vicuna': review_vicuna[qid]['tuple'],
+                # 'alpaca': review_alpaca[qid]['score'],
+                # 'llama': review_llama[qid]['score'],
+                # 'bard': review_bard[qid]['score'],
+                # 'gpt35': review_gpt35[qid]['score'],
+            },
+        }
+
+        # cleanup data
+        cleaned_evals = {}
+        for k, v in r['evaluations'].items():
+            v = v.strip()
+            lines = v.split('\n')
+            # trim the first line if it's a pair of numbers
+            if re.match(r'\d+[, ]+\d+', lines[0]):
+                lines = lines[1:]
+            v = '\n'.join(lines)
+            cleaned_evals[k] = v.replace('Assistant 1', "**Assistant 1**").replace('Assistant 2', '**Assistant 2**')
+
+        r['evaluations'] = cleaned_evals
+        records.append(r)
+
+    # Reorder the records, this is optional
+    for r in records:
+        if r['id'] <= 20:
+            r['id'] += 60
+        else:
+            r['id'] -= 20
+    for r in records:
+        if r['id'] <= 50:
+            r['id'] += 10
+        elif 50 < r['id'] <= 60:
+            r['id'] -= 50
+    for r in records:
+        if r['id'] == 7:
+            r['id'] = 1
+        elif r['id'] < 7:
+            r['id'] += 1 
+
+    records.sort(key=lambda x: x['id'])
+
+    # Write to file
+    with open('webpage/data.json', 'w') as f:
+        json.dump({'questions': records, 'models': models}, f, indent=2)
diff --git a/app/llava/eval/llava_eval/m4c_evaluator.py b/app/llava/eval/llava_eval/m4c_evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..e30e958da061a4f0a0bfe34b12d2fcaeba7ff2f4
--- /dev/null
+++ b/app/llava/eval/llava_eval/m4c_evaluator.py
@@ -0,0 +1,334 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import re
+
+from tqdm import tqdm
+
+
+class EvalAIAnswerProcessor:
+    """
+    Processes an answer similar to Eval AI
+        copied from
+        https://github.com/facebookresearch/mmf/blob/c46b3b3391275b4181567db80943473a89ab98ab/pythia/tasks/processors.py#L897
+    """
+
+    CONTRACTIONS = {
+        "aint": "ain't",
+        "arent": "aren't",
+        "cant": "can't",
+        "couldve": "could've",
+        "couldnt": "couldn't",
+        "couldn'tve": "couldn't've",
+        "couldnt've": "couldn't've",
+        "didnt": "didn't",
+        "doesnt": "doesn't",
+        "dont": "don't",
+        "hadnt": "hadn't",
+        "hadnt've": "hadn't've",
+        "hadn'tve": "hadn't've",
+        "hasnt": "hasn't",
+        "havent": "haven't",
+        "hed": "he'd",
+        "hed've": "he'd've",
+        "he'dve": "he'd've",
+        "hes": "he's",
+        "howd": "how'd",
+        "howll": "how'll",
+        "hows": "how's",
+        "Id've": "I'd've",
+        "I'dve": "I'd've",
+        "Im": "I'm",
+        "Ive": "I've",
+        "isnt": "isn't",
+        "itd": "it'd",
+        "itd've": "it'd've",
+        "it'dve": "it'd've",
+        "itll": "it'll",
+        "let's": "let's",
+        "maam": "ma'am",
+        "mightnt": "mightn't",
+        "mightnt've": "mightn't've",
+        "mightn'tve": "mightn't've",
+        "mightve": "might've",
+        "mustnt": "mustn't",
+        "mustve": "must've",
+        "neednt": "needn't",
+        "notve": "not've",
+        "oclock": "o'clock",
+        "oughtnt": "oughtn't",
+        "ow's'at": "'ow's'at",
+        "'ows'at": "'ow's'at",
+        "'ow'sat": "'ow's'at",
+        "shant": "shan't",
+        "shed've": "she'd've",
+        "she'dve": "she'd've",
+        "she's": "she's",
+        "shouldve": "should've",
+        "shouldnt": "shouldn't",
+        "shouldnt've": "shouldn't've",
+        "shouldn'tve": "shouldn't've",
+        "somebody'd": "somebodyd",
+        "somebodyd've": "somebody'd've",
+        "somebody'dve": "somebody'd've",
+        "somebodyll": "somebody'll",
+        "somebodys": "somebody's",
+        "someoned": "someone'd",
+        "someoned've": "someone'd've",
+        "someone'dve": "someone'd've",
+        "someonell": "someone'll",
+        "someones": "someone's",
+        "somethingd": "something'd",
+        "somethingd've": "something'd've",
+        "something'dve": "something'd've",
+        "somethingll": "something'll",
+        "thats": "that's",
+        "thered": "there'd",
+        "thered've": "there'd've",
+        "there'dve": "there'd've",
+        "therere": "there're",
+        "theres": "there's",
+        "theyd": "they'd",
+        "theyd've": "they'd've",
+        "they'dve": "they'd've",
+        "theyll": "they'll",
+        "theyre": "they're",
+        "theyve": "they've",
+        "twas": "'twas",
+        "wasnt": "wasn't",
+        "wed've": "we'd've",
+        "we'dve": "we'd've",
+        "weve": "we've",
+        "werent": "weren't",
+        "whatll": "what'll",
+        "whatre": "what're",
+        "whats": "what's",
+        "whatve": "what've",
+        "whens": "when's",
+        "whered": "where'd",
+        "wheres": "where's",
+        "whereve": "where've",
+        "whod": "who'd",
+        "whod've": "who'd've",
+        "who'dve": "who'd've",
+        "wholl": "who'll",
+        "whos": "who's",
+        "whove": "who've",
+        "whyll": "why'll",
+        "whyre": "why're",
+        "whys": "why's",
+        "wont": "won't",
+        "wouldve": "would've",
+        "wouldnt": "wouldn't",
+        "wouldnt've": "wouldn't've",
+        "wouldn'tve": "wouldn't've",
+        "yall": "y'all",
+        "yall'll": "y'all'll",
+        "y'allll": "y'all'll",
+        "yall'd've": "y'all'd've",
+        "y'alld've": "y'all'd've",
+        "y'all'dve": "y'all'd've",
+        "youd": "you'd",
+        "youd've": "you'd've",
+        "you'dve": "you'd've",
+        "youll": "you'll",
+        "youre": "you're",
+        "youve": "you've",
+    }
+
+    NUMBER_MAP = {
+        "none": "0",
+        "zero": "0",
+        "one": "1",
+        "two": "2",
+        "three": "3",
+        "four": "4",
+        "five": "5",
+        "six": "6",
+        "seven": "7",
+        "eight": "8",
+        "nine": "9",
+        "ten": "10",
+    }
+    ARTICLES = ["a", "an", "the"]
+    PERIOD_STRIP = re.compile(r"(?!<=\d)(\.)(?!\d)")
+    COMMA_STRIP = re.compile(r"(?<=\d)(\,)+(?=\d)")
+    PUNCTUATIONS = [
+        ";",
+        r"/",
+        "[",
+        "]",
+        '"',
+        "{",
+        "}",
+        "(",
+        ")",
+        "=",
+        "+",
+        "\\",
+        "_",
+        "-",
+        ">",
+        "<",
+        "@",
+        "`",
+        ",",
+        "?",
+        "!",
+    ]
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def word_tokenize(self, word):
+        word = word.lower()
+        word = word.replace(",", "").replace("?", "").replace("'s", " 's")
+        return word.strip()
+
+    def process_punctuation(self, in_text):
+        out_text = in_text
+        for p in self.PUNCTUATIONS:
+            if (p + " " in in_text or " " + p in in_text) or (
+                re.search(self.COMMA_STRIP, in_text) is not None
+            ):
+                out_text = out_text.replace(p, "")
+            else:
+                out_text = out_text.replace(p, " ")
+        out_text = self.PERIOD_STRIP.sub("", out_text, re.UNICODE)
+        return out_text
+
+    def process_digit_article(self, in_text):
+        out_text = []
+        temp_text = in_text.lower().split()
+        for word in temp_text:
+            word = self.NUMBER_MAP.setdefault(word, word)
+            if word not in self.ARTICLES:
+                out_text.append(word)
+            else:
+                pass
+        for word_id, word in enumerate(out_text):
+            if word in self.CONTRACTIONS:
+                out_text[word_id] = self.CONTRACTIONS[word]
+        out_text = " ".join(out_text)
+        return out_text
+
+    def __call__(self, item):
+        item = self.word_tokenize(item)
+        item = item.replace("\n", " ").replace("\t", " ").strip()
+        item = self.process_punctuation(item)
+        item = self.process_digit_article(item)
+        return item
+
+
+class TextVQAAccuracyEvaluator:
+    def __init__(self):
+        self.answer_processor = EvalAIAnswerProcessor()
+
+    def _compute_answer_scores(self, raw_answers):
+        """
+        compute the accuracy (soft score) of human answers
+        """
+        answers = [self.answer_processor(a) for a in raw_answers]
+        assert len(answers) == 10
+        gt_answers = list(enumerate(answers))
+        unique_answers = set(answers)
+        unique_answer_scores = {}
+
+        for unique_answer in unique_answers:
+            accs = []
+            for gt_answer in gt_answers:
+                other_answers = [item for item in gt_answers if item != gt_answer]
+                matching_answers = [
+                    item for item in other_answers if item[1] == unique_answer
+                ]
+                acc = min(1, float(len(matching_answers)) / 3)
+                accs.append(acc)
+            unique_answer_scores[unique_answer] = sum(accs) / len(accs)
+
+        return unique_answer_scores
+
+    def eval_pred_list(self, pred_list):
+        pred_scores = []
+        for entry in tqdm(pred_list):
+            pred_answer = self.answer_processor(entry["pred_answer"])
+            unique_answer_scores = self._compute_answer_scores(entry["gt_answers"])
+            score = unique_answer_scores.get(pred_answer, 0.0)
+            pred_scores.append(score)
+
+        accuracy = sum(pred_scores) / len(pred_scores)
+        return accuracy
+
+
+class STVQAAccuracyEvaluator:
+    def __init__(self):
+        self.answer_processor = EvalAIAnswerProcessor()
+
+    def eval_pred_list(self, pred_list):
+        pred_scores = []
+        for entry in pred_list:
+            pred_answer = self.answer_processor(entry["pred_answer"])
+            gts = [self.answer_processor(a) for a in entry["gt_answers"]]
+            score = 1.0 if pred_answer in gts else 0.0
+            pred_scores.append(score)
+
+        accuracy = sum(pred_scores) / len(pred_scores)
+        return accuracy
+
+
+class STVQAANLSEvaluator:
+    def __init__(self):
+        import editdistance  # install with `pip install editdistance`
+
+        self.get_edit_distance = editdistance.eval
+
+    def get_anls(self, s1, s2):
+        s1 = s1.lower().strip()
+        s2 = s2.lower().strip()
+        iou = 1 - self.get_edit_distance(s1, s2) / max(len(s1), len(s2))
+        anls = iou if iou >= 0.5 else 0.0
+        return anls
+
+    def eval_pred_list(self, pred_list):
+        pred_scores = []
+        for entry in pred_list:
+            anls = max(
+                self.get_anls(entry["pred_answer"], gt) for gt in entry["gt_answers"]
+            )
+            pred_scores.append(anls)
+
+        accuracy = sum(pred_scores) / len(pred_scores)
+        return accuracy
+
+
+class TextCapsBleu4Evaluator:
+    def __init__(self):
+        # The following script requires Java 1.8.0 and pycocotools installed.
+        # The pycocoevalcap can be installed with pip as
+        # pip install git+https://github.com/ronghanghu/coco-caption.git@python23
+        # Original pycocoevalcap code is at https://github.com/tylin/coco-caption
+        # but has no python3 support yet.
+        try:
+            from pycocoevalcap.bleu.bleu import Bleu
+            from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
+        except ModuleNotFoundError:
+            print(
+                "Please install pycocoevalcap module using "
+                "pip install git+https://github.com/ronghanghu/coco-caption.git@python23"  # noqa
+            )
+            raise
+
+        self.tokenizer = PTBTokenizer()
+        self.scorer = Bleu(4)
+
+    def eval_pred_list(self, pred_list):
+        # Create reference and hypotheses captions.
+        gts = {}
+        res = {}
+        for idx, entry in enumerate(pred_list):
+            gts[idx] = [{"caption": a} for a in entry["gt_answers"]]
+            res[idx] = [{"caption": entry["pred_answer"]}]
+
+        gts = self.tokenizer.tokenize(gts)
+        res = self.tokenizer.tokenize(res)
+        score, _ = self.scorer.compute_score(gts, res)
+
+        bleu4 = score[3]  # score is (Bleu-1, Bleu-2, Bleu-3, Bleu-4)
+        return bleu4
diff --git a/app/llava/eval/llava_eval/model_qa.py b/app/llava/eval/llava_eval/model_qa.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e254da152ac644ff54fb5fa57e625d9e6ba31d1
--- /dev/null
+++ b/app/llava/eval/llava_eval/model_qa.py
@@ -0,0 +1,64 @@
+import argparse
+from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria
+import torch
+import os
+import json
+from tqdm import tqdm
+import shortuuid
+
+from llava.conversation import default_conversation
+from llava.utils import disable_torch_init
+
+
+@torch.inference_mode()
+def eval_model(model_name, questions_file, answers_file):
+    # Model
+    disable_torch_init()
+    model_name = os.path.expanduser(model_name)
+    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
+    model = AutoModelForCausalLM.from_pretrained(model_name,
+        torch_dtype=torch.float16).cuda()
+
+
+    ques_file = open(os.path.expanduser(questions_file), "r")
+    ans_file = open(os.path.expanduser(answers_file), "w")
+    for i, line in enumerate(tqdm(ques_file)):
+        idx = json.loads(line)["question_id"]
+        qs = json.loads(line)["text"]
+        cat = json.loads(line)["category"]
+        conv = default_conversation.copy()
+        conv.append_message(conv.roles[0], qs)
+        prompt = conv.get_prompt()
+        inputs = tokenizer([prompt])
+        input_ids = torch.as_tensor(inputs.input_ids).cuda()
+        output_ids = model.generate(
+            input_ids,
+            do_sample=True,
+            use_cache=True,
+            temperature=0.7,
+            max_new_tokens=1024,)
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
+        try:
+            index = outputs.index(conv.sep, len(prompt))
+        except ValueError:
+            outputs += conv.sep
+            index = outputs.index(conv.sep, len(prompt))
+
+        outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip()
+        ans_id = shortuuid.uuid()
+        ans_file.write(json.dumps({"question_id": idx,
+                                   "text": outputs,
+                                   "answer_id": ans_id,
+                                   "model_id": model_name,
+                                   "metadata": {}}) + "\n")
+        ans_file.flush()
+    ans_file.close()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
+    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    args = parser.parse_args()
+
+    eval_model(args.model_name, args.question_file, args.answers_file)
diff --git a/app/llava/eval/llava_eval/model_vqa.py b/app/llava/eval/llava_eval/model_vqa.py
new file mode 100644
index 0000000000000000000000000000000000000000..8421a6103933a3e4e9eda8a35f23ea14f6049688
--- /dev/null
+++ b/app/llava/eval/llava_eval/model_vqa.py
@@ -0,0 +1,101 @@
+import argparse
+import torch
+import os
+import json
+from tqdm import tqdm
+import shortuuid
+
+from llava.constants import MM_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from llava.conversation import conv_templates, SeparatorStyle
+from llava.model.builder import load_pretrained_model
+from llava.utils import disable_torch_init
+from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
+
+from PIL import Image
+import math
+
+
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+
+
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+
+
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
+
+    questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
+    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+    for line in tqdm(questions):
+        idx = line["question_id"]
+        image_file = line["image"]
+        qs = line["text"]
+        cur_prompt = qs
+        if model.config.mm_use_start_end:
+            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+        else:
+            qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+
+        conv = conv_templates[args.conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+
+        input_ids = tokenizer_image_token(prompt, tokenizer, MM_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+
+        image = Image.open(os.path.join(args.image_folder, image_file)).convert('RGB')
+        image_tensor = process_images([image], image_processor, model.config)[0]
+
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=image_tensor.unsqueeze(0).half().cuda(),
+                image_sizes=[image.size],
+                do_sample=True if args.temperature > 0 else False,
+                temperature=args.temperature,
+                top_p=args.top_p,
+                num_beams=args.num_beams,
+                # no_repeat_ngram_size=3,
+                max_new_tokens=1024,
+                use_cache=True)
+
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+
+        ans_id = shortuuid.uuid()
+        ans_file.write(json.dumps({"question_id": idx,
+                                   "prompt": cur_prompt,
+                                   "text": outputs,
+                                   "answer_id": ans_id,
+                                   "model_id": model_name,
+                                   "metadata": {}}) + "\n")
+        ans_file.flush()
+    ans_file.close()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-folder", type=str, default="")
+    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="llava_v1")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--num_beams", type=int, default=1)
+    args = parser.parse_args()
+
+    eval_model(args)
diff --git a/app/llava/eval/llava_eval/model_vqa_loader.py b/app/llava/eval/llava_eval/model_vqa_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..96128198b97d9c2e4f8baef2342469983a658c8c
--- /dev/null
+++ b/app/llava/eval/llava_eval/model_vqa_loader.py
@@ -0,0 +1,144 @@
+import argparse
+import torch
+import os
+import json
+from tqdm import tqdm
+import shortuuid
+
+from llava.constants import MM_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from llava.conversation import conv_templates, SeparatorStyle
+from llava.model.builder import load_pretrained_model
+from llava.utils import disable_torch_init
+from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
+from torch.utils.data import Dataset, DataLoader
+
+from PIL import Image
+import math
+
+
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+
+
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+
+
+# Custom dataset class
+class CustomDataset(Dataset):
+    def __init__(self, questions, image_folder, tokenizer, image_processor, model_config):
+        self.questions = questions
+        self.image_folder = image_folder
+        self.tokenizer = tokenizer
+        self.image_processor = image_processor
+        self.model_config = model_config
+
+    def __getitem__(self, index):
+        line = self.questions[index]
+        image_file = line["image"]
+        qs = line["text"]
+        if self.model_config.mm_use_start_end:
+            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+        else:
+            qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+
+        conv = conv_templates[args.conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+
+        image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB')
+        image_tensor = process_images([image], self.image_processor, self.model_config)[0]
+
+        input_ids = tokenizer_image_token(prompt, self.tokenizer, MM_TOKEN_INDEX, return_tensors='pt')
+
+        return input_ids, image_tensor, image.size
+
+    def __len__(self):
+        return len(self.questions)
+
+
+def collate_fn(batch):
+    input_ids, image_tensors, image_sizes = zip(*batch)
+    input_ids = torch.stack(input_ids, dim=0)
+    image_tensors = torch.stack(image_tensors, dim=0)
+    return input_ids, image_tensors, image_sizes
+
+
+# DataLoader
+def create_data_loader(questions, image_folder, tokenizer, image_processor, model_config, batch_size=1, num_workers=4):
+    assert batch_size == 1, "batch_size must be 1"
+    dataset = CustomDataset(questions, image_folder, tokenizer, image_processor, model_config)
+    data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False, collate_fn=collate_fn)
+    return data_loader
+
+
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
+
+    questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
+    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+
+    if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
+        args.conv_mode = args.conv_mode + '_mmtag'
+        print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')
+
+    data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config)
+
+    for (input_ids, image_tensor, image_sizes), line in tqdm(zip(data_loader, questions), total=len(questions)):
+        idx = line["question_id"]
+        cur_prompt = line["text"]
+
+        input_ids = input_ids.to(device='cuda', non_blocking=True)
+
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True),
+                image_sizes=image_sizes,
+                do_sample=True if args.temperature > 0 else False,
+                temperature=args.temperature,
+                top_p=args.top_p,
+                num_beams=args.num_beams,
+                max_new_tokens=args.max_new_tokens,
+                use_cache=True)
+
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+
+        ans_id = shortuuid.uuid()
+        ans_file.write(json.dumps({"question_id": idx,
+                                   "prompt": cur_prompt,
+                                   "text": outputs,
+                                   "answer_id": ans_id,
+                                   "model_id": model_name,
+                                   "metadata": {}}) + "\n")
+        # ans_file.flush()
+    ans_file.close()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-folder", type=str, default="")
+    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="llava_v1")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--num_beams", type=int, default=1)
+    parser.add_argument("--max_new_tokens", type=int, default=128)
+    args = parser.parse_args()
+
+    eval_model(args)
diff --git a/app/llava/eval/llava_eval/model_vqa_mmbench.py b/app/llava/eval/llava_eval/model_vqa_mmbench.py
new file mode 100644
index 0000000000000000000000000000000000000000..525fb9584ebba2e689505e9c0fc9071af884088e
--- /dev/null
+++ b/app/llava/eval/llava_eval/model_vqa_mmbench.py
@@ -0,0 +1,160 @@
+import argparse
+import torch
+import os
+import json
+import pandas as pd
+from tqdm import tqdm
+import shortuuid
+
+from llava.constants import MM_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from llava.conversation import conv_templates, SeparatorStyle
+from llava.model.builder import load_pretrained_model
+from llava.utils import disable_torch_init
+from llava.mm_utils import tokenizer_image_token, process_images, load_image_from_base64, get_model_name_from_path
+
+from PIL import Image
+import math
+
+
+all_options = ['A', 'B', 'C', 'D']
+
+
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+
+
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+
+
+def is_none(value):
+    if value is None:
+        return True
+    if type(value) is float and math.isnan(value):
+        return True
+    if type(value) is str and value.lower() == 'nan':
+        return True
+    if type(value) is str and value.lower() == 'none':
+        return True
+    return False
+
+def get_options(row, options):
+    parsed_options = []
+    for option in options:
+        option_value = row[option]
+        if is_none(option_value):
+            break
+        parsed_options.append(option_value)
+    return parsed_options
+
+
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
+
+    questions = pd.read_table(os.path.expanduser(args.question_file))
+    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+
+    if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
+        args.conv_mode = args.conv_mode + '_mmtag'
+        print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')
+
+    for index, row in tqdm(questions.iterrows(), total=len(questions)):
+        options = get_options(row, all_options)
+        cur_option_char = all_options[:len(options)]
+
+        if args.all_rounds:
+            num_rounds = len(options)
+        else:
+            num_rounds = 1
+
+        for round_idx in range(num_rounds):
+            idx = row['index']
+            question = row['question']
+            hint = row['hint']
+            image = load_image_from_base64(row['image'])
+            if not is_none(hint):
+                question = hint + '\n' + question
+            for option_char, option in zip(all_options[:len(options)], options):
+                question = question + '\n' + option_char + '. ' + option
+            qs = cur_prompt = question
+            if model.config.mm_use_start_end:
+                qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+            else:
+                qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+
+            if args.single_pred_prompt:
+                if args.lang == 'cn':
+                    qs = qs + '\n' + "请直接回答选项字母。"
+                else:
+                    qs = qs + '\n' + "Answer with the option's letter from the given choices directly."
+
+            conv = conv_templates[args.conv_mode].copy()
+            conv.append_message(conv.roles[0], qs)
+            conv.append_message(conv.roles[1], None)
+            prompt = conv.get_prompt()
+
+            input_ids = tokenizer_image_token(prompt, tokenizer, MM_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+
+            image_tensor = process_images([image], image_processor, model.config)[0]
+
+            with torch.inference_mode():
+                output_ids = model.generate(
+                    input_ids,
+                    images=image_tensor.unsqueeze(0).half().cuda(),
+                    image_sizes=[image.size],
+                    do_sample=True if args.temperature > 0 else False,
+                    temperature=args.temperature,
+                    top_p=args.top_p,
+                    num_beams=args.num_beams,
+                    # no_repeat_ngram_size=3,
+                    max_new_tokens=1024,
+                    use_cache=True)
+
+            outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+
+            ans_id = shortuuid.uuid()
+            ans_file.write(json.dumps({"question_id": idx,
+                                    "round_id": round_idx,
+                                    "prompt": cur_prompt,
+                                    "text": outputs,
+                                    "options": options,
+                                    "option_char": cur_option_char,
+                                    "answer_id": ans_id,
+                                    "model_id": model_name,
+                                    "metadata": {}}) + "\n")
+            ans_file.flush()
+
+            # rotate options
+            options = options[1:] + options[:1]
+            cur_option_char = cur_option_char[1:] + cur_option_char[:1]
+    ans_file.close()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-folder", type=str, default="")
+    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="llava_v1")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--num_beams", type=int, default=1)
+    parser.add_argument("--all-rounds", action="store_true")
+    parser.add_argument("--single-pred-prompt", action="store_true")
+    parser.add_argument("--lang", type=str, default="en")
+    args = parser.parse_args()
+
+    eval_model(args)
diff --git a/app/llava/eval/llava_eval/model_vqa_science.py b/app/llava/eval/llava_eval/model_vqa_science.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cb259cdedd9254786116f098052336b3d630cfa
--- /dev/null
+++ b/app/llava/eval/llava_eval/model_vqa_science.py
@@ -0,0 +1,111 @@
+import argparse
+import torch
+import os
+import json
+from tqdm import tqdm
+import shortuuid
+
+from llava.constants import MM_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from llava.conversation import conv_templates, SeparatorStyle
+from llava.model.builder import load_pretrained_model
+from llava.utils import disable_torch_init
+from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
+
+from PIL import Image
+import math
+
+
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+
+
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+
+
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
+
+    questions = json.load(open(os.path.expanduser(args.question_file), "r"))
+    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+    for i, line in enumerate(tqdm(questions)):
+        idx = line["id"]
+        question = line['conversations'][0]
+        qs = question['value'].replace('<image>', '').strip()
+        cur_prompt = qs
+
+        if 'image' in line:
+            image_file = line["image"]
+            image = Image.open(os.path.join(args.image_folder, image_file))
+            image_tensor = process_images([image], image_processor, model.config)[0]
+            images = image_tensor.unsqueeze(0).half().cuda()
+            image_sizes = [image.size]
+            if getattr(model.config, 'mm_use_start_end', False):
+                qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+            else:
+                qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+            cur_prompt = '<image>' + '\n' + cur_prompt
+        else:
+            images = None
+            image_sizes = None
+
+        if args.single_pred_prompt:
+            qs = qs + '\n' + "Answer with the option's letter from the given choices directly."
+            cur_prompt = cur_prompt + '\n' + "Answer with the option's letter from the given choices directly."
+
+        conv = conv_templates[args.conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+
+        input_ids = tokenizer_image_token(prompt, tokenizer, MM_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=images,
+                image_sizes=image_sizes,
+                do_sample=True if args.temperature > 0 else False,
+                temperature=args.temperature,
+                max_new_tokens=1024,
+                use_cache=True,
+            )
+
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+
+        ans_id = shortuuid.uuid()
+        ans_file.write(json.dumps({"question_id": idx,
+                                   "prompt": cur_prompt,
+                                   "text": outputs,
+                                   "answer_id": ans_id,
+                                   "model_id": model_name,
+                                   "metadata": {}}) + "\n")
+        ans_file.flush()
+    ans_file.close()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-folder", type=str, default="")
+    parser.add_argument("--question-file", type=str, default="tables/question.json")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="llava_v0")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--answer-prompter", action="store_true")
+    parser.add_argument("--single-pred-prompt", action="store_true")
+    args = parser.parse_args()
+
+    eval_model(args)
diff --git a/app/llava/eval/llava_eval/qa_baseline_gpt35.py b/app/llava/eval/llava_eval/qa_baseline_gpt35.py
new file mode 100644
index 0000000000000000000000000000000000000000..babab6e12b4bb8cfa74a7edfa5e56cd1b3e2bf6c
--- /dev/null
+++ b/app/llava/eval/llava_eval/qa_baseline_gpt35.py
@@ -0,0 +1,74 @@
+"""Generate answers with GPT-3.5"""
+# Note: you need to be using OpenAI Python v0.27.0 for the code below to work
+import argparse
+import json
+import os
+import time
+import concurrent.futures
+
+import openai
+import tqdm
+import shortuuid
+
+MODEL = 'gpt-3.5-turbo'
+MODEL_ID = 'gpt-3.5-turbo:20230327'
+
+def get_answer(question_id: int, question: str, max_tokens: int):
+    ans = {
+        'answer_id': shortuuid.uuid(),
+        'question_id': question_id,
+        'model_id': MODEL_ID,
+    }
+    for _ in range(3):
+        try:
+            response = openai.ChatCompletion.create(
+                model=MODEL,
+                messages=[{
+                    'role': 'system',
+                    'content': 'You are a helpful assistant.'
+                }, {
+                    'role': 'user',
+                    'content': question,
+                }],
+                max_tokens=max_tokens,
+            )
+            ans['text'] = response['choices'][0]['message']['content']
+            return ans
+        except Exception as e:
+            print('[ERROR]', e)
+            ans['text'] = '#ERROR#'
+            time.sleep(1)
+    return ans
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='ChatGPT answer generation.')
+    parser.add_argument('-q', '--question')
+    parser.add_argument('-o', '--output')
+    parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
+    args = parser.parse_args()
+
+    questions_dict = {}
+    with open(os.path.expanduser(args.question)) as f:
+        for line in f:
+            if not line:
+                continue
+            q = json.loads(line)
+            questions_dict[q['question_id']] = q['text']
+
+    answers = []
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
+        futures = []
+        for qid, question in questions_dict.items():
+            future = executor.submit(get_answer, qid, question, args.max_tokens)
+            futures.append(future)
+
+        for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
+            answers.append(future.result())
+
+    answers.sort(key=lambda x: x['question_id'])
+
+    with open(os.path.expanduser(args.output), 'w') as f:
+        table = [json.dumps(ans) for ans in answers]
+        f.write('\n'.join(table))
diff --git a/app/llava/eval/llava_eval/run_llava.py b/app/llava/eval/llava_eval/run_llava.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cccecf97845fd7d131f8056e58fb542ccf1e48f
--- /dev/null
+++ b/app/llava/eval/llava_eval/run_llava.py
@@ -0,0 +1,145 @@
+import argparse
+import torch
+
+from llava.constants import (
+    MM_TOKEN_INDEX,
+    DEFAULT_IMAGE_TOKEN,
+    DEFAULT_IM_START_TOKEN,
+    DEFAULT_IM_END_TOKEN,
+    IMAGE_PLACEHOLDER,
+)
+from llava.conversation import conv_templates, SeparatorStyle
+from llava.model.builder import load_pretrained_model
+from llava.utils import disable_torch_init
+from llava.mm_utils import (
+    process_images,
+    tokenizer_image_token,
+    get_model_name_from_path,
+)
+
+from PIL import Image
+
+import requests
+from PIL import Image
+from io import BytesIO
+import re
+
+
+def image_parser(args):
+    out = args.image_file.split(args.sep)
+    return out
+
+
+def load_image(image_file):
+    if image_file.startswith("http") or image_file.startswith("https"):
+        response = requests.get(image_file)
+        image = Image.open(BytesIO(response.content)).convert("RGB")
+    else:
+        image = Image.open(image_file).convert("RGB")
+    return image
+
+
+def load_images(image_files):
+    out = []
+    for image_file in image_files:
+        image = load_image(image_file)
+        out.append(image)
+    return out
+
+
+def eval_model(args):
+    # Model
+    disable_torch_init()
+
+    model_name = get_model_name_from_path(args.model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(
+        args.model_path, args.model_base, model_name
+    )
+
+    qs = args.query
+    image_token_se = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
+    if IMAGE_PLACEHOLDER in qs:
+        if model.config.mm_use_start_end:
+            qs = re.sub(IMAGE_PLACEHOLDER, image_token_se, qs)
+        else:
+            qs = re.sub(IMAGE_PLACEHOLDER, DEFAULT_IMAGE_TOKEN, qs)
+    else:
+        if model.config.mm_use_start_end:
+            qs = image_token_se + "\n" + qs
+        else:
+            qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
+
+    if "llama-2" in model_name.lower():
+        conv_mode = "llava_llama_2"
+    elif "mistral" in model_name.lower():
+        conv_mode = "mistral_instruct"
+    elif "v1.6-34b" in model_name.lower():
+        conv_mode = "chatml_direct"
+    elif "v1" in model_name.lower():
+        conv_mode = "llava_v1"
+    elif "mpt" in model_name.lower():
+        conv_mode = "mpt"
+    else:
+        conv_mode = "llava_v0"
+
+    if args.conv_mode is not None and conv_mode != args.conv_mode:
+        print(
+            "[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}".format(
+                conv_mode, args.conv_mode, args.conv_mode
+            )
+        )
+    else:
+        args.conv_mode = conv_mode
+
+    conv = conv_templates[args.conv_mode].copy()
+    conv.append_message(conv.roles[0], qs)
+    conv.append_message(conv.roles[1], None)
+    prompt = conv.get_prompt()
+
+    image_files = image_parser(args)
+    images = load_images(image_files)
+    image_sizes = [x.size for x in images]
+    images_tensor = process_images(
+        images,
+        image_processor,
+        model.config
+    ).to(model.device, dtype=torch.float16)
+
+    input_ids = (
+        tokenizer_image_token(prompt, tokenizer, MM_TOKEN_INDEX, return_tensors="pt")
+        .unsqueeze(0)
+        .cuda()
+    )
+
+    with torch.inference_mode():
+        output_ids = model.generate(
+            input_ids,
+            images=images_tensor,
+            image_sizes=image_sizes,
+            do_sample=True if args.temperature > 0 else False,
+            temperature=args.temperature,
+            top_p=args.top_p,
+            num_beams=args.num_beams,
+            max_new_tokens=args.max_new_tokens,
+            use_cache=True,
+        )
+
+    outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+    print(outputs)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-file", type=str, required=True)
+    parser.add_argument("--query", type=str, required=True)
+    parser.add_argument("--conv-mode", type=str, default=None)
+    parser.add_argument("--sep", type=str, default=",")
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--num_beams", type=int, default=1)
+    parser.add_argument("--max_new_tokens", type=int, default=512)
+    args = parser.parse_args()
+
+    eval_model(args)
diff --git a/app/llava/eval/llava_eval/summarize_gpt_review.py b/app/llava/eval/llava_eval/summarize_gpt_review.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f796a3880341739677a5fe3bfbcc90515a0f324
--- /dev/null
+++ b/app/llava/eval/llava_eval/summarize_gpt_review.py
@@ -0,0 +1,60 @@
+import json
+import os
+from collections import defaultdict
+
+import numpy as np
+
+import argparse
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
+    parser.add_argument('-d', '--dir', default=None)
+    parser.add_argument('-v', '--version', default=None)
+    parser.add_argument('-s', '--select', nargs='*', default=None)
+    parser.add_argument('-f', '--files', nargs='*', default=[])
+    parser.add_argument('-i', '--ignore', nargs='*', default=[])
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    if args.ignore is not None:
+        args.ignore = [int(x) for x in args.ignore]
+
+    if len(args.files) > 0:
+        review_files = args.files
+    else:
+        review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (x.startswith('gpt4_text') or x.startswith('reviews_') or x.startswith('review_') or 'review' in args.dir)]
+
+    for review_file in sorted(review_files):
+        config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '')
+        if args.select is not None and any(x not in config for x in args.select):
+            continue
+        if '0613' in config:
+            version = '0613'
+        else:
+            version = '0314'
+        if args.version is not None and args.version != version:
+            continue
+        scores = defaultdict(list)
+        print(config)
+        with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f:
+            for review_str in f:
+                review = json.loads(review_str)
+                if review['question_id'] in args.ignore:
+                    continue
+                if 'category' in review:
+                    scores[review['category']].append(review['tuple'])
+                    scores['all'].append(review['tuple'])
+                else:
+                    if 'tuple' in review:
+                        scores['all'].append(review['tuple'])
+                    else:
+                        scores['all'].append(review['score'])
+        for k, v in sorted(scores.items()):
+            stats = np.asarray(v).mean(0).tolist()
+            stats = [round(x, 3) for x in stats]
+            # print(k, stats, round(stats[1]/stats[0]*100, 1))
+            print(k, round(stats[1]/stats[0]*100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1))
+        print('=================================')
diff --git a/app/llava/eval/llava_eval/webpage/figures/chatgpt.svg b/app/llava/eval/llava_eval/webpage/figures/chatgpt.svg
new file mode 100644
index 0000000000000000000000000000000000000000..8147382a3152de03c24b4cd91f9870ced1a95d54
--- /dev/null
+++ b/app/llava/eval/llava_eval/webpage/figures/chatgpt.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 2406 2406"><path d="M1 578.4C1 259.5 259.5 1 578.4 1h1249.1c319 0 577.5 258.5 577.5 577.4V2406H578.4C259.5 2406 1 2147.5 1 1828.6V578.4z" fill="#74aa9c"/><path d="M1107.3 299.1c-198 0-373.9 127.3-435.2 315.3C544.8 640.6 434.9 720.2 370.5 833c-99.3 171.4-76.6 386.9 56.4 533.8-41.1 123.1-27 257.7 38.6 369.2 98.7 172 297.3 260.2 491.6 219.2 86.1 97 209.8 152.3 339.6 151.8 198 0 373.9-127.3 435.3-315.3 127.5-26.3 237.2-105.9 301-218.5 99.9-171.4 77.2-386.9-55.8-533.9v-.6c41.1-123.1 27-257.8-38.6-369.8-98.7-171.4-297.3-259.6-491-218.6-86.6-96.8-210.5-151.8-340.3-151.2zm0 117.5-.6.6c79.7 0 156.3 27.5 217.6 78.4-2.5 1.2-7.4 4.3-11 6.1L952.8 709.3c-18.4 10.4-29.4 30-29.4 51.4V1248l-155.1-89.4V755.8c-.1-187.1 151.6-338.9 339-339.2zm434.2 141.9c121.6-.2 234 64.5 294.7 169.8 39.2 68.6 53.9 148.8 40.4 226.5-2.5-1.8-7.3-4.3-10.4-6.1l-360.4-208.2c-18.4-10.4-41-10.4-59.4 0L1024 984.2V805.4L1372.7 604c51.3-29.7 109.5-45.4 168.8-45.5zM650 743.5v427.9c0 21.4 11 40.4 29.4 51.4l421.7 243-155.7 90L597.2 1355c-162-93.8-217.4-300.9-123.8-462.8C513.1 823.6 575.5 771 650 743.5zm807.9 106 348.8 200.8c162.5 93.7 217.6 300.6 123.8 462.8l.6.6c-39.8 68.6-102.4 121.2-176.5 148.2v-428c0-21.4-11-41-29.4-51.4l-422.3-243.7 155-89.3zM1201.7 997l177.8 102.8v205.1l-177.8 102.8-177.8-102.8v-205.1L1201.7 997zm279.5 161.6 155.1 89.4v402.2c0 187.3-152 339.2-339 339.2v-.6c-79.1 0-156.3-27.6-217-78.4 2.5-1.2 8-4.3 11-6.1l360.4-207.5c18.4-10.4 30-30 29.4-51.4l.1-486.8zM1380 1421.9v178.8l-348.8 200.8c-162.5 93.1-369.6 38-463.4-123.7h.6c-39.8-68-54-148.8-40.5-226.5 2.5 1.8 7.4 4.3 10.4 6.1l360.4 208.2c18.4 10.4 41 10.4 59.4 0l421.9-243.7z" fill="white"/></svg>
\ No newline at end of file
diff --git a/app/llava/eval/llava_eval/webpage/figures/swords_FILL0_wght300_GRAD0_opsz48.svg b/app/llava/eval/llava_eval/webpage/figures/swords_FILL0_wght300_GRAD0_opsz48.svg
new file mode 100644
index 0000000000000000000000000000000000000000..3bee468d34515fdcbef1a8b8803c9fc4f7dc0b34
--- /dev/null
+++ b/app/llava/eval/llava_eval/webpage/figures/swords_FILL0_wght300_GRAD0_opsz48.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" height="48" viewBox="0 96 960 960" width="48"><path d="m762.846 947.614-124.77-124.769-88 88-30.306-30.692q-16.616-16.231-16.616-40.077 0-23.846 16.616-40.461L708 611.385q16.23-16.231 40.076-16.231t40.462 16.231l30.307 30.691-88 88 124.154 124.77q8.615 8.615 8.615 20.23 0 11.616-8.615 20.231l-51.692 52.307q-8.615 9-20.231 9-11.615 0-20.23-9Zm97.153-624.076L412.768 771.153l27.847 28.077q16.231 16.616 16.231 40.462 0 23.846-16.231 40.077l-30.691 30.691-88-88-124.77 124.769q-8.615 9-20.23 9-11.616 0-20.231-9l-52.307-52.307q-9-8.615-9-20.23 0-11.616 9-20.231l124.769-124.769-88-88L171.847 611q16.231-16.23 40.077-16.23 23.846 0 40.461 16.23l28.462 28.232 447.615-447.231h131.537v131.537ZM323.846 483.769l33.769-34.154 34.154-34.153-34.154 34.153-33.769 34.154Zm-31.999 31.999-191.846-192.23V192.001h131.537l191.461 191.846-31.23 31.615-179.077-178.077h-67.307v67.307l178.461 179.077-31.999 31.999Zm87.691 222.77 435.077-433.846v-67.307h-67.307L312.231 670.846l67.307 67.692Zm0 0L346.385 704l-34.154-33.154L346.385 704l33.153 34.538Z"/></svg>
\ No newline at end of file
diff --git a/app/llava/eval/llava_eval/webpage/index.html b/app/llava/eval/llava_eval/webpage/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..c2e3cf020ba7d8e064f2cd801788a5d2d50b97da
--- /dev/null
+++ b/app/llava/eval/llava_eval/webpage/index.html
@@ -0,0 +1,162 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Who's GPT-4's favorite? Battles between State-of-the-Art Chatbots</title>
+    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.5.2/css/bootstrap.min.css">
+    <link rel="stylesheet" href="https://fonts.googleapis.com/icon?family=Material+Icons">
+    <link rel="stylesheet" href="styles.css">
+</head>
+
+<body>
+    <nav class="navbar navbar-expand-lg navbar-dark bg-dark">
+        <a class="navbar-brand" href="#">🏔️ Vicuna Evaluation Examples</a>
+        <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation">
+          <span class="navbar-toggler-icon"></span>
+        </button>
+        <div class="collapse navbar-collapse" id="navbarNav">
+          <ul class="navbar-nav mr-auto">
+            <li class="nav-item">
+                <a class="nav-link" href="https://chat.lmsys.org/">Demo</a>
+              </li>
+              <li class="nav-item">
+                <a class="nav-link" href="https://vicuna.lmsys.org">Blog</a>
+              </li>
+              <li class="nav-item">
+                <a class="nav-link" href="https://github.com/lm-sys/FastChat">Github</a>
+              </li>
+          </ul>
+        </div>
+    </nav>
+
+    <div class="container mt-5">
+        <h2 class="text-center mb-5">Who's GPT-4's favorite? Battles between State-of-the-Art Chatbots</h2>
+
+        <!-- Selection -->
+        <div class="form-row">
+            <div class="form-group col-md-2">
+                <label for="category-select">Category</label>
+                <select class="form-control" id="category-select"></select>
+            </div>
+            <div class="form-group col-md-8">
+                <label for="question-select">Question</label>
+                <select class="form-control" id="question-select"></select>
+            </div>
+            <div class="form-group col-md-2">
+                <div class="col-md-2"><label>&nbsp;</label></div>
+                <div class="btn-group" role="group" aria-label="Left and Right Controller">
+                    <button type="button" class="form-control btn btn-primary" id="prev-question"><i class="material-icons">keyboard_arrow_left</i></button>
+                    <button type="button" class="form-control btn btn-primary" id="next-question"><i class="material-icons">keyboard_arrow_right</i></button>
+                </div>
+            </div>
+        </div>
+
+        <!-- "Battle" -->
+        <div class="row mb-4" style="justify-content: center;">
+            <div class="col" style="display: flex; justify-content: center; align-items: center;">
+                <label class="adjustable-font-size" id="other-score-label">*/10</label>
+            </div>
+            <div class="col">
+                <div class="vertical-flex-layout">
+                    <img class="shadow figure-img img-fluid" src="" alt="other logo" width="150" id="other-model-figure">
+                </div>
+            </div>
+            <div class="col">
+                <div class="vertical-flex-layout">
+                    <!-- from: https://fonts.google.com/icons?icon.query=battle&selected=Material+Symbols+Outlined:swords:FILL@0;wght@300;GRAD@0;opsz@48&icon.style=Outlined -->
+                    <img class="figure-img img-fluid" src="figures/swords_FILL0_wght300_GRAD0_opsz48.svg" width="60" height="60">
+                </div>
+            </div>
+            <div class="col">
+                <div class="vertical-flex-layout">
+                    <img class="shadow figure-img img-fluid" src="figures/vicuna.jpeg" alt="vicuna logo" width="150" id="our-model-figure">
+                </div>
+            </div>
+            <div class="col" style="display: flex; justify-content: center; align-items: center;">
+                <label class="adjustable-font-size" id="our-score-label">*/10</label>
+            </div>
+        </div>
+
+        <!-- Question Card -->
+        <div class="card mb-4">
+            <div class="card-body" id="selected-question"></div>
+        </div>
+
+        <!-- Answer Cards -->
+        <div class="row">
+            <div class="col-md-6">
+                <div class="card mb-4 expandable-card">
+                    <div class="card-header" style="padding-bottom: 0.2rem" id="other-model-header-bg">
+                        <div class="row">
+                            <div class="col-md-5" style="align-items: center; display: flex;">
+                                <label id="other-model-header">Assistant #1</label>
+                            </div>
+                            <div class="col-md-7">
+                                <select class="form-control" id="model-select" style="height: fit-content; margin-top: -0.3rem;"></select>
+                            </div>
+                        </div>
+                    </div>
+                    <div class="card-body">
+                        <div class="card-text-container">
+                            <div class="card-text" id="other-model-answer"></div>
+                        </div>
+                        <div class="btn btn-primary expand-btn" style="display:flex;"></div>
+                    </div>
+                </div>
+            </div>
+            <div class="col-md-6">
+                <div class="card mb-4 expandable-card">
+                    <div class="card-header" id="our-model-header">
+                        Assistant #2 (Vicuna, our model)
+                    </div>
+                    <div class="card-body">
+                        <div class="card-text-container">
+                            <div class="card-text" id="our-model-answer"></div>
+                        </div>
+                        <div class="btn btn-primary expand-btn" style="display:flex;"></div>
+                    </div>
+                </div>
+            </div>
+        </div>
+
+        <!-- Evaluation -->
+        <div class="card expandable-card">
+            <div class="card-header" style="background-color: #c9c9f2;" id="evaluation-header">GPT-4 Evaluation</div>
+            <div class="card-body">
+                <div class="card-text-container">
+                    <div class="card-text" id="evaluation-result"></div>
+                </div>
+                <div class="btn btn-primary expand-btn" style="display:flex;"></div>
+            </div>
+        </div>
+    </div>
+
+    <div class="container-fluid bg-light py-2">
+        <div class="text-center">
+            <small class="text-muted">This website is co-authored with <a href="https://openai.com" target="_blank">GPT-4</a>.</small>
+        </div>
+    </div>
+
+    <!-- Marked.js -->
+    <script src="https://cdn.jsdelivr.net/npm/marked@4.3.0/lib/marked.umd.min.js"></script>
+    <!-- Bootstrap and Popper.js JavaScript dependencies -->
+    <script src="https://code.jquery.com/jquery-3.5.1.slim.min.js"></script>
+    <script src="https://cdn.jsdelivr.net/npm/@popperjs/core@2.11.6/dist/umd/popper.min.js"></script>
+    <script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.5.2/js/bootstrap.min.js"></script>
+
+    <script src="script.js"></script>
+    <script>
+      // Fetch the JSON file
+      fetch('data.json')
+        .then(response => response.json())
+        .then(json_data => {
+            // Populate the models and questions.
+            populateModels(json_data.models);
+            populateQuestions(json_data.questions);
+            displayQuestion(currentQuestionIndex);
+        }).catch(error => console.error(error));
+    </script>
+</body>
+
+</html>
diff --git a/app/llava/eval/llava_eval/webpage/script.js b/app/llava/eval/llava_eval/webpage/script.js
new file mode 100644
index 0000000000000000000000000000000000000000..4b71e3d5618a262e4746f58e5d10947b73370dca
--- /dev/null
+++ b/app/llava/eval/llava_eval/webpage/script.js
@@ -0,0 +1,245 @@
+// Description: Script for the evaluation webpage.
+
+let currentQuestionIndex = 1;
+
+// Store the model name mapping for later use.
+modelNameMapping = {
+    "gpt35": "ChatGPT-3.5",
+    "gpt4": "GPT-4",
+    "alpaca": "Alpaca-13b",
+    "vicuna": "Vicuna-13b",
+    "llama": "LLaMA-13b",
+    "bard": "Bard",
+};
+
+modelFigureMapping = {
+    "vicuna": "figures/vicuna.jpeg",
+    // Image from: https://commons.wikimedia.org/wiki/File:ChatGPT_logo.svg
+    "gpt35": "figures/chatgpt.svg",
+    // Image from: https://www.reddit.com/r/logodesign/comments/1128aat/google_ai_bard_logo_design/
+    "bard": "figures/bard.jpg",
+    // Image from: https://crfm.stanford.edu/2023/03/13/alpaca.html
+    "alpaca": "figures/alpaca.png",
+    // Image adapted from https://commons.wikimedia.org/wiki/File:Llama_on_Machu_Picchu.jpg
+    "llama": "figures/llama.jpg",
+}
+
+// Store the question data in a mapping for later use.
+questionMapping = {};
+// Store the question ids in a mapping for later use.
+categoryMapping = {};
+// Store the number of questions for later use.
+questionsCount = 0;
+
+
+function text2Markdown(text) {
+    // Normalize the text for markdown rendering.
+    text = text.trim().replaceAll('\n\n', '\n').replaceAll('\n', '\n\n');
+    return marked.parse(text);
+}
+
+function capitalizeFirstChar(str) {
+    if (!str || str.length === 0) {
+      return str;
+    }
+    return str.charAt(0).toUpperCase() + str.slice(1);
+}
+
+function updateQuestionSelect(question_id) {
+    const select = document.getElementById('question-select');
+    // Clear the question select.
+    select.innerHTML = '';
+    // Populate the question select.
+    category = questionMapping[question_id].category;
+    categoryMapping[category].forEach(question_id => {
+        const question = questionMapping[question_id];
+        const option = document.createElement('option');
+        option.value = question_id;
+        option.textContent = 'Q' + question_id.toString() + ': ' + question.question;
+        select.appendChild(option);
+    });
+    select.value = question_id;
+}
+
+function updateModelSelect() {
+    const select = document.getElementById('model-select');
+    img_path = modelFigureMapping[select.value];
+    document.getElementById('other-model-figure').src = img_path;
+}
+
+function populateModels(models) {
+    const select = document.getElementById('model-select');
+    models.forEach(model => {
+        const option = document.createElement('option');
+        option.value = model;
+        option.textContent = modelNameMapping[model];
+        select.appendChild(option);
+    });
+    updateModelSelect();
+}
+
+function populateQuestions(questions) {
+    const category_select = document.getElementById('category-select');
+
+    questionsCount = questions.length;
+    questions.forEach(question => {
+        const option = document.createElement('option');
+        // Store the question data in a mapping for later use.
+        questionMapping[question.id] = {
+            category: question.category,
+            question: question.question,
+            answers: question.answers,
+            evaluations: question.evaluations,
+            scores: question.scores,
+        };
+        // Store the question id in the category mapping.
+        if (question.category in categoryMapping) {
+            categoryMapping[question.category].push(question.id);
+        } else {
+            categoryMapping[question.category] = [question.id];
+            const category_option = document.createElement('option');
+            category_option.value = question.category;
+            category_option.textContent = capitalizeFirstChar(question.category);
+            category_select.appendChild(category_option);
+        }
+    });
+    // Set the default category.
+    updateQuestionSelect(currentQuestionIndex);
+}
+
+function displayQuestion(index) {
+    const question = questionMapping[index].question;
+    document.getElementById('selected-question').innerHTML = text2Markdown('**Question:** ' + question);
+    displayAnswers(index);
+}
+
+function displayAnswers(index) {
+    const question = questionMapping[index];
+    const otherModel = document.getElementById('model-select').value;
+    // render the answers with markdown
+    document.getElementById('other-model-answer').innerHTML = text2Markdown(question.answers[otherModel]);
+    document.getElementById('our-model-answer').innerHTML = text2Markdown(question.answers.vicuna);
+
+    // Display evaluation
+    score = question.scores[otherModel];
+    score_text = modelNameMapping[otherModel] + " " + score[0] + "/10, Vicuna-13b " + score[1] + "/10";
+    document.getElementById('evaluation-header').textContent = "GPT-4 Evaluation" + " (Score: " + score_text + ")";
+    document.getElementById('evaluation-result').innerHTML = text2Markdown(question.evaluations[otherModel]);
+
+    // Update model names
+    let assistant1_title = "Assistant #1"; // (" + modelNameMapping[otherModel] + ")";
+    let assistant2_title = "Assistant #2 (Vicuna-13b, our model)";
+    // Update scores/labels.
+    let assistant1_score_label = score[0].toString() + '/10';
+    let assistant2_score_label = score[1].toString() + '/10';
+
+    const colorRed ='#fa9'; // '#eb978d';
+    // const colorGreen = '#c9f2c9';
+    const colorBlue = '#8ef'; // '#71dbf9';
+    const colorYellow = '#fe7'; // '#fada57';
+    let otherModelHeaderColor = '';
+    let ourModelHeaderColor = '';
+    // Update the winner.
+    if (score[0] == score[1]) {
+        assistant1_title = '🏆 ' + assistant1_title;
+        assistant1_score_label = '🏆 ' + assistant1_score_label;
+        assistant2_title = '🏆 ' + assistant2_title;
+        assistant2_score_label = '🏆 ' + assistant2_score_label;
+        otherModelHeaderColor = colorYellow;
+        ourModelHeaderColor = colorYellow;
+    } else if (score[0] > score[1]) {
+        assistant1_title = '🏆 ' + assistant1_title;
+        assistant1_score_label = '🏆 ' + assistant1_score_label;
+        otherModelHeaderColor = colorBlue;
+        ourModelHeaderColor = colorRed;
+    } else if (score[0] < score[1]) {
+        assistant2_title = '🏆 ' + assistant2_title;
+        assistant2_score_label = '🏆 ' + assistant2_score_label;
+        otherModelHeaderColor = colorRed;
+        ourModelHeaderColor = colorBlue;
+    }
+
+    document.getElementById('other-model-header-bg').style.backgroundColor = otherModelHeaderColor;
+    document.getElementById('our-model-header').style.backgroundColor = ourModelHeaderColor;
+
+    document.getElementById('other-model-header').textContent = assistant1_title;
+    document.getElementById('our-model-header').textContent = assistant2_title;
+
+    document.getElementById('other-score-label').textContent = assistant1_score_label;
+    document.getElementById('our-score-label').textContent = assistant2_score_label;
+
+    // Update expand buttons visibility for both cards after displaying answers
+    // Reset the expanded state and update expand buttons visibility for both cards after displaying answers
+    document.querySelectorAll('.expandable-card').forEach(card => {
+        card.classList.remove('expanded');
+        updateExpandButtonVisibility(card);
+        const expandBtn = card.querySelector('.expand-btn');
+        expandBtn.innerHTML = '<i class="material-icons" style="pointer-events: none">keyboard_arrow_down</i> Show more';   // .textContent = 'Show more';
+    });
+}
+
+document.getElementById('question-select').addEventListener('change', e => {
+    currentQuestionIndex = parseInt(e.target.value);
+    displayQuestion(currentQuestionIndex);
+});
+
+document.getElementById('category-select').addEventListener('change', e => {
+    let currentCategory = e.target.value;
+    const questionIds = categoryMapping[currentCategory];
+    currentQuestionIndex = questionIds[0];
+    updateQuestionSelect(currentQuestionIndex);
+    displayQuestion(currentQuestionIndex);
+});
+
+// Update expand buttons whenever the model is changed
+document.getElementById('model-select').addEventListener('change', () => {
+    displayAnswers(currentQuestionIndex);
+    document.querySelectorAll('.expandable-card').forEach(card => {
+        updateExpandButtonVisibility(card);
+    });
+    updateModelSelect();
+});
+
+function switchQuestionAndCategory() {
+    document.getElementById('question-select').value = currentQuestionIndex;
+    old_category = document.getElementById('category-select').value;
+    new_category = questionMapping[currentQuestionIndex].category;
+    if (old_category != new_category) {
+        document.getElementById('category-select').value = new_category;
+        updateQuestionSelect(currentQuestionIndex);
+    }
+    displayQuestion(currentQuestionIndex);
+}
+
+document.getElementById('prev-question').addEventListener('click', () => {
+    // Question index starts from 1.
+    currentQuestionIndex = Math.max(1, currentQuestionIndex - 1);
+    switchQuestionAndCategory();
+});
+
+document.getElementById('next-question').addEventListener('click', () => {
+    // Question index starts from 1.
+    currentQuestionIndex = Math.min(questionsCount, currentQuestionIndex + 1);
+    switchQuestionAndCategory();
+});
+
+function updateExpandButtonVisibility(card) {
+    const cardTextContainer = card.querySelector('.card-text-container');
+    const expandBtn = card.querySelector('.expand-btn');
+    if (cardTextContainer.scrollHeight > cardTextContainer.offsetHeight) {
+        expandBtn.style.display = 'flex';
+    } else {
+        expandBtn.style.display = 'none';
+        card.classList.add('expanded');
+    }
+}
+
+document.querySelectorAll('.expand-btn').forEach(btn => {
+    btn.addEventListener('click', e => {
+        const card = e.target.closest('.expandable-card');
+        card.classList.toggle('expanded');
+        const more = '<i class="material-icons" style="pointer-events: none">keyboard_arrow_down</i> Show more';
+        const less = '<i class="material-icons" style="pointer-events: none">keyboard_arrow_up</i> Show less';
+        e.target.innerHTML = card.classList.contains('expanded') ? less : more;
+    });
+});
diff --git a/app/llava/eval/llava_eval/webpage/styles.css b/app/llava/eval/llava_eval/webpage/styles.css
new file mode 100644
index 0000000000000000000000000000000000000000..7b6d6fc69b336c0a5d103be9fb13a0e0897c76a3
--- /dev/null
+++ b/app/llava/eval/llava_eval/webpage/styles.css
@@ -0,0 +1,105 @@
+body {
+    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+    background-color: #f8f9fa;
+}
+
+.navbar-dark .navbar-nav .nav-link {
+    color: #f1cf68;
+    font-size: 1.1rem;
+    padding: 0.5rem 0.6rem;
+}
+
+.card-header {
+    font-weight: bold;
+}
+
+.card {
+    box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
+    transition: 0.3s;
+}
+
+.card:hover {
+    box-shadow: 0 8px 16px rgba(0, 0, 0, 0.2);
+}
+
+button {
+    transition: background-color 0.3s;
+}
+
+button:hover {
+    background-color: #007bff;
+}
+
+@media (max-width: 767px) {
+    .form-row .form-group {
+        margin-bottom: 10px;
+    }
+}
+
+/* Extra styles */
+
+.expandable-card .card-text-container {
+    max-height: 200px;
+    overflow-y: hidden;
+    position: relative;
+}
+
+.expandable-card.expanded .card-text-container {
+    max-height: none;
+}
+
+.expand-btn {
+    position: relative;
+    display: none;
+    background-color: rgba(255, 255, 255, 0.8);
+    color: #510c75;
+    border-color: transparent;
+}
+
+.expand-btn:hover {
+    background-color: rgba(200, 200, 200, 0.8);
+    text-decoration: none;
+    border-color: transparent;
+    color: #510c75;
+}
+
+.expand-btn:focus {
+    outline: none;
+    text-decoration: none;
+}
+
+.expandable-card:not(.expanded) .card-text-container:after {
+    content: "";
+    position: absolute;
+    bottom: 0;
+    left: 0;
+    width: 100%;
+    height: 90px;
+    background: linear-gradient(rgba(255, 255, 255, 0.2), rgba(255, 255, 255, 1));
+}
+
+.expandable-card:not(.expanded) .expand-btn {
+    margin-top: -40px;
+}
+
+.card-body {
+    padding-bottom: 5px;
+}
+
+.vertical-flex-layout {
+    justify-content: center;
+    align-items: center;
+    height: 100%;
+    display: flex;
+    flex-direction: column;
+    gap: 5px;
+}
+
+.figure-img {
+    max-width: 100%;
+    height: auto;
+}
+
+.adjustable-font-size {
+    font-size: calc(0.5rem + 2vw);
+}
diff --git a/app/llava/eval/masp_eval/eval_case.py b/app/llava/eval/masp_eval/eval_case.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ffae65baec70f7cc0221e5d27b6b00af891489d
--- /dev/null
+++ b/app/llava/eval/masp_eval/eval_case.py
@@ -0,0 +1,161 @@
+import argparse
+import logging
+
+import copy
+import codecs
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
+import torch
+import decord
+import os
+import json
+import random
+import requests
+from tqdm import tqdm
+import numpy as np
+
+from llava.constants import MM_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_VIDEO_TOKEN, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN
+from llava.conversation import conv_templates, SeparatorStyle
+from llava.utils import disable_torch_init
+from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria, process_images_v2
+from llava.model import *
+from llava.model.builder import load_pretrained_model
+from llava.model.multimodal_encoder.processor import Blip2ImageTrainProcessor
+
+from transformers import CLIPImageProcessor
+from PIL import Image
+from decord import VideoReader, cpu
+
+decord.bridge.set_bridge("torch")
+
+
+
+def get_image(image_path):
+    image = Image.open(image_path).convert('RGB')
+    return image
+
+
+# def load_frames(frames_dir, frame_names):
+#     results = []
+#     for frame_name in frame_names:
+#         image_path = f"{frames_dir}/{frame_name}"
+#         image = get_image(image_path)
+#         results.append(image)
+#     return results
+
+def load_frames(frames_dir):
+    results = []
+    image_files = [(int(os.path.splitext(img)[0]), img) for img in os.listdir(frames_dir) if not img.startswith('cuttime')]
+    image_files = sorted(image_files, key=lambda img: img[0])
+    for frame_name in image_files:
+        image_path = f"{frames_dir}/{frame_name[1]}"
+        image = get_image(image_path)
+        results.append(image)
+    return results
+
+
+
+
+def uniform_sample(frames, num_segments):
+    indices = np.linspace(start=0, stop=len(frames) - 1, num=num_segments).astype(int)
+    frames = [frames[ind] for ind in indices]
+    return frames
+
+    
+
+
+def run_inference(args, frame_folders):
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, _, context_len = load_pretrained_model(model_path, args.model_base, model_name, device_map={"":0})
+    image_processor = Blip2ImageTrainProcessor(
+        image_size=model.config.img_size,
+        is_training=False)
+    model_cfgs = model.config
+
+    
+    for frame_folder in frame_folders:
+        question = "Describe the video in detail."
+
+        # Question input here
+        qs = question
+        # qs = DEFAULT_VIDEO_TOKEN + '\n' + qs
+        if model.config.mm_use_start_end:
+            qs = DEFAULT_VIDEO_START_TOKEN + DEFAULT_VIDEO_TOKEN + DEFAULT_VIDEO_END_TOKEN + '\n' + qs
+        else:
+            qs = DEFAULT_VIDEO_TOKEN + '\n' + qs
+
+        conv = conv_templates[args.conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+
+        # inputs = tokenizer([prompt])
+        input_ids = tokenizer_image_token(prompt, tokenizer, MM_TOKEN_INDEX, return_tensors='pt').unsqueeze(
+            0).cuda()
+    
+
+        # try:
+        images = load_frames(frame_folder)
+        # images = images[:15:2]
+        if len(images) > args.num_segments:
+            images = uniform_sample(images, args.num_segments)
+        elif len(images) < args.num_segments:
+            # frame_indices = [i for i in range(len(images))]
+            images = uniform_sample(images, args.num_segments)
+        else:
+            pass
+        
+        if model_cfgs.image_aspect_ratio == 'pad':
+            model_cfgs.image_aspect_ratio = 'no_padding' 
+        images_tensor = process_images_v2(images, image_processor, model_cfgs).half().cuda()
+        # print(images_tensor.shape)
+
+        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+        keywords = [stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+        images_tensors = [images_tensor.clone() for _ in range(args.num_beams)]
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images= images_tensors,
+                do_sample=True,
+                temperature=args.temperature,
+                top_p=args.top_p,
+                num_beams=args.num_beams,
+                no_repeat_ngram_size=args.no_repeat_ngram_size,
+                pad_token_id=tokenizer.eos_token_id, 
+                max_new_tokens=1024,
+                use_cache=True,
+                stopping_criteria=[stopping_criteria])
+
+
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
+
+        outputs = outputs.strip()
+        if outputs.endswith(conv.sep):
+            outputs = outputs[:-len(stop_str)]
+        outputs = outputs.strip()
+        print(outputs)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--video_dir', help='Directory containing video files.', type=str, default="")
+    parser.add_argument('--validation_data', type=str,
+                        default="/mnt/bn/yukunfeng-nasdrive/xiangchen/repo/benchmark_data/refine_chair_eval_gt_neg_1k.json")
+    parser.add_argument('--num_samples', help='Number of samples to predict', type=int, default=-1)
+    parser.add_argument("--model_path", type=str,
+                        default="/mnt/bn/algo-masp-nas-2/xiangchen/model/masp_models/checkpoints/llava-mistral_gpt4v_adso185k_unfreeze_qformer_data_sampler/")
+    parser.add_argument("--model_base", type=str, default=None)
+    parser.add_argument("--conv_mode", type=str, default="v1")
+    parser.add_argument("--output_file", type=str, default="vid_top1k_res.json")
+    parser.add_argument("--num_segments", type=int, default=10)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--num_beams", type=int, default=1)
+    parser.add_argument("--no_repeat_ngram_size", type=int, default=3)
+
+    args = parser.parse_args()
+    frame_folders = ['/mnt/bn/algo-masp-nas-2/xiangchen/repo/LLaVA/tmp/cases/yj']
+    run_inference(args, frame_folders)
diff --git a/app/llava/eval/masp_eval/gpt4v_score/eval_gpt4v_dist_stage1.py b/app/llava/eval/masp_eval/gpt4v_score/eval_gpt4v_dist_stage1.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca3e163c017511d29e0b97129f1283c66687771e
--- /dev/null
+++ b/app/llava/eval/masp_eval/gpt4v_score/eval_gpt4v_dist_stage1.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[1]:
+
+
+#coding=utf-8
+
+
+# In[2]:
+
+
+import os
+import re
+import sys
+from tqdm import tqdm
+
+from PIL import Image
+import base64
+from io import BytesIO
+import pandas as pd
+
+import requests
+import json
+import time
+import openai
+import random
+
+sys.path.append('../..')
+
+from concurrent.futures import ProcessPoolExecutor, as_completed
+# from llava.eval.benchmark_core.utils.azure_utils import AzureVisionClient
+# from llava.eval.benchmark_core.utils.video_utils import sample_frames
+from masp_eval.utils.azure_utils import AzureVisionClient
+from masp_eval.utils.video_utils import sample_frames
+
+
+# In[3]:
+
+
+stage1_prompt = """
+Extract information from a sequence of video frames based on five aspects:
+
+- **Subjects**: Identify all primary and secondary entities, describing their quantity, types, and notable features.
+- **Attributes**: Detail the subjects' characteristics, such as physical appearance, emotional expressions, and other qualities.
+- **Scenes**: Describe the video's setting, including location, ambiance, time, and weather if relevant.
+- **Actions**: Outline the subjects' actions or events, including movements, interactions, and environmental changes.
+- **OCR**: Transcribe overlaid text and provide its context and significance.
+
+Provide a detailed description for each aspect in JSON format:
+
+```json
+{
+    "Subjects": "List of subjects in the video.",
+    "Attributes": "List of the attributes of the subjects in the video.",
+    "Scenes": "List of the scenes in the video.",
+    "Actions": "List of the actions in the video.",
+    "OCR": "Transcription and interpretation of any text overlays in the video."
+}
+```
+"""
+
+
+# In[4]:
+# ak does not allow plaintext storage.
+ak = "Gjrgj*"
+client = AzureVisionClient(ak)
+pattern = r'\{.*?\}'
+
+
+# In[5]:
+
+
+def encode_image(image):
+    buffered = BytesIO()
+    image.save(buffered, format="JPEG")
+    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+    
+    return img_b64_str
+
+def load_frames(path_to_images):
+    max_frames = 10
+
+    # Get list of all files in the directory
+    image_files = [(int(os.path.splitext(file)[0]), file) for file in os.listdir(path_to_images) if file.endswith(('jpg', 'jpeg', 'png'))]
+    image_files = sorted(image_files, key=lambda img: img[0])
+    # Set up the matplotlib figure and axes, based on the number of images
+    num_images = len(image_files)
+    frames = []
+    # Read and display each image
+    for image_file in image_files:
+        image_path = os.path.join(path_to_images, image_file[1])
+        image = Image.open(image_path)
+        frames.append(image)
+
+    if len(frames) > max_frames:
+        frames = sample_frames(frames, max_frames)
+
+    vid_b64_lst = list(map(encode_image, frames))
+
+    return vid_b64_lst
+
+
+# In[6]:
+
+
+def extract_info(item):
+    ans = None
+    max_attempts = 3
+    attempts = 0
+
+    while attempts < max_attempts and ans is None:
+        try:
+            resp, messages = client.request(stage1_prompt, vid_data=load_frames(item['video_path']))
+            out = json.loads(re.findall(pattern, resp, re.DOTALL)[0].replace("\n", ""))
+            res = {
+                "object_id": item['object_id'],
+                "video_path": item['video_path'],
+                "policy_list": item['policy_list'],
+                "refine_caption": item['refine_caption'],
+                # "masp_inference": item['masp_inference'],
+                "Subjects": out['Subjects'],
+                "Attributes": out['Attributes'],
+                "Scenes": out['Scenes'],
+                "Actions": out['Actions'],
+                "OCR": out['OCR'],
+                "Resp": resp
+            }
+            # ans = (res, messages)
+            ans = res
+        except Exception as e:
+            attempts += 1
+            print(e)
+            
+    return ans
+
+
+# In[ ]:
+
+
+if __name__ == "__main__":
+    import argparse
+    
+    parser = argparse.ArgumentParser("GPT4-V Eval")
+    parser.add_argument("--gt_file", type=str, default="/mnt/bn/algo-masp-nas-2/baiyi.by/data/Benchmarks/GPT4V_Eval/eval_negative_1k.json")
+    parser.add_argument("--inter_file", type=str, default="/mnt/bn/algo-masp-nas-2/baiyi.by/data/Benchmarks/GPT4V_Eval/eval_negative_1k_gptv_inter_res.json")
+    parser.add_argument("-f", "--fff", type=str, default="1", required=False)
+    args = parser.parse_args()
+    
+    pred_data = json.load(open(args.pred_file))
+    
+    executor = ProcessPoolExecutor(max_workers=40)
+    
+    all_tasks = [executor.submit(extract_info, item) for item in pred_data]
+    
+    all_results = []
+    
+    for future in tqdm(as_completed(all_tasks)):
+        result = future.result()
+        
+        if result is None:
+            continue
+            
+        all_results.append(result)
+    
+    json.dump(all_results, open(args.inter_file, 'w'), indent=4, ensure_ascii=False)
+    
+    print("==> Eval Finished.")
+
+
+# In[ ]:
+
+
+
+
diff --git a/app/llava/eval/masp_eval/gpt4v_score/eval_gpt4v_dist_stage2.py b/app/llava/eval/masp_eval/gpt4v_score/eval_gpt4v_dist_stage2.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee77f123a545949d67490b06b422cc30bc91d3e3
--- /dev/null
+++ b/app/llava/eval/masp_eval/gpt4v_score/eval_gpt4v_dist_stage2.py
@@ -0,0 +1,311 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[1]:
+
+
+# coding=utf-8
+
+
+# In[2]:
+
+
+import os
+import re
+import sys
+from tqdm import tqdm
+
+from PIL import Image
+import base64
+from io import BytesIO
+import pandas as pd
+
+import requests
+import json
+import time
+import openai
+import random
+
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from llava.eval.masp_eval.utils.azure_utils import AzureVisionClient
+from llava.eval.masp_eval.utils.video_utils import sample_frames, uniform_sample
+
+# from benchmark_core.utils.azure_utils import AzureVisionClient
+# from benchmark_core.utils.video_utils import sample_frames
+
+
+# In[4]:
+
+stage1_prompt = """
+Extract information from a sequence of video frames based on five aspects:
+
+- **Subjects**: Identify all primary and secondary entities, describing their quantity, types, and notable features.
+- **Attributes**: Detail the subjects' characteristics, such as physical appearance, emotional expressions, and other qualities.
+- **Scenes**: Describe the video's setting, including location, ambiance, time, and weather if relevant.
+- **Actions**: Outline the subjects' actions or events, including movements, interactions, and environmental changes.
+- **OCR**: Transcribe overlaid text and provide its context and significance.
+
+Provide a detailed description for each aspect in JSON format:
+
+```json
+{
+    "Subjects": "List of subjects in the video.",
+    "Attributes": "List of the attributes of the subjects in the video.",
+    "Scenes": "List of the scenes in the video.",
+    "Actions": "List of the actions in the video.",
+    "OCR": "Transcription and interpretation of any text overlays in the video."
+}
+```
+"""
+
+stage2_prompt = """
+Evaluate the input caption's recall on five aspects of a video, assigning a numerical score for each. Average these scores for a final recall score. The aspects and their scoring guidelines are:
+- **SubjectsScore (100 points max)**: Evaluate the recall of 'Subjects' information in the caption. Full score indicates complete recall.
+- **AttributesScore (100 points max)**: Evaluate the recall of 'Attributes' information in the caption. Full score indicates complete recall.
+- **ScenesScore (100 points max)**: Evaluate the recall of 'Scenes' information in the caption. Full score indicates complete recall.
+- **ActionsScore (100 points max)**: Evaluate the recall of 'Actions' information in the caption. Full score indicates complete recall.
+- **OCRScore (100 points max)**: Evaluate the recall of 'OCR' information in the caption. Full score indicates complete recall.
+- **HallucinationScore (100 points max)**: Identify discrepancies between the caption and actual information, deducting points for added non-existent content.
+
+Provide structured output in JSON format following this template:
+
+```json
+{
+    "SubjectsScore": [Recall score out of 100 for subjects],
+    "SubjectsReason": "Explanation for Subjects score.",
+    "AttributesScore": [Recall score out of 100 for attributes],
+    "AttributesReason": "Explanation for Attributes score.",
+    "ScenesScore": [Recall score out of 100 for scenes],
+    "ScenesReason": "Explanation for Scenes score.",
+    "ActionsScore": [Recall score out of 100 for actions],
+    "ActionsReason": "Explanation for Actions score.",
+    "OCRScore": [Recall score out of 100 for OCR text],
+    "OCRReason": "Explanation for OCR score.",
+    "HallucinationScore": [Score out of 100 for hallucinated elements],
+    "HallucinationReason": "Explanation for Hallucination score.",
+    "FinalCaptionScore": [Final score averaged from each aspect]
+}
+
+The video information and caption for evaluation are as follows:
+"""
+
+# In[5]:
+
+# ak does not allow plaintext storage.
+ak_list = ["F1uPN8EfGeYXUSwt9iDy4KzxRBBLuxWr"]
+client_list = [AzureVisionClient(ak, max_retries=200) for ak in ak_list]
+
+pattern = r'\{.*?\}'
+
+
+# In[6]:
+
+
+def encode_image(image):
+    buffered = BytesIO()
+    image.save(buffered, format="JPEG")
+    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+
+    return img_b64_str
+
+
+def load_frames(path_to_images):
+    max_frames = 10
+
+    # Get list of all files in the directory
+    image_files = [(int(os.path.splitext(file)[0]), file) for file in os.listdir(path_to_images) if
+                   file.endswith(('jpg', 'jpeg', 'png'))]
+    image_files = sorted(image_files, key=lambda img: img[0])
+    # Set up the matplotlib figure and axes, based on the number of images
+    num_images = len(image_files)
+    frames = []
+    # Read and display each image
+    for image_file in image_files:
+        image_path = os.path.join(path_to_images, image_file[1])
+        image = Image.open(image_path)
+        frames.append(image)
+
+    if len(frames) > max_frames:
+        frames = uniform_sample(frames, max_frames)
+
+    vid_b64_lst = list(map(encode_image, frames))
+
+    return vid_b64_lst
+
+
+def score_func(item, messages=None):
+    ans = None
+    max_attempts = 3
+    attempts = 0
+
+    client = random.choice(client_list)
+
+    while attempts < max_attempts and ans is None:
+        try:
+            usr_prompt = stage2_prompt + '\nThe provided caption:\n' + item['masp_inference']
+            resp, _ = client.request(usr_prompt, messages=messages)
+            out = json.loads(re.findall(pattern, resp, re.DOTALL)[0].replace("\n", ""))
+            # out = json.loads(resp.choices[0].message.content)
+            res = {
+                "object_id": item['object_id'],
+                "video_path": item['video_path'],
+                "policy_list": item['policy_list'],
+                "refine_caption": item['refine_caption'],
+                "masp_inference": item['masp_inference'],
+                # "origin_caption_rate": item['origin_caption_rate'],
+                # "hard_negative_caption": item['hard_negative_caption'],
+                "Subjects": item['Subjects'],
+                "Attributes": item['Attributes'],
+                "Scenes": item['Scenes'],
+                "Actions": item['Actions'],
+                "OCR": item['OCR'],
+                "SubjectsScore": out['SubjectsScore'],
+                "SubjectsReason": out['SubjectsReason'],
+                "AttributesScore": out['AttributesScore'],
+                "AttributesReason": out['AttributesReason'],
+                "ScenesScore": out['ScenesScore'],
+                "ScenesReason": out['ScenesReason'],
+                "ActionsScore": out['ActionsScore'],
+                "ActionsReason": out['ActionsReason'],
+                "OCRScore": out['OCRScore'],
+                "OCRReason": out['OCRReason'],
+                "HallucinationScore": out['HallucinationScore'],
+                "HallucinationReason": out['HallucinationReason'],
+                "FinalCaptionScore": out['FinalCaptionScore']
+            }
+            ans = res
+        except Exception as e:
+            attempts += 1
+            print(e)
+    return ans
+
+
+# In[9]:
+def build_context(item):
+    vid_data = load_frames(item['video_path'])
+
+    content = [{
+            "type": "text",
+            "text": stage1_prompt
+        },
+        *map(lambda x: {
+            "type": "image_url",
+            "image_url": {
+                "url": f"data:image/jpeg;base64, {x}"
+            }
+        }, vid_data)]
+
+    messages = [{
+        'role': 'user',
+        'content': content
+    }, {
+        "role": "assistant",
+        "content": [{
+            "type": "text",
+            "text": item['Resp']
+        }]
+    }]
+    return messages
+
+
+def eval_pipeline(item):
+    try:
+        messages = build_context(item)
+
+        item_score = score_func(item, messages)
+
+    except Exception as e:
+        print(item["object_id"], e)
+        return None
+
+    return item_score
+
+
+# In[10]:
+
+
+def get_final_score(all_items):
+    from collections import defaultdict
+
+    score_dict = defaultdict(list)
+
+    for item in all_items:
+        for key in item:
+            if not key.endswith('Score'):
+                continue
+
+            score_dict[key].append(item[key])
+
+    final_score_dict = dict()
+    for key in score_dict:
+        final_score_dict[key] = round(sum(score_dict[key]) / len(score_dict[key]), 4)
+
+    return final_score_dict
+
+
+# In[11]:
+
+def merge_data(gt_data, pred_data):
+    gt_dict = dict()
+
+    for item in gt_data:
+        gt_dict[item['object_id']] = item
+
+    merge_list = []
+    for item in pred_data:
+        if item['object_id'] not in gt_dict:
+            continue
+
+        # item['Resp'] = gt_dict[item['object_id']]['Resp']
+        for key in ['Resp', 'Subjects', 'Attributes', 'Scenes', 'Actions', 'OCR']:
+            assert key in gt_dict[item['object_id']], f"object id: {item['object_id']}, key: {key}"
+            item[key] = gt_dict[item['object_id']][key]
+
+        merge_list.append(item)
+
+    return merge_list
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser("GPT4-V Eval")
+    parser.add_argument("--pred_file", type=str,
+                        default="/mnt/bn/algo-masp-nas-2/xiangchen/model/masp_models/checkpoints/llava-mistral_gpt4v_webvid/video_chair/pred_result.json")
+    parser.add_argument("--gt_file", type=str,
+                        default="/mnt/bn/algo-masp-nas-2/baiyi.by/data/Benchmarks/GPT4V_Eval/eval_negative_1k_gptv_inter_res.json")
+    parser.add_argument("-f", "--fff", type=str, default="1", required=False)
+    args = parser.parse_args()
+
+    pred_data = json.load(open(args.pred_file))
+    gt_data = json.load(open(args.gt_file))
+
+    merge_list = merge_data(gt_data, pred_data)
+    print(f"==> Valid data size: {len(merge_list)}")
+
+    executor = ProcessPoolExecutor(max_workers=8)
+
+    all_tasks = [executor.submit(eval_pipeline, item) for item in merge_list]
+
+    all_results = []
+
+    for future in tqdm(as_completed(all_tasks)):
+        result = future.result()
+
+        if result is None:
+            continue
+
+        all_results.append(result)
+
+    detail_file = os.path.splitext(args.pred_file)[0] + "_detail_res.json"
+    json.dump(all_results, open(detail_file, 'w'), indent=4, ensure_ascii=False)
+
+    final_score = get_final_score(all_results)
+    final_file = os.path.splitext(args.pred_file)[0] + "_final_score.json"
+    json.dump(final_score, open(final_file, 'w'), indent=4, ensure_ascii=False)
+
+    print(f"==> Final Result")
+
+    print(json.dumps(final_score, indent=4))
+
+    print("==> Eval Finished.")
diff --git a/app/llava/eval/masp_eval/gpt4v_score/v1/eval_gpt4v_dist_stage1.py b/app/llava/eval/masp_eval/gpt4v_score/v1/eval_gpt4v_dist_stage1.py
new file mode 100644
index 0000000000000000000000000000000000000000..63a718a463671aa600b8836bdfa884b1492da89c
--- /dev/null
+++ b/app/llava/eval/masp_eval/gpt4v_score/v1/eval_gpt4v_dist_stage1.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[8]:
+
+
+#coding=utf-8
+
+
+# In[9]:
+
+
+import os
+import re
+import sys
+from tqdm import tqdm
+
+from PIL import Image
+import base64
+from io import BytesIO
+import pandas as pd
+
+import requests
+import json
+import time
+import openai
+import random
+
+sys.path.append('../..')
+
+from concurrent.futures import ProcessPoolExecutor, as_completed
+# from llava.eval.benchmark_core.utils.azure_utils import AzureVisionClient
+# from llava.eval.benchmark_core.utils.video_utils import uniform_sample
+from masp_eval.utils.azure_utils import AzureVisionClient
+from masp_eval.utils.video_utils import uniform_sample
+
+
+
+# In[10]:
+
+
+stage1_prompt = """
+As a customer service agent, your primary responsibility is to comprehend the user-uploaded videos, taking into account both the visual content and the text presented, then extract the required aspects:
+
+- Main Content: The central idea and main content of the video, including an overview of the story and its context. 
+- Subjects: Identify all the entities that appear in the video content, such as people, animals, or objects.
+- Attributes: Detail the attributes of the subjects, such as physical appearance, emotional expressions, and other qualities.
+- Scenes: Identify the settings or environment of the video, including location, ambiance, time, and weather if applicable.
+- Actions: Identify the subjects' actions or events, including movements, interactions, and environmental changes.
+- OCR: Transcribe all the overlaid text that can be clearly observed in the video. Explain its context and summarize the main content of the text.
+
+For each aspect, provide a detailed description based on the video frames. The structured output must be in JSON format and follow this template:
+
+```json
+{
+    "Main Content": "The central idea and main content of the video.",
+    "Subjects": "The subjects in the video that appear in the video content.",
+    "Attributes": "The attributes of the subjects in the video.",
+    "Scenes": "The settings or environment of the video",
+    "Actions": "The subjects' actions or events in the video",
+    "OCR": "Transcription and interpretation of the text overlays in the video, along with its context and a summary of the main content."
+}
+```
+"""
+
+
+# In[11]:
+
+
+ak = "GjrgjjyJHUbLa15DLnr7t0Bhu6IPqFPj"
+client = AzureVisionClient(ak)
+pattern = r'\{.*?\}'
+
+
+# In[12]:
+
+
+def encode_image(image):
+    buffered = BytesIO()
+    image.save(buffered, format="JPEG")
+    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+    
+    return img_b64_str
+
+def load_frames(path_to_images):
+    max_frames = 10
+
+    # Get list of all files in the directory
+    image_files = [(int(os.path.splitext(file)[0]), file) for file in os.listdir(path_to_images) if file.endswith(('jpg', 'jpeg', 'png'))]
+    image_files = sorted(image_files, key=lambda img: img[0])
+    # Set up the matplotlib figure and axes, based on the number of images
+    num_images = len(image_files)
+    frames = []
+    # Read and display each image
+    for image_file in image_files:
+        image_path = os.path.join(path_to_images, image_file[1])
+        image = Image.open(image_path)
+        frames.append(image)
+
+    if len(frames) > max_frames:
+        frames = uniform_sample(frames, max_frames)
+
+    vid_b64_lst = list(map(encode_image, frames))
+
+    return vid_b64_lst
+    # return frames
+
+
+# In[13]:
+
+
+def extract_info(item):
+    ans = None
+    max_attempts = 10
+    attempts = 0
+
+    while attempts < max_attempts and ans is None:
+        try:
+            resp, messages = client.request(stage1_prompt, vid_data=load_frames(item['video_path']))
+            out = json.loads(re.findall(pattern, resp, re.DOTALL)[0].replace("\n", ""))
+            res = {
+                "object_id": item['object_id'],
+                "video_path": item['video_path'],
+                "policy_list": item['policy_list'],
+                "refine_caption": item['refine_caption'],
+                # "masp_inference": item['masp_inference'],
+                "Main Content": out['Main Content'],
+                "Subjects": out['Subjects'],
+                "Attributes": out['Attributes'],
+                "Scenes": out['Scenes'],
+                "Actions": out['Actions'],
+                "OCR": out['OCR'],
+                "Resp": resp
+            }
+            # ans = (res, messages)
+            ans = res
+        except Exception as e:
+            attempts += 1
+            print(e)
+            
+    return ans
+
+
+# In[14]:
+
+
+if __name__ == "__main__":
+    import argparse
+    
+    parser = argparse.ArgumentParser("GPT4-V Eval")
+    parser.add_argument("--gt_file", type=str, default="/mnt/bn/yukunfeng-nasdrive/kaili.zhao/masp/data/TT/eval_benchmark_neg_diverse_5k_uniform.json")
+    parser.add_argument("--inter_file", type=str, default="/mnt/bn/algo-masp-nas-2/baiyi.by/data/Benchmarks/GPT4V_Eval/PROMPT_VERSION/eval_v1_1_neg_5k_stage1_info.json")
+    parser.add_argument("-f", "--fff", type=str, default="1", required=False)
+    args = parser.parse_args()
+    
+    pred_data = json.load(open(args.gt_file))
+    
+    executor = ProcessPoolExecutor(max_workers=10)
+    
+    all_tasks = [executor.submit(extract_info, item) for item in pred_data]
+    
+    all_results = []
+    
+    for future in tqdm(as_completed(all_tasks)):
+        result = future.result()
+        
+        if result is None:
+            continue
+            
+        all_results.append(result)
+    
+    json.dump(all_results, open(args.inter_file, 'w'), indent=4, ensure_ascii=False)
+    
+    print("==> Eval Finished.")
diff --git a/app/llava/eval/masp_eval/gpt4v_score/v1/eval_gpt4v_dist_stage2.py b/app/llava/eval/masp_eval/gpt4v_score/v1/eval_gpt4v_dist_stage2.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b5859f7f04d7f45aac8f1b4c4eab93cb5ab4c03
--- /dev/null
+++ b/app/llava/eval/masp_eval/gpt4v_score/v1/eval_gpt4v_dist_stage2.py
@@ -0,0 +1,386 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[1]:
+
+
+# coding=utf-8
+
+
+# In[2]:
+
+
+import os
+import re
+import sys
+from tqdm import tqdm
+
+from PIL import Image
+import base64
+from io import BytesIO
+import pandas as pd
+
+import requests
+import json
+import time
+import openai
+import random
+
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from llava.eval.masp_eval.utils.azure_utils import AzureVisionClient
+from llava.eval.masp_eval.utils.video_utils import uniform_sample
+
+
+# In[57]:
+
+
+CONTENT_WEIGHT = 6
+SUBJECTS_WEIGHT = 4
+ATTRIBUTES_WEIGHT = 2
+SCENES_WEIGHT = 2
+ACTIONS_WEIGHT = 2
+OCR_WEIGHT = 4
+
+
+# In[32]:
+
+
+stage1_prompt = """
+As a customer service agent, your primary responsibility is to comprehend the user-uploaded videos, taking into account both the visual content and the text presented, then extract the required aspects:
+
+- Main Content: The central idea and main content of the video, including an overview of the story and its context. 
+- Subjects: Identify all the entities that appear in the video content, such as people, animals, or objects.
+- Attributes: Detail the attributes of the subjects, such as physical appearance, emotional expressions, and other qualities.
+- Scenes: Identify the settings or environment of the video, including location, ambiance, time, and weather if applicable.
+- Actions: Identify the subjects' actions or events, including movements, interactions, and environmental changes.
+- OCR: Transcribe all the overlaid text that can be clearly observed in the video. Explain its context and summarize the main content of the text.
+
+For each aspect, provide a detailed description based on the video frames. The structured output must be in JSON format and follow this template:
+
+```json
+{
+    "Main Content": "The central idea and main content of the video.",
+    "Subjects": "The subjects in the video that appear in the video content.",
+    "Attributes": "The attributes of the subjects in the video.",
+    "Scenes": "The settings or environment of the video",
+    "Actions": "The subjects' actions or events in the video",
+    "OCR": "Transcription and interpretation of the text overlays in the video, along with its context and a summary of the main content."
+}
+```
+"""
+
+
+# In[33]:
+
+
+stage2_prompt = """
+As a customer service agent, your primary responsibility is to score a given video content description (caption) based on the user-uploaded videos regarding the above aspects. 
+
+First, score the caption based on its main content and central idea. The main content of the video: {content}
+
+The scoring criteria are as follows (1~5):
+- 1: Does not accurately reflect the main content of the video or is largely incorrect.
+- 2: Somewhat reflects the main content of the video, but there are significant inaccuracies or omissions.
+- 3: Generally reflects the main content of the video, but there are some inaccuracies or omissions.
+- 4: Accurately reflects the main content of the video, with only minor inaccuracies or omissions.
+- 5: Perfectly reflects the main content of the video, with no inaccuracies or omissions.
+
+Then, compare the content extracted for each aspect (Subjects/Attributes/Scenes/Actions/OCR) with the caption to determine whether the caption mentions these contents. 
+The extracted aspects includes:
+- Subjects: {subjects}
+- Attributes: {attributes}
+- Scenes: {scenes}
+- Actions: {actions}
+- OCR: {ocr}
+
+And compare the caption with the video content to judge whether the caption hallucinates content that does not exist in the video for each aspect.
+Assign a score to each aspect of the caption based on the following criteria, and provide the reasons:
+
+- 1: Does not mention the key aspect or is entirely hallucinated.
+- 2: Lacks major aspects or contains major hallucinations.
+- 3: Generally recalls the aspects, but contains some hallucinations or lacks some aspects.
+- 4: Recalls most aspects, but lacks minor aspects or contains minor hallucinations.
+- 5: Accurately recalls all aspects with no hallucinations.
+
+Please note, in terms of 'Text Overlays', if there are no text overlays in the video, any text that the caption hallucinates will be seen as a negative indicator.
+
+Finally, output the scores for each aspect along with the reasoning behind the scores into a JSON output format.
+
+```json
+{{
+    "Main Content Reason": "Reason for the score concerning the main content and central idea.",
+    "Main Content Score": "Score assigned for the main content and central idea of the caption.",
+    "Subjects Reason": "Reason for the score concerning 'Subjects', considering both recall and hallucination.",
+    "Subjects Score": "Score assigned for caption concerning 'Subjects'.",
+    "Attributes Reason": "Reason for the score concerning 'Attributes', considering both recall and hallucination.",
+    "Attributes Score": "Score assigned for caption concerning 'Attributes'.",
+    "Scenes Reason": "Reason for the score concerning 'Scenes', considering both recall and hallucination.",
+    "Scenes Score": "Score assigned for caption concerning 'Scenes'.",
+    "Actions Reason": "Reason for the score concerning 'Actions', considering both recall and hallucination.",
+    "Actions Score": "Score assigned for caption concerning 'Actions'.",
+    "OCR Reason": "Reason for the score concerning 'OCR', considering both recall and hallucination.",
+    "OCR Score": "Score assigned for caption concerning 'OCR'."
+}}
+```
+"""
+
+
+# In[34]:
+
+ak_list = ["F1uPN8EfGeYXUSwt9iDy4KzxRBBLuxWr", "GjrgjjyJHUbLa15DLnr7t0Bhu6IPqFPj", "9SfaybfwPR3qpKqtZOzFtl28yXQNeHrJ"]
+client_list = [AzureVisionClient(ak, max_retries=20) for ak in ak_list]
+
+pattern = r'\{.*?\}'
+
+
+# In[35]:
+
+
+def encode_image(image):
+    buffered = BytesIO()
+    image.save(buffered, format="JPEG")
+    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+
+    return img_b64_str
+
+
+def load_frames(path_to_images):
+    max_frames = 10
+
+    # Get list of all files in the directory
+    image_files = [(int(os.path.splitext(file)[0]), file) for file in os.listdir(path_to_images) if
+                   file.endswith(('jpg', 'jpeg', 'png'))]
+    image_files = sorted(image_files, key=lambda img: img[0])
+    # Set up the matplotlib figure and axes, based on the number of images
+    num_images = len(image_files)
+    frames = []
+    # Read and display each image
+    for image_file in image_files:
+        image_path = os.path.join(path_to_images, image_file[1])
+        image = Image.open(image_path)
+        frames.append(image)
+
+    if len(frames) > max_frames:
+        frames = uniform_sample(frames, max_frames)
+
+    vid_b64_lst = list(map(encode_image, frames))
+
+    return vid_b64_lst
+
+
+# In[64]:
+
+
+def score_func(item, messages=None):
+    ans = None
+    max_attempts = 10
+    attempts = 0
+
+    client = random.choice(client_list)
+
+    while attempts < max_attempts and ans is None:
+        try:
+            param_dict = {
+                'content': item['Main Content'],
+                'subjects': item['Subjects'],
+                'attributes': item['Attributes'],
+                'actions': item['Actions'],
+                'scenes': item['Scenes'],
+                'ocr': item['OCR']
+            }
+            usr_prompt = stage2_prompt.format(**param_dict) + '\nThe provided caption:\n' + item['masp_inference']
+            
+            resp, _ = client.request(usr_prompt, messages=messages)
+            out = json.loads(re.findall(pattern, resp, re.DOTALL)[0].replace("\n", ""))
+            # out = json.loads(resp.choices[0].message.content)
+            res = {
+                "object_id": item['object_id'],
+                "video_path": item['video_path'],
+                "policy_list": item['policy_list'],
+                "refine_caption": item['refine_caption'],
+                "masp_inference": item['masp_inference'],
+                # "origin_caption_rate": item['origin_caption_rate'],
+                # "hard_negative_caption": item['hard_negative_caption'],
+                "Main Content": item['Main Content'],
+                "Subjects": item['Subjects'],
+                "Attributes": item['Attributes'],
+                "Scenes": item['Scenes'],
+                "Actions": item['Actions'],
+                "OCR": item['OCR'],
+                "Main Content Reason": out['Main Content Reason'],
+                "Main Content Score": out['Main Content Score'],
+                "Subjects Reason": out['Subjects Reason'],
+                "Subjects Score": out['Subjects Score'],
+                "Attributes Reason": out['Attributes Reason'],
+                "Attributes Score": out['Attributes Score'],
+                "Scenes Reason": out['Scenes Reason'],
+                "Scenes Score": out['Scenes Score'],
+                "Actions Reason": out['Actions Reason'],
+                "Actions Score": out['Actions Score'],
+                "OCR Reason": out['OCR Reason'],
+                "OCR Score": out['OCR Score']
+            }
+            final_score = CONTENT_WEIGHT * int(res['Main Content Score']) + SUBJECTS_WEIGHT * int(res['Subjects Score']) + ATTRIBUTES_WEIGHT * int(res['Attributes Score']) + SCENES_WEIGHT * int(res['Scenes Score']) + ACTIONS_WEIGHT * int(res['Actions Score']) + OCR_WEIGHT * int(res['OCR Score'])
+            
+            res['Final Score'] = round(final_score)
+            
+            ans = res
+        except Exception as e:
+            attempts += 1
+            print(e)
+    return ans
+
+
+# In[65]:
+
+
+def build_context(item):
+    vid_data = load_frames(item['video_path'])
+
+    content = [{
+            "type": "text",
+            "text": stage1_prompt
+        },
+        *map(lambda x: {
+            "type": "image_url",
+            "image_url": {
+                "url": f"data:image/jpeg;base64, {x}"
+            }
+        }, vid_data)]
+
+    messages = [{
+        'role': 'user',
+        'content': content
+    }, {
+        "role": "assistant",
+        "content": [{
+            "type": "text",
+            "text": item['Resp']
+        }]
+    }]
+    return messages
+
+
+# In[66]:
+
+
+def eval_pipeline(item):
+    try:
+        messages = build_context(item)
+
+        item_score = score_func(item, messages)
+
+    except Exception as e:
+        print(item["object_id"], e)
+        return None
+
+    return item_score
+
+
+# In[67]:
+
+
+def get_final_score(all_items):
+    from collections import defaultdict
+
+    score_dict = defaultdict(list)
+
+    for item in all_items:
+        for key in item:
+            if not key.endswith('Score'):
+                continue
+
+            score_dict[key].append(int(item[key]))
+
+    final_score_dict = dict()
+    for key in score_dict:
+        final_score_dict[key] = round(sum(score_dict[key]) / len(score_dict[key]), 4)
+
+    return final_score_dict
+
+
+# In[68]:
+
+
+def merge_data(gt_data, pred_data):
+    gt_dict = dict()
+
+    for item in gt_data:
+        gt_dict[item['object_id']] = item
+
+    merge_list = []
+    for item in pred_data:
+        if item['object_id'] not in gt_dict:
+            continue
+
+        # item['Resp'] = gt_dict[item['object_id']]['Resp']
+        for key in ['Main Content', 'Subjects', 'Attributes', 'Scenes', 'Actions', 'OCR', 'Resp']:
+            assert key in gt_dict[item['object_id']], f"object id: {item['object_id']}, key: {key}"
+            item[key] = gt_dict[item['object_id']][key]
+
+        merge_list.append(item)
+
+    return merge_list
+
+
+# In[70]:
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser("GPT4-V Eval")
+    parser.add_argument("--pred_file", type=str,
+                        default="/mnt/bn/masp-nas/xiangchen/model/masp_models/checkpoints/llava-thothv2_mar_release_all_data/video_chair/pred_result_5k.json")
+    parser.add_argument("--gt_file", type=str,
+                        default="/mnt/bn/algo-masp-nas-2/xiangchen/dataset/masp/eval_v1_1_neg_5k_stage1_info.json")
+    parser.add_argument("-f", "--fff", type=str, default="1", required=False)
+    args = parser.parse_args()
+
+    pred_data = json.load(open(args.pred_file))
+    print(f"Pred data: {len(pred_data)}")
+    gt_data = json.load(open(args.gt_file))
+    print(f"GT data: {len(gt_data)}")
+
+    merge_list = merge_data(gt_data, pred_data)
+    
+    print(f"==> Valid data size: {len(merge_list)}")
+    print('evaluation version: v1')
+
+    executor = ProcessPoolExecutor(max_workers=20)
+
+    all_tasks = [executor.submit(eval_pipeline, item) for item in merge_list]
+
+    all_results = []
+
+    for future in tqdm(as_completed(all_tasks)):
+        result = future.result()
+
+
+    # for item in tqdm(merge_list):
+    #     result = eval_pipeline(item)
+
+        if result is None:
+            continue
+        
+        all_results.append(result)
+        
+    print(f"==> Succeeded {len(all_results)} samples.")
+
+    detail_file = os.path.splitext(args.pred_file)[0] + "_detail_res.json"
+    json.dump(all_results, open(detail_file, 'w'), indent=4, ensure_ascii=False)
+
+    final_score = get_final_score(all_results)
+    final_file = os.path.splitext(args.pred_file)[0] + "_final_score.json"
+    json.dump(final_score, open(final_file, 'w'), indent=4, ensure_ascii=False)
+
+    print(f"==> Final Result")
+
+    print(json.dumps(final_score, indent=4))
+
+    print("==> Eval Finished.")
+
+
+
+
+
diff --git a/app/llava/eval/masp_eval/utils/__init__.py b/app/llava/eval/masp_eval/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d6338135e6e6fe83b46e25d8d27c49e3de3188e
--- /dev/null
+++ b/app/llava/eval/masp_eval/utils/__init__.py
@@ -0,0 +1,2 @@
+from .video_utils import *
+from .azure_utils import *
\ No newline at end of file
diff --git a/app/llava/eval/masp_eval/utils/azure_utils.py b/app/llava/eval/masp_eval/utils/azure_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..285df1fec6877b10ba8eef056ab8c6f28486c647
--- /dev/null
+++ b/app/llava/eval/masp_eval/utils/azure_utils.py
@@ -0,0 +1,159 @@
+#coding=utf-8
+import time
+import copy
+import requests
+import openai
+from abc import ABC, abstractmethod
+
+class AzureVisionClient():
+    def __init__(self, ak, max_retries=3):
+
+        self.client = openai.AzureOpenAI(
+            azure_endpoint="https://search-va.byteintl.net/gpt/openapi/online/multimodal/crawl/",
+            api_version="2023-09-01-preview",
+            api_key=ak
+        )
+        self.max_retries = max_retries
+        self.model_name = 'gptv'
+        self.temperature = 0.000000001
+        self.max_tokens = 3000
+        
+    def request(self, query, messages=None, img_data=None, vid_data=None):
+        if messages is None:
+            messages = []
+        
+        content = {
+            "type": "text",
+            "text": query
+        }
+        if img_data:
+            content = [
+                content,
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64,{img_data}"
+                    }
+                }
+            ]
+        elif vid_data:
+            content = [
+                content,
+                *map(lambda x: {
+                    "type": "image_url", 
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64, {x}"
+                    }
+                }, vid_data)
+            ]
+        else:
+            content = [content]
+    
+        messages.append({
+            'role': 'user',
+            'content': content
+        })
+        
+        completion = None
+        num_cur_retry = 0
+        while num_cur_retry < self.max_retries:
+            try:
+                completion = self.client.chat.completions.create(
+                    model=self.model_name,  # gptv 或 openai_gpt-4-vision
+                    max_tokens=self.max_tokens,
+                    temperature=self.temperature,
+                    messages=messages
+                )
+                break
+            except Exception as e:
+                num_cur_retry += 1
+                if 'Error code: 429' not in e.message:
+                    completion = None
+                    print(e)
+                    break
+                if num_cur_retry % 20 == 1:
+                    print('retry times:', num_cur_retry, e)
+                time.sleep(5)
+                
+        resp = completion.choices[0].message.content
+        messages.append({
+            "role": "assistant",
+            "content": [{
+                "type": "text",
+                "text": completion.choices[0].message.content
+            }]
+        })
+        
+        return resp, messages
+    
+
+class BaseAPIWrapper(ABC):
+    @abstractmethod
+    def get_completion(self, user_prompt, system_prompt=None):
+        pass
+
+class GPTAPIWrapper(BaseAPIWrapper):
+    def __init__(self, ak, max_retries=1000):
+        # self.key_pool = key_pool
+        # self.temperature = temperature
+        # self.model = model
+        # self.time_out = time_out
+        # openai.api_base = "https://search-us.byteintl.net/gpt/openapi/online/v2/crawl"
+        # openai.api_type = "azure"
+        # openai.api_version = "2023-06-01-preview" 
+        # openai.api_key = key_pool[0]
+
+        self.client = openai.AzureOpenAI(
+            azure_endpoint="https://search-us.byteintl.net/gpt/openapi/online/v2/crawl",
+            api_version="2023-06-01-preview",
+            api_key=ak
+        )
+        self.max_retries = max_retries
+        self.model_name = 'gpt-4-32k-0613'
+        self.temperature = 0.000000001
+        self.max_tokens = 3000
+
+    def request(self, system_content, usr_question, previous_msg=None, last_answer=None):
+        if previous_msg is None:
+            msgs = [
+                {"role": "system", "content": f"{system_content}"},
+                {"role": "user", "content": f"{usr_question}"}
+            ]
+        else:
+            msgs = copy.deepcopy(previous_msg)
+            msgs += [
+                {"role": "assistant", "content": last_answer},
+                {"role": "user", "content": usr_question}
+            ]
+        response = self.client.chat.completions.create(
+            messages=msgs,
+            temperature=self.temperature, 
+            max_tokens=self.max_tokens,
+            model=self.model_name,
+        )
+        resp = response.choices[0].message.content
+        # total_tokens = response.usage['total_tokens']
+
+        return resp, msgs
+    
+    def get_completion(self, user_prompt=None, system_prompt=None, previous_msgs=None, last_answer=None):
+        gpt_cv_nlp = '[]'
+        key_i = 0
+        total_tokens = 0
+        max_try = self.max_retries
+        # gpt_cv_nlp, total_tokens = self.request(system_prompt, user_prompt)
+        while max_try > 0:
+            try:
+                gpt_cv_nlp, msgs = self.request(system_prompt, user_prompt, previous_msgs, last_answer)
+                # print('Succ: ', gpt_cv_nlp)
+                max_try = 0
+                break
+            except Exception as e:
+                print("fail ", max_try, e)
+                # key = self.key_pool[key_i%2]
+                # openai.api_key = key
+                # key_i += 1
+                time.sleep(1)
+                max_try -= 1
+    
+        return gpt_cv_nlp, msgs
\ No newline at end of file
diff --git a/app/llava/eval/masp_eval/utils/video_utils.py b/app/llava/eval/masp_eval/utils/video_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e12512c3f01ff8d684e96bf6ddba8b8ddfbdf55
--- /dev/null
+++ b/app/llava/eval/masp_eval/utils/video_utils.py
@@ -0,0 +1,71 @@
+import os
+import copy
+import random
+import numpy as np
+from PIL import Image
+
+def get_image(image_path):
+    image = Image.open(image_path).convert('RGB')
+    return image
+
+
+def load_frames(frames_dir):
+    results = []
+    image_files = [(int(os.path.splitext(img)[0]), img) for img in os.listdir(frames_dir) if img.endswith('jpg')]
+    image_files = sorted(image_files, key=lambda img: img[0])
+
+    for frame_name in image_files:
+        image_path = f"{frames_dir}/{frame_name[1]}"
+        image = get_image(image_path)
+        results.append(image)
+    return results
+
+
+def uniform_sample(frames, num_segments):
+    """
+    Uniformly samples 10 frames from a list of frames.
+
+    Args:
+    - frames (list): A list of frames.
+
+    Returns:
+    - list: A list containing 10 uniformly sampled frames.
+    """
+
+    indices = np.linspace(start=0, stop=len(frames) - 1, num=num_segments).astype(int)
+
+    frames = [frames[ind] for ind in indices]
+
+    return frames
+
+
+def downsample_frames(frames, interval, keep_first_last=True):
+    if keep_first_last:
+        first, last, mid = frames[0], frames[-1], frames[1:-1]
+        sampled_frames = mid[interval - 1::interval]
+        ret = [first] + sampled_frames + [last]
+
+    else:
+        # may output empty list, recommend keep first and last frame
+        ret = frames[interval - 1::interval]
+
+    return ret
+
+
+def sample_frames(frames, num_segments):
+    frame_indices = list(range(len(frames)))
+    cand_indices = copy.deepcopy(frame_indices)
+    intervals = np.linspace(start=0, stop=len(frame_indices), num=num_segments + 1).astype(int)
+    ranges = []
+
+    for idx, interv in enumerate(intervals[:-1]):
+        ranges.append((interv, intervals[idx + 1] - 1))
+
+    # try:
+    #     frame_indices = [cand_indices[random.choice(range(x[0], x[1]))] for x in ranges]
+    # except:
+    frame_indices = [cand_indices[x[0]] for x in ranges]
+
+    sampled_frames = [frames[indice] for indice in frame_indices]
+
+    return sampled_frames
\ No newline at end of file
diff --git a/app/llava/eval/masp_eval/video_chair/cal_video_chair.py b/app/llava/eval/masp_eval/video_chair/cal_video_chair.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e8262bcc754591113d1ca3c3d6860149be81cd1
--- /dev/null
+++ b/app/llava/eval/masp_eval/video_chair/cal_video_chair.py
@@ -0,0 +1,287 @@
+from audioop import avg
+from email.policy import default
+import os
+import re
+import json
+import sys
+import argparse
+
+import openai
+from abc import ABC, abstractmethod
+# from pattern3.en import singularize
+# from nltk.stem import WordNetLemmatizer
+# from call_dino_service import 
+from tqdm import tqdm
+from functools import partial
+
+# import spacy
+import time
+from collections import defaultdict
+from copy import deepcopy
+from pathlib import Path
+from multiprocessing import Pool
+from llava.eval.masp_eval.utils import GPTAPIWrapper 
+
+# class RefineCHAIR():
+#     def __init__(self):
+#         self.system_prompt = "I am ChatGPT, a virtual assistant based on OpenAI's GPT-4 model. I'm designed to understand and generate human-like text based on the input I receive. My main purpose is to assist with information, answer questions, help with tasks that involve natural language processing, and engage in conversations with users.Please note that while I aim to provide accurate and reliable information, I can't guarantee perfection, and it's always a good idea to consult additional resources or professionals when making critical decisions based on the information I provide."
+#         self.openai_obj = GPTAPIWrapper(ak="GjrgjjyJHUbLa15DLnr7t0Bhu6IPqFPj")
+#         with open('llava/eval/masp_eval/video_chair/prompts/cap_mention.txt', 'r') as file:
+#             content = file.read()
+#         self.cap_user_prompt = content
+system_prompt = "I am ChatGPT, a virtual assistant based on OpenAI's GPT-4 model. I'm designed to understand and generate human-like text based on the input I receive. My main purpose is to assist with information, answer questions, help with tasks that involve natural language processing, and engage in conversations with users.Please note that while I aim to provide accurate and reliable information, I can't guarantee perfection, and it's always a good idea to consult additional resources or professionals when making critical decisions based on the information I provide."
+
+with open('llava/eval/masp_eval/video_chair/prompts/cap_mention.txt', 'r') as file:
+    content = file.read()
+cap_user_prompt = content
+
+openai_obj = GPTAPIWrapper(ak="GjrgjjyJHUbLa15DLnr7t0Bhu6IPqFPj")
+
+
+def _add(case_res, all_res):
+    for key, value in case_res.items():
+        for idx, count_ in enumerate(value):
+            all_res[key][idx] += count_
+    return
+    
+def save_metric(coverage, hallucination, case_len, output_dir=None):
+    final_metrics = {}
+    for name, res in [['coverage', coverage], ['hallucination', hallucination]]:
+        combine_counter = [0, 0]    
+        for cat, counter in res.items():
+            final_metrics[name+'_'+cat] = round(counter[0] * 100/ counter[1], 2)
+            combine_counter[0] += counter[0]
+            combine_counter[1] += counter[1]
+            if name == 'hallucination':
+                final_metrics[name+'_'+cat] = round(100 - final_metrics[name+'_'+cat], 2)
+            final_metrics[name] = round(combine_counter[0] * 100 / combine_counter[1], 2)
+        if name == 'hallucination':
+            final_metrics[name] = round(100 - final_metrics[name], 2)
+    final_metrics['avg_len'] = round(sum(case_len) / len(case_len), 1)
+
+    if output_dir is not None:
+        with (output_dir / 'chair_metric_neg.json').open('w') as f:
+            json.dump(final_metrics, f, indent=4)
+
+    print(json.dumps(final_metrics, indent=1))
+
+def combine_info(pred_info, gt_info):
+    combined_info = defaultdict(dict)
+    if 'object_id' in gt_info[0]:
+        id_key = 'object_id'
+    else:
+        id_key = 'task_id'
+    for gt in gt_info:
+        object_id = gt[id_key]
+        if gt['cap_info'] is None:
+            continue
+        combined_info[object_id]['gt_caption'] = gt['refine_caption']
+        combined_info[object_id]['gt_info'] = gt['cap_info']
+
+    for pred in pred_info:
+        object_id = pred[id_key]
+        if object_id not in combined_info:
+            # print(pred)
+            continue
+        if pred['cap_info'] is None:
+            continue
+        combined_info[object_id]['pred_caption'] = pred['masp_inference']
+        combined_info[object_id]['pred_info'] = pred['cap_info']
+    filtered_ids = []
+    for key, value in combined_info.items():
+        if ('pred_info' not in value) or ('gt_info' not in value):
+            filtered_ids.append(key)
+    for obj_id in filtered_ids:
+        del combined_info[obj_id]
+
+    print(f'evaluation cases: {len(combined_info)}')
+    
+    return combined_info
+
+def format_question(info):
+    categories = ['subjects', 'activities', 'locations', 'text_overlays']
+    question_id = 0
+    question_mapping = {}
+    questions = []
+    for cat in categories:
+        if cat == 'subjects':
+            for c_id, character_content in enumerate(info['subjects']):
+                questions.append(cat + ':' + character_content['name'])
+                question_mapping[question_id] = (cat, c_id)
+                question_id += 1
+                if 'attributes' not in character_content:
+                    continue
+                for a_id, attr in enumerate(character_content['attributes']):
+                    questions.append(character_content['name'] + ':' + attr)
+                    question_mapping[question_id] = ('attributes', c_id, a_id)
+                    question_id += 1
+            
+        else:
+            for c_id, cat_attr in enumerate(info[cat]):
+                questions.append(cat + ':' + cat_attr)
+                question_mapping[question_id] = (cat, c_id)
+                question_id += 1
+                
+    question_str = ''
+    for idx, q in enumerate(questions):
+        question_str += f'{idx+1}. {q}' + '\n'
+
+    return question_str, question_mapping
+
+def parsing_results(gpt_ret, question_mapping):
+    gpt_ret = gpt_ret.lower()
+    pattern = r'(\d+)\.(.+) - (yes|no|maybe),(.+)'
+
+    # Find all matches in the text
+    matches = re.findall(pattern, gpt_ret)
+    collected_answer = defaultdict(lambda:[0,0])
+    # Print the matches
+    for match in matches:
+        question_id, question, answer, reason = match
+        question_id = int(question_id) - 1
+        cat = question_mapping[question_id][0]
+        collected_answer[cat][1] += 1
+        if 'yes' in answer:
+            collected_answer[cat][0] += 1
+        elif 'no' in answer:
+            pass
+        elif 'maybe' in answer:
+            collected_answer[cat][0] += 1
+        else:
+            NotImplementedError
+    return collected_answer
+
+
+
+def process_coverage(data):
+    object_id = data[0]
+    case_info = data[1]
+    gt_info = case_info['gt_info']
+    # if gt_info is None:
+    #     return None
+    try:
+        question_str, question_mapping = format_question(gt_info)
+    except Exception as e:
+        print(e)
+        return None
+    user_prompt = deepcopy(cap_user_prompt)
+    user_prompt = user_prompt.replace("/video caption/", case_info['pred_caption'])
+    user_prompt = user_prompt.replace("/question/", question_str)
+    gpt_ret, _ = openai_obj.get_completion(user_prompt=user_prompt, system_prompt=system_prompt)
+    try:
+        coverage_res = parsing_results(gpt_ret, question_mapping)
+    except Exception as e:
+        print(e)
+        print(gpt_ret)
+        return None    
+    sentence_len = len(case_info['pred_caption'].split(' '))
+    return (object_id, gpt_ret, dict(coverage_res), sentence_len)
+
+
+def process_hallucination(data):
+    object_id = data[0]
+    case_info = data[1]
+    pred_info = case_info['pred_info']
+    # if pred_info is None:
+    #     return None
+    try:
+        question_str, question_mapping = format_question(pred_info)
+    except Exception as e:
+        print(e)
+        return None
+    user_prompt = deepcopy(cap_user_prompt)
+    user_prompt = user_prompt.replace("/video caption/", case_info['gt_caption'])
+    user_prompt = user_prompt.replace("/question/", question_str)
+    gpt_ret, _ = openai_obj.get_completion(user_prompt=user_prompt, system_prompt=system_prompt)
+    try:
+        hallucination_res = parsing_results(gpt_ret, question_mapping)
+    except Exception as e:
+        print(e)
+        print(gpt_ret)
+        return None        
+    # self._add(hallucination_res, evaluator.hallucination_metric)
+    # saved_combined_info[object_id]['hallucination_res'] = gpt_ret
+    # print(gpt_ret)    
+    return (object_id, gpt_ret, dict(hallucination_res))
+
+
+
+def compute_refine_chair(pred_file, gt_file, coverage_file, hallucination_file):
+    coverage_metric = defaultdict(lambda:[0,0])
+    hallucination_metric = defaultdict(lambda:[0,0])
+    case_len = []
+
+    with open(pred_file, 'r', encoding='utf-8') as f:
+        pred_info = json.load(f)
+    with open(gt_file, 'r', encoding='utf-8') as f:
+        gt_info = json.load(f)
+
+    combined_info = combine_info(pred_info, gt_info)
+    saved_combined_info = deepcopy(combined_info) 
+    combine_info_lst = list(combined_info.items())
+
+    pool = Pool(processes=32)
+    print('calculate coverage')
+    dict_res_coverage = {}   
+    for res in tqdm(pool.imap_unordered(process_coverage, combine_info_lst), total=len(combine_info_lst)):
+        if res is None:
+            continue
+        object_id, gpt_ret, coverage_res, sentence_len = res
+        _add(coverage_res, coverage_metric)
+        case_len.append(sentence_len)
+        saved_combined_info[object_id]['coverage_res'] = gpt_ret
+        dict_res_coverage[str(object_id)] = coverage_res
+
+    print('calculate hallucination')
+    dict_res_hallucination = {}
+    for res in tqdm(pool.imap_unordered(process_hallucination, combine_info_lst), total=len(combine_info_lst)):
+        if res is None:
+            continue
+        object_id, gpt_ret, hallucination_res = res
+        _add(hallucination_res, hallucination_metric)
+        saved_combined_info[object_id]['hallucination_res'] = gpt_ret
+        dict_res_hallucination[str(object_id)] = hallucination_res
+
+    pool.close()
+    pool.join()
+
+    output_dir = Path(pred_file).parent
+
+    with (output_dir / coverage_file).open('w') as f:
+        json.dump(dict_res_coverage, f, indent=4)
+    print(f"Saving coverage result for each video in {output_dir}")
+
+    with (output_dir / hallucination_file).open('w') as f:
+        json.dump(dict_res_hallucination, f, indent=4) 
+    print(f"Saving hallucination result for each video in {output_dir}")
+
+    save_metric(coverage_metric, hallucination_metric, case_len, output_dir)
+    with (output_dir / 'chair_metric_detailed.json').open('w') as f:
+        json.dump(saved_combined_info, f, indent=4)     
+
+    
+def print_metrics(hallucination_cap_dict, quiet=False):
+    sentence_metrics = hallucination_cap_dict['overall_metrics']
+    metric_string = "%0.01f\t%0.01f" %(sentence_metrics['CHAIRs']*100, 
+                                       sentence_metrics['CHAIRi']*100)
+    if not quiet:
+        print("CHAIRs\tCHAIRi")
+        print(metric_string)
+        print(sentence_metrics['sentence len'])
+        print(sentence_metrics['avg objects'])
+    else:
+        return metric_string
+    
+# python3 chair/chair_gpt.py --cap_file /mnt/bd/bohanzhaiv1/LLM/bohan/POPE/caption_data/vg_instruction1_llava.json  --annotation_path /mnt/bn/algo-masp-nas-2/masp_data/coco_2014/annotations
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pred_file", type=str, default='/mnt/bn/yukunfeng-nasdrive/xiangchen/model/masp_models/checkpoints/mistral-ablation-v077-ocr/video_chair/vid_top1k_neg_res_non_dup_info.json')
+    parser.add_argument("--gt_file", type=str, default='/mnt/bn/yukunfeng-nasdrive/xiangchen/repo/benchmark_data/refine_chair_eval_gt_neg_1k.json')
+    parser.add_argument("--coverage_filename", type=str, default='each_video_coverage_detail.json')
+    parser.add_argument("--hallucination_filename", type=str, default='each_video_halluciantion_detail.json')
+    
+    # parser.add_argument("--gt_file", type=str, default='/mnt/bn/yukunfeng-nasdrive/xiangchen/repo/benchmark_data/refine_chair_eval_gt.json')
+    args = parser.parse_args()
+
+    compute_refine_chair(args.pred_file, args.gt_file, args.coverage_filename, args.hallucination_filename)
+ 
\ No newline at end of file
diff --git a/app/llava/eval/masp_eval/video_chair/eval_instance_video_chair.py b/app/llava/eval/masp_eval/video_chair/eval_instance_video_chair.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d8c6e3544edb6bf315e4a4552344a003052b41a
--- /dev/null
+++ b/app/llava/eval/masp_eval/video_chair/eval_instance_video_chair.py
@@ -0,0 +1,80 @@
+# compute chair for each video
+import json
+import collections 
+import argparse 
+from pathlib import Path
+
+def eval_video_chair(file_name, metric):
+    with file_name.open("r") as json_file:
+        data = json.load(json_file)
+
+    items = {}
+    coverages = collections.defaultdict(list)
+    hallucinations = collections.defaultdict(list)
+    buckets = ['subjects', 'attributes', 'activities', 'locations', 'text_overlays']
+    index = 0
+    for object_id, tag_info in data.items():
+        items[object_id] = index
+        for tag in buckets:
+            if tag in tag_info:
+                cvg = round(tag_info[tag][0]*100 / tag_info[tag][1], 2)
+                coverages[tag].append(cvg) if metric == "coverage" else hallucinations[tag].append(round(100 - cvg, 2))
+            else: # "-100" means gt has no such tag for coverage and pred has no such tag for hallucination, leading to N/A value.
+                coverages[tag].append(-100) if metric == "coverage"  else hallucinations[tag].append(-100)
+        index += 1
+    return (items, coverages) if metric == "coverage" else (items, hallucinations)
+
+
+def get_dict_val(inputs, items, key):
+    for dd in inputs:
+        if str(dd["object_id"]) == str(items):
+            return dd["cap_info"][key] if key in dd["cap_info"] else []                
+    return []
+
+
+def get_instance_result(pred_file, gt_file, coverage_file, hallucination_file, save_file):
+    buckets = ['subjects', 'attributes', 'activities', 'locations', 'text_overlays']
+    pred = json.load(open(pred_file, "r"))
+    gt = json.load(open(gt_file, "r"))
+    output_dir = Path(pred_file).parent
+
+    items1, coverages  = eval_video_chair(output_dir / coverage_file, "coverage")
+    items2, hallucinations = eval_video_chair(output_dir / hallucination_file, "hallucination")
+
+    gt_map = {str(item['object_id']): item for item in gt}
+    pred_map = {str(item['object_id']): item for item in pred}
+
+    out = []
+    for obj_id, idx_1 in items1.items():
+        if obj_id not in items2:
+            continue
+        idx_2 = items2[obj_id]
+        res = {}
+        for key in buckets:
+            res["object_id"] = obj_id
+            res["coverage_"+key] = coverages[key][idx_1] if coverages[key][idx_1] != -100 else "N/A"
+            res["hallucination_"+key] = hallucinations[key][idx_2] if hallucinations[key][idx_2] != -100 else "N/A"
+            if key == "attributes": # "skip attributes which are combined in subjects"
+                continue
+            res["pred_"+key] = get_dict_val(pred, obj_id, key)
+            res["gt_"+key] = get_dict_val(gt, obj_id, key)
+            res['masp_inference'] = pred_map[obj_id]['masp_inference']
+            res['refine_caption'] = gt_map[obj_id]['refine_caption']
+        out.append(res) 
+
+    
+    with (output_dir / save_file).open("w") as json_data:
+        json.dump(out, json_data, indent=4)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pred_file", type=str, default='/mnt/bn/algo-masp-nas-2/xiangchen/model/masp_models/checkpoints/llava-mistral_gpt4v_public800k_unfreeze_qformer/video_chair/video_chair_1k_res_info.json')
+    parser.add_argument("--gt_file", type=str, default='/mnt/bn/algo-masp-nas-2/kaili.zhao/data/masp_data/eval/eval_v1.0/eval_benchmark_pos_diverse_1k_11policies_gt.json')
+    parser.add_argument("--coverage_file", type=str, default='each_video_coverage_detail.json')
+    parser.add_argument("--hallucination_file", type=str, default='each_video_halluciantion_detail.json')
+    parser.add_argument("--save_file", type=str, default='video_chair_final.json')
+    args = parser.parse_args()
+    get_instance_result(args.pred_file, args.gt_file, args.coverage_file, args.hallucination_file, args.save_file)
+    print(f"===== Completed video chair for each individual computation! =====")
+    
diff --git a/app/llava/eval/masp_eval/video_chair/info_extract_from_caption.py b/app/llava/eval/masp_eval/video_chair/info_extract_from_caption.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ee6fcdbc075003ff0f253308549774c554f66ed
--- /dev/null
+++ b/app/llava/eval/masp_eval/video_chair/info_extract_from_caption.py
@@ -0,0 +1,130 @@
+from operator import truediv
+import os
+import re
+import json
+import sys
+import argparse
+# from nltk.stem import *
+# import nltk
+import openai
+from abc import ABC, abstractmethod
+# from pattern3.en import singularize
+# from nltk.stem import WordNetLemmatizer
+# from call_dino_service import 
+from tqdm import tqdm
+from llava.eval.masp_eval.utils import GPTAPIWrapper 
+
+import time
+
+class BaseAPIWrapper(ABC):
+    @abstractmethod
+    def get_completion(self, user_prompt, system_prompt=None):
+        pass
+
+class CHAIR():
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.system_prompt = "I am ChatGPT, a virtual assistant based on OpenAI's GPT-4 model. I'm designed to understand and generate human-like text based on the input I receive. My main purpose is to assist with information, answer questions, help with tasks that involve natural language processing, and engage in conversations with users.Please note that while I aim to provide accurate and reliable information, I can't guarantee perfection, and it's always a good idea to consult additional resources or professionals when making critical decisions based on the information I provide."
+        # self.openai_obj = OpenAIAPIWrapper(key_pool=["VrJQmRwcwnRW3KVEDaE8D9gYZm2a0zPm", "GjrgjjyJHUbLa15DLnr7t0Bhu6IPqFPj"])
+        self.openai_obj = GPTAPIWrapper(ak="GjrgjjyJHUbLa15DLnr7t0Bhu6IPqFPj")
+        with open('llava/eval/masp_eval/video_chair/prompts/cap2info.txt', 'r') as file:
+            content = file.read()
+        self.cap_user_prompt = content
+        with open('llava/eval/masp_eval/video_chair/prompts/refine_json.txt', 'r') as file:
+            content = file.read()
+        self.cap_user_prompt_deduplicate = content     
+    
+    def cap2info_gpt4(self, cap):
+        user_prompt = self.cap_user_prompt.replace('/video caption/', cap)
+        gpt_ret1, msgs = self.openai_obj.get_completion(user_prompt=user_prompt, system_prompt=self.system_prompt)
+        user_prompt = self.cap_user_prompt_deduplicate.replace('/json file/', gpt_ret1)
+        gpt_ret2, msgs = self.openai_obj.get_completion(user_prompt=user_prompt, system_prompt=self.system_prompt, previous_msgs=msgs, last_answer=gpt_ret1)
+        match = re.search(r"(?<=```json\n)([\s\S]*?)(?=```)", gpt_ret2)
+        if match:
+            try:
+                info = json.loads(match.group(1))
+            except Exception as e:
+                print(match.group(1))
+                info = None
+            # Split the string into a list of items
+            return info
+        else:
+            try:
+                start = gpt_ret2.find('{')
+                end = gpt_ret2.rfind('}')
+                info = json.loads(gpt_ret2[start:end+1])
+                return info
+            except Exception as e:
+                print(gpt_ret1)
+                print(gpt_ret2)
+                return None
+
+
+def post_process_masp_cap_label(evaluator, annotations_file, gt=True):
+    results = []
+    with open(annotations_file, 'r', encoding='utf-8') as f:
+        annotations = json.load(f)
+    for data in tqdm(annotations):
+        if gt:
+            caption = data['refine_caption']
+        else:
+            caption = data['masp_inference']
+        cap_info = evaluator.cap2info_gpt4(caption)
+        data['cap_info'] = cap_info
+        results.append(data)
+    return results
+
+
+from multiprocessing import Pool
+
+evaluator = CHAIR()
+
+# Function to process a single data item
+def process_data(data, gt):
+    if gt:
+        caption = data['refine_caption']
+    else:
+        caption = data['masp_inference']
+    cap_info = evaluator.cap2info_gpt4(caption)
+    data['cap_info'] = cap_info
+    return data
+
+# Function to initialize the multiprocessing pool and process the data
+def process_annotations(annotations_file, gt=False):
+    # Load annotations
+    with open(annotations_file, 'r', encoding='utf-8') as f:
+        annotations = json.load(f)
+
+    # Create a pool of workers equal to the number of available CPU cores
+    pool = Pool(processes=32)  # None means use all available cores
+
+    # Use a partial function to fix the gt and evaluator arguments
+    from functools import partial
+    process_data_partial = partial(process_data, gt=gt)
+
+    # Map the data processing function over the annotations using the pool
+    # pool.map(process_data_partial, annotations)
+    res = []
+    for data in tqdm(pool.imap_unordered(process_data_partial, annotations), total=len(annotations)):
+        res.append(data)
+    # Close the pool and wait for the work to finish
+    pool.close()
+    pool.join()
+    return res
+
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--cap_file", type=str, default='/mnt/bn/algo-masp-nas-2/xiangchen/model/masp_models/checkpoints/llava-mistral_gpt4v_adso65k_unfreeze_qformer/video_chair/vid_top1k_res.json')
+    parser.add_argument("--output_file", type=str, default='/mnt/bn/algo-masp-nas-2/xiangchen/model/masp_models/checkpoints/llava-mistral_gpt4v_adso65k_unfreeze_qformer/video_chair/vid_top1k_res_info.json')
+    parser.add_argument("--gt", type=bool, default=False)
+
+    args = parser.parse_args()
+    
+    # post_anno = post_process_masp_cap_label(evaluator, args.cap_file, args.gt)
+    post_anno = process_annotations(args.cap_file, args.gt)
+    with open(f"{args.output_file}", "w") as file:
+            json.dump(post_anno, file, indent=4)
+
diff --git a/app/llava/eval/masp_eval/video_chair/prompts/VQA_generate/Live/noquestion.txt b/app/llava/eval/masp_eval/video_chair/prompts/VQA_generate/Live/noquestion.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2d1d2ba1ea465d89db2776dd8240298e8f524d27
--- /dev/null
+++ b/app/llava/eval/masp_eval/video_chair/prompts/VQA_generate/Live/noquestion.txt
@@ -0,0 +1,5 @@
+You are provided with the sentence which describes an image. You need to finish the following tasks: design questions
+based on the contrastive objects/attributes/actions. The contrastive object/attributes/actions are defined as having similar features, easy to confuse or always co-occur. The answer
+to the questions should be "no" because the contrastive objects/attributes/actions are not mentioned in the sentence.
+describe = {cap}
+question = 
\ No newline at end of file
diff --git a/app/llava/eval/masp_eval/video_chair/prompts/VQA_generate/Live/yesquestion.txt b/app/llava/eval/masp_eval/video_chair/prompts/VQA_generate/Live/yesquestion.txt
new file mode 100644
index 0000000000000000000000000000000000000000..98ced43b4d25561ab1635e4875925cf9e3513c25
--- /dev/null
+++ b/app/llava/eval/masp_eval/video_chair/prompts/VQA_generate/Live/yesquestion.txt
@@ -0,0 +1,5 @@
+You are provided with the sentence which describes an image. You need to finish the following tasks: design questions
+based on the objects/attributes/actions mentioned in the sentence. The answer to the question should be "yes" because
+the objects/attributes/actions are mentioned in the sentence.
+describe = {cap}
+question = 
\ No newline at end of file
diff --git a/app/llava/eval/masp_eval/video_chair/prompts/VQA_generate/QAprompts/yes_no_questions.txt b/app/llava/eval/masp_eval/video_chair/prompts/VQA_generate/QAprompts/yes_no_questions.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f67983c4468bd8a2e82d8011e9c1d9a17db61b61
--- /dev/null
+++ b/app/llava/eval/masp_eval/video_chair/prompts/VQA_generate/QAprompts/yes_no_questions.txt
@@ -0,0 +1,7 @@
+Can you generate some object existence yes no questions with paired answers which should have answer yes, based on provided video caption CAP.
+Here are some examples:
+1. 
+Cap = ""
+QA:
+Question = ""
+Answer = "yes"
\ No newline at end of file
diff --git a/app/llava/eval/masp_eval/video_chair/prompts/cap2info.txt b/app/llava/eval/masp_eval/video_chair/prompts/cap2info.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bf6cef9102950651192f0622e43bc8394cdbfa4b
--- /dev/null
+++ b/app/llava/eval/masp_eval/video_chair/prompts/cap2info.txt
@@ -0,0 +1,52 @@
+Please review the provided video caption and create a JSON representation.  The 'subjects' should list the main characters or entities in the video along with their distinctive attributes. 'Activities' should describe what actions or events are taking place. 'Locations' should pinpoint where the video is set, and 'text_overlays' should mention any text that appears in the video. Remember, clarity and precision are key.
+JSON format:
+{
+  "subjects": [
+    {
+      "name": "<Insert subject name here>",
+      "attributes": [
+        "<Insert subject attributes here separated by commas>"
+      ]
+    },
+    {
+      "name": "<Insert subject name here>",
+      "attributes": [
+        "<Insert subject attributes here separated by commas>"
+      ]
+    }
+  ],
+  "activities": [
+    "<Insert activities here separated by commas>"
+  ],
+  "locations": [
+    "<Insert locations here separated by commas>"
+  ],
+  "text_overlays": [
+    "<Insert text overlays here separated by commas>"
+  ]
+}
+
+
+Example: 
+video caption: The video content primarily features a young boy in various casual settings. The boy is then seen wearing filter pink swimming goggles in different scenes, either lying on a couch or sitting. The scenes are generally relaxed and playful, with the pink goggles adding a unique touch to the images. The video concludes with the boy still wearing the filter goggles.
+Answer:
+{
+  "subjects": [
+    {
+      "name": "young boy",
+      "attributes": [
+        "wearing filter pink swimming goggles",
+      ]
+    }
+  ],
+  "activities": [
+    "lying on a couch", "sitting"
+  ],
+  "locations": [
+    "various casual settings"
+  ],
+  "text_overlays": []
+}
+
+The following is the input video caption:
+video caption: /video caption/
\ No newline at end of file
diff --git a/app/llava/eval/masp_eval/video_chair/prompts/cap2objs.txt b/app/llava/eval/masp_eval/video_chair/prompts/cap2objs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bdfe5166695a1bda8b26c098d44fadfb38fca100
--- /dev/null
+++ b/app/llava/eval/masp_eval/video_chair/prompts/cap2objs.txt
@@ -0,0 +1,38 @@
+I have a description of an image, and I want to get objects from this description and return these objects in a list the object should be noun, and I don't want duplicated objects.
+I don't want scene name to be included, such as some caption describe the image is a scene or depict a position or a situation or place, this things is not an object, don't need to include.
+Here some objects are inside [] which we want to ignore.
+Here are some examples: 
+
+Example 1:
+caption = "The image features a bathroom sink situated under a large mirror. The sink is accompanied by a soap dispenser, and there are multiple toothbrushes placed around it. A few cups can be seen scattered around the sink area as well. \n\nIn addition to the sink, there is a toilet visible to the left side of the bathroom. The overall scene gives an impression of a well-equipped and functional bathroom space. Also a [brush] can been seen."
+Answer:
+objects = ['sink', 'mirror', 'soap dispenser', 'toothbrush', 'cup', 'toilet']
+
+Here we can see [brush] is ignored because its inside []. bathroom is the place not object, so not included.
+ 
+Example 2:
+caption = "The image depicts a cluttered dining room with a large kitchen table in the center. The table is covered with dirty dishes, including plates, bowls, cups, and utensils. There are several chairs around the table, with some placed closer to the center and others positioned at the edges.  In addition to the dishes, there is an apple sitting on the table, likely left over from a meal or snack. A bottle of water can be seen on the table as well, and a [flower], adding to the messy atmosphere of the room."
+Answer:
+objects = ['table', 'dish', 'bowl', 'cup', 'utensil', 'chair', 'apple', 'water']
+
+Here [flower] is in [], should be ignored. Here dining room and room are places, so ignored, not in objects.
+
+Example 3:
+caption = "The image depicts a busy city street with a pedestrian crossing in a sunny day. A man is walking across the street, carrying a backpack and wearing a jacket."
+Answer:
+objects = ['street', 'pedestrian crossing', 'man', 'backpack', 'jacket']
+
+Here 'city' is place, so not an object so not include in objects. 'The image depicts' is about image caption task, so not object in the scence.
+'sunny' or 'sunny day' or 'day' are not object in the image, this is time situation so not object, can't in objects.
+
+Example 4:
+caption = "The image depicts an office cubicle with a desk in the center. The desk is equipped with a computer, a keyboard, and a mouse."
+Answer:
+objects = ['desk', 'computer', 'keyboard', 'mouse']
+
+Here office is place so not in objects. Here 'center' is not objects, 'center' is position, not object, same thing like 'left' or 'right' etc.
+
+Here is the input caption
+caption = {cap}
+Answer:
+objects = 
\ No newline at end of file
diff --git a/app/llava/eval/masp_eval/video_chair/prompts/cap_mention.txt b/app/llava/eval/masp_eval/video_chair/prompts/cap_mention.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ab63d51fe9cc5e95f451dd4101367ecd03a7a873
--- /dev/null
+++ b/app/llava/eval/masp_eval/video_chair/prompts/cap_mention.txt
@@ -0,0 +1,7 @@
+/video caption/
+According to the above video caption, please judge one by one whether the following contents are mentioned in the caption or not. If it's not sure, please answer 'maybe'
+/question/
+The answer should be in the following format:
+<questionid>. <question content> - <Yes/No/Maybe>, <answer reason>
+Format example:
+1. activities:dancing - No,  the video only metioned the woman is singing.
\ No newline at end of file
diff --git a/app/llava/eval/masp_eval/video_chair/prompts/gpt4v_score.txt b/app/llava/eval/masp_eval/video_chair/prompts/gpt4v_score.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9a8b32091aa54253a8732cd8a80d59347343ae3d
--- /dev/null
+++ b/app/llava/eval/masp_eval/video_chair/prompts/gpt4v_score.txt
@@ -0,0 +1,31 @@
+Please conduct an accuracy evaluation of the following video caption compared to the video content. Your assessment should focus on two main tasks, with an important condition to avoid counting duplicate facts:
+
+Count the Correct Facts/Details (Avoiding Duplicates):
+
+Identify and count each distinct fact or detail in the caption that accurately corresponds to the video. This includes names, dates, places, specific events, and any data points.
+Ensure that each correct fact or detail is counted only once, regardless of how many times it appears in the caption.
+Provide a list of these correct facts/details and their total count.
+Count the Incorrect Facts/Details (Avoiding Duplicates):
+
+Identify and count each distinct fact or detail in the caption that is inaccurately represented or not present in the video. This includes hallucinations, factual errors, or misrepresentations.
+Make sure to count each incorrect fact or detail only once, even if it appears multiple times in the caption.
+Provide a list of these incorrect facts/details and their total count.
+In your analysis, be meticulous in ensuring that duplicates are not counted in either category. This is essential to provide an accurate assessment of the caption's accuracy. Present your findings clearly, specifying the exact number of correct and incorrect facts/details as per the video content.
+
+Imagine you are seeing the full video content. Please do reasonable inference for the temporal facts/detail based on the still images.
+The answer should be formartted as the following structure:
+
+Total Count of Correct Facts/Details: [Insert Total Number]
+List of Correct Facts/Details:
+    Fact/Detail 1: [Description]
+    Fact/Detail 2: [Description]
+    ...
+    Fact/Detail N: [Description]
+Total Count of Incorrect Facts/Details: [Insert Total Number]
+List of Incorrect Facts/Details:
+Fact/Detail 1: [Description]
+Fact/Detail 2: [Description]
+...
+Fact/Detail N: [Description]
+
+Here is the given video caption: /caption/
diff --git a/app/llava/eval/masp_eval/video_chair/prompts/gpt4v_sys_prompt.txt b/app/llava/eval/masp_eval/video_chair/prompts/gpt4v_sys_prompt.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5b958dfde2132abe4efc8f6067333cdca144bb14
--- /dev/null
+++ b/app/llava/eval/masp_eval/video_chair/prompts/gpt4v_sys_prompt.txt
@@ -0,0 +1 @@
+ Picture yourself as a customer service agent managing user-uploaded video. The uploaded video, consists of a seires of images. All the analysis should be video-level.
\ No newline at end of file
diff --git a/app/llava/eval/masp_eval/video_chair/prompts/object_coverage.txt b/app/llava/eval/masp_eval/video_chair/prompts/object_coverage.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fe5eeccf5d91d83eb35ae23f7e7ec8094489b6e6
--- /dev/null
+++ b/app/llava/eval/masp_eval/video_chair/prompts/object_coverage.txt
@@ -0,0 +1,45 @@
+I have two list of objects, list_A and list_B, I want to return a list named uncover which find items in list_B doesn't appear in list_A, 
+sometimes same object can be expressed in different ways in list_A and list_B, we treat different expression but similar meaning objects as matched, not include in mistmatch list.
+
+For example
+list_A = ['two cars', 'dark bagpack', 'yellow jacket', 'light', 'brick building', 'wood chair', 'chair', 'green car', 'dining room table', 'bike', 'city street', 'traffic light', 'sedan']
+list_B = ['reflection of light', 'view of office building', 'street chair', 'white car', 'red car', 'dark hair']
+
+Answer:
+uncover = ['reflection of light', 'dark hair']
+
+In this example
+'reflection of light' cannot find matched object in list_A, especially, 'light' is not equal to 'reflection of light'.
+'view of office building' in list_B can find matched object 'brick building' although they are not exactly same but they point to similar object.
+'street chair' in list_B can find 'chair', 'wood chair' in list_A which is an alternate expression of 'chair'.
+'white car' in list_B can find 'two cars' in list_A.
+'red car' in list_B can find 'two cars' in list_A.
+'dark hair' in list_B cannot find anything similar in list_A
+
+Another example
+list_A = ['bag', 'cloth', 'boy', 'Drinking glasses', 'table']
+list_B =['backpack', 'jacket', 'young man', 'cup', 'kitchen table', 'plate', 'apple']
+Answer:
+uncover = ['plate', 'apple']
+
+In this example, 
+'backpack' in list_B can find 'bag' in list_A has similar meaning, matched.
+'jacket' in list_B can be seen as a kind of 'cloth' in list_A still matching;
+'young man' in list_B can match 'boy' in list_A;
+'cup' in list_B is similar to 'Drinking glasses' in list_A;
+'kitchen table' is a kind of table as 'table' in list_A so there is no uncovered items.
+'plate' in list_B but no object has same or similar meaning in list_A.
+'apple' in list_B but no object has same or similar meaning in list_A.
+
+Another example
+list_A = ['keyboard', 'mouse', 'moniter', 'cpu']
+list_B = ['computer']
+Answer:
+uncover = []
+'computer' in list_B can find 'keyboard', 'mouse', 'moniter', 'cpu' as whole thing in list_A, matched.
+
+Here is the inputs
+list_A = {cap_obj}
+list_B = {gt}
+Answer:
+uncover = 
\ No newline at end of file
diff --git a/app/llava/eval/masp_eval/video_chair/prompts/refine_json.txt b/app/llava/eval/masp_eval/video_chair/prompts/refine_json.txt
new file mode 100644
index 0000000000000000000000000000000000000000..20a35b1c342039b2bada87881629cb43812cb63f
--- /dev/null
+++ b/app/llava/eval/masp_eval/video_chair/prompts/refine_json.txt
@@ -0,0 +1,39 @@
+please remove any duplicate information in the JSON to ensure that each category – 'subjects', 'activities', 'locations', and 'text_overlays' – contains unique elements without repetition. 
+Also, ensure that all categories in the JSON, including "subjects", "activities", "locations", and "text_overlays", are formatted correctly, please follow these guidelines:
+1. **Subjects**: For each subject in the video, provide their name and a list of attributes. Each subject should be a dictionary with a "name" key and an "attributes" key. If there are no specific attributes for a subject, leave the attributes list empty.
+
+   Example Format:
+   ```
+   "subjects": [
+     {
+       "name": "subject1",
+       "attributes": ["attribute1", "attribute2"]
+     },
+     {
+       "name": "subject2",
+       "attributes": []
+     }
+   ]
+   ```
+
+2. **Activities**: List all the activities featured in the video. Each activity should be a separate item in the list.
+
+   Example Format: 
+   ```
+   "activities": ["activity1", "activity2", "activity3"]
+   ```
+
+3. **Locations**: List all the distinct locations shown in the video. Each location should be a separate item in the list.
+
+   Example Format:
+   ```
+   "locations": ["location1", "location2", "location3"]
+   ```
+
+4. **Text Overlays**: If there are any text overlays in the video, list them. Each piece of text should be a separate item in the list. If there are no text overlays, the list should be empty.
+
+   Example Format:
+   ```
+   "text_overlays": ["text1", "text2", "text3"]
+   ```
+
diff --git a/app/llava/mm_utils.py b/app/llava/mm_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..80b9d9799a8c24f43f9e633bfdc938caa23a16ed
--- /dev/null
+++ b/app/llava/mm_utils.py
@@ -0,0 +1,341 @@
+from PIL import Image
+from io import BytesIO
+import base64
+import torch
+import math
+import ast
+import copy
+import numpy as np
+import random
+from transformers import StoppingCriteria, CLIPImageProcessor, SiglipImageProcessor
+from llava.constants import MM_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_VIDEO_TOKEN
+
+
+def select_best_resolution(original_size, possible_resolutions):
+    """
+    Selects the best resolution from a list of possible resolutions based on the original size.
+
+    Args:
+        original_size (tuple): The original size of the image in the format (width, height).
+        possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+
+    Returns:
+        tuple: The best fit resolution in the format (width, height).
+    """
+    original_width, original_height = original_size
+    best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float('inf')
+
+    for width, height in possible_resolutions:
+        scale = min(width / original_width, height / original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
+        effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
+        wasted_resolution = (width * height) - effective_resolution
+
+        if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
+            max_effective_resolution = effective_resolution
+            min_wasted_resolution = wasted_resolution
+            best_fit = (width, height)
+
+    return best_fit
+
+
+def resize_and_pad_image(image, target_resolution):
+    """
+    Resize and pad an image to a target resolution while maintaining aspect ratio.
+
+    Args:
+        image (PIL.Image.Image): The input image.
+        target_resolution (tuple): The target resolution (width, height) of the image.
+
+    Returns:
+        PIL.Image.Image: The resized and padded image.
+    """
+    original_width, original_height = image.size
+    target_width, target_height = target_resolution
+
+    scale_w = target_width / original_width
+    scale_h = target_height / original_height
+
+    if scale_w < scale_h:
+        new_width = target_width
+        new_height = min(math.ceil(original_height * scale_w), target_height)
+    else:
+        new_height = target_height
+        new_width = min(math.ceil(original_width * scale_h), target_width)
+
+    # Resize the image
+    resized_image = image.resize((new_width, new_height))
+
+    new_image = Image.new('RGB', (target_width, target_height), (0, 0, 0))
+    paste_x = (target_width - new_width) // 2
+    paste_y = (target_height - new_height) // 2
+    new_image.paste(resized_image, (paste_x, paste_y))
+
+    return new_image
+
+
+def divide_to_patches(image, patch_size):
+    """
+    Divides an image into patches of a specified size.
+
+    Args:
+        image (PIL.Image.Image): The input image.
+        patch_size (int): The size of each patch.
+
+    Returns:
+        list: A list of PIL.Image.Image objects representing the patches.
+    """
+    patches = []
+    width, height = image.size
+    for i in range(0, height, patch_size):
+        for j in range(0, width, patch_size):
+            box = (j, i, j + patch_size, i + patch_size)
+            patch = image.crop(box)
+            patches.append(patch)
+
+    return patches
+
+
+def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
+    """
+    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
+
+    Args:
+        image_size (tuple): The size of the input image in the format (width, height).
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+        patch_size (int): The size of each image patch.
+
+    Returns:
+        tuple: The shape of the image patch grid in the format (width, height).
+    """
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    width, height = select_best_resolution(image_size, possible_resolutions)
+    return width // patch_size, height // patch_size
+
+
+def process_anyres_image(image, processor, grid_pinpoints):
+    """
+    Process an image with variable resolutions.
+
+    Args:
+        image (PIL.Image.Image): The input image to be processed.
+        processor: The image processor object.
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+
+    Returns:
+        torch.Tensor: A tensor containing the processed image patches.
+    """
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    best_resolution = select_best_resolution(image.size, possible_resolutions)
+    image_padded = resize_and_pad_image(image, best_resolution)
+
+    patches = divide_to_patches(image_padded, processor.crop_size['height'] if hasattr(processor, 'crop_size') else processor.size['height'])
+
+
+    if isinstance(processor, CLIPImageProcessor) or isinstance(processor, SiglipImageProcessor):
+        image_original_resize = image.resize((processor.size['height'], processor.size['width']))
+        image_patches = [image_original_resize] + patches
+        image_patches = [processor.preprocess(image_patch, return_tensors='pt')['pixel_values'][0]
+                        for image_patch in image_patches]
+    else:
+        image_original_resize = image.resize((processor.img_size, processor.img_size))
+        image_patches = [image_original_resize] + patches
+        image_patches = [processor.preprocess(image_patch)
+                        for image_patch in image_patches]        
+    return torch.stack(image_patches, dim=0)
+
+
+def load_image_from_base64(image):
+    return Image.open(BytesIO(base64.b64decode(image)))
+
+
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+
+
+def process_images(images, image_processor, model_cfg):
+    image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
+    new_images = []
+    if image_aspect_ratio == 'pad':
+        for image in images:
+            image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean))
+            image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+            new_images.append(image)
+    elif image_aspect_ratio == "anyres":
+        for image in images:
+            image = process_anyres_image(image, image_processor, model_cfg.image_grid_pinpoints)
+            new_images.append(image)
+    else:
+        return image_processor(images, return_tensors='pt')['pixel_values']
+    if all(x.shape == new_images[0].shape for x in new_images):
+        new_images = torch.stack(new_images, dim=0)
+    return new_images
+
+def process_images_v2(images, image_processor, model_cfg):
+    image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
+    new_images = []
+    if image_aspect_ratio == 'pad':
+        for image in images:
+            image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean))
+            if isinstance(image_processor, CLIPImageProcessor) or isinstance(image_processor, SiglipImageProcessor):
+                image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+            else:
+                image = image_processor.preprocess(image)            
+            new_images.append(image)
+    elif image_aspect_ratio == "anyres":
+        for image in images:
+            image = process_anyres_image(image, image_processor, model_cfg.image_grid_pinpoints)
+            new_images.append(image)
+    else:
+        for image in images:
+            if isinstance(image_processor, CLIPImageProcessor) or isinstance(image_processor, SiglipImageProcessor):
+                image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+            else:
+                image = image_processor.preprocess(image)
+            new_images.append(image)
+    if all(x.shape == new_images[0].shape for x in new_images):
+        new_images = torch.stack(new_images, dim=0)
+    return new_images
+
+
+def tokenizer_image_token(prompt, tokenizer, MM_TOKEN_INDEX=MM_TOKEN_INDEX, return_tensors=None):
+    mm_token = DEFAULT_VIDEO_TOKEN if DEFAULT_VIDEO_TOKEN in prompt else DEFAULT_IMAGE_TOKEN
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split(mm_token)]
+
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
+
+    input_ids = []
+    offset = 0
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+
+    for x in insert_separator(prompt_chunks, [MM_TOKEN_INDEX] * (offset + 1)):
+        input_ids.extend(x[offset:])
+
+    if return_tensors is not None:
+        if return_tensors == 'pt':
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f'Unsupported tensor type: {return_tensors}')
+    return input_ids
+
+
+def get_model_name_from_path(model_path):
+    model_path = model_path.strip("/")
+    model_paths = model_path.split("/")
+    if model_paths[-1].startswith('checkpoint-'):
+        return model_paths[-2] + "_" + model_paths[-1]
+    else:
+        return model_paths[-1]
+
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.keyword_ids = []
+        self.max_keyword_len = 0
+        for keyword in keywords:
+            cur_keyword_ids = tokenizer(keyword).input_ids
+            if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
+                cur_keyword_ids = cur_keyword_ids[1:]
+            if len(cur_keyword_ids) > self.max_keyword_len:
+                self.max_keyword_len = len(cur_keyword_ids)
+            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
+        self.tokenizer = tokenizer
+        self.start_len = input_ids.shape[1]
+    
+    def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
+        self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
+        for keyword_id in self.keyword_ids:
+            truncated_output_ids = output_ids[0, -keyword_id.shape[0]:]
+            if torch.equal(truncated_output_ids, keyword_id):
+                return True
+        outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
+        for keyword in self.keywords:
+            if keyword in outputs:
+                return True
+        return False
+    
+    def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        outputs = []
+        for i in range(output_ids.shape[0]):
+            outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
+        return all(outputs)
+
+
+def get_frame_indices(num_segments, vlen, sample='rand', fix_start=None, input_fps=1, pad_last=False):
+    if sample in ['rand', 'middle']: # uniform sampling
+        num_segments = min(num_segments, vlen)
+        intervals = np.linspace(start=0, stop=vlen, num=num_segments + 1).astype(int)
+        ranges = []
+
+        for idx, interv in enumerate(intervals[:-1]):
+            ranges.append((interv, intervals[idx + 1] - 1))
+
+        if sample == 'rand':
+            try:
+                frame_indices = [random.choice(range(x[0], x[1])) for x in ranges]
+
+            except:
+                frame_indices = np.random.permutation(vlen)[:num_segments]
+                frame_indices.sort()
+                frame_indices = list(frame_indices)
+
+        elif fix_start is not None:
+            frame_indices = [x[0] + fix_start for x in ranges]
+
+        elif sample == 'middle':
+            frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
+
+        if pad_last:
+            if len(frame_indices) < num_segments:
+                padded_frame_indices = [frame_indices[-1]] * num_segments
+                padded_frame_indices[:len(frame_indices)] = frame_indices
+                frame_indices = padded_frame_indices
+
+    elif "fps" in sample: # fps0.5, sequentially sample frames at 0.5 fps
+        output_fps = float(sample[3:])
+        duration = float(vlen) / input_fps
+        delta = 1 / output_fps # gap between frames, this is also the clip length each frame represents
+        frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
+        frame_indices = np.around(frame_seconds * input_fps).astype(int)
+        frame_indices = [e for e in frame_indices if e < vlen]
+        if num_segments > 0 and len(frame_indices) > num_segments:
+            cand_indices = copy.deepcopy(frame_indices)
+            intervals = np.linspace(start=0, stop=len(cand_indices), num=num_segments + 1).astype(int)
+            ranges = []
+
+            for idx, interv in enumerate(intervals[:-1]):
+                ranges.append((interv, intervals[idx + 1] - 1))
+
+            try:
+                frame_indices = [cand_indices[random.choice(range(x[0], x[1]))] for x in ranges]
+            except:
+                frame_indices = [cand_indices[x[0]] for x in ranges]
+
+    else:
+        raise NotImplementedError
+
+    if len(frame_indices) == 0:
+        frame_indices = [0]
+
+    return frame_indices
\ No newline at end of file
diff --git a/app/llava/model/__init__.py b/app/llava/model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fbfad5b663146291184a843297ada82963989a2
--- /dev/null
+++ b/app/llava/model/__init__.py
@@ -0,0 +1,11 @@
+from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig
+from .language_model.llava_mpt import LlavaMptForCausalLM, LlavaMptConfig
+from .language_model.llava_mistral import LlavaMistralForCausalLM, LlavaMistralConfig
+try:
+    from .language_model.llava_gemma import LlavaGemmaForCausalLM, LlavaGemmaConfig
+except:
+    pass
+try:
+    from .language_model.llava_thoth import LlavaThothForCausalLM, LlavaThothConfig
+except:
+    pass
diff --git a/app/llava/model/__pycache__/__init__.cpython-310.pyc b/app/llava/model/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..97e23c3af81c24fd18c54eb416faa891866ec75e
Binary files /dev/null and b/app/llava/model/__pycache__/__init__.cpython-310.pyc differ
diff --git a/app/llava/model/__pycache__/__init__.cpython-39.pyc b/app/llava/model/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f625157dd9aeae4901da1f4b805be7950fd68722
Binary files /dev/null and b/app/llava/model/__pycache__/__init__.cpython-39.pyc differ
diff --git a/app/llava/model/__pycache__/builder.cpython-39.pyc b/app/llava/model/__pycache__/builder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c93eecc5401c1100d4d186be3263c3a211a0dbe5
Binary files /dev/null and b/app/llava/model/__pycache__/builder.cpython-39.pyc differ
diff --git a/app/llava/model/__pycache__/llava_arch.cpython-310.pyc b/app/llava/model/__pycache__/llava_arch.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..477b93e92bb6a1a95cbde67c7d1667fa00a6686d
Binary files /dev/null and b/app/llava/model/__pycache__/llava_arch.cpython-310.pyc differ
diff --git a/app/llava/model/__pycache__/llava_arch.cpython-39.pyc b/app/llava/model/__pycache__/llava_arch.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f96bbc9a5bbba5a047bd290489165e3b7906de99
Binary files /dev/null and b/app/llava/model/__pycache__/llava_arch.cpython-39.pyc differ
diff --git a/app/llava/model/__pycache__/utils.cpython-310.pyc b/app/llava/model/__pycache__/utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d32b70782900dd687e9367f3d3d8005d409c7a55
Binary files /dev/null and b/app/llava/model/__pycache__/utils.cpython-310.pyc differ
diff --git a/app/llava/model/__pycache__/utils.cpython-39.pyc b/app/llava/model/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..39c795d05b84a807da985eae2d01cc153c390560
Binary files /dev/null and b/app/llava/model/__pycache__/utils.cpython-39.pyc differ
diff --git a/app/llava/model/apply_delta.py b/app/llava/model/apply_delta.py
new file mode 100644
index 0000000000000000000000000000000000000000..666dd9691bde7d54ddf2871e311d6f621e29f099
--- /dev/null
+++ b/app/llava/model/apply_delta.py
@@ -0,0 +1,48 @@
+"""
+Usage:
+python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta
+"""
+import argparse
+
+import torch
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from llava import LlavaLlamaForCausalLM
+
+
+def apply_delta(base_model_path, target_model_path, delta_path):
+    print("Loading base model")
+    base = AutoModelForCausalLM.from_pretrained(
+        base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+
+    print("Loading delta")
+    delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+    delta_tokenizer = AutoTokenizer.from_pretrained(delta_path)
+
+    print("Applying delta")
+    for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"):
+        if name not in base.state_dict():
+            assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
+            continue
+        if param.data.shape == base.state_dict()[name].shape:
+            param.data += base.state_dict()[name]
+        else:
+            assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \
+                f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
+            bparam = base.state_dict()[name]
+            param.data[:bparam.shape[0], :bparam.shape[1]] += bparam
+
+    print("Saving target model")
+    delta.save_pretrained(target_model_path)
+    delta_tokenizer.save_pretrained(target_model_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-model-path", type=str, required=True)
+    parser.add_argument("--target-model-path", type=str, required=True)
+    parser.add_argument("--delta-path", type=str, required=True)
+
+    args = parser.parse_args()
+
+    apply_delta(args.base_model_path, args.target_model_path, args.delta_path)
diff --git a/app/llava/model/builder.py b/app/llava/model/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..24f32de6b8e034aa2a75ac1fd2f40d19605b0b44
--- /dev/null
+++ b/app/llava/model/builder.py
@@ -0,0 +1,181 @@
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+
+import os
+import warnings
+import shutil
+
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
+import torch
+from llava.model import *
+from llava.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_VIDEO_PATCH_TOKEN, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN
+
+
+def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
+    kwargs = {"device_map": device_map, **kwargs}
+
+    if device != "cuda":
+        kwargs['device_map'] = {"": device}
+
+    if load_8bit:
+        kwargs['load_in_8bit'] = True
+    elif load_4bit:
+        kwargs['load_in_4bit'] = True
+        kwargs['quantization_config'] = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type='nf4'
+        )
+    else:
+        kwargs['torch_dtype'] = torch.float16
+
+    if use_flash_attn:
+        kwargs['attn_implementation'] = 'flash_attention_2'
+
+    if 'llava' in model_name.lower():
+        # Load LLaVA model
+        if 'lora' in model_name.lower() and model_base is None:
+            warnings.warn('There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument. Detailed instruction: https://github.com/haotian-liu/LLaVA#launch-a-model-worker-lora-weights-unmerged.')
+        if 'lora' in model_name.lower() and model_base is not None:
+            from llava.model.language_model.llava_llama import LlavaConfig
+            lora_cfg_pretrained = LlavaConfig.from_pretrained(model_path)
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            print('Loading LLaVA from base model...')
+            model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
+            token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
+            if model.lm_head.weight.shape[0] != token_num:
+                model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
+                model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
+
+            print('Loading additional LLaVA weights...')
+            if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')):
+                non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu')
+            else:
+                # this is probably from HF Hub
+                from huggingface_hub import hf_hub_download
+                def load_from_hf(repo_id, filename, subfolder=None):
+                    cache_file = hf_hub_download(
+                        repo_id=repo_id,
+                        filename=filename,
+                        subfolder=subfolder)
+                    return torch.load(cache_file, map_location='cpu')
+                non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin')
+            non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
+            if any(k.startswith('model.model.') for k in non_lora_trainables):
+                non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
+            model.load_state_dict(non_lora_trainables, strict=False)
+
+            from peft import PeftModel
+            print('Loading LoRA weights...')
+            model = PeftModel.from_pretrained(model, model_path)
+            print('Merging LoRA weights...')
+            model = model.merge_and_unload()
+            print('Model is loaded...')
+        elif model_base is not None:
+            # this may be mm projector only
+            print('Loading LLaVA from base model...')
+            if 'mpt' in model_name.lower():
+                if not os.path.isfile(os.path.join(model_path, 'configuration_mpt.py')):
+                    shutil.copyfile(os.path.join(model_base, 'configuration_mpt.py'), os.path.join(model_path, 'configuration_mpt.py'))
+                tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=True)
+                cfg_pretrained = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+                model = LlavaMptForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+                cfg_pretrained = AutoConfig.from_pretrained(model_path)
+                model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
+
+            mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu')
+            mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
+            model.load_state_dict(mm_projector_weights, strict=False)
+        else:
+            if 'mpt' in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
+                model = LlavaMptForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+            elif 'mistral' in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path)
+                model = LlavaMistralForCausalLM.from_pretrained(
+                    model_path,
+                    low_cpu_mem_usage=True,
+                    **kwargs
+                )
+            elif 'gemma' in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path)
+                model = LlavaGemmaForCausalLM.from_pretrained(
+                    model_path,
+                    low_cpu_mem_usage=True,
+                    **kwargs
+                )
+            elif 'thoth' in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path)
+                model = LlavaThothForCausalLM.from_pretrained(
+                    model_path,
+                    low_cpu_mem_usage=True,
+                    **kwargs
+                )
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+                model = LlavaLlamaForCausalLM.from_pretrained(
+                    model_path,
+                    low_cpu_mem_usage=True,
+                    **kwargs
+                )
+    else:
+        # Load language model
+        if model_base is not None:
+            # PEFT model
+            from peft import PeftModel
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            model = AutoModelForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, **kwargs)
+            print(f"Loading LoRA weights from {model_path}")
+            model = PeftModel.from_pretrained(model, model_path)
+            print(f"Merging weights")
+            model = model.merge_and_unload()
+            print('Convert to FP16...')
+            model.to(torch.float16)
+        else:
+            use_fast = False
+            if 'mpt' in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
+                model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, trust_remote_code=True, **kwargs)
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+                model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+
+    image_processor = None
+
+    if 'llava' in model_name.lower():
+        mm_use_start_end = getattr(model.config, "mm_use_start_end", False)
+        mm_use_patch_token = getattr(model.config, "mm_use_patch_token", True)
+        if mm_use_patch_token:
+            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_VIDEO_PATCH_TOKEN], special_tokens=True)
+        if mm_use_start_end:
+            tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN], special_tokens=True)
+        model.resize_token_embeddings(len(tokenizer))
+
+        vision_tower = model.get_vision_tower()
+        if not vision_tower.is_loaded:
+            vision_tower.load_model(device_map=device_map)
+        if device_map != 'auto':
+            vision_tower.to(device='cuda', dtype=torch.float16)
+        image_processor = vision_tower.image_processor
+
+    if hasattr(model.config, "max_sequence_length"):
+        context_len = model.config.max_sequence_length
+    else:
+        context_len = 2048
+
+    return tokenizer, model, image_processor, context_len
diff --git a/app/llava/model/consolidate.py b/app/llava/model/consolidate.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e324210e229eeba23b75791bba82df7c6e639eb
--- /dev/null
+++ b/app/llava/model/consolidate.py
@@ -0,0 +1,29 @@
+"""
+Usage:
+python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
+"""
+import argparse
+
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from llava.model import *
+from llava.model.utils import auto_upgrade
+
+
+def consolidate_ckpt(src_path, dst_path):
+    print("Loading model")
+    auto_upgrade(src_path)
+    src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+    src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
+    src_model.save_pretrained(dst_path)
+    src_tokenizer.save_pretrained(dst_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--src", type=str, required=True)
+    parser.add_argument("--dst", type=str, required=True)
+
+    args = parser.parse_args()
+
+    consolidate_ckpt(args.src, args.dst)
diff --git a/app/llava/model/language_model/__pycache__/llava_gemma.cpython-310.pyc b/app/llava/model/language_model/__pycache__/llava_gemma.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f5a6ff2601e0bd5e94133df316810066668a32d4
Binary files /dev/null and b/app/llava/model/language_model/__pycache__/llava_gemma.cpython-310.pyc differ
diff --git a/app/llava/model/language_model/__pycache__/llava_gemma.cpython-39.pyc b/app/llava/model/language_model/__pycache__/llava_gemma.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c956c1b63bf29ba5215a366dfd0ba842587fa3bb
Binary files /dev/null and b/app/llava/model/language_model/__pycache__/llava_gemma.cpython-39.pyc differ
diff --git a/app/llava/model/language_model/__pycache__/llava_llama.cpython-310.pyc b/app/llava/model/language_model/__pycache__/llava_llama.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2880a6ac9dbef051c888be38500e3d9798e15ffa
Binary files /dev/null and b/app/llava/model/language_model/__pycache__/llava_llama.cpython-310.pyc differ
diff --git a/app/llava/model/language_model/__pycache__/llava_llama.cpython-39.pyc b/app/llava/model/language_model/__pycache__/llava_llama.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f4bba7e621bfd7dc3854f479e2aff59baca0d6e6
Binary files /dev/null and b/app/llava/model/language_model/__pycache__/llava_llama.cpython-39.pyc differ
diff --git a/app/llava/model/language_model/__pycache__/llava_mistral.cpython-310.pyc b/app/llava/model/language_model/__pycache__/llava_mistral.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3fb92f9354c75c4445dd2758f9d5b3484ecdd16f
Binary files /dev/null and b/app/llava/model/language_model/__pycache__/llava_mistral.cpython-310.pyc differ
diff --git a/app/llava/model/language_model/__pycache__/llava_mistral.cpython-39.pyc b/app/llava/model/language_model/__pycache__/llava_mistral.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..428defb20e5c6f26f2daaf3dd771a22a3ab9808c
Binary files /dev/null and b/app/llava/model/language_model/__pycache__/llava_mistral.cpython-39.pyc differ
diff --git a/app/llava/model/language_model/__pycache__/llava_mpt.cpython-310.pyc b/app/llava/model/language_model/__pycache__/llava_mpt.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..39677e002d4bb1bc6a500cfa4516e467e06a475b
Binary files /dev/null and b/app/llava/model/language_model/__pycache__/llava_mpt.cpython-310.pyc differ
diff --git a/app/llava/model/language_model/__pycache__/llava_mpt.cpython-39.pyc b/app/llava/model/language_model/__pycache__/llava_mpt.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d0484bbf7f25d716dd2348da321173cd3d08355
Binary files /dev/null and b/app/llava/model/language_model/__pycache__/llava_mpt.cpython-39.pyc differ
diff --git a/app/llava/model/language_model/__pycache__/llava_thoth.cpython-310.pyc b/app/llava/model/language_model/__pycache__/llava_thoth.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..388387c3fe30e1275fee559aa1722c7316f1e48a
Binary files /dev/null and b/app/llava/model/language_model/__pycache__/llava_thoth.cpython-310.pyc differ
diff --git a/app/llava/model/language_model/__pycache__/llava_thoth.cpython-39.pyc b/app/llava/model/language_model/__pycache__/llava_thoth.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb104341bf53fb9efcd029268ebd8c8d39742454
Binary files /dev/null and b/app/llava/model/language_model/__pycache__/llava_thoth.cpython-39.pyc differ
diff --git a/app/llava/model/language_model/llava_gemma.py b/app/llava/model/language_model/llava_gemma.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f072970cbe2021ef15597c23be396fe8c9dc5f4
--- /dev/null
+++ b/app/llava/model/language_model/llava_gemma.py
@@ -0,0 +1,160 @@
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+
+from transformers import AutoConfig, AutoModelForCausalLM, \
+                         GemmaConfig, GemmaForCausalLM, GemmaModel
+
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.generation.utils import GenerateOutput
+
+from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
+
+
+class LlavaGemmaConfig(GemmaConfig):
+    model_type = "llava_gemma"
+
+
+class LlavaGemmalModel(LlavaMetaModel, GemmaModel):
+    config_class = LlavaGemmaConfig
+
+    def __init__(self, config: GemmaConfig):
+        super(LlavaGemmalModel, self).__init__(config)
+
+
+class LlavaGemmaForCausalLM(GemmaForCausalLM, LlavaMetaForCausalLM):
+    config_class = LlavaGemmaConfig
+
+    def __init__(self, config):
+        super(LlavaGemmaForCausalLM, self).__init__(config)
+        self.model = LlavaGemmalModel(config)
+
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_model(self):
+        return self.model
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[List[List[int]]] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images,
+                image_sizes
+            )
+
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+
+        if images is not None:
+            (
+                inputs,
+                position_ids,
+                attention_mask,
+                _,
+                inputs_embeds,
+                _
+            ) = self.prepare_inputs_labels_for_multimodal(
+                inputs,
+                position_ids,
+                attention_mask,
+                None,
+                None,
+                images,
+                image_sizes=image_sizes
+            )
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+
+        return super().generate(
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
+                                      inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        image_sizes = kwargs.pop("image_sizes", None)
+        inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+        )
+        if images is not None:
+            inputs['images'] = images
+        if image_sizes is not None:
+            inputs['image_sizes'] = image_sizes
+        return inputs
+
+AutoConfig.register("llava_gemma", LlavaGemmaConfig)
+AutoModelForCausalLM.register(LlavaGemmaConfig, LlavaGemmaForCausalLM)
diff --git a/app/llava/model/language_model/llava_llama.py b/app/llava/model/language_model/llava_llama.py
new file mode 100644
index 0000000000000000000000000000000000000000..069d0d1c10da42f5d278598e8534f166d1f9f5ff
--- /dev/null
+++ b/app/llava/model/language_model/llava_llama.py
@@ -0,0 +1,158 @@
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from transformers import AutoConfig, AutoModelForCausalLM, \
+                         LlamaConfig, LlamaModel, LlamaForCausalLM
+
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.generation.utils import GenerateOutput
+
+from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
+
+
+class LlavaConfig(LlamaConfig):
+    model_type = "llava_llama"
+
+
+class LlavaLlamaModel(LlavaMetaModel, LlamaModel):
+    config_class = LlavaConfig
+
+    def __init__(self, config: LlamaConfig):
+        super(LlavaLlamaModel, self).__init__(config)
+
+
+class LlavaLlamaForCausalLM(LlamaForCausalLM, LlavaMetaForCausalLM):
+    config_class = LlavaConfig
+
+    def __init__(self, config):
+        super(LlamaForCausalLM, self).__init__(config)
+        self.model = LlavaLlamaModel(config)
+        self.pretraining_tp = config.pretraining_tp
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_model(self):
+        return self.model
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[List[List[int]]] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images,
+                image_sizes
+            )
+
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+
+        if images is not None:
+            (
+                inputs,
+                position_ids,
+                attention_mask,
+                _,
+                inputs_embeds,
+                _
+            ) = self.prepare_inputs_labels_for_multimodal(
+                inputs,
+                position_ids,
+                attention_mask,
+                None,
+                None,
+                images,
+                image_sizes=image_sizes
+            )
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+
+        return super().generate(
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
+                                      inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        image_sizes = kwargs.pop("image_sizes", None)
+        inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+        )
+        if images is not None:
+            inputs['images'] = images
+        if image_sizes is not None:
+            inputs['image_sizes'] = image_sizes
+        return inputs
+
+AutoConfig.register("llava_llama", LlavaConfig)
+AutoModelForCausalLM.register(LlavaConfig, LlavaLlamaForCausalLM)
diff --git a/app/llava/model/language_model/llava_mistral.py b/app/llava/model/language_model/llava_mistral.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f1b2f3546fb1015fe57b61ba4eb46dfac444085
--- /dev/null
+++ b/app/llava/model/language_model/llava_mistral.py
@@ -0,0 +1,166 @@
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+
+from transformers import AutoConfig, AutoModelForCausalLM, \
+                         MistralConfig, MistralModel, MistralForCausalLM
+
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.generation.utils import GenerateOutput
+
+from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
+
+
+class LlavaMistralConfig(MistralConfig):
+    model_type = "llava_mistral"
+
+
+class LlavaMistralModel(LlavaMetaModel, MistralModel):
+    config_class = LlavaMistralConfig
+
+    def __init__(self, config: MistralConfig):
+        super(LlavaMistralModel, self).__init__(config)
+
+
+class LlavaMistralForCausalLM(MistralForCausalLM, LlavaMetaForCausalLM):
+    config_class = LlavaMistralConfig
+
+    def __init__(self, config):
+        super(MistralForCausalLM, self).__init__(config)
+        self.model = LlavaMistralModel(config)
+
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_model(self):
+        return self.model
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[List[List[int]]] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images,
+                image_sizes
+            )
+
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+
+        if images is not None:
+            (
+                inputs,
+                position_ids,
+                attention_mask,
+                _,
+                inputs_embeds,
+                _
+            ) = self.prepare_inputs_labels_for_multimodal(
+                inputs,
+                position_ids,
+                attention_mask,
+                None,
+                None,
+                images,
+                image_sizes=image_sizes
+            )
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+
+        return super().generate(
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs
+        )
+    
+    def generate_from_base_class(self, inputs_embeds, **kwargs):
+        return super().generate(
+            position_ids=None,
+            attention_mask=None,
+            inputs_embeds=inputs_embeds,
+            **kwargs
+        )        
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
+                                      inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        image_sizes = kwargs.pop("image_sizes", None)
+        inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+        )
+        if images is not None:
+            inputs['images'] = images
+        if image_sizes is not None:
+            inputs['image_sizes'] = image_sizes
+        return inputs
+
+AutoConfig.register("llava_mistral", LlavaMistralConfig)
+AutoModelForCausalLM.register(LlavaMistralConfig, LlavaMistralForCausalLM)
diff --git a/app/llava/model/language_model/llava_mpt.py b/app/llava/model/language_model/llava_mpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..02e5237ece031af23fcd76b5b4e0d9b0bc5f55cc
--- /dev/null
+++ b/app/llava/model/language_model/llava_mpt.py
@@ -0,0 +1,97 @@
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+
+from typing import Optional, Tuple
+
+import torch
+
+from transformers import AutoConfig, AutoModelForCausalLM, \
+                         MptConfig, MptForCausalLM, MptModel
+from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
+
+
+class LlavaMptConfig(MptConfig):
+    model_type = "llava_mpt"
+
+
+class LlavaMptModel(LlavaMetaModel, MptModel):
+    config_class = LlavaMptConfig
+
+    def __init__(self, config: MptConfig):
+        config.hidden_size = config.d_model
+        super(LlavaMptModel, self).__init__(config)
+    
+    def embed_tokens(self, x):
+        return self.wte(x)
+
+
+class LlavaMptForCausalLM(MptForCausalLM, LlavaMetaForCausalLM):
+    config_class = LlavaMptConfig
+    supports_gradient_checkpointing = True
+
+    def __init__(self, config):
+        super(MptForCausalLM, self).__init__(config)
+
+        self.transformer = LlavaMptModel(config)
+        self.lm_head = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_model(self):
+        return self.transformer
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, LlavaMptModel):
+            module.gradient_checkpointing = value
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        images=None):
+
+        input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images)
+        
+        return super().forward(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        _inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+        )
+        _inputs['images'] = images
+        return _inputs
+
+
+AutoConfig.register("llava_mpt", LlavaMptConfig)
+AutoModelForCausalLM.register(LlavaMptConfig, LlavaMptForCausalLM)
diff --git a/app/llava/model/language_model/llava_thoth.py b/app/llava/model/language_model/llava_thoth.py
new file mode 100644
index 0000000000000000000000000000000000000000..13c44c79f9f3d112166e3c46ce9d2f7a8de78195
--- /dev/null
+++ b/app/llava/model/language_model/llava_thoth.py
@@ -0,0 +1,169 @@
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+
+from transformers import AutoConfig, AutoModelForCausalLM
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.generation.utils import GenerateOutput
+
+from .thoth.modeling_thoth import ThothForCausalLM, ThothModel, ThothConfig
+from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
+
+
+class LlavaThothConfig(ThothConfig):
+    model_type = "llava_thoth"
+
+
+class LlavaThothModel(LlavaMetaModel, ThothModel):
+    config_class = LlavaThothConfig
+
+    def __init__(self, config: LlavaThothConfig):
+        super(LlavaThothModel, self).__init__(config)
+
+    # def embed_tokens(self, x):
+    #     return self.embed_tokens(x)
+
+class LlavaThothForCausalLM(ThothForCausalLM, LlavaMetaForCausalLM):
+    config_class = LlavaThothConfig
+    supports_gradient_checkpointing = True
+
+    def __init__(self, config):
+        super(ThothForCausalLM, self).__init__(config)
+        self.model = LlavaThothModel(config)
+
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_model(self):
+        return self.model
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[List[List[int]]] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images,
+                image_sizes
+            )
+
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+
+        if images is not None:
+            (
+                inputs,
+                position_ids,
+                attention_mask,
+                _,
+                inputs_embeds,
+                _
+            ) = self.prepare_inputs_labels_for_multimodal(
+                inputs,
+                position_ids,
+                attention_mask,
+                None,
+                None,
+                images,
+                image_sizes=image_sizes
+            )
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+
+        return super().generate(
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs
+        )
+
+    def generate_from_base_class(self, inputs_embeds, **kwargs):
+        return super().generate(
+            position_ids=None,
+            attention_mask=None,
+            inputs_embeds=inputs_embeds,
+            **kwargs
+        )  
+      
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
+                                      inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        image_sizes = kwargs.pop("image_sizes", None)
+        inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+        )
+        if images is not None:
+            inputs['images'] = images
+        if image_sizes is not None:
+            inputs['image_sizes'] = image_sizes
+        return inputs
+
+
+AutoConfig.register("llava_thoth", LlavaThothConfig)
+AutoModelForCausalLM.register(LlavaThothConfig, LlavaThothForCausalLM)
diff --git a/app/llava/model/language_model/thoth/__pycache__/configuration_thoth.cpython-310.pyc b/app/llava/model/language_model/thoth/__pycache__/configuration_thoth.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b4111084a8afdd1e482898b00df9f968247bf6f
Binary files /dev/null and b/app/llava/model/language_model/thoth/__pycache__/configuration_thoth.cpython-310.pyc differ
diff --git a/app/llava/model/language_model/thoth/__pycache__/configuration_thoth.cpython-39.pyc b/app/llava/model/language_model/thoth/__pycache__/configuration_thoth.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7e7ce340e515df5bfdb4a3c541e7ec7cbed7e3a3
Binary files /dev/null and b/app/llava/model/language_model/thoth/__pycache__/configuration_thoth.cpython-39.pyc differ
diff --git a/app/llava/model/language_model/thoth/__pycache__/modeling_thoth.cpython-310.pyc b/app/llava/model/language_model/thoth/__pycache__/modeling_thoth.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..34802445edc1c79800c9d53e098ee60555314e1e
Binary files /dev/null and b/app/llava/model/language_model/thoth/__pycache__/modeling_thoth.cpython-310.pyc differ
diff --git a/app/llava/model/language_model/thoth/__pycache__/modeling_thoth.cpython-39.pyc b/app/llava/model/language_model/thoth/__pycache__/modeling_thoth.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d1458b202ca9718441c885f38137dc12d708bccf
Binary files /dev/null and b/app/llava/model/language_model/thoth/__pycache__/modeling_thoth.cpython-39.pyc differ
diff --git a/app/llava/model/language_model/thoth/configuration_thoth.py b/app/llava/model/language_model/thoth/configuration_thoth.py
new file mode 100644
index 0000000000000000000000000000000000000000..c19098dd75f561c737002c7055a08c23b879cdac
--- /dev/null
+++ b/app/llava/model/language_model/thoth/configuration_thoth.py
@@ -0,0 +1,160 @@
+# coding=utf-8
+
+""" Thoth model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+THOTH_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+
+
+class ThothConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ThothModel`]. It is used to instantiate an Thoth
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Thoth-6B5.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Thoth model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`ThothModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to understand more about it. This value is
+            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+
+    """
+
+    model_type = "thoth"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
+    
\ No newline at end of file
diff --git a/app/llava/model/language_model/thoth/modeling_thoth.py b/app/llava/model/language_model/thoth/modeling_thoth.py
new file mode 100644
index 0000000000000000000000000000000000000000..5397c633068f74221b379b53c4be22847eda9594
--- /dev/null
+++ b/app/llava/model/language_model/thoth/modeling_thoth.py
@@ -0,0 +1,1376 @@
+# coding=utf-8
+
+""" PyTorch Thoth model."""
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import (
+    AttentionMaskConverter,
+    _prepare_4d_attention_mask,
+    _prepare_4d_causal_attention_mask,
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS, is_torch_greater_or_equal_than_1_13
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.utils.import_utils import is_torch_fx_available
+from .configuration_thoth import ThothConfig
+
+
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+
+
+# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
+# It means that the function will not be traced through and simply appear as a node in the graph.
+if is_torch_fx_available():
+    if not is_torch_greater_or_equal_than_1_13:
+        import torch.fx
+
+    _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "ThothConfig"
+
+
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    warnings.warn(
+        "Calling `transformers.models.thoth.modeling_thoth._prepare_4d_attention_mask` is deprecated and will be removed in v4.37. Use `transformers.modeling_attn_mask_utils._prepare_4d_attention_mask"
+    )
+    return _prepare_4d_attention_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
+
+
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    warnings.warn(
+        "Calling `transformers.models.thoth.modeling_thoth._make_causal_mask` is deprecated and will be removed in v4.37. Use `transformers.models.thoth.modeling_thoth.AttentionMaskConverter._make_causal_mask"
+    )
+    return AttentionMaskConverter._make_causal_mask(
+        input_ids_shape=input_ids_shape, dtype=dtype, device=device, past_key_values_length=past_key_values_length
+    )
+
+
+class ThothRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        ThothRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+ALL_LAYERNORM_LAYERS.append(ThothRMSNorm)
+
+
+class ThothRotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+
+
+class ThothLinearScalingRotaryEmbedding(ThothRotaryEmbedding):
+    """ThothRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        t = t / self.scaling_factor
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+class ThothDynamicNTKScalingRotaryEmbedding(ThothRotaryEmbedding):
+    """ThothRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class ThothMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        if self.config.pretraining_tp > 1:
+            slice = self.intermediate_size // self.config.pretraining_tp
+            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
+            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
+            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
+
+            gate_proj = torch.cat(
+                [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
+            )
+            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
+
+            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
+            down_proj = [
+                F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
+            ]
+            down_proj = sum(down_proj)
+        else:
+            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+        return down_proj
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class ThothAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: ThothConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias)
+        self._init_rope()
+
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = ThothRotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = ThothLinearScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            elif scaling_type == "dynamic":
+                self.rotary_emb = ThothDynamicNTKScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        if self.config.pretraining_tp > 1:
+            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+            query_slices = self.q_proj.weight.split(
+                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
+            )
+            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+
+            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+            query_states = torch.cat(query_states, dim=-1)
+
+            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+            key_states = torch.cat(key_states, dim=-1)
+
+            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+            value_states = torch.cat(value_states, dim=-1)
+
+        else:
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        if self.config.pretraining_tp > 1:
+            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
+            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
+            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
+        else:
+            attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class ThothFlashAttention2(ThothAttention):
+    """
+    Thoth flash attention module. This module inherits from `ThothAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # ThothFlashAttention2 attention does not support output_attentions
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+
+            # overwrite attention_mask with padding_mask
+            attention_mask = kwargs.pop("padding_mask")
+
+        output_attentions = False
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        dropout_rate = self.attention_dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (ThothRMSNorm handles it correctly)
+
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            # Handle the case where the model is quantized
+            if hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_output = self._flash_attention_forward(
+            query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+    def _flash_attention_forward(
+        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+        """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in ThothFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            attn_output = flash_attn_func(
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
+            )
+
+        return attn_output
+
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
+class ThothSdpaAttention(ThothAttention):
+    """
+    Thoth attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `ThothAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from ThothAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "ThothModel is using ThothSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+THOTH_ATTENTION_CLASSES = {
+    "eager": ThothAttention,
+    "flash_attention_2": ThothFlashAttention2,
+    "sdpa": ThothSdpaAttention,
+}
+
+
+class ThothDecoderLayer(nn.Module):
+    def __init__(self, config: ThothConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = THOTH_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
+
+        self.mlp = ThothMLP(config)
+        self.input_layernorm = ThothRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = ThothRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+THOTH_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`ThothConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Thoth Model outputting raw hidden-states without any specific head on top.",
+    THOTH_START_DOCSTRING,
+)
+class ThothPreTrainedModel(PreTrainedModel):
+    config_class = ThothConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["ThothDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, ThothModel):
+            module.gradient_checkpointing = value
+
+
+THOTH_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Thoth Model outputting raw hidden-states without any specific head on top.",
+    THOTH_START_DOCSTRING,
+)
+class ThothModel(ThothPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`ThothDecoderLayer`]
+
+    Args:
+        config: ThothConfig
+    """
+
+    def __init__(self, config: ThothConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [ThothDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._use_sdpa = config._attn_implementation == "sdpa"
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        self.norm = ThothRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        self._gradient_checkpointing_func = torch.utils.checkpoint.checkpoint
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(THOTH_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape[:2]
+        elif inputs_embeds is not None:
+            batch_size, seq_length = inputs_embeds.shape[:2]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        past_key_values_length = 0
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if self._use_flash_attention_2:
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self._use_sdpa and not output_attentions:
+            # output_attentions=True can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+            )
+
+        # embed positions
+        hidden_states = inputs_embeds
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class ThothForCausalLM(ThothPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = ThothModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(THOTH_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        if self.config.pretraining_tp > 1:
+            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
+            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
+            logits = torch.cat(logits, dim=-1)
+        else:
+            logits = self.lm_head(hidden_states)
+        logits = logits.float()
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """
+    The Thoth Model transformer with a sequence classification head on top (linear layer).
+
+    [`ThothForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    THOTH_START_DOCSTRING,
+)
+class ThothForSequenceClassification(ThothPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = ThothModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(THOTH_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1).to(
+                    logits.device
+                )
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/app/llava/model/llava_arch.py b/app/llava/model/llava_arch.py
new file mode 100644
index 0000000000000000000000000000000000000000..0522c7c07dacb3fc87b421ae26b3637c127f2eb2
--- /dev/null
+++ b/app/llava/model/llava_arch.py
@@ -0,0 +1,573 @@
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+from email.mime import image
+import os
+from abc import ABC, abstractmethod
+
+import torch
+import torch.nn as nn
+
+from .multimodal_encoder.builder import build_adapter_module, build_vision_tower, build_Qformer
+from .multimodal_projector.builder import build_vision_projector
+
+from llava.constants import IGNORE_INDEX, MM_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_VIDEO_PATCH_TOKEN, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN
+
+from llava.mm_utils import get_anyres_image_grid_shape
+from llava.utils import master_print
+
+class LlavaMetaModel:
+
+    def __init__(self, config):
+        super(LlavaMetaModel, self).__init__(config)
+
+        if hasattr(config, "mm_vision_tower"):
+            self.vision_tower = build_vision_tower(config, delay_load=True)
+            self.mm_projector = build_vision_projector(config)
+            if getattr(config, "qformer_model_path", None):
+                self.Qformer, self.ln_vision, self.query_tokens = build_Qformer(
+                            config.num_query_token, self.vision_tower.hidden_size)
+                self.frame_position_encoding = nn.Embedding(
+                    config.max_num_segments,
+                    self.Qformer.config.hidden_size
+                )
+            if getattr(config, "adapter_module_name", None):
+                self.adapter_module = build_adapter_module(config, self.vision_tower.hidden_size)
+            if 'unpad' in getattr(config, 'mm_patch_merge_type', ''):
+                self.image_newline = nn.Parameter(
+                    torch.empty(config.hidden_size, dtype=self.dtype)
+                )
+
+    def get_vision_tower(self):
+        vision_tower = getattr(self, 'vision_tower', None)
+        if type(vision_tower) is list:
+            vision_tower = vision_tower[0]
+        return vision_tower
+    
+    def get_adapter_module(self):
+        adapter_module = getattr(self, 'adapter_module', None)
+        if type(adapter_module) is list:
+            adapter_module = adapter_module[0]
+        return adapter_module
+
+    def get_qformer(self):
+        qformer = getattr(self, 'Qformer', None)
+        if type(qformer) is list:
+            qformer = qformer[0]
+        return qformer
+
+    def get_ln_vision(self):
+        ln_vision = getattr(self, 'ln_vision', None)
+        if type(ln_vision) is list:
+            ln_vision = ln_vision[0]
+        return ln_vision
+    
+    def get_query_tokens(self):
+        query_tokens = getattr(self, 'query_tokens', None)
+        if type(query_tokens) is list:
+            query_tokens = query_tokens[0]
+        return query_tokens
+
+    def get_frame_position_encoding(self):
+        frame_position_encoding = getattr(self, 'frame_position_encoding', None)
+        if type(frame_position_encoding) is list:
+            frame_position_encoding = frame_position_encoding[0]
+        return frame_position_encoding    
+
+    def initialize_vision_modules(self, model_args, fsdp=None):
+        vision_tower = model_args.vision_tower
+        mm_vision_select_layer = model_args.mm_vision_select_layer
+        mm_vision_select_feature = model_args.mm_vision_select_feature
+        pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
+        mm_patch_merge_type = model_args.mm_patch_merge_type
+        image_grid_pinpoints = model_args.image_grid_pinpoints
+        self.config.mm_vision_tower = vision_tower
+        self.config.img_size = model_args.img_size
+        self.config.drop_path_rate = model_args.drop_path_rate
+        self.config.vit_precision = model_args.vit_precision
+        self.config.vit_model_path = model_args.vit_model_path   
+        self.config.num_query_token = model_args.num_query_token
+        self.config.qformer_model_path = model_args.qformer_model_path
+        self.config.adapter_module_name = model_args.adapter_module_name
+        self.config.adapter_module_path = model_args.adapter_module_path
+        self.config.max_num_segments = model_args.max_num_segments
+        self.config.pretrain_mm_mlp_adapter = pretrain_mm_mlp_adapter
+        # TODO: FSDP training is not ready
+        if self.get_vision_tower() is None:
+            vision_tower = build_vision_tower(model_args)
+
+            if fsdp is not None and len(fsdp) > 0:
+                self.vision_tower = [vision_tower]
+            else:
+                self.vision_tower = vision_tower
+        else:
+            if fsdp is not None and len(fsdp) > 0:
+                vision_tower = self.vision_tower[0]
+            else:
+                vision_tower = self.vision_tower
+            vision_tower.load_model()
+
+        self.config.use_mm_proj = True
+        self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'linear')
+        self.config.mm_hidden_size = vision_tower.hidden_size
+        self.config.mm_vision_hidden_size = vision_tower.hidden_size
+        self.config.mm_vision_select_layer = mm_vision_select_layer
+        self.config.mm_vision_select_feature = mm_vision_select_feature
+        self.config.mm_patch_merge_type = mm_patch_merge_type
+        self.config.image_grid_pinpoints = image_grid_pinpoints
+
+        if getattr(model_args, "qformer_model_path", None):
+            if self.get_qformer() is None:
+                self.Qformer, self.ln_vision, self.query_tokens = build_Qformer(
+                            model_args.num_query_token, self.vision_tower.hidden_size)
+                self.frame_position_encoding = nn.Embedding(
+                    model_args.max_num_segments,
+                    self.Qformer.config.hidden_size
+                )
+                self.config.mm_hidden_size = self.Qformer.config.hidden_size
+            # self.Qformer = self.Qformer.to(torch.bfloat16)
+            if model_args.qformer_model_path != 'from_scratch':
+                self.load_pretrained_qformer(model_args.qformer_model_path)
+            
+        if getattr(model_args, 'adapter_module_name', None):
+            if self.get_adapter_module() is None:
+                self.adapter_module = build_adapter_module(self.config, self.vision_tower.hidden_size)
+                self.adapter_module.load_model()
+            self.config.mm_hidden_size = self.adapter_module.output_dim
+
+        if getattr(self, 'mm_projector', None) is None:
+
+            self.mm_projector = build_vision_projector(self.config)
+
+            if 'unpad' in mm_patch_merge_type:
+                embed_std = 1 / torch.sqrt(torch.tensor(self.config.hidden_size, dtype=self.dtype))
+                self.image_newline = nn.Parameter(
+                    torch.randn(self.config.hidden_size, dtype=self.dtype) * embed_std
+                )
+        else:
+            # In case it is frozen by LoRA
+            for p in self.mm_projector.parameters():
+                p.requires_grad = True
+
+        if pretrain_mm_mlp_adapter is not None:
+            mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
+            def get_w(weights, keyword):
+                return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
+            
+            def get_variable_frame_encoding_w(model_weights, load_weights):
+                model_len = model_weights.shape[0]
+                load_weights = {'.'.join(k.split('.')[1:]): v for k, v in load_weights.items()}
+
+                load_len = load_weights['frame_position_encoding.weight'].shape[0]
+                if model_len == load_len:
+                    return get_w(load_weights, 'frame_position_encoding')
+                elif model_len < load_len:
+                    value = load_weights['frame_position_encoding.weight'][:model_len]
+                    return {'weight': value}
+                else:
+                    value = model_weights.clone().cpu()
+                    value[:load_len] = load_weights['frame_position_encoding.weight']
+                    return {'weight': value}
+            
+            self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'))
+            if self.get_frame_position_encoding():
+                self.frame_position_encoding.load_state_dict(get_variable_frame_encoding_w(self.frame_position_encoding.weight, mm_projector_weights))
+            
+            master_print(f"Loaded pretrained parameters from {pretrain_mm_mlp_adapter}")
+
+
+    def load_pretrained_qformer(self, model_path):
+        if os.path.isfile(model_path):
+            checkpoint = torch.load(model_path, map_location="cpu")
+        else:
+            raise RuntimeError("checkpoint path is invalid")
+        if 'projector.bin' in model_path:
+            state_dict = {}
+            match_keys = ['Qformer', 'query_tokens']
+            for k, v in checkpoint.items():
+                flag = False
+                for match_key in match_keys:
+                    if match_key in k:
+                        flag = True
+                        break
+                if flag:
+                    state_dict[k.replace('model.', '')] = v            
+
+        else:
+            state_dict = checkpoint["model"]
+        msg = self.load_state_dict(state_dict, strict=False)
+
+        master_print(f"Loaded Qformer from {model_path}")
+        # master_print(msg)
+
+        # return msg
+
+
+def unpad_image(tensor, original_size):
+    """
+    Unpads a PyTorch tensor of a padded and resized image.
+
+    Args:
+    tensor (torch.Tensor): The image tensor, assumed to be in CxHxW format.
+    original_size (tuple): The original size of the image (height, width).
+
+    Returns:
+    torch.Tensor: The unpadded image tensor.
+    """
+    original_width, original_height = original_size
+    current_height, current_width = tensor.shape[1:]
+
+    original_aspect_ratio = original_width / original_height
+    current_aspect_ratio = current_width / current_height
+
+    if original_aspect_ratio > current_aspect_ratio:
+        scale_factor = current_width / original_width
+        new_height = int(original_height * scale_factor)
+        padding = (current_height - new_height) // 2
+        unpadded_tensor = tensor[:, padding:current_height - padding, :]
+    else:
+        scale_factor = current_height / original_height
+        new_width = int(original_width * scale_factor)
+        padding = (current_width - new_width) // 2
+        unpadded_tensor = tensor[:, :, padding:current_width - padding]
+
+    return unpadded_tensor
+
+
+class LlavaMetaForCausalLM(ABC):
+
+    @abstractmethod
+    def get_model(self):
+        pass
+
+    def get_vision_tower(self):
+        return self.get_model().get_vision_tower()
+    
+    def get_adapter_module(self):
+        return self.get_model().get_adapter_module()
+
+    def get_ln_vision(self):
+        return self.get_model().get_ln_vision()
+
+    def get_qformer(self):
+        return self.get_model().get_qformer()
+
+    def get_query_tokens(self):
+        return self.get_model().get_query_tokens()
+
+    def get_frame_position_encoding(self):
+        return self.get_model().get_frame_position_encoding()
+
+    def encode_images(self, images):
+        image_features = self.get_vision_tower()(images)
+        if self.get_qformer():
+            image_features = self.get_ln_vision()(image_features)
+            query_tokens = self.get_query_tokens()
+            query_tokens = query_tokens.expand(image_features.shape[0], -1, -1)
+            attn_mask = torch.ones(image_features.size()[:-1], dtype=torch.long).to(image_features.device)
+            dtype_ = self.get_vision_tower().dtype
+            # print(dtype_)
+            image_features = self.qformer_fusion(
+                query_tokens.to(dtype_),
+                image_features.to(dtype_), 
+                attn_mask
+            ).to(images.dtype)
+
+        # image_features = self.get_model().mm_projector(image_features)
+        return image_features
+
+    def qformer_fusion(self, query_tokens, features, attn_mask=None):
+        qformer = self.get_qformer()
+        query_output = qformer.bert(
+            query_embeds=query_tokens,
+            encoder_hidden_states=features,
+            encoder_attention_mask=attn_mask,
+            return_dict=True
+        )
+        return query_output.last_hidden_state
+
+    def prepare_inputs_labels_for_multimodal(
+        self, input_ids, position_ids, attention_mask, past_key_values, labels,
+        images, image_sizes=None
+    ):  
+
+        vision_tower = self.get_vision_tower()
+        if vision_tower is None or images is None or input_ids.shape[1] == 1:
+            return input_ids, position_ids, attention_mask, past_key_values, None, labels
+        
+        # image:  list(B) of tensor[1, 3, 336, 336]
+        # video:  list(B) of tensor[N, 3, 336, 336]
+        # video_any_res: list(B) of tensor[N, P, 3, 336, 336]
+        if type(images) is list or images.ndim == 5:
+            if type(images) is list:
+                images = [x.unsqueeze(0) if x.ndim == 3 else x for x in images]
+            # video any res
+            if images[0].ndim == 5:
+                concat_images = torch.cat([image.flatten(0, 1) for image in images], dim=0)
+                split_sizes = [image.shape[0:2] for image in images]
+            else:
+                concat_images = torch.cat([image for image in images], dim=0)
+                split_sizes = [image.shape[0] for image in images]
+            image_features = self.encode_images(concat_images)
+
+            # add frame encoding then projector
+            if images[0].ndim == 5:
+                frame_ids = []
+                for split_size in split_sizes:
+                    frame_ids.append(torch.tensor([idx for idx in range(split_size[0]) for _ in range(split_size[1])], \
+                                                    dtype=torch.long, device=image_features.device))
+            else:
+                frame_ids = [torch.arange(split_size, dtype=torch.long, device=image_features.device)
+                                for split_size in split_sizes]
+            frame_ids = torch.concat(frame_ids)
+            frame_position_encoding = self.get_frame_position_encoding()
+            if frame_position_encoding:
+
+                frame_embeddings = frame_position_encoding(frame_ids).unsqueeze(-2)
+                image_features += frame_embeddings
+
+            # TODO: add fusion model, rewrite this part in the future
+            adapter_module = self.get_adapter_module()
+            if adapter_module:
+                image_features = adapter_module(image_features, frame_ids)
+            image_features = self.get_model().mm_projector(image_features)
+            if images[0].ndim == 5:
+                split_sizes = [split_size[0] * split_size[1] for split_size in split_sizes]
+            image_features = torch.split(image_features, split_sizes, dim=0)
+            if adapter_module:
+                # image_features = [image_features[i].view(images[i].shape[0], images[i].shape[1], -1) for i in range(image_features.shape[0])]
+                image_features = [x.view(im.shape[0], -1, x.shape[2]) for x, im in zip(image_features, images)]
+                image_features = adapter_module.compress_token_per_img(image_features)
+
+            mm_patch_merge_type = getattr(self.config, 'mm_patch_merge_type', 'flat')
+            image_aspect_ratio = getattr(self.config, 'image_aspect_ratio', 'square')
+            if mm_patch_merge_type == 'flat':
+                image_features = [x.flatten(0, 1) for x in image_features]
+            elif mm_patch_merge_type.startswith('spatial'):
+                new_image_features = []
+                for image_idx, image_feature in enumerate(image_features):
+                    if image_feature.shape[0] > 1:
+                        base_image_feature = image_feature[0]
+                        image_feature = image_feature[1:]
+                        height = width = self.get_vision_tower().num_patches_per_side
+                        assert height * width == base_image_feature.shape[0]
+                        if image_aspect_ratio == 'anyres':
+                            num_patch_width, num_patch_height = get_anyres_image_grid_shape(image_sizes[image_idx], self.config.image_grid_pinpoints, self.get_vision_tower().config.image_size)
+                            image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
+                        else:
+                            raise NotImplementedError
+                        if 'unpad' in mm_patch_merge_type:
+                            image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
+                            image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+                            image_feature = unpad_image(image_feature, image_sizes[image_idx])
+                            image_feature = torch.cat((
+                                image_feature,
+                                self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)
+                            ), dim=-1)
+                            image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+                        else:
+                            image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
+                            image_feature = image_feature.flatten(0, 3)
+                        image_feature = torch.cat((base_image_feature, image_feature), dim=0)
+                    else:
+                        image_feature = image_feature[0]
+                        if 'unpad' in mm_patch_merge_type:
+                            image_feature = torch.cat((
+                                image_feature,
+                                self.model.image_newline[None].to(image_feature.device)
+                            ), dim=0)
+                    new_image_features.append(image_feature)
+                image_features = new_image_features
+            else:
+                raise ValueError(f"Unexpected mm_patch_merge_type: {self.config.mm_patch_merge_type}")
+
+        else:
+            image_features = self.encode_images(images)
+
+        # if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_start_end', False):
+        #     raise NotImplementedError
+
+
+        # TODO: Currently, all the embed_token will bu update when tune_mm_mlp_adapter = True && mm_use_start_end = True
+
+        # Let's just add dummy tensors if they do not exist,
+        # it is a headache to deal with None all the time.
+        # But it is not ideal, and if you have a better idea,
+        # please open an issue / submit a PR, thanks.
+        _labels = labels
+        _position_ids = position_ids
+        _attention_mask = attention_mask
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+        else:
+            attention_mask = attention_mask.bool()
+        if position_ids is None:
+            position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
+        if labels is None:
+            labels = torch.full_like(input_ids, IGNORE_INDEX)
+
+        # remove the padding using attention_mask -- FIXME
+        _input_ids = input_ids
+        input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)]
+        labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
+
+        new_input_embeds = []
+        new_labels = []
+        cur_image_idx = 0
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            num_images = (cur_input_ids == MM_TOKEN_INDEX).sum()
+            if num_images == 0:
+                cur_image_features = image_features[cur_image_idx]
+                cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids)
+                cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
+                new_input_embeds.append(cur_input_embeds)
+                new_labels.append(labels[batch_idx])
+                cur_image_idx += 1
+                continue
+
+            image_token_indices = [-1] + torch.where(cur_input_ids == MM_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
+            cur_input_ids_noim = []
+            cur_labels = labels[batch_idx]
+            cur_labels_noim = []
+            for i in range(len(image_token_indices) - 1):
+                cur_input_ids_noim.append(cur_input_ids[image_token_indices[i]+1:image_token_indices[i+1]])
+                cur_labels_noim.append(cur_labels[image_token_indices[i]+1:image_token_indices[i+1]])
+            split_sizes = [x.shape[0] for x in cur_labels_noim]
+            cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_noim))
+            cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
+            cur_new_input_embeds = []
+            cur_new_labels = []
+
+            for i in range(num_images + 1):
+                cur_new_input_embeds.append(cur_input_embeds_no_im[i])
+                cur_new_labels.append(cur_labels_noim[i])
+                if i < num_images:
+                    cur_image_features = image_features[cur_image_idx]
+                    cur_image_idx += 1
+                    cur_new_input_embeds.append(cur_image_features)
+                    cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=cur_labels.device, dtype=cur_labels.dtype))
+
+            cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds]
+
+            cur_new_input_embeds = torch.cat(cur_new_input_embeds)
+            cur_new_labels = torch.cat(cur_new_labels)
+
+            new_input_embeds.append(cur_new_input_embeds)
+            new_labels.append(cur_new_labels)
+
+        # Truncate sequences to max length as image embeddings can make the sequence longer
+        tokenizer_model_max_length = getattr(self.config, 'tokenizer_model_max_length', None)
+        if tokenizer_model_max_length is not None:
+            new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
+            new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
+
+        # Combine them
+        max_len = max(x.shape[0] for x in new_input_embeds)
+        batch_size = len(new_input_embeds)
+
+        new_input_embeds_padded = []
+        new_labels_padded = torch.full((batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device)
+        attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
+        position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
+
+        for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
+            cur_len = cur_new_embed.shape[0]
+            if getattr(self.config, 'tokenizer_padding_side', 'right') == "left":
+                new_input_embeds_padded.append(torch.cat((
+                    torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device),
+                    cur_new_embed
+                ), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, -cur_len:] = cur_new_labels
+                    attention_mask[i, -cur_len:] = True
+                    position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+            else:
+                new_input_embeds_padded.append(torch.cat((
+                    cur_new_embed,
+                    torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)
+                ), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, :cur_len] = cur_new_labels
+                    attention_mask[i, :cur_len] = True
+                    position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+
+        new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
+
+        if _labels is None:
+            new_labels = None
+        else:
+            new_labels = new_labels_padded
+
+        if _attention_mask is None:
+            attention_mask = None
+        else:
+            attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
+
+        if _position_ids is None:
+            position_ids = None
+
+        return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
+
+    def initialize_vision_tokenizer(self, model_args, tokenizer):
+        if model_args.mm_use_patch_token:
+            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_VIDEO_PATCH_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+
+        if model_args.mm_use_start_end:
+            num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+
+            if num_new_tokens > 0:
+                input_embeddings = self.get_input_embeddings().weight.data
+                output_embeddings = self.get_output_embeddings().weight.data
+
+                input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True)
+                output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True)
+
+                input_embeddings[-num_new_tokens:] = input_embeddings_avg
+                output_embeddings[-num_new_tokens:] = output_embeddings_avg
+
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = True
+                if 'gemma' in model_args.model_name_or_path:
+                    # gemma use the same embedding for input and output
+                    pass
+                else:
+                    for p in self.get_output_embeddings().parameters():
+                        p.requires_grad = False
+
+            if model_args.pretrain_mm_mlp_adapter:
+                # raise NotImplementedError
+                mm_projector_weights = torch.load(model_args.pretrain_mm_mlp_adapter, map_location='cpu')
+                mm_projector_weights = {'.'.join(k.split('.')[1:]): v for k, v in mm_projector_weights.items()}
+                embed_tokens_weight = mm_projector_weights['embed_tokens.weight']
+                input_embeddings[:] = embed_tokens_weight
+                if 'gemma' in model_args.model_name_or_path:
+                    output_embeddings[:] = embed_tokens_weight
+                assert num_new_tokens == 4
+                # if input_embeddings.shape == embed_tokens_weight.shape:
+                #     input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:]
+                # elif embed_tokens_weight.shape[0] == num_new_tokens:
+                #     input_embeddings[-num_new_tokens:] = embed_tokens_weight
+                # else:
+                #     raise ValueError(f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.")
+        elif model_args.mm_use_patch_token:
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = False
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False
diff --git a/app/llava/model/make_delta.py b/app/llava/model/make_delta.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ae55d59c2c8bab80299272314a41bbeb959d8ed
--- /dev/null
+++ b/app/llava/model/make_delta.py
@@ -0,0 +1,52 @@
+"""
+Usage:
+python3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta
+"""
+import argparse
+
+import torch
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from llava.model.utils import auto_upgrade
+
+
+def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id):
+    print("Loading base model")
+    base = AutoModelForCausalLM.from_pretrained(
+        base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+
+    print("Loading target model")
+    auto_upgrade(target_model_path)
+    target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+
+    print("Calculating delta")
+    for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
+        if name not in base.state_dict():
+            assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
+            continue
+        if param.data.shape == base.state_dict()[name].shape:
+            param.data -= base.state_dict()[name]
+        else:
+            assert name in ['model.embed_tokens.weight', 'lm_head.weight'], f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
+            bparam = base.state_dict()[name]
+            param.data[:bparam.shape[0], :bparam.shape[1]] -= bparam
+
+    print("Saving delta")
+    if hub_repo_id:
+        kwargs = {"push_to_hub": True, "repo_id": hub_repo_id}
+    else:
+        kwargs = {}
+    target.save_pretrained(delta_path, **kwargs)
+    target_tokenizer = AutoTokenizer.from_pretrained(target_model_path)
+    target_tokenizer.save_pretrained(delta_path, **kwargs)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-model-path", type=str, required=True)
+    parser.add_argument("--target-model-path", type=str, required=True)
+    parser.add_argument("--delta-path", type=str, required=True)
+    parser.add_argument("--hub-repo-id", type=str, default=None)
+    args = parser.parse_args()
+
+    make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id)
diff --git a/app/llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc b/app/llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b1cf8dbbd677d6da6fe3e121ebe7073291d3bff7
Binary files /dev/null and b/app/llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc differ
diff --git a/app/llava/model/multimodal_encoder/__pycache__/builder.cpython-39.pyc b/app/llava/model/multimodal_encoder/__pycache__/builder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..097f6dba1add892567eb4387c3b7623b7826e446
Binary files /dev/null and b/app/llava/model/multimodal_encoder/__pycache__/builder.cpython-39.pyc differ
diff --git a/app/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc b/app/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d8445abef93a08e4e0d0079d9dd4b7e2447e9c2b
Binary files /dev/null and b/app/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc differ
diff --git a/app/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-39.pyc b/app/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..21e2668d7128bc9b3a3afe84a98b54febf1940cb
Binary files /dev/null and b/app/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-39.pyc differ
diff --git a/app/llava/model/multimodal_encoder/__pycache__/eva_clip_encoder.cpython-310.pyc b/app/llava/model/multimodal_encoder/__pycache__/eva_clip_encoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e8178e3d4e58a132c2fa03d40c4d0359298fcf4f
Binary files /dev/null and b/app/llava/model/multimodal_encoder/__pycache__/eva_clip_encoder.cpython-310.pyc differ
diff --git a/app/llava/model/multimodal_encoder/__pycache__/eva_clip_encoder.cpython-39.pyc b/app/llava/model/multimodal_encoder/__pycache__/eva_clip_encoder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..23efbbc3f1b18c8593f9dd2381c1696f410f3022
Binary files /dev/null and b/app/llava/model/multimodal_encoder/__pycache__/eva_clip_encoder.cpython-39.pyc differ
diff --git a/app/llava/model/multimodal_encoder/__pycache__/eva_vit.cpython-310.pyc b/app/llava/model/multimodal_encoder/__pycache__/eva_vit.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..19c19449c7f65b796c1b7dcf7500f5ad239002e9
Binary files /dev/null and b/app/llava/model/multimodal_encoder/__pycache__/eva_vit.cpython-310.pyc differ
diff --git a/app/llava/model/multimodal_encoder/__pycache__/eva_vit.cpython-39.pyc b/app/llava/model/multimodal_encoder/__pycache__/eva_vit.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2a861116ec74c8fe9edc42bc05f96c5df896e22f
Binary files /dev/null and b/app/llava/model/multimodal_encoder/__pycache__/eva_vit.cpython-39.pyc differ
diff --git a/app/llava/model/multimodal_encoder/__pycache__/google_siglip_encoder.cpython-310.pyc b/app/llava/model/multimodal_encoder/__pycache__/google_siglip_encoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d97833b210585ade030f7db4f0ca4f97d44e075b
Binary files /dev/null and b/app/llava/model/multimodal_encoder/__pycache__/google_siglip_encoder.cpython-310.pyc differ
diff --git a/app/llava/model/multimodal_encoder/__pycache__/google_siglip_encoder.cpython-39.pyc b/app/llava/model/multimodal_encoder/__pycache__/google_siglip_encoder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..55d233be41e637171a0a342bef36d50505bf52f8
Binary files /dev/null and b/app/llava/model/multimodal_encoder/__pycache__/google_siglip_encoder.cpython-39.pyc differ
diff --git a/app/llava/model/multimodal_encoder/__pycache__/qformer.cpython-310.pyc b/app/llava/model/multimodal_encoder/__pycache__/qformer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b0000a8583dbd458cf1e1742da33239382a2af23
Binary files /dev/null and b/app/llava/model/multimodal_encoder/__pycache__/qformer.cpython-310.pyc differ
diff --git a/app/llava/model/multimodal_encoder/__pycache__/qformer.cpython-39.pyc b/app/llava/model/multimodal_encoder/__pycache__/qformer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..76db6654e940cac945e88f29b048c4b04c9a6628
Binary files /dev/null and b/app/llava/model/multimodal_encoder/__pycache__/qformer.cpython-39.pyc differ
diff --git a/app/llava/model/multimodal_encoder/__pycache__/resampler.cpython-310.pyc b/app/llava/model/multimodal_encoder/__pycache__/resampler.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1ea81a708caa93212cb3eeb3199d80cab1e5ef9d
Binary files /dev/null and b/app/llava/model/multimodal_encoder/__pycache__/resampler.cpython-310.pyc differ
diff --git a/app/llava/model/multimodal_encoder/__pycache__/resampler.cpython-39.pyc b/app/llava/model/multimodal_encoder/__pycache__/resampler.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..79f7f9fde9d305e95670e882982e0ad3ca7e4a13
Binary files /dev/null and b/app/llava/model/multimodal_encoder/__pycache__/resampler.cpython-39.pyc differ
diff --git a/app/llava/model/multimodal_encoder/__pycache__/siglip_encoder.cpython-310.pyc b/app/llava/model/multimodal_encoder/__pycache__/siglip_encoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d2c9a23df2a77de5f2234295b92616665c24a4cc
Binary files /dev/null and b/app/llava/model/multimodal_encoder/__pycache__/siglip_encoder.cpython-310.pyc differ
diff --git a/app/llava/model/multimodal_encoder/__pycache__/siglip_encoder.cpython-39.pyc b/app/llava/model/multimodal_encoder/__pycache__/siglip_encoder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9cb4c5881268df2011359c311f9e3b916ff7eab6
Binary files /dev/null and b/app/llava/model/multimodal_encoder/__pycache__/siglip_encoder.cpython-39.pyc differ
diff --git a/app/llava/model/multimodal_encoder/builder.py b/app/llava/model/multimodal_encoder/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec4bf76adaad5bd01406ad2532d38aa274b5c88f
--- /dev/null
+++ b/app/llava/model/multimodal_encoder/builder.py
@@ -0,0 +1,239 @@
+import os
+import re
+import math
+import torch
+import torch.nn as nn
+from .clip_encoder import CLIPVisionTower
+from .eva_clip_encoder import EvaClipVisionTower
+from .siglip_encoder import SiglipVisionTower
+from .google_siglip_encoder import GoogleSiglipVisionTower
+from llava.model.utils import LayerNorm
+from .qformer import BertConfig, BertLMHeadModel
+from .resampler import Resampler, TokenCompressor
+from torch.nn.init import trunc_normal_
+
+
+
+
+
+def build_vision_tower(vision_tower_cfg, **kwargs):
+    vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
+    # is_absolute_path_exists = os.path.exists(vision_tower)
+    if vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower:
+        vision_tower = CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
+    elif vision_tower.startswith("eva"):
+        vision_tower = EvaClipVisionTower(vision_tower, args=vision_tower_cfg)
+    elif vision_tower.startswith("google/siglip"):
+        vision_tower = GoogleSiglipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
+    elif 'HuggingFaceM4/siglip' in vision_tower:
+        vision_tower = SiglipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
+    else:
+        raise ValueError(f'Unknown vision tower: {vision_tower}')
+    
+    return vision_tower
+
+
+
+def build_Qformer(num_query_token, vision_width, extra_num_query_token=64, cross_attention_freq=2):
+    ln_vision = LayerNorm(vision_width)
+    encoder_config = BertConfig.from_pretrained("./model/bert-base-uncased")
+    encoder_config.encoder_width = vision_width
+    # insert cross-attention layer every other block
+    encoder_config.add_cross_attention = True
+    encoder_config.cross_attention_freq = cross_attention_freq
+    encoder_config.query_length = num_query_token
+    Qformer = BertLMHeadModel(config=encoder_config)
+    query_tokens = nn.Parameter(
+        torch.zeros(1, num_query_token, encoder_config.hidden_size)
+    )
+    query_tokens.data.normal_(mean=0.0, std=encoder_config.initializer_range)
+    
+    Qformer.cls = None
+    Qformer.bert.embeddings.word_embeddings = None
+    Qformer.bert.embeddings.position_embeddings = None
+    for layer in Qformer.bert.encoder.layer:
+        layer.output = None
+        layer.intermediate = None
+
+    return Qformer, ln_vision, query_tokens
+
+#TODO: remove the vision_width here
+def build_adapter_module(cfg, vision_width):
+    return AdapterModule(cfg, vision_width)
+
+
+class IdentityMap(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, *args, **kwargs):
+        return x
+
+
+class AdapterModule(nn.Module):
+    def __init__(self, config, vision_width):
+        super().__init__()
+        self.adapter_name = config.adapter_module_name
+        self.config = config
+        self.output_dim = vision_width
+        if 'perceiver' in self.adapter_name:
+            from flash_perceiver import Perceiver
+            self.adapter = Perceiver(
+                input_dim=vision_width,
+                depth=6,
+                output_dim=vision_width,
+                num_latents=self.config.num_query_token,
+                latent_dim=1024,
+                cross_heads=1,
+                cross_head_dim=128,
+                cross_rotary_emb_dim=0,
+                cross_attn_dropout=0.0,
+                latent_heads=8,
+                latent_head_dim=128,
+                latent_rotary_emb_dim=0,
+                latent_attn_dropout=0.0,
+                weight_tie_layers=False,
+                gated_mlp=True,
+                self_per_cross_attn=1,
+                num_zero_tokens=None,
+                use_flash_attn=True,
+            )
+        elif 'naive_resampler' in self.adapter_name:
+            assert math.sqrt(self.config.num_query_token) ** 2 == self.config.num_query_token,  'num of query need to be a square number'
+            self.adapter = Resampler(
+                    grid_size=int(math.sqrt(self.config.num_query_token)), 
+                    embed_dim=vision_width,
+                    num_heads=8, 
+            )
+        elif 'qformer' in self.adapter_name:
+            Qformer, ln_vision, query_tokens = build_Qformer(
+                self.config.num_query_token, vision_width)
+            self.adapter = Qformer
+            self.ln_vision = ln_vision
+            self.query_tokens = query_tokens
+            self.output_dim = Qformer.config.hidden_size
+        elif 'none' in self.adapter_name:
+            self.adapter = IdentityMap()
+            
+        self.is_loaded = False
+        
+        if 'compress_token' in self.adapter_name:
+            match = re.search(r'\d+$', self.adapter_name)
+            self.token_compressor = TokenCompressor(
+                num_compressed_token=int(match.group()),
+                embed_dim=self.config.hidden_size,
+                num_heads=8, 
+            )
+            if 'v1' in self.adapter_name:
+                self.compress_version = 'v1'
+            else:
+                self.compress_version = 'v0'
+
+        # self.ln_vision = LayerNorm(self.config.vision_in_dim)
+        self.frame_position_encoding = nn.Embedding(
+                config.max_num_segments,
+                self.output_dim,
+                )
+        
+        self.adapter.apply(self._init_weights)
+    
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Linear, nn.Embedding)):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        
+    def forward(self, image_features, frame_ids):
+        if 'perceiver' in self.adapter_name:
+            adapted_image_features = self.adapter(image_features, return_embeddings=True)
+        elif 'naive_resampler' in self.adapter_name:
+            adapted_image_features = self.adapter(image_features)
+        elif 'qformer' in self.adapter_name:
+            image_features = self.ln_vision(image_features)
+            query_tokens = self.query_tokens.expand(image_features.shape[0], -1, -1)
+            attn_mask = torch.ones(image_features.size()[:-1], dtype=torch.long).to(image_features.device)
+            adapted_image_features = self.adapter.bert(
+                query_embeds=query_tokens,
+                encoder_hidden_states=image_features,
+                encoder_attention_mask=attn_mask,
+                return_dict=True
+            ).last_hidden_state
+        elif 'none' in self.adapter_name:
+            adapted_image_features = self.adapter(image_features)
+            
+        frame_embeddings = self.frame_position_encoding(frame_ids).unsqueeze(-2)
+        adapted_image_features += frame_embeddings        
+        return adapted_image_features
+    
+    # TODO: addhoc func, rewrite it in the future
+    def compress_token_per_img(self, batch_image_features):
+        if 'compress_token' not in self.adapter_name:
+            return batch_image_features
+        compressed_features = []
+        for image_features in batch_image_features: # image_features [num_frames, tokens, C]
+            # handle non image cases(in that case, image_patch maybe smaller than num_compressed_token)
+            if image_features.shape[1] < self.token_compressor.num_compressed_token:  
+                compressed_features.append(image_features)
+            else:
+                compressed_features.append(self.token_compressor(image_features, compress_version=self.compress_version))
+        return compressed_features
+
+
+    def load_model(self):
+        if self.is_loaded:
+            return
+
+        if getattr(self.config, 'adapter_module_path', None):
+            checkpoint = torch.load(self.config.adapter_module_path, map_location="cpu")
+            
+            def get_w(weights, keyword):
+                return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword + '.' in k}
+            
+            def get_variable_frame_encoding_w(model_weights, load_weights):
+                keyword = 'frame_position_encoding'
+                model_len = model_weights.shape[0]
+                load_weights_f_encoding = get_w(load_weights, keyword)
+
+                load_len = load_weights_f_encoding['weight'].shape[0]
+                if model_len <= load_len:
+                    value = load_weights_f_encoding['weight'][:model_len]
+                else:
+                    value = model_weights.clone().cpu()
+                    value[:load_len] = load_weights_f_encoding['weight']
+                return value
+            
+            if 'qformer' in self.adapter_name and ('projector.bin' not in self.config.adapter_module_path):
+                    state_dict = checkpoint["model"]
+                    self.adapter.load_state_dict(get_w(state_dict, 'Qformer'))
+                    self.ln_vision.load_state_dict(get_w(state_dict, 'ln_vision'))
+                    self.load_state_dict({'query_tokens': state_dict['query_tokens']}, strict=False)
+                    if getattr(self.config, 'pretrain_mm_mlp_adapter', None):
+                        mm_projector_weights = torch.load(self.config.pretrain_mm_mlp_adapter, map_location='cpu')
+                        frame_encoding_weight = get_variable_frame_encoding_w(self.frame_position_encoding.weight, mm_projector_weights)
+                        self.frame_position_encoding.load_state_dict({'weight': frame_encoding_weight})
+            else:
+                frame_encoding_weight = get_variable_frame_encoding_w(self.frame_position_encoding.weight, checkpoint)
+                for k in checkpoint.keys():
+                    if 'frame_position_encoding' in k:
+                        checkpoint[k] = frame_encoding_weight
+                
+                self.load_state_dict(get_w(checkpoint, 'adapter_module'))
+        else:
+            # no pertrain weight, use initalization
+            return
+
+    def freeze_adapter_module(self, freeze_flag):
+        if freeze_flag:
+            for name, p in self.named_parameters():
+                p.requires_grad = False 
+        else:
+            for name, p in self.named_parameters():
+                p.requires_grad = True
+
+            if 'naive_resampler' in self.adapter_name:
+                for name, p in self.named_parameters():
+                    if 'pos_embed' in name:
+                        p.requires_grad = False
diff --git a/app/llava/model/multimodal_encoder/clip_encoder.py b/app/llava/model/multimodal_encoder/clip_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..97dbea3dee79b06ed163f9d85f174753a1237572
--- /dev/null
+++ b/app/llava/model/multimodal_encoder/clip_encoder.py
@@ -0,0 +1,88 @@
+import torch
+import torch.nn as nn
+
+from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
+
+
+class CLIPVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.vision_tower_name = vision_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+
+        if not delay_load:
+            self.load_model()
+        elif getattr(args, 'unfreeze_mm_vision_tower', False):
+            self.load_model()
+        else:
+            self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
+
+    def load_model(self, device_map=None):
+        if self.is_loaded:
+            print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
+            return
+
+        self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
+        self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
+        self.vision_tower.requires_grad_(False)
+
+        self.is_loaded = True
+
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        if self.select_feature == 'patch':
+            image_features = image_features[:, 1:]
+        elif self.select_feature == 'cls_patch':
+            image_features = image_features
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+
+    @property
+    def device(self):
+        return self.vision_tower.device
+
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+
+    @property
+    def num_patches_per_side(self):
+        return self.config.image_size // self.config.patch_size
+
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2
diff --git a/app/llava/model/multimodal_encoder/eva_clip_encoder.py b/app/llava/model/multimodal_encoder/eva_clip_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..30a628513f789ebd9c363bf7887931fd761e1627
--- /dev/null
+++ b/app/llava/model/multimodal_encoder/eva_clip_encoder.py
@@ -0,0 +1,101 @@
+import torch
+import torch.nn as nn
+
+from .processor import Blip2ImageTrainProcessor
+from .eva_vit import create_eva_vit_g
+
+
+class EvaClipVisionTower(nn.Module):
+
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.vision_tower_name = vision_tower
+        # self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+        self.args = args
+
+        if not delay_load:
+            self.load_model()
+
+        # self.is_loaded = True
+
+
+    def load_model(self, device_map=None):
+        if self.is_loaded:
+            print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
+            return
+        
+        if not hasattr(self.args, 'dynamic_resolution'):
+            dynamic_resolution = None
+        else:
+            dynamic_resolution = self.args.dynamic_resolution
+
+
+        if (not hasattr(self.args, 'freeze_vision_encoder')) or self.args.freeze_vision_encoder:
+            use_checkpoint = False
+        else:
+            use_checkpoint = True
+            assert self.args.vit_precision == 'fp32',  'if the vision encoder is training, the type needs to be fp32'
+                    
+        
+        self.image_processor = Blip2ImageTrainProcessor(
+            image_size=self.args.img_size,
+            dynamic_resolution= dynamic_resolution
+        )
+        self.vision_tower = create_eva_vit_g(
+            img_size=self.args.img_size,
+            drop_path_rate=self.args.drop_path_rate,
+            precision=self.args.vit_precision,
+            vit_model_path=self.args.vit_model_path,
+            use_checkpoint=use_checkpoint
+        )
+
+        # self.vision_tower.requires_grad_(False)
+
+        self.is_loaded = True
+
+
+    def feature_select(self, image_features):
+        if self.select_feature == 'patch':
+            image_features = image_features[:, 1:]
+        elif self.select_feature == 'cls_patch':
+            image_features = image_features
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+
+    # @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.unsqueeze(0))
+                image_features.append(self.feature_select(image_forward_out).to(image.dtype))
+
+        else:
+            image_features = self.vision_tower(images.to(dtype=self.dtype))
+            image_features = self.feature_select(image_features).to(images.dtype)
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, dtype=torch.float)
+
+    @property
+    def hidden_size(self):
+        return self.vision_tower.hidden_size
+
+    @property
+    def num_patches(self):
+        return (self.vision_tower.image_size // self.vision_tower.patch_size) ** 2
+    
+    @property
+    def num_patches_per_side(self):
+        return (self.vision_tower.image_size // self.vision_tower.patch_size)
+
+    @property
+    def dtype(self):
+        return self.vision_tower.pos_embed.dtype
+
diff --git a/app/llava/model/multimodal_encoder/eva_vit.py b/app/llava/model/multimodal_encoder/eva_vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..075308498eb32d114f739b5954d2d8481c464181
--- /dev/null
+++ b/app/llava/model/multimodal_encoder/eva_vit.py
@@ -0,0 +1,448 @@
+# Based on EVA, BEIT, timm and DeiT code bases
+# https://github.com/baaivision/EVA
+# https://github.com/rwightman/pytorch-image-models/tree/master/timm
+# https://github.com/microsoft/unilm/tree/master/beit
+# https://github.com/facebookresearch/deit/
+# https://github.com/facebookresearch/dino
+# --------------------------------------------------------'
+import math
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import drop_path, to_2tuple, trunc_normal_
+from timm.models.registry import register_model
+from llava.utils import master_print
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .9, 'interpolation': 'bicubic',
+        'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5),
+        **kwargs
+    }
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return 'p={}'.format(self.drop_prob)
+
+
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        # x = self.drop(x)
+        # commit this for the orignal BERT implement
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(
+            self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
+            proj_drop=0., window_size=None, attn_head_dim=None):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        else:
+            self.q_bias = None
+            self.v_bias = None
+
+        if window_size:
+            self.window_size = window_size
+            self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
+            self.relative_position_bias_table = nn.Parameter(
+                torch.zeros(self.num_relative_distance, num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+            # cls to token & token 2 cls & cls to cls
+
+            # get pair-wise relative position index for each token inside the window
+            coords_h = torch.arange(window_size[0])
+            coords_w = torch.arange(window_size[1])
+            coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+            coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+            relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+            relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+            relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+            relative_coords[:, :, 1] += window_size[1] - 1
+            relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+            relative_position_index = \
+                torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
+            relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+            relative_position_index[0, 0:] = self.num_relative_distance - 3
+            relative_position_index[0:, 0] = self.num_relative_distance - 2
+            relative_position_index[0, 0] = self.num_relative_distance - 1
+
+            self.register_buffer("relative_position_index", relative_position_index)
+        else:
+            self.window_size = None
+            self.relative_position_bias_table = None
+            self.relative_position_index = None
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, rel_pos_bias=None):
+        B, N, C = x.shape
+        qkv_bias = None
+        if self.q_bias is not None:
+            qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias))
+        # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        if self.relative_position_bias_table is not None:
+            relative_position_bias = \
+                self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+                    self.window_size[0] * self.window_size[1] + 1,
+                    self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
+            relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+            attn = attn + relative_position_bias.unsqueeze(0)
+
+        if rel_pos_bias is not None:
+            attn = attn + rel_pos_bias
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Module):
+
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., init_values=None, act_layer=nn.GELU, norm_layer=nn.LayerNorm,
+                 window_size=None, attn_head_dim=None):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            attn_drop=attn_drop, proj_drop=drop, window_size=window_size, attn_head_dim=attn_head_dim)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+        if init_values is not None and init_values > 0:
+            self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)), requires_grad=True)
+            self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)), requires_grad=True)
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+
+    def forward(self, x, rel_pos_bias=None):
+        if self.gamma_1 is None:
+            x = x + self.drop_path(self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+        self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, x, **kwargs):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+
+
+class RelativePositionBias(nn.Module):
+
+    def __init__(self, window_size, num_heads):
+        super().__init__()
+        self.window_size = window_size
+        self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros(self.num_relative_distance, num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+        # cls to token & token 2 cls & cls to cls
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(window_size[0])
+        coords_w = torch.arange(window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+        relative_position_index = \
+            torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
+        relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        relative_position_index[0, 0:] = self.num_relative_distance - 3
+        relative_position_index[0:, 0] = self.num_relative_distance - 2
+        relative_position_index[0, 0] = self.num_relative_distance - 1
+
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        # trunc_normal_(self.relative_position_bias_table, std=.02)
+
+    def forward(self):
+        relative_position_bias = \
+            self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+                self.window_size[0] * self.window_size[1] + 1,
+                self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
+        return relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+
+
+class VisionTransformer(nn.Module):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+                 drop_path_rate=0., norm_layer=nn.LayerNorm, init_values=None,
+                 use_abs_pos_emb=True, use_rel_pos_bias=False, use_shared_rel_pos_bias=False,
+                 use_mean_pooling=True, init_scale=0.001, use_checkpoint=False):
+        super().__init__()
+        self.image_size = img_size
+        self.patch_size = patch_size
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        if use_abs_pos_emb:
+            self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+        else:
+            self.pos_embed = None
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        if use_shared_rel_pos_bias:
+            self.rel_pos_bias = RelativePositionBias(window_size=self.patch_embed.patch_shape, num_heads=num_heads)
+        else:
+            self.rel_pos_bias = None
+        self.use_checkpoint = use_checkpoint
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.use_rel_pos_bias = use_rel_pos_bias
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+                init_values=init_values, window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None)
+            for i in range(depth)])
+        #         self.norm = nn.Identity() if use_mean_pooling else norm_layer(embed_dim)
+        #         self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None
+        #         self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+        if self.pos_embed is not None:
+            trunc_normal_(self.pos_embed, std=.02)
+        trunc_normal_(self.cls_token, std=.02)
+        # trunc_normal_(self.mask_token, std=.02)
+        #         if isinstance(self.head, nn.Linear):
+        #             trunc_normal_(self.head.weight, std=.02)
+        self.apply(self._init_weights)
+        self.fix_init_weight()
+
+    #         if isinstance(self.head, nn.Linear):
+    #             self.head.weight.data.mul_(init_scale)
+    #             self.head.bias.data.mul_(init_scale)
+
+    def fix_init_weight(self):
+        def rescale(param, layer_id):
+            param.div_(math.sqrt(2.0 * layer_id))
+
+        for layer_id, layer in enumerate(self.blocks):
+            rescale(layer.attn.proj.weight.data, layer_id + 1)
+            rescale(layer.mlp.fc2.weight.data, layer_id + 1)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=''):
+        self.num_classes = num_classes
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        batch_size, seq_len, _ = x.size()
+
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+        if self.pos_embed is not None:
+            x = x + self.pos_embed
+        x = self.pos_drop(x)
+
+        rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, rel_pos_bias)
+            else:
+                x = blk(x, rel_pos_bias)
+        return x
+
+    #         x = self.norm(x)
+
+    #         if self.fc_norm is not None:
+    #             t = x[:, 1:, :]
+    #             return self.fc_norm(t.mean(1))
+    #         else:
+    #             return x[:, 0]
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        #         x = self.head(x)
+        return x
+
+    def get_intermediate_layers(self, x):
+        x = self.patch_embed(x)
+        batch_size, seq_len, _ = x.size()
+
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+        if self.pos_embed is not None:
+            x = x + self.pos_embed
+        x = self.pos_drop(x)
+
+        features = []
+        rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None
+        for blk in self.blocks:
+            x = blk(x, rel_pos_bias)
+            features.append(x)
+
+        return features
+
+    @property
+    def hidden_size(self):
+        return self.num_features
+
+
+def interpolate_pos_embed(model, checkpoint_model):
+    if 'pos_embed' in checkpoint_model:
+        pos_embed_checkpoint = checkpoint_model['pos_embed'].float()
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        num_patches = model.patch_embed.num_patches
+        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+        # height (== width) for the checkpoint position embedding
+        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+        # height (== width) for the new position embedding
+        new_size = int(num_patches ** 0.5)
+        # class_token and dist_token are kept unchanged
+        if orig_size != new_size:
+            print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
+            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+            # only the position tokens are interpolated
+            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+            pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
+            pos_tokens = torch.nn.functional.interpolate(
+                pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
+            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+            checkpoint_model['pos_embed'] = new_pos_embed
+
+
+def convert_weights_to_fp16(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+
+    def _convert_weights_to_fp16(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            l.weight.data = l.weight.data.half()
+            if l.bias is not None:
+                l.bias.data = l.bias.data.half()
+
+    #         if isinstance(l, (nn.MultiheadAttention, Attention)):
+    #             for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
+    #                 tensor = getattr(l, attr)
+    #                 if tensor is not None:
+    #                     tensor.data = tensor.data.half()
+
+    model.apply(_convert_weights_to_fp16)
+
+def create_eva_vit_g(img_size=224, drop_path_rate=0.4, use_checkpoint=False, precision="fp16", vit_model_path=None):
+    model = VisionTransformer(
+        img_size=img_size,
+        patch_size=14,
+        use_mean_pooling=False,
+        embed_dim=1408,
+        depth=39,
+        num_heads=1408 // 88,
+        mlp_ratio=4.3637,
+        qkv_bias=True,
+        drop_path_rate=drop_path_rate,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        use_checkpoint=use_checkpoint,
+    )
+    if vit_model_path is not None:
+        state_dict = torch.load(vit_model_path, map_location="cpu")
+        interpolate_pos_embed(model, state_dict)
+
+        incompatible_keys = model.load_state_dict(state_dict, strict=False)
+        #master_print(incompatible_keys)
+
+    if precision == "fp16":
+        #         model.to("cuda")
+        convert_weights_to_fp16(model)
+    return model
diff --git a/app/llava/model/multimodal_encoder/google_siglip_encoder.py b/app/llava/model/multimodal_encoder/google_siglip_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbf20f80a392d19269009edc72fac1d48f0c9a45
--- /dev/null
+++ b/app/llava/model/multimodal_encoder/google_siglip_encoder.py
@@ -0,0 +1,84 @@
+import torch
+import torch.nn as nn
+
+# from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
+# from .siglip.modeling_siglip import SiglipVisionModel
+# from .siglip.configuration_siglip import SiglipVisionConfig
+from transformers import AutoImageProcessor, SiglipVisionModel, SiglipVisionConfig
+
+
+class GoogleSiglipVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.vision_tower_name = vision_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.args = args
+        if not delay_load:
+            self.load_model()
+        elif getattr(args, 'unfreeze_mm_vision_tower', False):
+            self.load_model()
+        else:
+            self.cfg_only = SiglipVisionConfig.from_pretrained(self.vision_tower_name)
+
+    def load_model(self, device_map=None):
+        if self.is_loaded:
+            print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
+            return
+
+        self.image_processor = AutoImageProcessor.from_pretrained(self.vision_tower_name)
+        self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
+        self.vision_tower.requires_grad_(False)
+
+        self.is_loaded = True
+
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        return image_features
+
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+
+    @property
+    def device(self):
+        return self.vision_tower.device
+
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+
+    @property
+    def num_patches_per_side(self):
+        return self.image_processor.size['width'] // self.config.patch_size
+
+    @property
+    def num_patches(self):
+        return (self.image_processor.size['width'] // self.config.patch_size) ** 2
diff --git a/app/llava/model/multimodal_encoder/processor/__init__.py b/app/llava/model/multimodal_encoder/processor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab318449dd49e1c550d90171133680938c62b3c7
--- /dev/null
+++ b/app/llava/model/multimodal_encoder/processor/__init__.py
@@ -0,0 +1 @@
+from .blip_processor import Blip2ImageTrainProcessor
\ No newline at end of file
diff --git a/app/llava/model/multimodal_encoder/processor/__pycache__/__init__.cpython-310.pyc b/app/llava/model/multimodal_encoder/processor/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df971d84732e5f45d3791b892d33955759a9b6e2
Binary files /dev/null and b/app/llava/model/multimodal_encoder/processor/__pycache__/__init__.cpython-310.pyc differ
diff --git a/app/llava/model/multimodal_encoder/processor/__pycache__/__init__.cpython-39.pyc b/app/llava/model/multimodal_encoder/processor/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b53620c5921b17b31f61acd37dbe787e4fbef4b
Binary files /dev/null and b/app/llava/model/multimodal_encoder/processor/__pycache__/__init__.cpython-39.pyc differ
diff --git a/app/llava/model/multimodal_encoder/processor/__pycache__/blip_processor.cpython-310.pyc b/app/llava/model/multimodal_encoder/processor/__pycache__/blip_processor.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8dc956d44f151978418d50248a2ef826db66550b
Binary files /dev/null and b/app/llava/model/multimodal_encoder/processor/__pycache__/blip_processor.cpython-310.pyc differ
diff --git a/app/llava/model/multimodal_encoder/processor/__pycache__/blip_processor.cpython-39.pyc b/app/llava/model/multimodal_encoder/processor/__pycache__/blip_processor.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..76d79eb2289d339fbb1cd2ccfa9d7ef543a72c6b
Binary files /dev/null and b/app/llava/model/multimodal_encoder/processor/__pycache__/blip_processor.cpython-39.pyc differ
diff --git a/app/llava/model/multimodal_encoder/processor/blip_processor.py b/app/llava/model/multimodal_encoder/processor/blip_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef381ff576b7bb4bf4cfafd01800e49eb217f0e5
--- /dev/null
+++ b/app/llava/model/multimodal_encoder/processor/blip_processor.py
@@ -0,0 +1,125 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from omegaconf import OmegaConf
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+from llava.mm_utils import select_best_resolution
+
+class BaseProcessor:
+    def __init__(self):
+        self.transform = lambda x: x
+        return
+
+    def __call__(self, item):
+        return self.transform(item)
+
+    @classmethod
+    def from_config(cls, cfg=None):
+        return cls()
+
+    def build(self, **kwargs):
+        cfg = OmegaConf.create(kwargs)
+
+        return self.from_config(cfg)
+
+
+class BlipImageBaseProcessor(BaseProcessor):
+    def __init__(self, image_mean=None, image_std=None):
+        if image_mean is None:
+            image_mean = (0.48145466, 0.4578275, 0.40821073)
+        if image_std is None:
+            image_std = (0.26862954, 0.26130258, 0.27577711)
+
+        self.normalize = transforms.Normalize(image_mean, image_std)
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+class Blip2ImageTrainProcessor(BlipImageBaseProcessor):
+    def __init__(self, image_size=224, image_mean=None, image_std=None, min_scale=0.5, max_scale=1.0, is_training=True, dynamic_resolution=None):
+        super().__init__(image_mean=image_mean, image_std=image_std)
+
+        self.is_training = is_training
+        self.dynamic_resolution = dynamic_resolution
+        if isinstance(image_size, int):
+            self.img_size = image_size
+            size_tuple = (image_size, image_size)
+        elif isinstance(image_size, tuple):
+            self.img_size = image_size[0]
+            size_tuple = image_size   # H, W
+        self.crop_size = {
+            'height': self.img_size,
+            'width': self.img_size
+        }
+        if self.dynamic_resolution:
+            self.transform_dic = {}
+            for size_ in self.dynamic_resolution:
+                self.transform_dic[size_] = (
+                    transforms.Compose(
+                    [
+                        transforms.Resize(
+                            size_, interpolation=InterpolationMode.BICUBIC # H, W
+                        ),
+                        transforms.ToTensor(),
+                        self.normalize,
+                    ]
+                    )                   
+                )
+        self.transform = transforms.Compose(
+            [
+                transforms.Resize(
+                    size_tuple, interpolation=InterpolationMode.BICUBIC
+                ),
+                transforms.ToTensor(),
+                self.normalize,
+            ]
+        )
+
+    def preprocess(self, item):
+        # if self.dynamic_resolution is not None:
+        #     images = []
+        #     images.append(self.transform(item))
+        #     width, height = item.size
+        #     best_fit_res = select_best_resolution((width, height), self.dynamic_resolution)
+        #     resize_img = self.transform_dic[best_fit_res](item)
+        #     splitted_imgs = self.split_images(resize_img, (self.img_size, self.img_size))
+        #     images.extend(splitted_imgs)
+        #     return images
+        # else:
+        return self.transform(item)
+
+    @classmethod
+    def from_config(cls, cfg=None):
+        if cfg is None:
+            cfg = OmegaConf.create()
+
+        image_size = cfg.get("image_size", 224)
+
+        image_mean = cfg.get("mean", None)
+        image_std = cfg.get("image_std", None)
+
+        min_scale = cfg.get("min_scale", 0.5)
+        max_scale = cfg.get("max_scale", 1.0)
+
+        return cls(
+            image_size=image_size,
+            image_mean=image_mean,
+            image_std=image_std,
+            min_scale=min_scale,
+            max_scale=max_scale,
+        )
+
+    @staticmethod
+    def split_images(image, split_size):
+        splited_images = []
+        _, h, w = image.shape # C, H, W
+        assert h % split_size[0] == 0 and w % split_size[1] == 0, "dynamic resolution must be a multiple of input image size "
+        for i in range(0, h, split_size[0]):
+            for j in range(0, w, split_size[1]):
+                patch = image[:, i:i+split_size[0], j:j+split_size[1]].clone()
+                splited_images.append(patch)
+        return splited_images
\ No newline at end of file
diff --git a/app/llava/model/multimodal_encoder/qformer.py b/app/llava/model/multimodal_encoder/qformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c70d19040eef1f74f11cb009d1d393401579ef8c
--- /dev/null
+++ b/app/llava/model/multimodal_encoder/qformer.py
@@ -0,0 +1,1221 @@
+"""
+Adapted from salesforce@LAVIS. Below is the original copyright:
+ * Copyright (c) 2023, salesforce.com, inc.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+ * By Junnan Li
+ * Based on huggingface code base
+ * https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/models/bert
+"""
+
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple, Dict, Any
+
+import torch
+from torch import Tensor, device, dtype, nn
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+import torch.nn.functional as F
+
+from transformers.activations import ACT2FN
+from transformers.file_utils import (
+    ModelOutput,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import (
+    PreTrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from transformers.utils import logging
+from transformers.models.bert.configuration_bert import BertConfig
+from llava.model.utils import LayerNorm
+
+logger = logging.get_logger(__name__)
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word and position embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id
+        )
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size
+        )
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.LayerNorm = LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))
+        )
+        self.position_embedding_type = getattr(
+            config, "position_embedding_type", "absolute"
+        )
+
+        self.config = config
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        query_embeds=None,
+        past_key_values_length=0,
+    ):
+        if input_ids is not None:
+            seq_length = input_ids.size()[1]
+        else:
+            seq_length = 0
+
+        if position_ids is None:
+            position_ids = self.position_ids[
+                :, past_key_values_length : seq_length + past_key_values_length
+            ].clone()
+
+        if input_ids is not None:
+            embeddings = self.word_embeddings(input_ids)
+            if self.position_embedding_type == "absolute":
+                position_embeddings = self.position_embeddings(position_ids)
+                embeddings = embeddings + position_embeddings
+
+            if query_embeds is not None:
+                embeddings = torch.cat((query_embeds, embeddings), dim=1)
+        else:
+            embeddings = query_embeds
+
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertSelfAttention(nn.Module):
+    def __init__(self, config, is_cross_attention):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+            config, "embedding_size"
+        ):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_width, self.all_head_size)
+            self.value = nn.Linear(config.encoder_width, self.all_head_size)
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if (
+            self.position_embedding_type == "relative_key"
+            or self.position_embedding_type == "relative_key_query"
+        ):
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(
+                2 * config.max_position_embeddings - 1, self.attention_head_size
+            )
+        self.save_attention = False
+
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+
+    def get_attn_gradients(self):
+        return self.attn_gradients
+
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+
+    def get_attention_map(self):
+        return self.attention_map
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (
+            self.num_attention_heads,
+            self.attention_head_size,
+        )
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        mixed_query_layer = self.query(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if (
+            self.position_embedding_type == "relative_key"
+            or self.position_embedding_type == "relative_key_query"
+        ):
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(
+                seq_length, dtype=torch.long, device=hidden_states.device
+            ).view(-1, 1)
+            position_ids_r = torch.arange(
+                seq_length, dtype=torch.long, device=hidden_states.device
+            ).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(
+                distance + self.max_position_embeddings - 1
+            )
+            positional_embedding = positional_embedding.to(
+                dtype=query_layer.dtype
+            )  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum(
+                    "bhld,lrd->bhlr", query_layer, positional_embedding
+                )
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum(
+                    "bhld,lrd->bhlr", query_layer, positional_embedding
+                )
+                relative_position_scores_key = torch.einsum(
+                    "bhrd,lrd->bhlr", key_layer, positional_embedding
+                )
+                attention_scores = (
+                    attention_scores
+                    + relative_position_scores_query
+                    + relative_position_scores_key
+                )
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores.to(torch.float32)).to(attention_scores.dtype)
+
+        if is_cross_attention and self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+
+        context_layer = torch.matmul(attention_probs_dropped, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (
+            (context_layer, attention_probs) if output_attentions else (context_layer,)
+        )
+
+        outputs = outputs + (past_key_value,)
+        return outputs
+
+
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.LayerNorm = LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.self = BertSelfAttention(config, is_cross_attention)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads,
+            self.self.num_attention_heads,
+            self.self.attention_head_size,
+            self.pruned_heads,
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = (
+            self.self.attention_head_size * self.self.num_attention_heads
+        )
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[
+            1:
+        ]  # add attentions if we output them
+        return outputs
+
+
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.LayerNorm = LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+    def __init__(self, config, layer_num):
+        super().__init__()
+        self.config = config
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config)
+        self.layer_num = layer_num
+        if (
+            self.config.add_cross_attention
+            and layer_num % self.config.cross_attention_freq == 0
+        ):
+            self.crossattention = BertAttention(
+                config, is_cross_attention=self.config.add_cross_attention
+            )
+            self.has_cross_attention = True
+        else:
+            self.has_cross_attention = False
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+        self.intermediate_query = BertIntermediate(config)
+        self.output_query = BertOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        query_length=0,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = (
+            past_key_value[:2] if past_key_value is not None else None
+        )
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:-1]
+
+        present_key_value = self_attention_outputs[-1]
+
+        if query_length > 0:
+            query_attention_output = attention_output[:, :query_length, :]
+
+            if self.has_cross_attention:
+                assert (
+                    encoder_hidden_states is not None
+                ), "encoder_hidden_states must be given for cross-attention layers"
+                cross_attention_outputs = self.crossattention(
+                    query_attention_output,
+                    attention_mask,
+                    head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+                query_attention_output = cross_attention_outputs[0]
+                outputs = (
+                    outputs + cross_attention_outputs[1:-1]
+                )  # add cross attentions if we output attention weights
+
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk_query,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                query_attention_output,
+            )
+            if attention_output.shape[1] > query_length:
+                layer_output_text = apply_chunking_to_forward(
+                    self.feed_forward_chunk,
+                    self.chunk_size_feed_forward,
+                    self.seq_len_dim,
+                    attention_output[:, query_length:, :],
+                )
+                layer_output = torch.cat([layer_output, layer_output_text], dim=1)
+        else:
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                attention_output,
+            )
+        outputs = (layer_output,) + outputs
+
+        outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+    def feed_forward_chunk_query(self, attention_output):
+        intermediate_output = self.intermediate_query(attention_output)
+        layer_output = self.output_query(intermediate_output, attention_output)
+        return layer_output
+
+
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [BertLayer(config, i) for i in range(config.num_hidden_layers)]
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        query_length=0,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = (
+            () if output_attentions and self.config.add_cross_attention else None
+        )
+
+        next_decoder_cache = () if use_cache else None
+
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+
+                if use_cache:
+                    logger.warn(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(
+                            *inputs, past_key_value, output_attentions, query_length
+                        )
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                    query_length,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.LayerNorm = LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class BertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = BertConfig
+    base_model_prefix = "bert"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+
+class BertModel(BertPreTrainedModel):
+    """
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+    input to the forward pass.
+    """
+
+    def __init__(self, config, add_pooling_layer=False):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = BertEmbeddings(config)
+
+        self.encoder = BertEncoder(config)
+
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_extended_attention_mask(
+        self,
+        attention_mask: Tensor,
+        input_shape: Tuple[int],
+        device: device,
+        is_decoder: bool,
+        has_query: bool = False,
+    ) -> Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+
+        Arguments:
+            attention_mask (:obj:`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (:obj:`Tuple[int]`):
+                The shape of the input to the model.
+            device: (:obj:`torch.device`):
+                The device of the input to the model.
+
+        Returns:
+            :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if is_decoder:
+                batch_size, seq_length = input_shape
+
+                seq_ids = torch.arange(seq_length, device=device)
+                causal_mask = (
+                    seq_ids[None, None, :].repeat(batch_size, seq_length, 1)
+                    <= seq_ids[None, :, None]
+                )
+
+                # add a prefix ones mask to the causal mask
+                # causal and attention masks must have same type with pytorch version < 1.3
+                causal_mask = causal_mask.to(attention_mask.dtype)
+
+                if causal_mask.shape[1] < attention_mask.shape[1]:
+                    prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
+                    if has_query:  # UniLM style attention mask
+                        causal_mask = torch.cat(
+                            [
+                                torch.zeros(
+                                    (batch_size, prefix_seq_len, seq_length),
+                                    device=device,
+                                    dtype=causal_mask.dtype,
+                                ),
+                                causal_mask,
+                            ],
+                            axis=1,
+                        )
+                    causal_mask = torch.cat(
+                        [
+                            torch.ones(
+                                (batch_size, causal_mask.shape[1], prefix_seq_len),
+                                device=device,
+                                dtype=causal_mask.dtype,
+                            ),
+                            causal_mask,
+                        ],
+                        axis=-1,
+                    )
+                extended_attention_mask = (
+                    causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
+                )
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(
+            dtype=self.dtype
+        )  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        query_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        is_decoder=False,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        # use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if input_ids is None:
+            assert (
+                query_embeds is not None
+            ), "You have to specify query_embeds when input_ids is None"
+
+        # past_key_values_length
+        past_key_values_length = (
+            past_key_values[0][0].shape[2] - self.config.query_length
+            if past_key_values is not None
+            else 0
+        )
+
+        query_length = query_embeds.shape[1] if query_embeds is not None else 0
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            query_embeds=query_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+
+        input_shape = embedding_output.size()[:-1]
+        batch_size, seq_length = input_shape
+        device = embedding_output.device
+
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                ((batch_size, seq_length + past_key_values_length)), device=device
+            )
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if is_decoder:
+            extended_attention_mask = self.get_extended_attention_mask(
+                attention_mask,
+                input_ids.shape,
+                device,
+                is_decoder,
+                has_query=(query_embeds is not None),
+            )
+        else:
+            extended_attention_mask = self.get_extended_attention_mask(
+                attention_mask, input_shape, device, is_decoder
+            )
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if type(encoder_hidden_states) == list:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[
+                    0
+                ].size()
+            else:
+                (
+                    encoder_batch_size,
+                    encoder_sequence_length,
+                    _,
+                ) = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+
+            if type(encoder_attention_mask) == list:
+                encoder_extended_attention_mask = [
+                    self.invert_attention_mask(mask) for mask in encoder_attention_mask
+                ]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+                encoder_extended_attention_mask = self.invert_attention_mask(
+                    encoder_attention_mask
+                )
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(
+                    encoder_attention_mask
+                )
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            query_length=query_length,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = (
+            self.pooler(sequence_output) if self.pooler is not None else None
+        )
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+class BertLMHeadModel(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        query_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=True,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        return_logits=False,
+        is_decoder=True,
+        reduction="mean",
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
+            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        Returns:
+        Example::
+            >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
+            >>> import torch
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+            >>> config = BertConfig.from_pretrained("bert-base-cased")
+            >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+            >>> prediction_logits = outputs.logits
+        """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        if labels is not None:
+            use_cache = False
+        if past_key_values is not None:
+            query_embeds = None
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            query_embeds=query_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+        )
+
+        sequence_output = outputs[0]
+        if query_embeds is not None:
+            sequence_output = outputs[0][:, query_embeds.shape[1] :, :]
+
+        prediction_scores = self.cls(sequence_output)
+
+        if return_logits:
+            return prediction_scores[:, :-1, :].contiguous()
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1)
+            lm_loss = loss_fct(
+                shifted_prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1),
+            )
+            if reduction == "none":
+                lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1)
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, query_embeds, past=None, attention_mask=None, **model_kwargs
+    ):
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_ids.shape)
+        query_mask = input_ids.new_ones(query_embeds.shape[:-1])
+        attention_mask = torch.cat([query_mask, attention_mask], dim=-1)
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            "input_ids": input_ids,
+            "query_embeds": query_embeds,
+            "attention_mask": attention_mask,
+            "past_key_values": past,
+            "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None),
+            "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None),
+            "is_decoder": True,
+        }
+
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (
+                tuple(
+                    past_state.index_select(0, beam_idx) for past_state in layer_past
+                ),
+            )
+        return reordered_past
+
+
+class BertForMaskedLM(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        query_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        return_logits=False,
+        is_decoder=False,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        """
+
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            query_embeds=query_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+        )
+
+        if query_embeds is not None:
+            sequence_output = outputs[0][:, query_embeds.shape[1] :, :]
+        prediction_scores = self.cls(sequence_output)
+
+        if return_logits:
+            return prediction_scores
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)
+            )
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return (
+                ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+            )
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
\ No newline at end of file
diff --git a/app/llava/model/multimodal_encoder/resampler.py b/app/llava/model/multimodal_encoder/resampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..cba5183e6df5e65212e6b15672b399368de16582
--- /dev/null
+++ b/app/llava/model/multimodal_encoder/resampler.py
@@ -0,0 +1,260 @@
+import torch
+import numpy as np
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.init import trunc_normal_
+import math
+
+def get_abs_pos(abs_pos, tgt_size):
+    # abs_pos: L, C
+    # tgt_size: M
+    # return: M, C
+    src_size = int(math.sqrt(abs_pos.size(0)))
+    tgt_size = int(math.sqrt(tgt_size))
+    dtype = abs_pos.dtype
+
+    if src_size != tgt_size:
+        return F.interpolate(
+            abs_pos.float().reshape(1, src_size, src_size, -1).permute(0, 3, 1, 2),
+            size=(tgt_size, tgt_size),
+            mode="bicubic",
+            align_corners=False,
+        ).permute(0, 2, 3, 1).flatten(0, 2).to(dtype=dtype)
+    else:
+        return abs_pos
+
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+
+
+class Resampler(nn.Module):
+    """
+    A 2D perceiver-resampler network with one cross attention layers by
+        (grid_size**2) learnable queries and 2d sincos pos_emb
+    Outputs:
+        A tensor with the shape of (grid_size**2, embed_dim)
+    """
+    def __init__(
+            self,
+            grid_size,
+            embed_dim,
+            num_heads,
+            kv_dim=None,
+            norm_layer=nn.LayerNorm
+    ):
+        super().__init__()
+        self.num_queries = grid_size ** 2
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+
+        self.pos_embed = nn.Parameter(
+            torch.from_numpy(get_2d_sincos_pos_embed(embed_dim, grid_size)).float()
+        ).requires_grad_(False)
+
+        self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim))
+        trunc_normal_(self.query, std=.02)
+
+        if kv_dim is not None and kv_dim != embed_dim:
+            self.kv_proj = nn.Linear(kv_dim, embed_dim, bias=False)
+        else:
+            self.kv_proj = nn.Identity()
+
+        self.attn = nn.MultiheadAttention(embed_dim, num_heads) # batch_first = False
+        self.ln_q = norm_layer(embed_dim)
+        self.ln_kv = norm_layer(embed_dim)
+        
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def forward(self, x, attn_mask=None):
+
+        pos_embed = get_abs_pos(self.pos_embed, x.size(1))
+
+        x = self.kv_proj(x)
+        x = self.ln_kv(x).permute(1, 0, 2)
+
+        N = x.shape[1]
+        q = self.ln_q(self.query).to(dtype=x.dtype)
+        out = self.attn(
+            self._repeat(q, N) + self.pos_embed.unsqueeze(1).to(dtype=x.dtype),
+            x + pos_embed.unsqueeze(1).to(dtype=x.dtype),
+            x,
+            attn_mask=attn_mask)[0]
+        return out.permute(1, 0, 2)
+
+    def _repeat(self, query, N: int):
+        return query.unsqueeze(1).repeat(1, N, 1)
+
+
+
+
+class TokenCompressor(nn.Module):
+    def __init__(
+            self,
+            num_compressed_token,
+            embed_dim,
+            num_heads,
+            kv_dim=None,
+            norm_layer=nn.LayerNorm
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.num_compressed_token = num_compressed_token
+
+        if kv_dim is not None and kv_dim != embed_dim:
+            self.kv_proj = nn.Linear(kv_dim, embed_dim, bias=False)
+        else:
+            self.kv_proj = nn.Identity()
+
+        self.attn = nn.MultiheadAttention(embed_dim, num_heads)
+        self.ln_q = norm_layer(embed_dim)
+        self.ln_kv = norm_layer(embed_dim)
+        
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        # zero initializatoin ,identical
+        if isinstance(m, nn.Linear):
+            # trunc_normal_(m.weight, std=.02)
+            nn.init.constant_(m.weight, 0.0)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def forward(self, x, attn_mask=None, compress_version='v0'):
+        
+        topk_index_sorted = self.token_filter(x, compress_version)
+        out = torch.gather(x, 1, topk_index_sorted.unsqueeze(-1).expand(-1, -1, x.shape[-1])).permute(1, 0, 2)
+        q = torch.gather(x, 1, topk_index_sorted.unsqueeze(-1).expand(-1, -1, x.shape[-1])).clone().detach()
+        q = q.permute(1, 0, 2)
+        x = self.kv_proj(x)
+        x = self.ln_kv(x).permute(1, 0, 2)
+
+        q = self.ln_q(q)
+        out += self.attn(
+            q,
+            x,
+            x,
+            attn_mask=attn_mask)[0]
+        return out.permute(1, 0, 2)
+    
+
+    def token_filter(self, x, compress_version='v0'):
+        tokens = x.clone().detach()
+        tokens_norm = tokens / tokens.norm(dim=-1, keepdim=True)
+        attn_scores = torch.matmul(tokens_norm, tokens_norm.transpose(2, 1))
+        # mask = torch.ones((tokens_norm.shape[1], tokens_norm.shape[1]), device=tokens_norm.device).triu()
+        if compress_version == 'v0':
+            mask = torch.eye(tokens_norm.shape[1], device=tokens_norm.device)
+        elif compress_version == 'v1':
+            mask = torch.ones((tokens_norm.shape[1], tokens_norm.shape[1]), device=tokens_norm.device).triu()
+        else:
+            NotImplementedError
+        attn_scores = attn_scores.masked_fill(mask == 1, 1e-5)
+        importances = 1 - attn_scores.max(dim=-1)[0]
+        topk_index = torch.topk(importances, self.num_compressed_token)[1]
+        topk_index_sorted =  torch.sort(topk_index, dim=-1)[0]
+        return topk_index_sorted
+    
+
+
+
+
+
+
+
+# from flash_perceiver import Perceiver, utils
+# from torchstat import stat
+# batch_size, seq_len, in_dim = 32, 5120, 1024
+
+# latent_dim = 1024
+# num_latents = 128
+# out_dim = 1024
+
+# model = Perceiver(
+#     input_dim=in_dim,
+#     depth=4,
+#     output_dim=out_dim,
+#     num_latents=num_latents,
+#     latent_dim=latent_dim,
+#     cross_heads=1,
+#     cross_head_dim=64,
+#     cross_rotary_emb_dim=0,
+#     cross_attn_dropout=0.0,
+#     latent_heads=8,
+#     latent_head_dim=64,
+#     latent_rotary_emb_dim=0,
+#     latent_attn_dropout=0.0,
+#     weight_tie_layers=False,
+#     gated_mlp=True,
+#     self_per_cross_attn=1,
+#     num_zero_tokens=None,
+#     use_flash_attn=True,
+# ).cuda()
+
+# data = torch.randn(batch_size, seq_len, in_dim, device='cuda:0')
+
+# # `out_dim` specified; averages and projects output
+# # Note: FlashAttention only supports half-precision.
+# #  We need to use `torch.autocast` for the forward-pass
+# with torch.autocast('cuda'):
+#     out = model(data, return_embeddings=True)
+#     print(torch.cuda.max_memory_allocated(device=None))
+# print(out.shape)
\ No newline at end of file
diff --git a/app/llava/model/multimodal_encoder/siglip/__pycache__/configuration_siglip.cpython-310.pyc b/app/llava/model/multimodal_encoder/siglip/__pycache__/configuration_siglip.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..023156b9d69bdd063900032bb55211925c11fcfb
Binary files /dev/null and b/app/llava/model/multimodal_encoder/siglip/__pycache__/configuration_siglip.cpython-310.pyc differ
diff --git a/app/llava/model/multimodal_encoder/siglip/__pycache__/configuration_siglip.cpython-39.pyc b/app/llava/model/multimodal_encoder/siglip/__pycache__/configuration_siglip.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..50809d5a390a1f95a70a25b7ef8683b569be4b9b
Binary files /dev/null and b/app/llava/model/multimodal_encoder/siglip/__pycache__/configuration_siglip.cpython-39.pyc differ
diff --git a/app/llava/model/multimodal_encoder/siglip/__pycache__/modeling_siglip.cpython-310.pyc b/app/llava/model/multimodal_encoder/siglip/__pycache__/modeling_siglip.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..58a1313e2fe1ed1ab5f342a4dc1cc0a3deb6cbc7
Binary files /dev/null and b/app/llava/model/multimodal_encoder/siglip/__pycache__/modeling_siglip.cpython-310.pyc differ
diff --git a/app/llava/model/multimodal_encoder/siglip/__pycache__/modeling_siglip.cpython-39.pyc b/app/llava/model/multimodal_encoder/siglip/__pycache__/modeling_siglip.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..91c5bcbc72e4a561c4724f492a87058e60ab0a2d
Binary files /dev/null and b/app/llava/model/multimodal_encoder/siglip/__pycache__/modeling_siglip.cpython-39.pyc differ
diff --git a/app/llava/model/multimodal_encoder/siglip/configuration_siglip.py b/app/llava/model/multimodal_encoder/siglip/configuration_siglip.py
new file mode 100644
index 0000000000000000000000000000000000000000..99e37ad88012851ffa1ac987b0f129dc0e08c628
--- /dev/null
+++ b/app/llava/model/multimodal_encoder/siglip/configuration_siglip.py
@@ -0,0 +1,306 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Siglip model configuration"""
+
+import os
+from typing import Union
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+SIGLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "google/siglip-base-patch16-224": "https://huggingface.co/google/siglip-base-patch16-224/resolve/main/config.json",
+}
+
+
+class SiglipTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SiglipTextModel`]. It is used to instantiate a
+    Siglip text encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the text encoder of the Siglip
+    [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Siglip text model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`SiglipModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 64):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        pad_token_id (`int`, *optional*, defaults to 1):
+            The id of the padding token in the vocabulary.
+        bos_token_id (`int`, *optional*, defaults to 49406):
+            The id of the beginning-of-sequence token in the vocabulary.
+        eos_token_id (`int`, *optional*, defaults to 49407):
+            The id of the end-of-sequence token in the vocabulary.
+
+    Example:
+
+    ```python
+    >>> from transformers import SiglipTextConfig, SiglipTextModel
+
+    >>> # Initializing a SiglipTextConfig with google/siglip-base-patch16-224 style configuration
+    >>> configuration = SiglipTextConfig()
+
+    >>> # Initializing a SiglipTextModel (with random weights) from the google/siglip-base-patch16-224 style configuration
+    >>> model = SiglipTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "siglip_text_model"
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        max_position_embeddings=64,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        # This differs from `CLIPTokenizer`'s default and from openai/siglip
+        # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
+        pad_token_id=1,
+        bos_token_id=49406,
+        eos_token_id=49407,
+        _flash_attn_2_enabled=True,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.attention_dropout = attention_dropout
+        self._flash_attn_2_enabled = _flash_attn_2_enabled
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the text config dict if we are loading from SiglipConfig
+        if config_dict.get("model_type") == "siglip":
+            config_dict = config_dict["text_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class SiglipVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
+    Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip
+    [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input images.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+
+    Example:
+
+    ```python
+    >>> from transformers import SiglipVisionConfig, SiglipVisionModel
+
+    >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration
+    >>> configuration = SiglipVisionConfig()
+
+    >>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration
+    >>> model = SiglipVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "siglip_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=16,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        _flash_attn_2_enabled=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self._flash_attn_2_enabled = _flash_attn_2_enabled
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from SiglipConfig
+        if config_dict.get("model_type") == "siglip":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class SiglipConfig(PretrainedConfig):
+    r"""
+    [`SiglipConfig`] is the configuration class to store the configuration of a [`SiglipModel`]. It is used to
+    instantiate a Siglip model according to the specified arguments, defining the text model and vision model configs.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Siglip
+    [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`SiglipTextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`SiglipVisionConfig`].
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import SiglipConfig, SiglipModel
+
+    >>> # Initializing a SiglipConfig with google/siglip-base-patch16-224 style configuration
+    >>> configuration = SiglipConfig()
+
+    >>> # Initializing a SiglipModel (with random weights) from the google/siglip-base-patch16-224 style configuration
+    >>> model = SiglipModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a SiglipConfig from a SiglipTextConfig and a SiglipVisionConfig
+    >>> from transformers import SiglipTextConfig, SiglipVisionConfig
+
+    >>> # Initializing a SiglipText and SiglipVision configuration
+    >>> config_text = SiglipTextConfig()
+    >>> config_vision = SiglipVisionConfig()
+
+    >>> config = SiglipConfig.from_text_vision_configs(config_text, config_vision)
+    ```"""
+
+    model_type = "siglip"
+
+    def __init__(self, text_config=None, vision_config=None, **kwargs):
+        super().__init__(**kwargs)
+
+        if text_config is None:
+            text_config = {}
+            logger.info("`text_config` is `None`. Initializing the `SiglipTextConfig` with default values.")
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("`vision_config` is `None`. initializing the `SiglipVisionConfig` with default values.")
+
+        self.text_config = SiglipTextConfig(**text_config)
+        self.vision_config = SiglipVisionConfig(**vision_config)
+
+        self.initializer_factor = 1.0
+
+    @classmethod
+    def from_text_vision_configs(cls, text_config: SiglipTextConfig, vision_config: SiglipVisionConfig, **kwargs):
+        r"""
+        Instantiate a [`SiglipConfig`] (or a derived class) from siglip text model configuration and siglip vision
+        model configuration.
+
+        Returns:
+            [`SiglipConfig`]: An instance of a configuration object
+        """
+
+        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
diff --git a/app/llava/model/multimodal_encoder/siglip/modeling_siglip.py b/app/llava/model/multimodal_encoder/siglip/modeling_siglip.py
new file mode 100644
index 0000000000000000000000000000000000000000..43e631bc02fa7bdba6f003610768614206042f14
--- /dev/null
+++ b/app/llava/model/multimodal_encoder/siglip/modeling_siglip.py
@@ -0,0 +1,1473 @@
+# coding=utf-8
+# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Siglip model."""
+
+
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn.init import _calculate_fan_in_and_fan_out
+
+from transformers.activations import ACT2FN
+from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_siglip import SiglipConfig, SiglipTextConfig, SiglipVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
+
+SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "google/siglip-base-patch16-224",
+    # See all SigLIP models at https://huggingface.co/models?filter=siglip
+]
+
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+def _trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+
+    # Values are generated by using a truncated uniform distribution and
+    # then using the inverse CDF for the normal distribution.
+    # Get upper and lower cdf values
+    l = norm_cdf((a - mean) / std)
+    u = norm_cdf((b - mean) / std)
+
+    # Uniformly fill tensor with values from [l, u], then translate to
+    # [2l-1, 2u-1].
+    tensor.uniform_(2 * l - 1, 2 * u - 1)
+
+    # Use inverse cdf transform for normal distribution to get truncated
+    # standard normal
+    if tensor.dtype in [torch.float16, torch.bfloat16]:
+        # The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu
+        og_dtype = tensor.dtype
+        tensor = tensor.to(torch.float32)
+        tensor.erfinv_()
+        tensor = tensor.to(og_dtype)
+    else:
+        tensor.erfinv_()
+
+    # Transform to proper mean, std
+    tensor.mul_(std * math.sqrt(2.0))
+    tensor.add_(mean)
+
+    # Clamp to ensure it's in the proper range
+    if tensor.dtype == torch.float16:
+        # The `clamp_` op is not (yet?) defined in float16+cpu
+        tensor = tensor.to(torch.float32)
+        tensor.clamp_(min=a, max=b)
+        tensor = tensor.to(torch.float16)
+    else:
+        tensor.clamp_(min=a, max=b)
+
+
+def trunc_normal_tf_(
+    tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
+) -> torch.Tensor:
+    """Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \\leq \text{mean} \\leq b`.
+
+    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
+    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
+    and the result is subsquently scaled and shifted by the mean and std args.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    """
+    with torch.no_grad():
+        _trunc_normal_(tensor, 0, 1.0, a, b)
+        tensor.mul_(std).add_(mean)
+
+
+def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    if mode == "fan_in":
+        denom = fan_in
+    elif mode == "fan_out":
+        denom = fan_out
+    elif mode == "fan_avg":
+        denom = (fan_in + fan_out) / 2
+
+    variance = scale / denom
+
+    if distribution == "truncated_normal":
+        # constant is stddev of standard normal truncated to (-2, 2)
+        trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
+    elif distribution == "normal":
+        with torch.no_grad():
+            tensor.normal_(std=math.sqrt(variance))
+    elif distribution == "uniform":
+        bound = math.sqrt(3 * variance)
+        with torch.no_grad():
+            tensor.uniform_(-bound, bound)
+    else:
+        raise ValueError(f"invalid distribution {distribution}")
+
+
+def lecun_normal_(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
+
+
+def default_flax_embed_init(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="normal")
+
+
+@dataclass
+# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->Siglip
+class SiglipVisionModelOutput(ModelOutput):
+    """
+    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
+
+    Args:
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    image_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+# Copied from transformers.models.clip.modeling_clip.CLIPTextModelOutput with CLIP->Siglip
+class SiglipTextModelOutput(ModelOutput):
+    """
+    Base class for text model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The text embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    text_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+# Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->Siglip
+class SiglipOutput(ModelOutput):
+    """
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+            Contrastive loss for image-text similarity.
+        logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
+            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
+            similarity scores.
+        logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
+            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
+            similarity scores.
+        text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The text embeddings obtained by applying the projection layer to the pooled output of [`SiglipTextModel`].
+        image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The image embeddings obtained by applying the projection layer to the pooled output of [`SiglipVisionModel`].
+        text_model_output(`BaseModelOutputWithPooling`):
+            The output of the [`SiglipTextModel`].
+        vision_model_output(`BaseModelOutputWithPooling`):
+            The output of the [`SiglipVisionModel`].
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits_per_image: torch.FloatTensor = None
+    logits_per_text: torch.FloatTensor = None
+    text_embeds: torch.FloatTensor = None
+    image_embeds: torch.FloatTensor = None
+    text_model_output: BaseModelOutputWithPooling = None
+    vision_model_output: BaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+class SiglipVisionEmbeddings(nn.Module):
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+
+        self.num_patches_per_side = self.image_size // self.patch_size
+        self.num_patches = self.num_patches_per_side**2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+
+    def forward(self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.BoolTensor) -> torch.Tensor:
+        batch_size = pixel_values.size(0)
+
+        patch_embeds = self.patch_embedding(pixel_values)
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+
+        max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3)
+        max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, max_im_w // self.patch_size
+        boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side)
+        position_ids = torch.full(
+            size=(
+                batch_size,
+                max_nb_patches_h * max_nb_patches_w,
+            ),
+            fill_value=0,
+        )
+
+        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
+            nb_patches_h = p_attn_mask[:, 0].sum()
+            nb_patches_w = p_attn_mask[0].sum()
+
+            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
+            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
+
+            bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True)
+            bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
+
+            pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten()
+            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
+
+        position_ids = position_ids.to(self.position_embedding.weight.device)
+
+        embeddings = embeddings + self.position_embedding(position_ids)
+        return embeddings
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPTextEmbeddings with CLIP->Siglip
+class SiglipTextEmbeddings(nn.Module):
+    def __init__(self, config: SiglipTextConfig):
+        super().__init__()
+        embed_dim = config.hidden_size
+
+        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
+        self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.token_embedding(input_ids)
+
+        position_embeddings = self.position_embedding(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+
+        return embeddings
+
+
+class SiglipAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        k_v_seq_len = key_states.shape[-2]
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
+
+        if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+class SiglipFlashAttention2(SiglipAttention):
+    """
+    Llama flash attention module. This module inherits from `LlamaAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.is_causal = False  # Hack to make sure we don't use a causal mask
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        output_attentions = False
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        # cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        # query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        # if past_key_value is not None:
+        #     cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+        #     key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        dropout_rate = self.dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (LlamaRMSNorm handles it correctly)
+
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                "The input hidden states seems to be silently casted in float32, this might be related to the fact"
+                " you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_output = self._flash_attention_forward(
+            query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, self.embed_dim).contiguous()
+        attn_output = self.out_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights
+
+    def _flash_attention_forward(
+        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+        """
+
+        # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+        causal = self.is_causal and query_length != 1
+
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            attn_output = flash_attn_func(
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
+            )
+
+        return attn_output
+
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
+class SiglipMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Siglip
+class SiglipEncoderLayer(nn.Module):
+    def __init__(self, config: SiglipConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = (
+            SiglipAttention(config)
+            if not getattr(config, "_flash_attn_2_enabled", False)
+            else SiglipFlashAttention2(config)
+        )
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+            attention_mask (`torch.FloatTensor`):
+                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class SiglipPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = SiglipConfig
+    base_model_prefix = "siglip"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+
+        if isinstance(module, SiglipVisionEmbeddings):
+            width = (
+                self.config.vision_config.hidden_size
+                if isinstance(self.config, SiglipConfig)
+                else self.config.hidden_size
+            )
+            nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
+        elif isinstance(module, nn.Embedding):
+            default_flax_embed_init(module.weight)
+        elif isinstance(module, SiglipAttention):
+            nn.init.normal_(module.q_proj.weight)
+            nn.init.normal_(module.k_proj.weight)
+            nn.init.normal_(module.v_proj.weight)
+            nn.init.normal_(module.out_proj.weight)
+            nn.init.zeros_(module.q_proj.bias)
+            nn.init.zeros_(module.k_proj.bias)
+            nn.init.zeros_(module.v_proj.bias)
+            nn.init.zeros_(module.out_proj.bias)
+        elif isinstance(module, SiglipMLP):
+            nn.init.normal_(module.fc1.weight)
+            nn.init.normal_(module.fc2.weight)
+            nn.init.normal_(module.fc1.bias, std=1e-6)
+            nn.init.normal_(module.fc2.bias, std=1e-6)
+        elif isinstance(module, SiglipMultiheadAttentionPoolingHead):
+            nn.init.normal_(module.probe.data)
+            nn.init.normal_(module.attention.in_proj_weight.data)
+            nn.init.zeros_(module.attention.in_proj_bias.data)
+        elif isinstance(module, SiglipModel):
+            logit_scale_init = torch.tensor(0.0)
+            module.logit_scale.data.fill_(logit_scale_init)
+            module.logit_bias.data.zero_()
+        elif isinstance(module, (nn.Linear, nn.Conv2d)):
+            lecun_normal_(module.weight)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+SIGLIP_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`SiglipConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+SIGLIP_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+SIGLIP_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+SIGLIP_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Siglip
+class SiglipEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`SiglipEncoderLayer`].
+
+    Args:
+        config: SiglipConfig
+    """
+
+    def __init__(self, config: SiglipConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    # Ignore copy
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    encoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class SiglipTextTransformer(nn.Module):
+    def __init__(self, config: SiglipTextConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = SiglipTextEmbeddings(config)
+        self.encoder = SiglipEncoder(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+        self.head = nn.Linear(embed_dim, embed_dim)
+
+    @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None:
+            raise ValueError("You have to specify input_ids")
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+
+        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+
+        # note: SigLIP's text model does not use a causal mask, unlike the original CLIP model.
+        # expand attention_mask
+        if attention_mask is not None:
+            # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+        # Assuming "sticky" EOS tokenization, last token is always EOS.
+        pooled_output = last_hidden_state[:, -1, :]
+        pooled_output = self.head(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """The text model from SigLIP without any head or projection on top.""",
+    SIGLIP_START_DOCSTRING,
+)
+class SiglipTextModel(SiglipPreTrainedModel):
+    config_class = SiglipTextConfig
+
+    _no_split_modules = ["SiglipTextEmbeddings", "SiglipEncoderLayer"]
+
+    def __init__(self, config: SiglipTextConfig):
+        super().__init__(config)
+        self.text_model = SiglipTextTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.embeddings.token_embedding
+
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.token_embedding = value
+
+    @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, SiglipTextModel
+
+        >>> model = SiglipTextModel.from_pretrained("google/siglip-base-patch16-224")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224")
+
+        >>> # important: make sure to set padding="max_length" as that's how the model was trained
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        return self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class SiglipVisionTransformer(nn.Module):
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = SiglipVisionEmbeddings(config)
+        self.encoder = SiglipEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.head = SiglipMultiheadAttentionPoolingHead(config)
+
+    @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipVisionConfig)
+    def forward(
+        self,
+        pixel_values,
+        patch_attention_mask: Optional[torch.BoolTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size = pixel_values.size(0)
+        if patch_attention_mask is None:
+            patch_attention_mask = torch.ones(
+                size=(
+                    batch_size,
+                    pixel_values.size(2) // self.config.patch_size,
+                    pixel_values.size(3) // self.config.patch_size,
+                ),
+                dtype=torch.bool,
+                device=pixel_values.device,
+            )
+
+        hidden_states = self.embeddings(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask)
+
+        patch_attention_mask = patch_attention_mask.view(batch_size, -1)
+        # The call to `_upad_input` in `_flash_attention_forward` is expensive
+        # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
+        # avoiding passing the attention_mask, which is equivalent to attending to the full sequence
+        if not torch.any(~patch_attention_mask):
+            attention_mask=None
+        else:
+            attention_mask = (
+                _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype)
+                if not self.config._flash_attn_2_enabled
+                else patch_attention_mask
+            )
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        pooled_output = self.head(
+            hidden_state=last_hidden_state,
+            attention_mask=patch_attention_mask,
+        )
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class SiglipMultiheadAttentionPoolingHead(nn.Module):
+    """Multihead Attention Pooling."""
+
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+
+        self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.attention = torch.nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, batch_first=True)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(config)
+
+    def forward(self, hidden_state, attention_mask):
+        batch_size = hidden_state.shape[0]
+        probe = self.probe.repeat(batch_size, 1, 1)
+
+        hidden_state = self.attention(
+            query=probe, key=hidden_state, value=hidden_state, key_padding_mask=~attention_mask
+        )[0]
+
+        residual = hidden_state
+        hidden_state = self.layernorm(hidden_state)
+        hidden_state = residual + self.mlp(hidden_state)
+
+        return hidden_state[:, 0]
+
+
+@add_start_docstrings(
+    """The vision model from SigLIP without any head or projection on top.""",
+    SIGLIP_START_DOCSTRING,
+)
+class SiglipVisionModel(SiglipPreTrainedModel):
+    config_class = SiglipVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__(config)
+
+        self.vision_model = SiglipVisionTransformer(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipVisionConfig)
+    def forward(
+        self,
+        pixel_values,
+        patch_attention_mask: Optional[torch.BoolTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, SiglipVisionModel
+
+        >>> model = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224")
+        >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled features
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        return self.vision_model(
+            pixel_values=pixel_values,
+            patch_attention_mask=patch_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+@add_start_docstrings(SIGLIP_START_DOCSTRING)
+class SiglipModel(SiglipPreTrainedModel):
+    config_class = SiglipConfig
+
+    def __init__(self, config: SiglipConfig):
+        super().__init__(config)
+
+        if not isinstance(config.text_config, SiglipTextConfig):
+            raise ValueError(
+                "config.text_config is expected to be of type SiglipTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.vision_config, SiglipVisionConfig):
+            raise ValueError(
+                "config.vision_config is expected to be of type SiglipVisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        self.text_model = SiglipTextTransformer(text_config)
+        self.vision_model = SiglipVisionTransformer(vision_config)
+
+        self.logit_scale = nn.Parameter(torch.randn(1))
+        self.logit_bias = nn.Parameter(torch.randn(1))
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`SiglipTextModel`].
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, AutoModel
+        >>> import torch
+
+        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224")
+
+        >>> # important: make sure to set padding="max_length" as that's how the model was trained
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")
+        >>> with torch.no_grad():
+        ...     text_features = model.get_text_features(**inputs)
+        ```"""
+        # Use SigLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = text_outputs[1]
+
+        return pooled_output
+
+    @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`SiglipVisionModel`].
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, AutoModel
+        >>> import torch
+
+        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
+        >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> with torch.no_grad():
+        ...     image_features = model.get_image_features(**inputs)
+        ```"""
+        # Use SiglipModel's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = vision_outputs[1]
+
+        return pooled_output
+
+    @add_start_docstrings_to_model_forward(SIGLIP_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=SiglipOutput, config_class=SiglipConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SiglipOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, AutoModel
+        >>> import torch
+
+        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
+        >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> texts = ["a photo of 2 cats", "a photo of 2 dogs"]
+        >>> # important: we pass `padding=max_length` since the model was trained with this
+        >>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")
+
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+
+        >>> logits_per_image = outputs.logits_per_image
+        >>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
+        >>> print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
+        31.9% that image 0 is 'a photo of 2 cats'
+        ```"""
+        # Use SigLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        image_embeds = vision_outputs[1]
+        text_embeds = text_outputs[1]
+
+        # normalized features
+        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+
+        # cosine similarity as logits
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * self.logit_scale.exp() + self.logit_bias
+        logits_per_image = logits_per_text.t()
+
+        loss = None
+        if return_loss:
+            raise NotImplementedError("SigLIP loss to be implemented")
+
+        if not return_dict:
+            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return SiglipOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
diff --git a/app/llava/model/multimodal_encoder/siglip/processing_siglip.py b/app/llava/model/multimodal_encoder/siglip/processing_siglip.py
new file mode 100644
index 0000000000000000000000000000000000000000..261510f0d7c7450b1f59915ca5a878b94b71a735
--- /dev/null
+++ b/app/llava/model/multimodal_encoder/siglip/processing_siglip.py
@@ -0,0 +1,143 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Image/Text processor class for SigLIP.
+"""
+
+from typing import List, Optional, Union
+
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from transformers.utils import TensorType
+
+
+class SiglipProcessor(ProcessorMixin):
+    r"""
+    Constructs a Siglip processor which wraps a Siglip image processor and a Siglip tokenizer into a single processor.
+
+    [`SiglipProcessor`] offers all the functionalities of [`SiglipImageProcessor`] and [`SiglipTokenizer`]. See the
+    [`~SiglipProcessor.__call__`] and [`~SiglipProcessor.decode`] for more information.
+
+    Args:
+        image_processor ([`SiglipImageProcessor`]):
+            The image processor is a required input.
+        tokenizer ([`SiglipTokenizer`]):
+            The tokenizer is a required input.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "SiglipImageProcessor"
+    tokenizer_class = "SiglipTokenizer"
+
+    def __init__(self, image_processor, tokenizer):
+        super().__init__(image_processor, tokenizer)
+
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        images: ImageInput = None,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: int = None,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to SiglipTokenizer's [`~SiglipTokenizer.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` argument to
+        SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            truncation (`bool`, *optional*):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+
+        if text is None and images is None:
+            raise ValueError("You have to specify either text or images. Both cannot be none.")
+
+        if text is not None:
+            encoding = self.tokenizer(
+                text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
+            )
+
+        if images is not None:
+            image_features = self.image_processor(images, return_tensors=return_tensors)
+
+        if text is not None and images is not None:
+            encoding["pixel_values"] = image_features.pixel_values
+            return encoding
+        elif text is not None:
+            return encoding
+        else:
+            return BatchFeature(data=dict(**image_features), tensor_type=return_tensors)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to SiglipTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to SiglipTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    @property
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names with CLIP->Siglip, T5->Siglip
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
diff --git a/app/llava/model/multimodal_encoder/siglip_encoder.py b/app/llava/model/multimodal_encoder/siglip_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1551579cb7323cdfdabe60478ffb855ec061807
--- /dev/null
+++ b/app/llava/model/multimodal_encoder/siglip_encoder.py
@@ -0,0 +1,84 @@
+import torch
+import torch.nn as nn
+
+# from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
+from .siglip.modeling_siglip import SiglipVisionModel
+from .siglip.configuration_siglip import SiglipVisionConfig
+from transformers import AutoImageProcessor
+
+
+class SiglipVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.vision_tower_name = vision_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.args = args
+        if not delay_load:
+            self.load_model()
+        elif getattr(args, 'unfreeze_mm_vision_tower', False):
+            self.load_model()
+        else:
+            self.cfg_only = SiglipVisionConfig.from_pretrained(self.vision_tower_name)
+
+    def load_model(self, device_map=None):
+        if self.is_loaded:
+            print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
+            return
+
+        self.image_processor = AutoImageProcessor.from_pretrained("google/siglip-so400m-patch14-384")
+        self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map, _flash_attn_2_enabled=True)
+        self.vision_tower.requires_grad_(False)
+
+        self.is_loaded = True
+
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        return image_features
+
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+
+    @property
+    def device(self):
+        return self.vision_tower.device
+
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+
+    @property
+    def num_patches_per_side(self):
+        return self.image_processor.size['width'] // self.config.patch_size
+
+    @property
+    def num_patches(self):
+        return (self.image_processor.size['width'] // self.config.patch_size) ** 2
diff --git a/app/llava/model/multimodal_projector/__pycache__/builder.cpython-310.pyc b/app/llava/model/multimodal_projector/__pycache__/builder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..61b4dd01bc0143966a911253c08bbeaed6dfa2a3
Binary files /dev/null and b/app/llava/model/multimodal_projector/__pycache__/builder.cpython-310.pyc differ
diff --git a/app/llava/model/multimodal_projector/__pycache__/builder.cpython-39.pyc b/app/llava/model/multimodal_projector/__pycache__/builder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..de205d77db76c667ce50ed82096e208b8c77b852
Binary files /dev/null and b/app/llava/model/multimodal_projector/__pycache__/builder.cpython-39.pyc differ
diff --git a/app/llava/model/multimodal_projector/builder.py b/app/llava/model/multimodal_projector/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..31cd4f48e6055cd6d00a162af30b1c8139e26b57
--- /dev/null
+++ b/app/llava/model/multimodal_projector/builder.py
@@ -0,0 +1,51 @@
+import torch
+import torch.nn as nn
+import re
+
+
+class IdentityMap(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, *args, **kwargs):
+        return x
+
+    @property
+    def config(self):
+        return {"mm_projector_type": 'identity'}
+
+
+class SimpleResBlock(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(channels)
+
+        self.proj = nn.Sequential(
+            nn.Linear(channels, channels),
+            nn.GELU(),
+            nn.Linear(channels, channels)
+        )
+    def forward(self, x):
+        x = self.pre_norm(x)
+        return x + self.proj(x)
+
+
+def build_vision_projector(config, delay_load=False, **kwargs):
+    projector_type = getattr(config, 'mm_projector_type', 'linear')
+
+    if projector_type == 'linear':
+        return nn.Linear(config.mm_hidden_size, config.hidden_size)
+
+    mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
+    if mlp_gelu_match:
+        mlp_depth = int(mlp_gelu_match.group(1))
+        modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
+        for _ in range(1, mlp_depth):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+        return nn.Sequential(*modules)
+
+    if projector_type == 'identity':
+        return IdentityMap()
+
+    raise ValueError(f'Unknown projector type: {projector_type}')
diff --git a/app/llava/model/preprocessor.py b/app/llava/model/preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ebacc7a24f057437f3d51feadd959802629598b
--- /dev/null
+++ b/app/llava/model/preprocessor.py
@@ -0,0 +1,691 @@
+import copy
+import transformers
+import tokenizers
+import torch
+from typing import Dict, Optional, Sequence, List
+from packaging import version
+
+from llava.mm_utils import tokenizer_image_token
+from llava.train.arguments import ModelArguments, TrainingArguments, DataArguments
+from llava.constants import IGNORE_INDEX, MM_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_VIDEO_TOKEN, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN
+from llava import conversation as conversation_lib
+
+IS_TOKENIZER_GREATER_THAN_0_14 = version.parse(tokenizers.__version__) >= version.parse('0.14')
+
+def _tokenize_fn(strings: Sequence[str],
+                 tokenizer: transformers.PreTrainedTokenizer) -> Dict:
+    """Tokenize a list of strings."""
+    tokenized_list = [
+        tokenizer(
+            text,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ) for text in strings
+    ]
+    input_ids = labels = [
+        tokenized.input_ids[0] for tokenized in tokenized_list
+    ]
+    input_ids_lens = labels_lens = [
+        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item()
+        for tokenized in tokenized_list
+    ]
+    return dict(
+        input_ids=input_ids,
+        labels=labels,
+        input_ids_lens=input_ids_lens,
+        labels_lens=labels_lens,
+    )
+
+
+def _mask_targets(target, tokenized_lens, speakers):
+    # cur_idx = 0
+    cur_idx = tokenized_lens[0]
+    tokenized_lens = tokenized_lens[1:]
+    target[:cur_idx] = IGNORE_INDEX
+    # TODO: why +2 ?
+    for tokenized_len, speaker in zip(tokenized_lens, speakers):
+        if speaker == "human":
+            target[cur_idx+2:cur_idx + tokenized_len] = IGNORE_INDEX
+        cur_idx += tokenized_len
+
+
+def _add_speaker_and_signal(header, source, get_conversation=True):
+    """Add speaker and start/end signal on each round."""
+    BEGIN_SIGNAL = "### "
+    END_SIGNAL = "\n"
+    conversation = header
+    for sentence in source:
+        from_str = sentence["from"]
+        if from_str.lower() == "human":
+            from_str = conversation_lib.default_conversation.roles[0]
+        elif from_str.lower() == "gpt":
+            from_str = conversation_lib.default_conversation.roles[1]
+        else:
+            from_str = 'unknown'
+        sentence["value"] = (BEGIN_SIGNAL + from_str + ": " +
+                             sentence["value"] + END_SIGNAL)
+        if get_conversation:
+            conversation += sentence["value"]
+    conversation += BEGIN_SIGNAL
+    return conversation
+
+def preprocess_multimodal(
+    sources: Sequence[str],
+    data_args: DataArguments
+) -> Dict:
+    is_multimodal = data_args.is_multimodal
+    if not is_multimodal:
+        return sources
+
+    for source in sources:
+        for sentence in source:
+
+            if DEFAULT_VIDEO_TOKEN in sentence['value']:
+                # sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, '').strip()
+                # sentence['value'] = DEFAULT_IMAGE_TOKEN + '\n' + sentence['value']
+                sentence['value'] = sentence['value'].strip()
+                if "mmtag" in conversation_lib.default_conversation.version:
+                    raise NotImplementedError
+                    # sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, '<Image>' + DEFAULT_IMAGE_TOKEN + '</Image>')
+                replace_token = DEFAULT_VIDEO_TOKEN
+                if data_args.mm_use_start_end:
+                    replace_token = DEFAULT_VIDEO_START_TOKEN + replace_token + DEFAULT_VIDEO_END_TOKEN
+                sentence["value"] = sentence["value"].replace(DEFAULT_VIDEO_TOKEN, replace_token)
+            
+
+            if DEFAULT_IMAGE_TOKEN in sentence['value']:
+                # sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, '').strip()
+                # sentence['value'] = DEFAULT_IMAGE_TOKEN + '\n' + sentence['value']
+                sentence['value'] = sentence['value'].strip()
+                if "mmtag" in conversation_lib.default_conversation.version:
+                    sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, '<Image>' + DEFAULT_IMAGE_TOKEN + '</Image>')
+                replace_token = DEFAULT_IMAGE_TOKEN
+                if data_args.mm_use_start_end:
+                    replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
+                sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token)
+
+    return sources
+
+
+def preprocess_llama_2(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1], "model": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+
+    # Tokenize conversations
+
+    if has_image:
+        input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+
+    targets = input_ids.clone()
+
+    assert conv.sep_style == conversation_lib.SeparatorStyle.LLAMA_2
+
+    # Mask targets
+    sep = "[/INST] "
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(conv.sep2)
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_v1(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1], "model": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+
+    # Tokenize conversations
+
+    if has_image:
+        input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+
+    targets = input_ids.clone()
+
+    assert conv.sep_style == conversation_lib.SeparatorStyle.TWO
+
+    # Mask targets
+    sep = conv.sep + conv.roles[1] + ": "
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(conv.sep2)
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+
+            if i != 0 and not tokenizer.legacy and IS_TOKENIZER_GREATER_THAN_0_14:
+                round_len -= 1
+                instruction_len -= 1
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_mpt(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1], "model": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+
+    # Tokenize conversations
+
+    if has_image:
+        input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+
+    targets = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.MPT
+
+    # Mask targets
+    sep = conv.sep + conv.roles[1]
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(conv.sep)
+        re_rounds = [conv.sep.join(rounds[:3])] # system + user + gpt
+        for conv_idx in range(3, len(rounds), 2):
+            re_rounds.append(conv.sep.join(rounds[conv_idx:conv_idx+2]))    # user + gpt
+        cur_len = 0
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(re_rounds):
+            if rou == "":
+                break
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 1
+
+            if i != 0 and getattr(tokenizer, 'legacy', False) and IS_TOKENIZER_GREATER_THAN_0_14:
+                round_len += 1
+                instruction_len += 1
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_plain(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+) -> Dict:
+    # add end signal and concatenate MM_TOKEN_INDEXtogether
+    conversations = []
+    for source in sources:
+        assert len(source) == 2
+        # assert DEFAULT_IMAGE_TOKEN in source[0]['value']
+        # source[0]['value'] = DEFAULT_IMAGE_TOKEN
+        conversation = source[0]['value'] + source[1]['value'] + conversation_lib.default_conversation.sep
+        conversations.append(conversation)
+    # tokenize conversations
+    input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations]
+    targets = copy.deepcopy(input_ids)
+    for target, source in zip(targets, sources):
+        tokenized_len = len(tokenizer_image_token(source[0]['value'], tokenizer))
+        target[:tokenized_len] = IGNORE_INDEX
+
+    return dict(input_ids=input_ids, labels=targets)
+
+
+
+
+def preprocess_gemma(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1], "model": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt(use_chat_template=True, tokenizer=tokenizer))
+
+    # Tokenize conversations
+
+    if has_image:
+        input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+
+    targets = input_ids.clone()
+
+    # Mask targets
+    sep = conv.sep + conv.roles[1] + '\n'
+    sep2 = conv.sep2 + '\n' + conv.sep + conv.roles[0]
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(sep2)
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            if i != len(rounds) - 1:
+                rou += conv.sep2 + '\n'
+            if i >= 1 :
+                rou = conv.sep + conv.roles[0] + rou
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer)) - 1
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1
+            else:
+                raise NotImplementedError
+
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_mistral(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1], "model": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt(use_chat_template=True, tokenizer=tokenizer))
+
+    # Tokenize conversations
+
+    if has_image:
+        input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+
+    targets = input_ids.clone()
+
+    # Mask targets
+    sep = " [/INST]"
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(conv.sep2)
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 1
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+        if rou[-1] == ' ':   # '</s> ' another space after </s>
+            cur_len += 1
+
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_thoth(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1], "model": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+
+    # Tokenize conversations
+
+    if has_image:
+        input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+
+    targets = input_ids.clone()
+
+
+    # Mask targets
+    sep = conv.sep + conv.roles[1] + ": "
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(conv.sep2)
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+
+            target[cur_len: cur_len + instruction_len] = IGNORE_INDEX
+            cur_len += round_len + 1
+            if i == 0:
+                cur_len -= 1
+        target[cur_len:] = IGNORE_INDEX
+
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False
+) -> Dict:
+    """
+    Given a list of sources, each is a conversation list. This transform:
+    1. Add signal '### ' at the beginning each sentence, with end signal '\n';
+    2. Concatenate conversations together;
+    3. Tokenize the concatenated conversation;
+    4. Make a deepcopy as the target. Mask human words with IGNORE_INDEX.
+    """
+    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN:
+        return preprocess_plain(sources, tokenizer)
+    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.LLAMA_2:
+        return preprocess_llama_2(sources, tokenizer, has_image=has_image)
+    if conversation_lib.default_conversation.version.startswith("v1"):
+        return preprocess_v1(sources, tokenizer, has_image=has_image)
+    if conversation_lib.default_conversation.version == "mpt":
+        return preprocess_mpt(sources, tokenizer, has_image=has_image)
+    if conversation_lib.default_conversation.version == 'gemma':
+        return preprocess_gemma(sources, tokenizer, has_image=has_image)
+    if conversation_lib.default_conversation.version == 'thoth':
+        return preprocess_thoth(sources, tokenizer, has_image=has_image)
+    if conversation_lib.default_conversation.version == 'mistral':
+        return preprocess_mistral(sources, tokenizer, has_image=has_image)
+    # add end signal and concatenate together
+    conversations = []
+    for source in sources:
+        header = f"{conversation_lib.default_conversation.system}\n\n"
+        conversation = _add_speaker_and_signal(header, source)
+        conversations.append(conversation)
+    # tokenize conversations
+    def get_tokenize_len(prompts):
+        return [len(tokenizer_image_token(prompt, tokenizer)) for prompt in prompts]
+
+    if has_image:
+        input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations]
+    else:
+        conversations_tokenized = _tokenize_fn(conversations, tokenizer)
+        input_ids = conversations_tokenized["input_ids"]
+
+    targets = copy.deepcopy(input_ids)
+    for target, source in zip(targets, sources):
+        if has_image:
+            tokenized_lens = get_tokenize_len([header] + [s["value"] for s in source])
+        else:
+            tokenized_lens = _tokenize_fn([header] + [s["value"] for s in source], tokenizer)["input_ids_lens"]
+        speakers = [sentence["from"] for sentence in source]
+        _mask_targets(target, tokenized_lens, speakers)
+
+    return dict(input_ids=input_ids, labels=targets)
diff --git a/app/llava/model/utils.py b/app/llava/model/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9169ef4b61320f63ea0177fbc5bf7f07698766bf
--- /dev/null
+++ b/app/llava/model/utils.py
@@ -0,0 +1,32 @@
+import torch
+import torch.nn as nn
+from transformers import AutoConfig
+
+
+def auto_upgrade(config):
+    cfg = AutoConfig.from_pretrained(config)
+    if 'llava' in config and 'llava' not in cfg.model_type:
+        assert cfg.model_type == 'llama'
+        print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
+        print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
+        confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
+        if confirm.lower() in ["y", "yes"]:
+            print("Upgrading checkpoint...")
+            assert len(cfg.architectures) == 1
+            setattr(cfg.__class__, "model_type", "llava")
+            cfg.architectures[0] = 'LlavaLlamaForCausalLM'
+            cfg.save_pretrained(config)
+            print("Checkpoint upgraded.")
+        else:
+            print("Checkpoint upgrade aborted.")
+            exit(1)
+
+
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+
+    def forward(self, x: torch.Tensor):
+        with torch.cuda.amp.autocast(dtype=torch.float32):
+            orig_type = x.dtype
+            ret = super().forward(x.type(torch.float32))
+            return ret.type(orig_type)
\ No newline at end of file
diff --git a/app/llava/serve/__init__.py b/app/llava/serve/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/app/llava/serve/cli.py b/app/llava/serve/cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed583278d6661f0bbeb3026d262e6404c093d672
--- /dev/null
+++ b/app/llava/serve/cli.py
@@ -0,0 +1,128 @@
+import argparse
+import torch
+
+from llava.constants import MM_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from llava.conversation import conv_templates, SeparatorStyle
+from llava.model.builder import load_pretrained_model
+from llava.utils import disable_torch_init
+from llava.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path
+
+from PIL import Image
+
+import requests
+from PIL import Image
+from io import BytesIO
+from transformers import TextStreamer
+
+
+def load_image(image_file):
+    if image_file.startswith('http://') or image_file.startswith('https://'):
+        response = requests.get(image_file)
+        image = Image.open(BytesIO(response.content)).convert('RGB')
+    else:
+        image = Image.open(image_file).convert('RGB')
+    return image
+
+
+def main(args):
+    # Model
+    disable_torch_init()
+
+    model_name = get_model_name_from_path(args.model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit, device=args.device)
+
+    if "llama-2" in model_name.lower():
+        conv_mode = "llava_llama_2"
+    elif "mistral" in model_name.lower():
+        conv_mode = "mistral_instruct"
+    elif "v1.6-34b" in model_name.lower():
+        conv_mode = "chatml_direct"
+    elif "v1" in model_name.lower():
+        conv_mode = "llava_v1"
+    elif "mpt" in model_name.lower():
+        conv_mode = "mpt"
+    else:
+        conv_mode = "llava_v0"
+
+    if args.conv_mode is not None and conv_mode != args.conv_mode:
+        print('[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}'.format(conv_mode, args.conv_mode, args.conv_mode))
+    else:
+        args.conv_mode = conv_mode
+
+    conv = conv_templates[args.conv_mode].copy()
+    if "mpt" in model_name.lower():
+        roles = ('user', 'assistant')
+    else:
+        roles = conv.roles
+
+    image = load_image(args.image_file)
+    image_size = image.size
+    # Similar operation in model_worker.py
+    image_tensor = process_images([image], image_processor, model.config)
+    if type(image_tensor) is list:
+        image_tensor = [image.to(model.device, dtype=torch.float16) for image in image_tensor]
+    else:
+        image_tensor = image_tensor.to(model.device, dtype=torch.float16)
+
+    while True:
+        try:
+            inp = input(f"{roles[0]}: ")
+        except EOFError:
+            inp = ""
+        if not inp:
+            print("exit...")
+            break
+
+        print(f"{roles[1]}: ", end="")
+
+        if image is not None:
+            # first message
+            if model.config.mm_use_start_end:
+                inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + inp
+            else:
+                inp = DEFAULT_IMAGE_TOKEN + '\n' + inp
+            conv.append_message(conv.roles[0], inp)
+            image = None
+        else:
+            # later messages
+            conv.append_message(conv.roles[0], inp)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+
+        input_ids = tokenizer_image_token(prompt, tokenizer, MM_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(model.device)
+        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+        keywords = [stop_str]
+        streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=image_tensor,
+                image_sizes=[image_size],
+                do_sample=True if args.temperature > 0 else False,
+                temperature=args.temperature,
+                max_new_tokens=args.max_new_tokens,
+                streamer=streamer,
+                use_cache=True)
+
+        outputs = tokenizer.decode(output_ids[0]).strip()
+        conv.messages[-1][-1] = outputs
+
+        if args.debug:
+            print("\n", {"prompt": prompt, "outputs": outputs}, "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-file", type=str, required=True)
+    parser.add_argument("--device", type=str, default="cuda")
+    parser.add_argument("--conv-mode", type=str, default=None)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--max-new-tokens", type=int, default=512)
+    parser.add_argument("--load-8bit", action="store_true")
+    parser.add_argument("--load-4bit", action="store_true")
+    parser.add_argument("--debug", action="store_true")
+    args = parser.parse_args()
+    main(args)
diff --git a/app/llava/serve/controller.py b/app/llava/serve/controller.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4bf1b4c47ccdb1401b18f8397868ec016d1c43a
--- /dev/null
+++ b/app/llava/serve/controller.py
@@ -0,0 +1,298 @@
+"""
+A controller manages distributed workers.
+It sends worker addresses to clients.
+"""
+import argparse
+import asyncio
+import dataclasses
+from enum import Enum, auto
+import json
+import logging
+import time
+from typing import List, Union
+import threading
+
+from fastapi import FastAPI, Request
+from fastapi.responses import StreamingResponse
+import numpy as np
+import requests
+import uvicorn
+
+from llava.constants import CONTROLLER_HEART_BEAT_EXPIRATION
+from llava.utils import build_logger, server_error_msg
+
+
+logger = build_logger("controller", "controller.log")
+
+
+class DispatchMethod(Enum):
+    LOTTERY = auto()
+    SHORTEST_QUEUE = auto()
+
+    @classmethod
+    def from_str(cls, name):
+        if name == "lottery":
+            return cls.LOTTERY
+        elif name == "shortest_queue":
+            return cls.SHORTEST_QUEUE
+        else:
+            raise ValueError(f"Invalid dispatch method")
+
+
+@dataclasses.dataclass
+class WorkerInfo:
+    model_names: List[str]
+    speed: int
+    queue_length: int
+    check_heart_beat: bool
+    last_heart_beat: str
+
+
+def heart_beat_controller(controller):
+    while True:
+        time.sleep(CONTROLLER_HEART_BEAT_EXPIRATION)
+        controller.remove_stable_workers_by_expiration()
+
+
+class Controller:
+    def __init__(self, dispatch_method: str):
+        # Dict[str -> WorkerInfo]
+        self.worker_info = {}
+        self.dispatch_method = DispatchMethod.from_str(dispatch_method)
+
+        self.heart_beat_thread = threading.Thread(
+            target=heart_beat_controller, args=(self,), daemon=True)
+        self.heart_beat_thread.start()
+
+        logger.info("Init controller")
+
+    def register_worker(self, worker_name: str, check_heart_beat: bool,
+                        worker_status: dict):
+        if worker_name not in self.worker_info:
+            logger.info(f"Register a new worker: {worker_name}")
+        else:
+            logger.info(f"Register an existing worker: {worker_name}")
+
+        if not worker_status:
+            worker_status = self.get_worker_status(worker_name)
+        if not worker_status:
+            return False
+
+        self.worker_info[worker_name] = WorkerInfo(
+            worker_status["model_names"], worker_status["speed"], worker_status["queue_length"],
+            check_heart_beat, time.time())
+
+        logger.info(f"Register done: {worker_name}, {worker_status}")
+        return True
+
+    def get_worker_status(self, worker_name: str):
+        try:
+            r = requests.post(worker_name + "/worker_get_status", timeout=5)
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Get status fails: {worker_name}, {e}")
+            return None
+
+        if r.status_code != 200:
+            logger.error(f"Get status fails: {worker_name}, {r}")
+            return None
+
+        return r.json()
+
+    def remove_worker(self, worker_name: str):
+        del self.worker_info[worker_name]
+
+    def refresh_all_workers(self):
+        old_info = dict(self.worker_info)
+        self.worker_info = {}
+
+        for w_name, w_info in old_info.items():
+            if not self.register_worker(w_name, w_info.check_heart_beat, None):
+                logger.info(f"Remove stale worker: {w_name}")
+
+    def list_models(self):
+        model_names = set()
+
+        for w_name, w_info in self.worker_info.items():
+            model_names.update(w_info.model_names)
+
+        return list(model_names)
+
+    def get_worker_address(self, model_name: str):
+        if self.dispatch_method == DispatchMethod.LOTTERY:
+            worker_names = []
+            worker_speeds = []
+            for w_name, w_info in self.worker_info.items():
+                if model_name in w_info.model_names:
+                    worker_names.append(w_name)
+                    worker_speeds.append(w_info.speed)
+            worker_speeds = np.array(worker_speeds, dtype=np.float32)
+            norm = np.sum(worker_speeds)
+            if norm < 1e-4:
+                return ""
+            worker_speeds = worker_speeds / norm
+            if True:  # Directly return address
+                pt = np.random.choice(np.arange(len(worker_names)),
+                    p=worker_speeds)
+                worker_name = worker_names[pt]
+                return worker_name
+
+            # Check status before returning
+            while True:
+                pt = np.random.choice(np.arange(len(worker_names)),
+                    p=worker_speeds)
+                worker_name = worker_names[pt]
+
+                if self.get_worker_status(worker_name):
+                    break
+                else:
+                    self.remove_worker(worker_name)
+                    worker_speeds[pt] = 0
+                    norm = np.sum(worker_speeds)
+                    if norm < 1e-4:
+                        return ""
+                    worker_speeds = worker_speeds / norm
+                    continue
+            return worker_name
+        elif self.dispatch_method == DispatchMethod.SHORTEST_QUEUE:
+            worker_names = []
+            worker_qlen = []
+            for w_name, w_info in self.worker_info.items():
+                if model_name in w_info.model_names:
+                    worker_names.append(w_name)
+                    worker_qlen.append(w_info.queue_length / w_info.speed)
+            if len(worker_names) == 0:
+                return ""
+            min_index = np.argmin(worker_qlen)
+            w_name = worker_names[min_index]
+            self.worker_info[w_name].queue_length += 1
+            logger.info(f"names: {worker_names}, queue_lens: {worker_qlen}, ret: {w_name}")
+            return w_name
+        else:
+            raise ValueError(f"Invalid dispatch method: {self.dispatch_method}")
+
+    def receive_heart_beat(self, worker_name: str, queue_length: int):
+        if worker_name not in self.worker_info:
+            logger.info(f"Receive unknown heart beat. {worker_name}")
+            return False
+
+        self.worker_info[worker_name].queue_length = queue_length
+        self.worker_info[worker_name].last_heart_beat = time.time()
+        logger.info(f"Receive heart beat. {worker_name}")
+        return True
+
+    def remove_stable_workers_by_expiration(self):
+        expire = time.time() - CONTROLLER_HEART_BEAT_EXPIRATION
+        to_delete = []
+        for worker_name, w_info in self.worker_info.items():
+            if w_info.check_heart_beat and w_info.last_heart_beat < expire:
+                to_delete.append(worker_name)
+
+        for worker_name in to_delete:
+            self.remove_worker(worker_name)
+
+    def worker_api_generate_stream(self, params):
+        worker_addr = self.get_worker_address(params["model"])
+        if not worker_addr:
+            logger.info(f"no worker: {params['model']}")
+            ret = {
+                "text": server_error_msg,
+                "error_code": 2,
+            }
+            yield json.dumps(ret).encode() + b"\0"
+
+        try:
+            response = requests.post(worker_addr + "/worker_generate_stream",
+                json=params, stream=True, timeout=5)
+            for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"):
+                if chunk:
+                    yield chunk + b"\0"
+        except requests.exceptions.RequestException as e:
+            logger.info(f"worker timeout: {worker_addr}")
+            ret = {
+                "text": server_error_msg,
+                "error_code": 3,
+            }
+            yield json.dumps(ret).encode() + b"\0"
+
+
+    # Let the controller act as a worker to achieve hierarchical
+    # management. This can be used to connect isolated sub networks.
+    def worker_api_get_status(self):
+        model_names = set()
+        speed = 0
+        queue_length = 0
+
+        for w_name in self.worker_info:
+            worker_status = self.get_worker_status(w_name)
+            if worker_status is not None:
+                model_names.update(worker_status["model_names"])
+                speed += worker_status["speed"]
+                queue_length += worker_status["queue_length"]
+
+        return {
+            "model_names": list(model_names),
+            "speed": speed,
+            "queue_length": queue_length,
+        }
+
+
+app = FastAPI()
+
+
+@app.post("/register_worker")
+async def register_worker(request: Request):
+    data = await request.json()
+    controller.register_worker(
+        data["worker_name"], data["check_heart_beat"],
+        data.get("worker_status", None))
+
+
+@app.post("/refresh_all_workers")
+async def refresh_all_workers():
+    models = controller.refresh_all_workers()
+
+
+@app.post("/list_models")
+async def list_models():
+    models = controller.list_models()
+    return {"models": models}
+
+
+@app.post("/get_worker_address")
+async def get_worker_address(request: Request):
+    data = await request.json()
+    addr = controller.get_worker_address(data["model"])
+    return {"address": addr}
+
+
+@app.post("/receive_heart_beat")
+async def receive_heart_beat(request: Request):
+    data = await request.json()
+    exist = controller.receive_heart_beat(
+        data["worker_name"], data["queue_length"])
+    return {"exist": exist}
+
+
+@app.post("/worker_generate_stream")
+async def worker_api_generate_stream(request: Request):
+    params = await request.json()
+    generator = controller.worker_api_generate_stream(params)
+    return StreamingResponse(generator)
+
+
+@app.post("/worker_get_status")
+async def worker_api_get_status(request: Request):
+    return controller.worker_api_get_status()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=21001)
+    parser.add_argument("--dispatch-method", type=str, choices=[
+        "lottery", "shortest_queue"], default="shortest_queue")
+    args = parser.parse_args()
+    logger.info(f"args: {args}")
+
+    controller = Controller(args.dispatch_method)
+    uvicorn.run(app, host=args.host, port=args.port, log_level="info")
diff --git a/app/llava/serve/gradio_web_server.py b/app/llava/serve/gradio_web_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..c07efc122950da37455608b609dcf1f2b4103d56
--- /dev/null
+++ b/app/llava/serve/gradio_web_server.py
@@ -0,0 +1,479 @@
+import argparse
+import datetime
+import json
+import os
+import time
+
+import gradio as gr
+import requests
+
+from llava.conversation import (default_conversation, conv_templates,
+                                   SeparatorStyle)
+from llava.constants import LOGDIR
+from llava.utils import (build_logger, server_error_msg,
+    violates_moderation, moderation_msg)
+import hashlib
+
+
+logger = build_logger("gradio_web_server", "gradio_web_server.log")
+
+headers = {"User-Agent": "LLaVA Client"}
+
+no_change_btn = gr.Button()
+enable_btn = gr.Button(interactive=True)
+disable_btn = gr.Button(interactive=False)
+
+priority = {
+    "vicuna-13b": "aaaaaaa",
+    "koala-13b": "aaaaaab",
+}
+
+
+def get_conv_log_filename():
+    t = datetime.datetime.now()
+    name = os.path.join(LOGDIR, f"{t.year}-{t.month:02d}-{t.day:02d}-conv.json")
+    return name
+
+
+def get_model_list():
+    ret = requests.post(args.controller_url + "/refresh_all_workers")
+    assert ret.status_code == 200
+    ret = requests.post(args.controller_url + "/list_models")
+    models = ret.json()["models"]
+    models.sort(key=lambda x: priority.get(x, x))
+    logger.info(f"Models: {models}")
+    return models
+
+
+get_window_url_params = """
+function() {
+    const params = new URLSearchParams(window.location.search);
+    url_params = Object.fromEntries(params);
+    console.log(url_params);
+    return url_params;
+    }
+"""
+
+
+def load_demo(url_params, request: gr.Request):
+    logger.info(f"load_demo. ip: {request.client.host}. params: {url_params}")
+
+    dropdown_update = gr.Dropdown(visible=True)
+    if "model" in url_params:
+        model = url_params["model"]
+        if model in models:
+            dropdown_update = gr.Dropdown(value=model, visible=True)
+
+    state = default_conversation.copy()
+    return state, dropdown_update
+
+
+def load_demo_refresh_model_list(request: gr.Request):
+    logger.info(f"load_demo. ip: {request.client.host}")
+    models = get_model_list()
+    state = default_conversation.copy()
+    dropdown_update = gr.Dropdown(
+        choices=models,
+        value=models[0] if len(models) > 0 else ""
+    )
+    return state, dropdown_update
+
+
+def vote_last_response(state, vote_type, model_selector, request: gr.Request):
+    with open(get_conv_log_filename(), "a") as fout:
+        data = {
+            "tstamp": round(time.time(), 4),
+            "type": vote_type,
+            "model": model_selector,
+            "state": state.dict(),
+            "ip": request.client.host,
+        }
+        fout.write(json.dumps(data) + "\n")
+
+
+def upvote_last_response(state, model_selector, request: gr.Request):
+    logger.info(f"upvote. ip: {request.client.host}")
+    vote_last_response(state, "upvote", model_selector, request)
+    return ("",) + (disable_btn,) * 3
+
+
+def downvote_last_response(state, model_selector, request: gr.Request):
+    logger.info(f"downvote. ip: {request.client.host}")
+    vote_last_response(state, "downvote", model_selector, request)
+    return ("",) + (disable_btn,) * 3
+
+
+def flag_last_response(state, model_selector, request: gr.Request):
+    logger.info(f"flag. ip: {request.client.host}")
+    vote_last_response(state, "flag", model_selector, request)
+    return ("",) + (disable_btn,) * 3
+
+
+def regenerate(state, image_process_mode, request: gr.Request):
+    logger.info(f"regenerate. ip: {request.client.host}")
+    state.messages[-1][-1] = None
+    prev_human_msg = state.messages[-2]
+    if type(prev_human_msg[1]) in (tuple, list):
+        prev_human_msg[1] = (*prev_human_msg[1][:2], image_process_mode)
+    state.skip_next = False
+    return (state, state.to_gradio_chatbot(), "", None) + (disable_btn,) * 5
+
+
+def clear_history(request: gr.Request):
+    logger.info(f"clear_history. ip: {request.client.host}")
+    state = default_conversation.copy()
+    return (state, state.to_gradio_chatbot(), "", None) + (disable_btn,) * 5
+
+
+def add_text(state, text, image, image_process_mode, request: gr.Request):
+    logger.info(f"add_text. ip: {request.client.host}. len: {len(text)}")
+    if len(text) <= 0 and image is None:
+        state.skip_next = True
+        return (state, state.to_gradio_chatbot(), "", None) + (no_change_btn,) * 5
+    if args.moderate:
+        flagged = violates_moderation(text)
+        if flagged:
+            state.skip_next = True
+            return (state, state.to_gradio_chatbot(), moderation_msg, None) + (
+                no_change_btn,) * 5
+
+    text = text[:1536]  # Hard cut-off
+    if image is not None:
+        text = text[:1200]  # Hard cut-off for images
+        if '<image>' not in text:
+            # text = '<Image><image></Image>' + text
+            text = text + '\n<image>'
+        text = (text, image, image_process_mode)
+        state = default_conversation.copy()
+    state.append_message(state.roles[0], text)
+    state.append_message(state.roles[1], None)
+    state.skip_next = False
+    return (state, state.to_gradio_chatbot(), "", None) + (disable_btn,) * 5
+
+
+def http_bot(state, model_selector, temperature, top_p, max_new_tokens, request: gr.Request):
+    logger.info(f"http_bot. ip: {request.client.host}")
+    start_tstamp = time.time()
+    model_name = model_selector
+
+    if state.skip_next:
+        # This generate call is skipped due to invalid inputs
+        yield (state, state.to_gradio_chatbot()) + (no_change_btn,) * 5
+        return
+
+    if len(state.messages) == state.offset + 2:
+        # First round of conversation
+        if "llava" in model_name.lower():
+            if 'llama-2' in model_name.lower():
+                template_name = "llava_llama_2"
+            elif "mistral" in model_name.lower() or "mixtral" in model_name.lower():
+                if 'orca' in model_name.lower():
+                    template_name = "mistral_orca"
+                elif 'hermes' in model_name.lower():
+                    template_name = "chatml_direct"
+                else:
+                    template_name = "mistral_instruct"
+            elif 'llava-v1.6-34b' in model_name.lower():
+                template_name = "chatml_direct"
+            elif "v1" in model_name.lower():
+                if 'mmtag' in model_name.lower():
+                    template_name = "v1_mmtag"
+                elif 'plain' in model_name.lower() and 'finetune' not in model_name.lower():
+                    template_name = "v1_mmtag"
+                else:
+                    template_name = "llava_v1"
+            elif "mpt" in model_name.lower():
+                template_name = "mpt"
+            else:
+                if 'mmtag' in model_name.lower():
+                    template_name = "v0_mmtag"
+                elif 'plain' in model_name.lower() and 'finetune' not in model_name.lower():
+                    template_name = "v0_mmtag"
+                else:
+                    template_name = "llava_v0"
+        elif "mpt" in model_name:
+            template_name = "mpt_text"
+        elif "llama-2" in model_name:
+            template_name = "llama_2"
+        else:
+            template_name = "vicuna_v1"
+        new_state = conv_templates[template_name].copy()
+        new_state.append_message(new_state.roles[0], state.messages[-2][1])
+        new_state.append_message(new_state.roles[1], None)
+        state = new_state
+
+    # Query worker address
+    controller_url = args.controller_url
+    ret = requests.post(controller_url + "/get_worker_address",
+            json={"model": model_name})
+    worker_addr = ret.json()["address"]
+    logger.info(f"model_name: {model_name}, worker_addr: {worker_addr}")
+
+    # No available worker
+    if worker_addr == "":
+        state.messages[-1][-1] = server_error_msg
+        yield (state, state.to_gradio_chatbot(), disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
+        return
+
+    # Construct prompt
+    prompt = state.get_prompt()
+
+    all_images = state.get_images(return_pil=True)
+    all_image_hash = [hashlib.md5(image.tobytes()).hexdigest() for image in all_images]
+    for image, hash in zip(all_images, all_image_hash):
+        t = datetime.datetime.now()
+        filename = os.path.join(LOGDIR, "serve_images", f"{t.year}-{t.month:02d}-{t.day:02d}", f"{hash}.jpg")
+        if not os.path.isfile(filename):
+            os.makedirs(os.path.dirname(filename), exist_ok=True)
+            image.save(filename)
+
+    # Make requests
+    pload = {
+        "model": model_name,
+        "prompt": prompt,
+        "temperature": float(temperature),
+        "top_p": float(top_p),
+        "max_new_tokens": min(int(max_new_tokens), 1536),
+        "stop": state.sep if state.sep_style in [SeparatorStyle.SINGLE, SeparatorStyle.MPT] else state.sep2,
+        "images": f'List of {len(state.get_images())} images: {all_image_hash}',
+    }
+    logger.info(f"==== request ====\n{pload}")
+
+    pload['images'] = state.get_images()
+
+    state.messages[-1][-1] = "▌"
+    yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
+
+    try:
+        # Stream output
+        response = requests.post(worker_addr + "/worker_generate_stream",
+            headers=headers, json=pload, stream=True, timeout=10)
+        for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"):
+            if chunk:
+                data = json.loads(chunk.decode())
+                if data["error_code"] == 0:
+                    output = data["text"][len(prompt):].strip()
+                    state.messages[-1][-1] = output + "▌"
+                    yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
+                else:
+                    output = data["text"] + f" (error_code: {data['error_code']})"
+                    state.messages[-1][-1] = output
+                    yield (state, state.to_gradio_chatbot()) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
+                    return
+                time.sleep(0.03)
+    except requests.exceptions.RequestException as e:
+        state.messages[-1][-1] = server_error_msg
+        yield (state, state.to_gradio_chatbot()) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
+        return
+
+    state.messages[-1][-1] = state.messages[-1][-1][:-1]
+    yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 5
+
+    finish_tstamp = time.time()
+    logger.info(f"{output}")
+
+    with open(get_conv_log_filename(), "a") as fout:
+        data = {
+            "tstamp": round(finish_tstamp, 4),
+            "type": "chat",
+            "model": model_name,
+            "start": round(start_tstamp, 4),
+            "finish": round(finish_tstamp, 4),
+            "state": state.dict(),
+            "images": all_image_hash,
+            "ip": request.client.host,
+        }
+        fout.write(json.dumps(data) + "\n")
+
+title_markdown = ("""
+# 🌋 LLaVA: Large Language and Vision Assistant
+[[Project Page](https://llava-vl.github.io)] [[Code](https://github.com/haotian-liu/LLaVA)] [[Model](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)] | 📚 [[LLaVA](https://arxiv.org/abs/2304.08485)] [[LLaVA-v1.5](https://arxiv.org/abs/2310.03744)] [[LLaVA-v1.6](https://llava-vl.github.io/blog/2024-01-30-llava-1-6/)]
+""")
+
+tos_markdown = ("""
+### Terms of use
+By using this service, users are required to agree to the following terms:
+The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. The service may collect user dialogue data for future research.
+Please click the "Flag" button if you get any inappropriate answer! We will collect those to keep improving our moderator.
+For an optimal experience, please use desktop computers for this demo, as mobile devices may compromise its quality.
+""")
+
+
+learn_more_markdown = ("""
+### License
+The service is a research preview intended for non-commercial use only, subject to the model [License](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) of LLaMA, [Terms of Use](https://openai.com/policies/terms-of-use) of the data generated by OpenAI, and [Privacy Practices](https://chrome.google.com/webstore/detail/sharegpt-share-your-chatg/daiacboceoaocpibfodeljbdfacokfjb) of ShareGPT. Please contact us if you find any potential violation.
+""")
+
+block_css = """
+
+#buttons button {
+    min-width: min(120px,100%);
+}
+
+"""
+
+def build_demo(embed_mode, cur_dir=None, concurrency_count=10):
+    textbox = gr.Textbox(show_label=False, placeholder="Enter text and press ENTER", container=False)
+    with gr.Blocks(title="LLaVA", theme=gr.themes.Default(), css=block_css) as demo:
+        state = gr.State()
+
+        if not embed_mode:
+            gr.Markdown(title_markdown)
+
+        with gr.Row():
+            with gr.Column(scale=3):
+                with gr.Row(elem_id="model_selector_row"):
+                    model_selector = gr.Dropdown(
+                        choices=models,
+                        value=models[0] if len(models) > 0 else "",
+                        interactive=True,
+                        show_label=False,
+                        container=False)
+
+                imagebox = gr.Image(type="pil")
+                image_process_mode = gr.Radio(
+                    ["Crop", "Resize", "Pad", "Default"],
+                    value="Default",
+                    label="Preprocess for non-square image", visible=False)
+
+                if cur_dir is None:
+                    cur_dir = os.path.dirname(os.path.abspath(__file__))
+                gr.Examples(examples=[
+                    [f"{cur_dir}/examples/extreme_ironing.jpg", "What is unusual about this image?"],
+                    [f"{cur_dir}/examples/waterview.jpg", "What are the things I should be cautious about when I visit here?"],
+                ], inputs=[imagebox, textbox])
+
+                with gr.Accordion("Parameters", open=False) as parameter_row:
+                    temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.2, step=0.1, interactive=True, label="Temperature",)
+                    top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, step=0.1, interactive=True, label="Top P",)
+                    max_output_tokens = gr.Slider(minimum=0, maximum=1024, value=512, step=64, interactive=True, label="Max output tokens",)
+
+            with gr.Column(scale=8):
+                chatbot = gr.Chatbot(
+                    elem_id="chatbot",
+                    label="LLaVA Chatbot",
+                    height=650,
+                    layout="panel",
+                )
+                with gr.Row():
+                    with gr.Column(scale=8):
+                        textbox.render()
+                    with gr.Column(scale=1, min_width=50):
+                        submit_btn = gr.Button(value="Send", variant="primary")
+                with gr.Row(elem_id="buttons") as button_row:
+                    upvote_btn = gr.Button(value="👍  Upvote", interactive=False)
+                    downvote_btn = gr.Button(value="👎  Downvote", interactive=False)
+                    flag_btn = gr.Button(value="⚠️  Flag", interactive=False)
+                    #stop_btn = gr.Button(value="⏹️  Stop Generation", interactive=False)
+                    regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=False)
+                    clear_btn = gr.Button(value="🗑️  Clear", interactive=False)
+
+        if not embed_mode:
+            gr.Markdown(tos_markdown)
+            gr.Markdown(learn_more_markdown)
+        url_params = gr.JSON(visible=False)
+
+        # Register listeners
+        btn_list = [upvote_btn, downvote_btn, flag_btn, regenerate_btn, clear_btn]
+        upvote_btn.click(
+            upvote_last_response,
+            [state, model_selector],
+            [textbox, upvote_btn, downvote_btn, flag_btn]
+        )
+        downvote_btn.click(
+            downvote_last_response,
+            [state, model_selector],
+            [textbox, upvote_btn, downvote_btn, flag_btn]
+        )
+        flag_btn.click(
+            flag_last_response,
+            [state, model_selector],
+            [textbox, upvote_btn, downvote_btn, flag_btn]
+        )
+
+        regenerate_btn.click(
+            regenerate,
+            [state, image_process_mode],
+            [state, chatbot, textbox, imagebox] + btn_list
+        ).then(
+            http_bot,
+            [state, model_selector, temperature, top_p, max_output_tokens],
+            [state, chatbot] + btn_list,
+            concurrency_limit=concurrency_count
+        )
+
+        clear_btn.click(
+            clear_history,
+            None,
+            [state, chatbot, textbox, imagebox] + btn_list,
+            queue=False
+        )
+
+        textbox.submit(
+            add_text,
+            [state, textbox, imagebox, image_process_mode],
+            [state, chatbot, textbox, imagebox] + btn_list,
+            queue=False
+        ).then(
+            http_bot,
+            [state, model_selector, temperature, top_p, max_output_tokens],
+            [state, chatbot] + btn_list,
+            concurrency_limit=concurrency_count
+        )
+
+        submit_btn.click(
+            add_text,
+            [state, textbox, imagebox, image_process_mode],
+            [state, chatbot, textbox, imagebox] + btn_list
+        ).then(
+            http_bot,
+            [state, model_selector, temperature, top_p, max_output_tokens],
+            [state, chatbot] + btn_list,
+            concurrency_limit=concurrency_count
+        )
+
+        if args.model_list_mode == "once":
+            demo.load(
+                load_demo,
+                [url_params],
+                [state, model_selector],
+                js=get_window_url_params
+            )
+        elif args.model_list_mode == "reload":
+            demo.load(
+                load_demo_refresh_model_list,
+                None,
+                [state, model_selector],
+                queue=False
+            )
+        else:
+            raise ValueError(f"Unknown model list mode: {args.model_list_mode}")
+
+    return demo
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="0.0.0.0")
+    parser.add_argument("--port", type=int)
+    parser.add_argument("--controller-url", type=str, default="http://localhost:21001")
+    parser.add_argument("--concurrency-count", type=int, default=16)
+    parser.add_argument("--model-list-mode", type=str, default="once",
+        choices=["once", "reload"])
+    parser.add_argument("--share", action="store_true")
+    parser.add_argument("--moderate", action="store_true")
+    parser.add_argument("--embed", action="store_true")
+    args = parser.parse_args()
+    logger.info(f"args: {args}")
+
+    models = get_model_list()
+
+    logger.info(args)
+    demo = build_demo(args.embed, concurrency_count=args.concurrency_count)
+    demo.queue(
+        api_open=False
+    ).launch(
+        server_name=args.host,
+        server_port=args.port,
+        share=args.share
+    )
diff --git a/app/llava/serve/model_worker.py b/app/llava/serve/model_worker.py
new file mode 100644
index 0000000000000000000000000000000000000000..98885cb36103dff333ec0f1415cea5468a947e14
--- /dev/null
+++ b/app/llava/serve/model_worker.py
@@ -0,0 +1,288 @@
+"""
+A model worker executes the model.
+"""
+import argparse
+import asyncio
+import json
+import time
+import threading
+import uuid
+
+from fastapi import FastAPI, Request, BackgroundTasks
+from fastapi.responses import StreamingResponse
+import requests
+import torch
+import uvicorn
+from functools import partial
+
+from llava.constants import WORKER_HEART_BEAT_INTERVAL
+from llava.utils import (build_logger, server_error_msg,
+    pretty_print_semaphore)
+from llava.model.builder import load_pretrained_model
+from llava.mm_utils import process_images, load_image_from_base64, tokenizer_image_token
+from llava.constants import MM_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from transformers import TextIteratorStreamer
+from threading import Thread
+
+
+GB = 1 << 30
+
+worker_id = str(uuid.uuid4())[:6]
+logger = build_logger("model_worker", f"model_worker_{worker_id}.log")
+global_counter = 0
+
+model_semaphore = None
+
+
+def heart_beat_worker(controller):
+
+    while True:
+        time.sleep(WORKER_HEART_BEAT_INTERVAL)
+        controller.send_heart_beat()
+
+
+class ModelWorker:
+    def __init__(self, controller_addr, worker_addr,
+                 worker_id, no_register,
+                 model_path, model_base, model_name,
+                 load_8bit, load_4bit, device, use_flash_attn=False):
+        self.controller_addr = controller_addr
+        self.worker_addr = worker_addr
+        self.worker_id = worker_id
+        if model_path.endswith("/"):
+            model_path = model_path[:-1]
+        if model_name is None:
+            model_paths = model_path.split("/")
+            if model_paths[-1].startswith('checkpoint-'):
+                self.model_name = model_paths[-2] + "_" + model_paths[-1]
+            else:
+                self.model_name = model_paths[-1]
+        else:
+            self.model_name = model_name
+
+        self.device = device
+        logger.info(f"Loading the model {self.model_name} on worker {worker_id} ...")
+        self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model(
+            model_path, model_base, self.model_name, load_8bit, load_4bit, device=self.device, use_flash_attn=use_flash_attn)
+        self.is_multimodal = 'llava' in self.model_name.lower()
+
+        if not no_register:
+            self.register_to_controller()
+            self.heart_beat_thread = threading.Thread(
+                target=heart_beat_worker, args=(self,), daemon=True)
+            self.heart_beat_thread.start()
+
+    def register_to_controller(self):
+        logger.info("Register to controller")
+
+        url = self.controller_addr + "/register_worker"
+        data = {
+            "worker_name": self.worker_addr,
+            "check_heart_beat": True,
+            "worker_status": self.get_status()
+        }
+        r = requests.post(url, json=data)
+        assert r.status_code == 200
+
+    def send_heart_beat(self):
+        logger.info(f"Send heart beat. Models: {[self.model_name]}. "
+                    f"Semaphore: {pretty_print_semaphore(model_semaphore)}. "
+                    f"global_counter: {global_counter}")
+
+        url = self.controller_addr + "/receive_heart_beat"
+
+        while True:
+            try:
+                ret = requests.post(url, json={
+                    "worker_name": self.worker_addr,
+                    "queue_length": self.get_queue_length()}, timeout=5)
+                exist = ret.json()["exist"]
+                break
+            except requests.exceptions.RequestException as e:
+                logger.error(f"heart beat error: {e}")
+            time.sleep(5)
+
+        if not exist:
+            self.register_to_controller()
+
+    def get_queue_length(self):
+        if model_semaphore is None:
+            return 0
+        else:
+            return args.limit_model_concurrency - model_semaphore._value + (len(
+                model_semaphore._waiters) if model_semaphore._waiters is not None else 0)
+
+    def get_status(self):
+        return {
+            "model_names": [self.model_name],
+            "speed": 1,
+            "queue_length": self.get_queue_length(),
+        }
+
+    @torch.inference_mode()
+    def generate_stream(self, params):
+        tokenizer, model, image_processor = self.tokenizer, self.model, self.image_processor
+
+        prompt = params["prompt"]
+        ori_prompt = prompt
+        images = params.get("images", None)
+        num_image_tokens = 0
+        if images is not None and len(images) > 0 and self.is_multimodal:
+            if len(images) > 0:
+                if len(images) != prompt.count(DEFAULT_IMAGE_TOKEN):
+                    raise ValueError("Number of images does not match number of <image> tokens in prompt")
+
+                images = [load_image_from_base64(image) for image in images]
+                image_sizes = [image.size for image in images]
+                images = process_images(images, image_processor, model.config)
+
+                if type(images) is list:
+                    images = [image.to(self.model.device, dtype=torch.float16) for image in images]
+                else:
+                    images = images.to(self.model.device, dtype=torch.float16)
+
+                replace_token = DEFAULT_IMAGE_TOKEN
+                if getattr(self.model.config, 'mm_use_start_end', False):
+                    replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
+                prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
+
+                num_image_tokens = prompt.count(replace_token) * model.get_vision_tower().num_patches
+            else:
+                images = None
+                image_sizes = None
+            image_args = {"images": images, "image_sizes": image_sizes}
+        else:
+            images = None
+            image_args = {}
+
+        temperature = float(params.get("temperature", 1.0))
+        top_p = float(params.get("top_p", 1.0))
+        max_context_length = getattr(model.config, 'max_position_embeddings', 2048)
+        max_new_tokens = min(int(params.get("max_new_tokens", 256)), 1024)
+        stop_str = params.get("stop", None)
+        do_sample = True if temperature > 0.001 else False
+
+        input_ids = tokenizer_image_token(prompt, tokenizer, MM_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device)
+        keywords = [stop_str]
+        # stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=15)
+
+        max_new_tokens = min(max_new_tokens, max_context_length - input_ids.shape[-1] - num_image_tokens)
+
+        if max_new_tokens < 1:
+            yield json.dumps({"text": ori_prompt + "Exceeds max token length. Please start a new conversation, thanks.", "error_code": 0}).encode() + b"\0"
+            return
+
+        thread = Thread(target=model.generate, kwargs=dict(
+            inputs=input_ids,
+            do_sample=do_sample,
+            temperature=temperature,
+            top_p=top_p,
+            max_new_tokens=max_new_tokens,
+            streamer=streamer,
+            use_cache=True,
+            **image_args
+        ))
+        thread.start()
+
+        generated_text = ori_prompt
+        for new_text in streamer:
+            generated_text += new_text
+            if generated_text.endswith(stop_str):
+                generated_text = generated_text[:-len(stop_str)]
+            yield json.dumps({"text": generated_text, "error_code": 0}).encode() + b"\0"
+
+    def generate_stream_gate(self, params):
+        try:
+            for x in self.generate_stream(params):
+                yield x
+        except ValueError as e:
+            print("Caught ValueError:", e)
+            ret = {
+                "text": server_error_msg,
+                "error_code": 1,
+            }
+            yield json.dumps(ret).encode() + b"\0"
+        except torch.cuda.CudaError as e:
+            print("Caught torch.cuda.CudaError:", e)
+            ret = {
+                "text": server_error_msg,
+                "error_code": 1,
+            }
+            yield json.dumps(ret).encode() + b"\0"
+        except Exception as e:
+            print("Caught Unknown Error", e)
+            ret = {
+                "text": server_error_msg,
+                "error_code": 1,
+            }
+            yield json.dumps(ret).encode() + b"\0"
+
+
+app = FastAPI()
+
+
+def release_model_semaphore(fn=None):
+    model_semaphore.release()
+    if fn is not None:
+        fn()
+
+
+@app.post("/worker_generate_stream")
+async def generate_stream(request: Request):
+    global model_semaphore, global_counter
+    global_counter += 1
+    params = await request.json()
+
+    if model_semaphore is None:
+        model_semaphore = asyncio.Semaphore(args.limit_model_concurrency)
+    await model_semaphore.acquire()
+    worker.send_heart_beat()
+    generator = worker.generate_stream_gate(params)
+    background_tasks = BackgroundTasks()
+    background_tasks.add_task(partial(release_model_semaphore, fn=worker.send_heart_beat))
+    return StreamingResponse(generator, background=background_tasks)
+
+
+@app.post("/worker_get_status")
+async def get_status(request: Request):
+    return worker.get_status()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=21002)
+    parser.add_argument("--worker-address", type=str,
+        default="http://localhost:21002")
+    parser.add_argument("--controller-address", type=str,
+        default="http://localhost:21001")
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--model-name", type=str)
+    parser.add_argument("--device", type=str, default="cuda")
+    parser.add_argument("--multi-modal", action="store_true", help="Multimodal mode is automatically detected with model name, please make sure `llava` is included in the model path.")
+    parser.add_argument("--limit-model-concurrency", type=int, default=5)
+    parser.add_argument("--stream-interval", type=int, default=1)
+    parser.add_argument("--no-register", action="store_true")
+    parser.add_argument("--load-8bit", action="store_true")
+    parser.add_argument("--load-4bit", action="store_true")
+    parser.add_argument("--use-flash-attn", action="store_true")
+    args = parser.parse_args()
+    logger.info(f"args: {args}")
+
+    if args.multi_modal:
+        logger.warning("Multimodal mode is automatically detected with model name, please make sure `llava` is included in the model path.")
+
+    worker = ModelWorker(args.controller_address,
+                         args.worker_address,
+                         worker_id,
+                         args.no_register,
+                         args.model_path,
+                         args.model_base,
+                         args.model_name,
+                         args.load_8bit,
+                         args.load_4bit,
+                         args.device,
+                         use_flash_attn=args.use_flash_attn)
+    uvicorn.run(app, host=args.host, port=args.port, log_level="info")
diff --git a/app/llava/serve/register_worker.py b/app/llava/serve/register_worker.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c2c40295e0351f25709ba25554c9329f15bf0d2
--- /dev/null
+++ b/app/llava/serve/register_worker.py
@@ -0,0 +1,26 @@
+"""
+Manually register workers.
+
+Usage:
+python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002
+"""
+
+import argparse
+
+import requests
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--controller-address", type=str)
+    parser.add_argument("--worker-name", type=str)
+    parser.add_argument("--check-heart-beat", action="store_true")
+    args = parser.parse_args()
+
+    url = args.controller_address + "/register_worker"
+    data = {
+        "worker_name": args.worker_name,
+        "check_heart_beat": args.check_heart_beat,
+        "worker_status": None,
+    }
+    r = requests.post(url, json=data)
+    assert r.status_code == 200
diff --git a/app/llava/serve/sglang_worker.py b/app/llava/serve/sglang_worker.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae37504c698740901218089369872c46b2f78aa0
--- /dev/null
+++ b/app/llava/serve/sglang_worker.py
@@ -0,0 +1,244 @@
+"""
+A model worker executes the model.
+"""
+import argparse
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+import json
+import time
+import threading
+import uuid
+
+from fastapi import FastAPI, Request, BackgroundTasks
+from fastapi.responses import StreamingResponse
+import requests
+import re
+import uvicorn
+from functools import partial
+
+from llava.constants import WORKER_HEART_BEAT_INTERVAL
+from llava.utils import (build_logger, server_error_msg,
+    pretty_print_semaphore)
+from llava.mm_utils import process_images, load_image_from_base64, tokenizer_image_token, expand2square
+from llava.constants import DEFAULT_IMAGE_TOKEN
+
+import sglang as sgl
+from sglang.backend.runtime_endpoint import RuntimeEndpoint
+
+
+GB = 1 << 30
+
+worker_id = str(uuid.uuid4())[:6]
+logger = build_logger("model_worker", f"model_worker_{worker_id}.log")
+global_counter = 0
+
+model_semaphore = None
+
+
+def heart_beat_worker(controller):
+    while True:
+        time.sleep(WORKER_HEART_BEAT_INTERVAL)
+        controller.send_heart_beat()
+
+
+@sgl.function
+def pipeline(s, prompt, max_tokens):
+    for p in prompt:
+        if type(p) is str:
+            s += p
+        else:
+            s += sgl.image(p)
+    s += sgl.gen("response", max_tokens=max_tokens)
+
+
+class ModelWorker:
+    def __init__(self, controller_addr, worker_addr, sgl_endpoint,
+                 worker_id, no_register, model_name):
+        self.controller_addr = controller_addr
+        self.worker_addr = worker_addr
+        self.worker_id = worker_id
+
+        # Select backend
+        backend = RuntimeEndpoint(sgl_endpoint)
+        sgl.set_default_backend(backend)
+        model_path = backend.model_info["model_path"]
+
+        if model_path.endswith("/"):
+            model_path = model_path[:-1]
+        if model_name is None:
+            model_paths = model_path.split("/")
+            if model_paths[-1].startswith('checkpoint-'):
+                self.model_name = model_paths[-2] + "_" + model_paths[-1]
+            else:
+                self.model_name = model_paths[-1]
+        else:
+            self.model_name = model_name
+
+        logger.info(f"Loading the SGLANG model {self.model_name} on worker {worker_id} ...")
+
+        if not no_register:
+            self.register_to_controller()
+            self.heart_beat_thread = threading.Thread(
+                target=heart_beat_worker, args=(self,), daemon=True)
+            self.heart_beat_thread.start()
+
+    def register_to_controller(self):
+        logger.info("Register to controller")
+
+        url = self.controller_addr + "/register_worker"
+        data = {
+            "worker_name": self.worker_addr,
+            "check_heart_beat": True,
+            "worker_status": self.get_status()
+        }
+        r = requests.post(url, json=data)
+        assert r.status_code == 200
+
+    def send_heart_beat(self):
+        logger.info(f"Send heart beat. Models: {[self.model_name]}. "
+                    f"Semaphore: {pretty_print_semaphore(model_semaphore)}. "
+                    f"global_counter: {global_counter}")
+
+        url = self.controller_addr + "/receive_heart_beat"
+
+        while True:
+            try:
+                ret = requests.post(url, json={
+                    "worker_name": self.worker_addr,
+                    "queue_length": self.get_queue_length()}, timeout=5)
+                exist = ret.json()["exist"]
+                break
+            except requests.exceptions.RequestException as e:
+                logger.error(f"heart beat error: {e}")
+            time.sleep(5)
+
+        if not exist:
+            self.register_to_controller()
+
+    def get_queue_length(self):
+        if model_semaphore is None:
+            return 0
+        else:
+            return args.limit_model_concurrency - model_semaphore._value + (len(
+                model_semaphore._waiters) if model_semaphore._waiters is not None else 0)
+
+    def get_status(self):
+        return {
+            "model_names": [self.model_name],
+            "speed": 1,
+            "queue_length": self.get_queue_length(),
+        }
+
+    async def generate_stream(self, params):
+        ori_prompt = prompt = params["prompt"]
+        images = params.get("images", None)
+        if images is not None and len(images) > 0:
+            if len(images) > 0:
+                if len(images) != prompt.count(DEFAULT_IMAGE_TOKEN):
+                    raise ValueError("Number of images does not match number of <image> tokens in prompt")
+
+                images = [load_image_from_base64(image) for image in images]
+
+                # FIXME: for image-start/end token
+                # replace_token = DEFAULT_IMAGE_TOKEN
+                # if getattr(self.model.config, 'mm_use_start_end', False):
+                #     replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
+                # prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
+                prompt = prompt.replace(' ' + DEFAULT_IMAGE_TOKEN + '\n', DEFAULT_IMAGE_TOKEN)
+                prompt_split = prompt.split(DEFAULT_IMAGE_TOKEN)
+                prompt = []
+                for i in range(len(prompt_split)):
+                    prompt.append(prompt_split[i])
+                    if i < len(images):
+                        prompt.append(images[i])
+        else:
+            prompt = [prompt]
+
+        temperature = float(params.get("temperature", 1.0))
+        top_p = float(params.get("top_p", 1.0))
+        # max_context_length = getattr(model.config, 'max_position_embeddings', 2048)
+        max_new_tokens = min(int(params.get("max_new_tokens", 256)), 1024)
+        stop_str = params.get("stop", None)
+        stop_str = [stop_str] if stop_str is not None else None
+
+        print({'prompt': prompt, 'max_new_tokens': max_new_tokens, 'temperature': temperature, 'top_p': top_p})
+        state = pipeline.run(prompt, max_new_tokens, temperature=temperature, top_p=top_p, stream=True)
+
+        generated_text = ori_prompt
+        async for text_outputs in state.text_async_iter(var_name="response"):
+            generated_text += text_outputs
+            yield json.dumps({"text": generated_text, "error_code": 0}).encode() + b"\0"
+
+    async def generate_stream_gate(self, params):
+        try:
+            async for x in self.generate_stream(params):
+                yield x
+        except ValueError as e:
+            print("Caught ValueError:", e)
+            ret = {
+                "text": server_error_msg,
+                "error_code": 1,
+            }
+            yield json.dumps(ret).encode() + b"\0"
+        except Exception as e:
+            print("Caught Unknown Error", e)
+            ret = {
+                "text": server_error_msg,
+                "error_code": 1,
+            }
+            yield json.dumps(ret).encode() + b"\0"
+
+
+app = FastAPI()
+
+
+def release_model_semaphore(fn=None):
+    model_semaphore.release()
+    if fn is not None:
+        fn()
+
+
+@app.post("/worker_generate_stream")
+async def generate_stream(request: Request):
+    global model_semaphore, global_counter
+    global_counter += 1
+    params = await request.json()
+
+    if model_semaphore is None:
+        model_semaphore = asyncio.Semaphore(args.limit_model_concurrency)
+    await model_semaphore.acquire()
+    worker.send_heart_beat()
+    generator = worker.generate_stream_gate(params)
+    background_tasks = BackgroundTasks()
+    background_tasks.add_task(partial(release_model_semaphore, fn=worker.send_heart_beat))
+    return StreamingResponse(generator, background=background_tasks)
+
+
+@app.post("/worker_get_status")
+async def get_status(request: Request):
+    return worker.get_status()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=21002)
+    parser.add_argument("--worker-address", type=str,
+        default="http://localhost:21002")
+    parser.add_argument("--controller-address", type=str,
+        default="http://localhost:21001")
+    parser.add_argument("--model-name", type=str)
+    parser.add_argument("--sgl-endpoint", type=str)
+    parser.add_argument("--limit-model-concurrency", type=int, default=5)
+    parser.add_argument("--stream-interval", type=int, default=1)
+    parser.add_argument("--no-register", action="store_true")
+    args = parser.parse_args()
+    logger.info(f"args: {args}")
+
+    worker = ModelWorker(args.controller_address,
+                         args.worker_address,
+                         args.sgl_endpoint,
+                         worker_id,
+                         args.no_register,
+                         args.model_name)
+    uvicorn.run(app, host=args.host, port=args.port, log_level="info")
diff --git a/app/llava/serve/test_message.py b/app/llava/serve/test_message.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b090faed0e630b03b2294545050f1f4f5032cad
--- /dev/null
+++ b/app/llava/serve/test_message.py
@@ -0,0 +1,62 @@
+import argparse
+import json
+
+import requests
+
+from llava.conversation import default_conversation
+
+
+def main():
+    if args.worker_address:
+        worker_addr = args.worker_address
+    else:
+        controller_addr = args.controller_address
+        ret = requests.post(controller_addr + "/refresh_all_workers")
+        ret = requests.post(controller_addr + "/list_models")
+        models = ret.json()["models"]
+        models.sort()
+        print(f"Models: {models}")
+
+        ret = requests.post(controller_addr + "/get_worker_address",
+            json={"model": args.model_name})
+        worker_addr = ret.json()["address"]
+        print(f"worker_addr: {worker_addr}")
+
+    if worker_addr == "":
+        return
+
+    conv = default_conversation.copy()
+    conv.append_message(conv.roles[0], args.message)
+    prompt = conv.get_prompt()
+
+    headers = {"User-Agent": "LLaVA Client"}
+    pload = {
+        "model": args.model_name,
+        "prompt": prompt,
+        "max_new_tokens": args.max_new_tokens,
+        "temperature": 0.7,
+        "stop": conv.sep,
+    }
+    response = requests.post(worker_addr + "/worker_generate_stream", headers=headers,
+            json=pload, stream=True)
+
+    print(prompt.replace(conv.sep, "\n"), end="")
+    for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"):
+        if chunk:
+            data = json.loads(chunk.decode("utf-8"))
+            output = data["text"].split(conv.sep)[-1]
+            print(output, end="\r")
+    print("")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--controller-address", type=str, default="http://localhost:21001")
+    parser.add_argument("--worker-address", type=str)
+    parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
+    parser.add_argument("--max-new-tokens", type=int, default=32)
+    parser.add_argument("--message", type=str, default=
+        "Tell me a story with more than 1000 words.")
+    args = parser.parse_args()
+
+    main()
diff --git a/app/llava/train/arguments.py b/app/llava/train/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9fe77ed80ab45e4a3f2d40b0043852c11689c38
--- /dev/null
+++ b/app/llava/train/arguments.py
@@ -0,0 +1,85 @@
+import transformers
+
+from typing import Dict, Optional, Sequence, List
+from dataclasses import dataclass, field
+
+@dataclass
+class ModelArguments:
+    model_name_or_path: Optional[str] = field(default="facebook/opt-125m")
+    version: Optional[str] = field(default="v0")
+    freeze_backbone: bool = field(default=False)
+    tune_mm_mlp_adapter: bool = field(default=False)
+    vision_tower: Optional[str] = field(default=None)
+    mm_vision_select_layer: Optional[int] = field(default=-1)   # default to the last layer
+    pretrain_mm_mlp_adapter: Optional[str] = field(default=None)
+    mm_projector_type: Optional[str] = field(default='linear')
+    mm_use_start_end: bool = field(default=False)
+    mm_use_patch_token: bool = field(default=True)
+    mm_patch_merge_type: Optional[str] = field(default='flat')
+    mm_vision_select_feature: Optional[str] = field(default="patch")
+    image_grid_pinpoints: Optional[str] = field(default="[(448, 448)]")
+
+    img_size: int = 224
+    drop_path_rate: float = 0.
+    vit_precision: Optional[str] = field(default="fp16")
+    vit_model_path: Optional[str] = field(default=None)
+    qformer_model_path: Optional[str] = field(default=None)
+    num_query_token: int = 32
+
+    adapter_module_name: Optional[str] = field(default=None)
+    adapter_module_path: Optional[str] = field(default=None)
+
+@dataclass
+class DataArguments:
+    dataset_config: str = field(default="",
+                                metadata={"help": "Training dataset config path"})   
+    # data_path: str = field(default=None,
+    #                        metadata={"help": "Path to the training data."})
+    lazy_preprocess: bool = False
+    is_multimodal: bool = False
+    # image_folder: Optional[str] = field(default=None)
+    image_aspect_ratio: str = 'square'
+    # num_segments: int = 10
+    num_segments: int = 10
+    sample_strategy: str = 'fps0.5'
+    external_args: dict = None
+    num_token_per_image: Optional[int] = field(default=32) 
+
+
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    cache_dir: Optional[str] = field(default=None)
+    optim: str = field(default="adamw_torch")
+    remove_unused_columns: bool = field(default=False)
+    freeze_mm_mlp_adapter: bool = field(default=False)
+    freeze_qformer: bool = field(default=True)
+    freeze_adapter: bool = field(default=False)
+    mpt_attn_impl: Optional[str] = field(default="triton")
+    model_max_length: int = field(
+        default=512,
+        metadata={
+            "help":
+            "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
+        },
+    )
+    double_quant: bool = field(
+        default=True,
+        metadata={"help": "Compress the quantization statistics through double quantization."}
+    )
+    quant_type: str = field(
+        default="nf4",
+        metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
+    )
+    bits: int = field(
+        default=16,
+        metadata={"help": "How many bits to use."}
+    )
+    lora_enable: bool = False
+    lora_r: int = 64
+    lora_alpha: int = 16
+    lora_dropout: float = 0.05
+    lora_weight_path: str = ""
+    lora_bias: str = "none"
+    mm_projector_lr: Optional[float] = None
+    lora_lr: Optional[float] = None
+    group_by_modality_length: bool = field(default=False)
\ No newline at end of file
diff --git a/app/llava/train/llama_flash_attn_monkey_patch.py b/app/llava/train/llama_flash_attn_monkey_patch.py
new file mode 100644
index 0000000000000000000000000000000000000000..31db2eff8d1c4b3ae645583dfc5e156e818b6f1c
--- /dev/null
+++ b/app/llava/train/llama_flash_attn_monkey_patch.py
@@ -0,0 +1,115 @@
+from typing import Optional, Tuple
+import warnings
+
+import torch
+
+import transformers
+from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv
+
+try:
+    from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
+except ImportError:
+    from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
+from flash_attn.bert_padding import unpad_input, pad_input
+
+
+def forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.Tensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    if output_attentions:
+        warnings.warn(
+            "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
+        )
+
+    bsz, q_len, _ = hidden_states.size()
+
+    query_states = (
+        self.q_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    key_states = (
+        self.k_proj(hidden_states)
+        .view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    value_states = (
+        self.v_proj(hidden_states)
+        .view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        .transpose(1, 2)
+    )  # shape: (b, num_heads, s, head_dim)
+
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    query_states, key_states = apply_rotary_pos_emb(
+        query_states, key_states, cos, sin, position_ids
+    )
+
+    if past_key_value is not None:
+        # reuse k, v
+        key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+    past_key_value = (key_states, value_states) if use_cache else None
+
+    # repeat k/v heads if n_kv_heads < n_heads
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+    # Transform the data into the format required by flash attention
+    qkv = torch.stack([query_states, key_states, value_states], dim=2)
+    qkv = qkv.transpose(1, 3)  # shape: [b, s, 3, num_heads, head_dim]
+    key_padding_mask = attention_mask
+
+    if key_padding_mask is None:
+        qkv = qkv.reshape(-1, 3, self.num_heads, self.head_dim)
+        cu_q_lens = torch.arange(
+            0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=qkv.device
+        )
+        max_s = q_len
+        output = flash_attn_unpadded_qkvpacked_func(
+            qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
+        )
+        output = output.view(bsz, q_len, -1)
+    else:
+        qkv = qkv.reshape(bsz, q_len, -1)
+        qkv, indices, cu_q_lens, max_s = unpad_input(qkv, key_padding_mask)
+        qkv = qkv.view(-1, 3, self.num_heads, self.head_dim)
+        output_unpad = flash_attn_unpadded_qkvpacked_func(
+            qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
+        )
+        output_unpad = output_unpad.reshape(-1, self.num_heads * self.head_dim)
+        output = pad_input(output_unpad, indices, bsz, q_len)
+
+    return self.o_proj(output), None, past_key_value
+
+
+# Disable the transformation of the attention mask in LlamaModel as the flash attention
+# requires the attention mask to be the same as the key_padding_mask
+def _prepare_decoder_attention_mask(
+    self, attention_mask, input_shape, inputs_embeds, past_key_values_length
+):
+    # [bsz, seq_len]
+    return attention_mask
+
+
+def replace_llama_attn_with_flash_attn():
+    cuda_major, cuda_minor = torch.cuda.get_device_capability()
+    if cuda_major < 8:
+        warnings.warn(
+            "Flash attention is only supported on A100 or H100 GPU during training due to head dim > 64 backward."
+            "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593"
+        )
+    transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = (
+        _prepare_decoder_attention_mask
+    )
+    transformers.models.llama.modeling_llama.LlamaAttention.forward = forward
diff --git a/app/llava/train/llama_xformers_attn_monkey_patch.py b/app/llava/train/llama_xformers_attn_monkey_patch.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8351e41ccd4a64dca237bd8f8be0702b23989dc
--- /dev/null
+++ b/app/llava/train/llama_xformers_attn_monkey_patch.py
@@ -0,0 +1,129 @@
+"""
+Directly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments
+"""
+
+import logging
+import math
+from typing import Optional, Tuple
+
+import torch
+import transformers.models.llama.modeling_llama
+from torch import nn
+
+try:
+    import xformers.ops
+except ImportError:
+    logging.error("xformers not found! Please install it before trying to use it.")
+
+
+def replace_llama_attn_with_xformers_attn():
+    transformers.models.llama.modeling_llama.LlamaAttention.forward = xformers_forward
+
+
+def xformers_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    # pylint: disable=duplicate-code
+    bsz, q_len, _ = hidden_states.size()
+
+    query_states = (
+        self.q_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    key_states = (
+        self.k_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    value_states = (
+        self.v_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    (
+        query_states,
+        key_states,
+    ) = transformers.models.llama.modeling_llama.apply_rotary_pos_emb(
+        query_states, key_states, cos, sin, position_ids
+    )
+    # [bsz, nh, t, hd]
+
+    if past_key_value is not None:
+        # reuse k, v, self_attention
+        key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+    past_key_value = (key_states, value_states) if use_cache else None
+
+    # We only apply xformers optimizations if we don't need to output the whole attention matrix
+    if not output_attentions:
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        # This is a nasty hack. We know attention_mask in transformers is either LowerTriangular or all Zeros.
+        # We therefore check if one element in the upper triangular portion is zero. If it is, then the mask is all zeros.
+        if attention_mask is None or attention_mask[0, 0, 0, 1] == 0:
+            # input and output should be of form (bsz, q_len, num_heads, head_dim)
+            attn_output = xformers.ops.memory_efficient_attention(
+                query_states, key_states, value_states, attn_bias=None
+            )
+        else:
+            # input and output should be of form (bsz, q_len, num_heads, head_dim)
+            attn_output = xformers.ops.memory_efficient_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_bias=xformers.ops.LowerTriangularMask(),
+            )
+        attn_weights = None
+    else:
+        attn_weights = torch.matmul(
+            query_states, key_states.transpose(2, 3)
+        ) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+            attn_weights = torch.max(
+                attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)
+            )
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(
+            attn_weights, dim=-1, dtype=torch.float32
+        ).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2)
+
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+    attn_output = self.o_proj(attn_output)
+    return attn_output, attn_weights, past_key_value
diff --git a/app/llava/train/llava_trainer.py b/app/llava/train/llava_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fb0c1d52ccc8ffa45b6952204d79b82edd05d27
--- /dev/null
+++ b/app/llava/train/llava_trainer.py
@@ -0,0 +1,287 @@
+import os
+import torch
+import torch.nn as nn
+
+from torch.utils.data import Sampler
+
+from transformers import Trainer
+from transformers.trainer import (
+    is_sagemaker_mp_enabled,
+    get_parameter_names,
+    has_length,
+    ALL_LAYERNORM_LAYERS,
+    logger,
+)
+from typing import List, Optional
+
+
+def maybe_zero_3(param, ignore_status=False, name=None):
+    from deepspeed import zero
+    from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+    if hasattr(param, "ds_id"):
+        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
+            if not ignore_status:
+                print(name, 'no ignore status')
+        with zero.GatheredParameters([param]):
+            param = param.data.detach().cpu().clone()
+    else:
+        param = param.detach().cpu().clone()
+    return param
+
+
+def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
+    to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)}
+    to_return = {k: maybe_zero_3(v, ignore_status=True, name=k).cpu() for k, v in to_return.items()}
+    return to_return
+
+
+def split_to_even_chunks(indices, lengths, num_chunks):
+    """
+    Split a list of indices into `chunks` chunks of roughly equal lengths.
+    """
+
+    if len(indices) % num_chunks != 0:
+        return [indices[i::num_chunks] for i in range(num_chunks)]
+
+    num_indices_per_chunk = len(indices) // num_chunks
+
+    chunks = [[] for _ in range(num_chunks)]
+    chunks_lengths = [0 for _ in range(num_chunks)]
+    for index in indices:
+        shortest_chunk = chunks_lengths.index(min(chunks_lengths))
+        chunks[shortest_chunk].append(index)
+        chunks_lengths[shortest_chunk] += lengths[index]
+        if len(chunks[shortest_chunk]) == num_indices_per_chunk:
+            chunks_lengths[shortest_chunk] = float("inf")
+
+    return chunks
+
+
+def get_modality_length_grouped_indices(lengths, batch_size, world_size, generator=None):
+    # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
+    assert all(l != 0 for l in lengths), "Should not have zero length."
+    if all(l > 0 for l in lengths) or all(l < 0 for l in lengths):
+        # all samples are in the same modality
+        return get_length_grouped_indices(lengths, batch_size, world_size, generator=generator)
+    mm_indices, mm_lengths = zip(*[(i, l) for i, l in enumerate(lengths) if l > 0])
+    lang_indices, lang_lengths = zip(*[(i, -l) for i, l in enumerate(lengths) if l < 0])
+
+    mm_shuffle = [mm_indices[i] for i in get_length_grouped_indices(mm_lengths, batch_size, world_size, generator=None)]
+    lang_shuffle = [lang_indices[i] for i in get_length_grouped_indices(lang_lengths, batch_size, world_size, generator=None)]
+    megabatch_size = world_size * batch_size
+    mm_megabatches = [mm_shuffle[i : i + megabatch_size] for i in range(0, len(mm_shuffle), megabatch_size)]
+    lang_megabatches = [lang_shuffle[i : i + megabatch_size] for i in range(0, len(lang_shuffle), megabatch_size)]
+
+    last_mm = mm_megabatches[-1]
+    last_lang = lang_megabatches[-1]
+    additional_batch = last_mm + last_lang
+    megabatches = mm_megabatches[:-1] + lang_megabatches[:-1]
+    megabatch_indices = torch.randperm(len(megabatches), generator=generator)
+    megabatches = [megabatches[i] for i in megabatch_indices]
+
+    if len(additional_batch) > 0:
+        megabatches.append(sorted(additional_batch))
+
+    return [i for megabatch in megabatches for i in megabatch]
+
+
+def get_length_grouped_indices(lengths, batch_size, world_size, generator=None, merge=True):
+    # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
+    indices = torch.randperm(len(lengths), generator=generator)
+    megabatch_size = world_size * batch_size
+    megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)]
+    megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches]
+    megabatches = [split_to_even_chunks(megabatch, lengths, world_size) for megabatch in megabatches]
+
+    return [i for megabatch in megabatches for batch in megabatch for i in batch]
+
+
+class LengthGroupedSampler(Sampler):
+    r"""
+    Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while
+    keeping a bit of randomness.
+    """
+
+    def __init__(
+        self,
+        batch_size: int,
+        world_size: int,
+        lengths: Optional[List[int]] = None,
+        generator=None,
+        group_by_modality: bool = False,
+    ):
+        if lengths is None:
+            raise ValueError("Lengths must be provided.")
+
+        self.batch_size = batch_size
+        self.world_size = world_size
+        self.lengths = lengths
+        self.generator = generator
+        self.group_by_modality = group_by_modality
+
+    def __len__(self):
+        return len(self.lengths)
+
+    def __iter__(self):
+        if self.group_by_modality:
+            indices = get_modality_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
+        else:
+            indices = get_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
+        return iter(indices)
+
+
+class LLaVATrainer(Trainer):
+
+    def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
+        if self.train_dataset is None or not has_length(self.train_dataset):
+            return None
+
+        if self.args.group_by_modality_length:
+            lengths = self.train_dataset.modality_lengths
+            return LengthGroupedSampler(
+                self.args.train_batch_size,
+                world_size=self.args.world_size * self.args.gradient_accumulation_steps,
+                lengths=lengths,
+                group_by_modality=True,
+            )
+        else:
+            return super()._get_train_sampler()
+
+    def create_optimizer(self):
+        """
+        Setup the optimizer.
+
+        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
+        Trainer's init through `optimizers`, or subclass and override this method in a subclass.
+        """
+        if is_sagemaker_mp_enabled():
+            return super().create_optimizer()
+
+        opt_model = self.model
+
+        if self.optimizer is None:
+            decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS)
+            decay_parameters = [name for name in decay_parameters if "bias" not in name]
+            # if self.args.mm_projector_lr is not None:
+            #     projector_parameters = [name for name, _ in opt_model.named_parameters() if "mm_projector" in name]
+            #     optimizer_grouped_parameters = [
+            #         {
+            #             "params": [
+            #                 p for n, p in opt_model.named_parameters() if (n in decay_parameters and n not in projector_parameters and p.requires_grad)
+            #             ],
+            #             "weight_decay": self.args.weight_decay,
+            #         },
+            #         {
+            #             "params": [
+            #                 p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n not in projector_parameters and p.requires_grad)
+            #             ],
+            #             "weight_decay": 0.0,
+            #         },
+            #         {
+            #             "params": [
+            #                 p for n, p in opt_model.named_parameters() if (n in decay_parameters and n in projector_parameters and p.requires_grad)
+            #             ],
+            #             "weight_decay": self.args.weight_decay,
+            #             "lr": self.args.mm_projector_lr,
+            #         },
+            #         {
+            #             "params": [
+            #                 p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n in projector_parameters and p.requires_grad)
+            #             ],
+            #             "weight_decay": 0.0,
+            #             "lr": self.args.mm_projector_lr,
+            #         },
+            #     ]
+            if self.args.lora_lr is not None:
+                lora_parameters = [name for name, _ in opt_model.named_parameters() if "lora" in name]
+                optimizer_grouped_parameters = [
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and n not in lora_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n not in lora_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": 0.0,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and n in lora_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                        "lr": self.args.lora_lr,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n in lora_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": 0.0,
+                        "lr": self.args.lora_lr,
+                    },
+                ]
+            else:
+                optimizer_grouped_parameters = [
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": 0.0,
+                    },
+                ]
+
+            optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
+
+            self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
+            if optimizer_cls.__name__ == "Adam8bit":
+                import bitsandbytes
+
+                manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
+
+                skipped = 0
+                for module in opt_model.modules():
+                    if isinstance(module, nn.Embedding):
+                        skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values())
+                        logger.info(f"skipped {module}: {skipped/2**20}M params")
+                        manager.register_module_override(module, "weight", {"optim_bits": 32})
+                        logger.debug(f"bitsandbytes: will optimize {module} in fp32")
+                logger.info(f"skipped: {skipped/2**20}M params")
+
+        return self.optimizer
+
+    def _save_checkpoint(self, model, trial, metrics=None):
+        if getattr(self.args, 'tune_mm_mlp_adapter', False):
+            from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+            checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
+
+            run_dir = self._get_output_dir(trial=trial)
+            output_dir = os.path.join(run_dir, checkpoint_folder)
+
+            # Only save Adapter
+            keys_to_match = ['mm_projector', 'vision_resampler', 'frame_position_encoding', 'adapter_module']
+            if getattr(self.args, "use_im_start_end", False):
+                keys_to_match.extend(['embed_tokens', 'embed_in', 'wte'])
+            if not getattr(self.args, 'freeze_qformer', True):
+                keys_to_match.extend(['Qformer', 'query_tokens'])
+
+            weight_to_save = get_mm_adapter_state_maybe_zero_3(self.model.named_parameters(), keys_to_match)
+
+            if self.args.local_rank == 0 or self.args.local_rank == -1:
+                self.model.config.save_pretrained(output_dir)
+                torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin'))
+        else:
+            super(LLaVATrainer, self)._save_checkpoint(model, trial, metrics)
+
+    def _save(self, output_dir: Optional[str] = None, state_dict=None):
+        if getattr(self.args, 'tune_mm_mlp_adapter', False):
+            pass
+        else:
+            super(LLaVATrainer, self)._save(output_dir, state_dict)
diff --git a/app/llava/train/train.py b/app/llava/train/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..f45992ce0e78eb031f56ee33e8e6586793030dde
--- /dev/null
+++ b/app/llava/train/train.py
@@ -0,0 +1,474 @@
+# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
+# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
+#    Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+import os
+import copy
+from dataclasses import dataclass, field
+import json
+import logging
+import pathlib
+from typing import Dict, Optional, Sequence, List
+from webbrowser import get
+
+import torch
+
+import transformers
+import tokenizers
+
+# from llava.constants import IGNORE_INDEX, MM_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+# from torch.utils.data import Dataset
+from llava.train.llava_trainer import LLaVATrainer
+from llava.train.arguments import ModelArguments, TrainingArguments, DataArguments
+from llava.datasets.super_dataset import make_supervised_data_module, make_supervised_data_module_concatdataset
+from llava import conversation as conversation_lib
+from llava.model import *
+# from llava.mm_utils import tokenizer_image_token
+# from llava.model.preprocessor import preprocess, preprocess_multimodal
+# from PIL import Image
+
+
+local_rank = None
+
+
+def rank0_print(*args):
+    if local_rank == 0:
+        print(*args)
+
+
+# from packaging import version
+# IS_TOKENIZER_GREATER_THAN_0_14 = version.parse(tokenizers.__version__) >= version.parse('0.14')
+
+
+
+def maybe_zero_3(param, ignore_status=False, name=None):
+    from deepspeed import zero
+    from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+    if hasattr(param, "ds_id"):
+        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
+            if not ignore_status:
+                logging.warning(f"{name}: param.ds_status != ZeroParamStatus.NOT_AVAILABLE: {param.ds_status}")
+        with zero.GatheredParameters([param]):
+            param = param.data.detach().cpu().clone()
+    else:
+        param = param.detach().cpu().clone()
+    return param
+
+
+# Borrowed from peft.utils.get_peft_model_state_dict
+def get_peft_state_maybe_zero_3(named_params, bias):
+    if bias == "none":
+        to_return = {k: t for k, t in named_params if "lora_" in k}
+    elif bias == "all":
+        to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
+    elif bias == "lora_only":
+        to_return = {}
+        maybe_lora_bias = {}
+        lora_bias_names = set()
+        for k, t in named_params:
+            if "lora_" in k:
+                to_return[k] = t
+                bias_name = k.split("lora_")[0] + "bias"
+                lora_bias_names.add(bias_name)
+            elif "bias" in k:
+                maybe_lora_bias[k] = t
+        for k, t in maybe_lora_bias:
+            if bias_name in lora_bias_names:
+                to_return[bias_name] = t
+    else:
+        raise NotImplementedError
+    to_return = {k: maybe_zero_3(v, ignore_status=True) for k, v in to_return.items()}
+    return to_return
+
+
+def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only=True):
+    to_return = {k: t for k, t in named_params if "lora_" not in k}
+    if require_grad_only:
+        to_return = {k: t for k, t in to_return.items() if t.requires_grad}
+    to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()}
+    return to_return
+
+
+def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
+    to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)}
+    to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()}
+    return to_return
+
+
+def find_all_linear_names(model):
+    cls = torch.nn.Linear
+    lora_module_names = set()
+    multimodal_keywords = ['mm_projector', 'vision_tower', 'vision_resampler']
+    for name, module in model.named_modules():
+        if any(mm_keyword in name for mm_keyword in multimodal_keywords):
+            continue
+        if isinstance(module, cls):
+            names = name.split('.')
+            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
+
+    if 'lm_head' in lora_module_names: # needed for 16-bit
+        lora_module_names.remove('lm_head')
+    return list(lora_module_names)
+
+
+def safe_save_model_for_hf_trainer(trainer: transformers.Trainer,
+                                   output_dir: str):
+    """Collects the state dict and dump to disk."""
+
+    if getattr(trainer.args, "tune_mm_mlp_adapter", False):
+        # Only save Adapter
+        keys_to_match = ['mm_projector', 'frame_position_encoding', 'adapter_module']
+        if getattr(trainer.args, "use_im_start_end", False):
+            keys_to_match.extend(['embed_tokens', 'embed_in', 'wte'])
+        if not getattr(trainer.args, 'freeze_qformer', True):
+            keys_to_match.extend(['Qformer', 'query_tokens'])
+
+
+        weight_to_save = get_mm_adapter_state_maybe_zero_3(trainer.model.named_parameters(), keys_to_match)
+        trainer.model.config.save_pretrained(output_dir)
+
+        current_folder = output_dir.split('/')[-1]
+        parent_folder = os.path.dirname(output_dir)
+        if trainer.args.local_rank == 0 or trainer.args.local_rank == -1:
+            if current_folder.startswith('checkpoint-'):
+                mm_projector_folder = os.path.join(parent_folder, "mm_projector")
+                os.makedirs(mm_projector_folder, exist_ok=True)
+                torch.save(weight_to_save, os.path.join(mm_projector_folder, f'{current_folder}.bin'))
+            else:
+                torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin'))
+        return
+
+    if trainer.deepspeed:
+        torch.cuda.synchronize()
+        trainer.save_model(output_dir)
+        return
+
+    state_dict = trainer.model.state_dict()
+    if trainer.args.should_save:
+        cpu_state_dict = {
+            key: value.cpu()
+            for key, value in state_dict.items()
+        }
+        del state_dict
+        trainer._save(output_dir, state_dict=cpu_state_dict)  # noqa
+
+
+def smart_tokenizer_and_embedding_resize(
+    special_tokens_dict: Dict,
+    tokenizer: transformers.PreTrainedTokenizer,
+    model: transformers.PreTrainedModel,
+):
+    """Resize tokenizer and embedding.
+
+    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
+    """
+    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
+    model.resize_token_embeddings(len(tokenizer))
+
+    if num_new_tokens > 0:
+        input_embeddings = model.get_input_embeddings().weight.data
+        output_embeddings = model.get_output_embeddings().weight.data
+
+        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+            dim=0, keepdim=True)
+        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+            dim=0, keepdim=True)
+
+        input_embeddings[-num_new_tokens:] = input_embeddings_avg
+        output_embeddings[-num_new_tokens:] = output_embeddings_avg
+
+
+def train(attn_implementation=None):
+    global local_rank
+
+    parser = transformers.HfArgumentParser(
+        (ModelArguments, DataArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    local_rank = training_args.local_rank
+    compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
+
+    bnb_model_from_pretrained_args = {}
+    if training_args.bits in [4, 8]:
+        from transformers import BitsAndBytesConfig
+        bnb_model_from_pretrained_args.update(dict(
+            device_map={"": training_args.device},
+            load_in_4bit=training_args.bits == 4,
+            load_in_8bit=training_args.bits == 8,
+            quantization_config=BitsAndBytesConfig(
+                load_in_4bit=training_args.bits == 4,
+                load_in_8bit=training_args.bits == 8,
+                llm_int8_skip_modules=["mm_projector"],
+                llm_int8_threshold=6.0,
+                llm_int8_has_fp16_weight=False,
+                bnb_4bit_compute_dtype=compute_dtype,
+                bnb_4bit_use_double_quant=training_args.double_quant,
+                bnb_4bit_quant_type=training_args.quant_type # {'fp4', 'nf4'}
+            )
+        ))
+
+    if model_args.vision_tower is not None:
+        if 'mpt' in model_args.model_name_or_path:
+            config = transformers.AutoConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)
+            config.attn_config['attn_impl'] = training_args.mpt_attn_impl
+            model = LlavaMptForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                config=config,
+                cache_dir=training_args.cache_dir,
+                **bnb_model_from_pretrained_args
+            )
+        elif 'mistral' in model_args.model_name_or_path.lower():
+            model = LlavaMistralForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=training_args.cache_dir,
+                attn_implementation=attn_implementation,
+                torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+                **bnb_model_from_pretrained_args
+            )   
+        elif 'gemma' in model_args.model_name_or_path.lower(): 
+            model = LlavaGemmaForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=training_args.cache_dir,
+                attn_implementation=attn_implementation,
+                torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+                **bnb_model_from_pretrained_args
+            )
+        elif 'thoth' in model_args.model_name_or_path.lower(): 
+            model = LlavaThothForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=training_args.cache_dir,
+                attn_implementation=attn_implementation,
+                torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+                **bnb_model_from_pretrained_args
+            )                           
+        else:
+            model = LlavaLlamaForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=training_args.cache_dir,
+                attn_implementation=attn_implementation,
+                torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+                **bnb_model_from_pretrained_args
+            )
+    else:
+        model = transformers.LlamaForCausalLM.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            attn_implementation=attn_implementation,
+            torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+            **bnb_model_from_pretrained_args
+        )
+    model.config.use_cache = False
+
+    if model_args.freeze_backbone:
+        model.model.requires_grad_(False)
+
+    if training_args.bits in [4, 8]:
+        from peft import prepare_model_for_kbit_training
+        model.config.torch_dtype=(torch.float32 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
+        model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing)
+
+    if training_args.gradient_checkpointing:
+        if hasattr(model, "enable_input_require_grads"):
+            model.enable_input_require_grads()
+        else:
+            def make_inputs_require_grad(module, input, output):
+                output.requires_grad_(True)
+            model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+    if training_args.lora_enable:
+        from peft import LoraConfig, get_peft_model
+        lora_config = LoraConfig(
+            r=training_args.lora_r,
+            lora_alpha=training_args.lora_alpha,
+            target_modules=find_all_linear_names(model),
+            lora_dropout=training_args.lora_dropout,
+            bias=training_args.lora_bias,
+            task_type="CAUSAL_LM",
+        )
+        if training_args.bits == 16:
+            if training_args.bf16:
+                model.to(torch.bfloat16)
+            if training_args.fp16:
+                model.to(torch.float16)
+        rank0_print("Adding LoRA adapters...")
+        model = get_peft_model(model, lora_config)
+
+    if 'mpt' in model_args.model_name_or_path:
+        tokenizer = transformers.AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            model_max_length=training_args.model_max_length,
+            padding_side="right"
+        )
+    elif 'thoth' in model_args.model_name_or_path:
+        tokenizer = transformers.AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            model_max_length=training_args.model_max_length,
+            use_fast=True
+        ) 
+    else:
+        tokenizer = transformers.AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            model_max_length=training_args.model_max_length,
+            padding_side="right",
+            use_fast=False,
+        )
+
+    if model_args.version == "v0":
+        if tokenizer.pad_token is None:
+            smart_tokenizer_and_embedding_resize(
+                special_tokens_dict=dict(pad_token="[PAD]"),
+                tokenizer=tokenizer,
+                model=model,
+            )
+    elif model_args.version == "v0.5":
+        tokenizer.pad_token = tokenizer.unk_token
+    else:
+        if 'thoth' not in model_args.model_name_or_path:
+            tokenizer.pad_token = tokenizer.unk_token 
+        if model_args.version in conversation_lib.conv_templates:
+            conversation_lib.default_conversation = conversation_lib.conv_templates[model_args.version]
+        else:
+            conversation_lib.default_conversation = conversation_lib.conv_templates["vicuna_v1"]
+
+    model_args.max_num_segments = data_args.num_segments
+    if model_args.vision_tower is not None:
+        model.get_model().initialize_vision_modules(
+            model_args=model_args,
+            fsdp=training_args.fsdp
+        )
+        
+        vision_tower = model.get_vision_tower()
+    
+        data_args.image_processor = vision_tower.image_processor
+        data_args.is_multimodal = True
+
+        model.config.image_aspect_ratio = data_args.image_aspect_ratio
+        model.config.tokenizer_padding_side = tokenizer.padding_side
+        model.config.tokenizer_model_max_length = tokenizer.model_max_length
+
+        model.config.tune_mm_mlp_adapter = training_args.tune_mm_mlp_adapter = model_args.tune_mm_mlp_adapter
+        if model_args.tune_mm_mlp_adapter:
+            model.requires_grad_(False)
+            for p in model.get_model().mm_projector.parameters():
+                p.requires_grad = True
+
+        model.config.freeze_mm_mlp_adapter = training_args.freeze_mm_mlp_adapter
+        if training_args.freeze_mm_mlp_adapter:
+            for p in model.get_model().mm_projector.parameters():
+                p.requires_grad = False
+
+        # frame position encoding always train
+        if model.get_model().get_frame_position_encoding():
+            model.get_frame_position_encoding().weight.requires_grad = True
+
+        if training_args.bits in [4, 8]:
+            model.get_model().mm_projector.to(dtype=compute_dtype, device=training_args.device)
+
+        model.config.mm_use_start_end = data_args.mm_use_start_end = model_args.mm_use_start_end
+        model.config.mm_projector_lr = training_args.mm_projector_lr
+        model.config.lora_lr = training_args.lora_lr
+        training_args.use_im_start_end = model_args.mm_use_start_end
+        model.config.mm_use_patch_token = model_args.mm_use_patch_token
+        model.initialize_vision_tokenizer(model_args, tokenizer=tokenizer)
+
+        
+        if getattr(training_args, "freeze_vision_encoder", True):
+            for p in model.get_vision_tower().parameters():
+                p.requires_grad = False
+        else:
+            for p in model.get_vision_tower().parameters():
+                p.requires_grad = True
+        
+
+        if getattr(model_args, 'qformer_model_path', None):
+            if getattr(training_args, "freeze_qformer", True):
+                for p in model.get_qformer().parameters():
+                    p.requires_grad = False
+                for p in model.get_ln_vision().parameters():
+                    p.requires_grad = False          
+                model.get_query_tokens().requires_grad = False
+            else:
+                for p in model.get_qformer().parameters():
+                    p.requires_grad = True
+                for p in model.get_ln_vision().parameters():
+                    p.requires_grad = True          
+                model.get_query_tokens().requires_grad = True
+        
+        if getattr(model_args, 'adapter_module_name', None):
+            model.get_adapter_module().freeze_adapter_module(getattr(training_args, "freeze_adapter", False))
+        
+   
+    # deepspeed will handle fp16/bf16 automatically
+
+    trainable_params = [name for (name, param) in model.named_parameters() if param.requires_grad == True]
+
+    rank0_print(f"==> Trainable parameters: {trainable_params}")
+
+    if training_args.bits in [4, 8]:
+        from peft.tuners.lora import LoraLayer
+        for name, module in model.named_modules():
+            if isinstance(module, LoraLayer):
+                if training_args.bf16:
+                    module = module.to(torch.bfloat16)
+            if 'norm' in name:
+                module = module.to(torch.float32)
+            if 'lm_head' in name or 'embed_tokens' in name:
+                if hasattr(module, 'weight'):
+                    if training_args.bf16 and module.weight.dtype == torch.float32:
+                        module = module.to(torch.bfloat16)
+
+    data_args.image_grid_pinpoints = model_args.image_grid_pinpoints
+    if not training_args.group_by_modality_length:
+        data_module = make_supervised_data_module(tokenizer=tokenizer,
+                                                  data_args=data_args,
+                                                  num_workers=training_args.dataloader_num_workers)
+    else:
+        data_module = make_supervised_data_module_concatdataset(tokenizer=tokenizer,
+                                                data_args=data_args,
+                                                num_workers=training_args.dataloader_num_workers)
+
+    trainer = LLaVATrainer(model=model,
+                    tokenizer=tokenizer,
+                    args=training_args,
+                    **data_module)
+
+    if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
+        trainer.train(resume_from_checkpoint=True)
+    else:
+        trainer.train()
+    trainer.save_state()
+
+    model.config.use_cache = True
+
+    if training_args.lora_enable:
+        state_dict = get_peft_state_maybe_zero_3(
+            model.named_parameters(), training_args.lora_bias
+        )
+        non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(
+            model.named_parameters()
+        )
+        if training_args.local_rank == 0 or training_args.local_rank == -1:
+            model.config.save_pretrained(training_args.output_dir)
+            model.save_pretrained(training_args.output_dir, state_dict=state_dict)
+            torch.save(non_lora_state_dict, os.path.join(training_args.output_dir, 'non_lora_trainables.bin'))
+    else:
+        safe_save_model_for_hf_trainer(trainer=trainer,
+                                       output_dir=training_args.output_dir)
+ 
+
+if __name__ == "__main__":
+    train()
diff --git a/app/llava/train/train_xformers.py b/app/llava/train/train_xformers.py
new file mode 100644
index 0000000000000000000000000000000000000000..23a59bf4ee0f365de9fbf3838836b170058126d6
--- /dev/null
+++ b/app/llava/train/train_xformers.py
@@ -0,0 +1,13 @@
+# Make it more memory efficient by monkey patching the LLaMA model with xformers attention.
+
+# Need to call this before importing transformers.
+from llava.train.llama_xformers_attn_monkey_patch import (
+    replace_llama_attn_with_xformers_attn,
+)
+
+replace_llama_attn_with_xformers_attn()
+
+from llava.train.train import train
+
+if __name__ == "__main__":
+    train()
diff --git a/app/llava/utils.py b/app/llava/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..97765d93f018a5ab17b01f1ee7b511f64c664fd4
--- /dev/null
+++ b/app/llava/utils.py
@@ -0,0 +1,185 @@
+import datetime
+import logging
+import logging.handlers
+import os
+import sys
+import math
+import random
+import requests
+import torch.distributed as dist
+
+from llava.constants import LOGDIR
+
+server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
+moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
+
+handler = None
+
+
+def build_logger(logger_name, logger_filename):
+    global handler
+
+    formatter = logging.Formatter(
+        fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+
+    # Set the format of root handlers
+    if not logging.getLogger().handlers:
+        logging.basicConfig(level=logging.INFO)
+    logging.getLogger().handlers[0].setFormatter(formatter)
+
+    # Redirect stdout and stderr to loggers
+    stdout_logger = logging.getLogger("stdout")
+    stdout_logger.setLevel(logging.INFO)
+    sl = StreamToLogger(stdout_logger, logging.INFO)
+    sys.stdout = sl
+
+    stderr_logger = logging.getLogger("stderr")
+    stderr_logger.setLevel(logging.ERROR)
+    sl = StreamToLogger(stderr_logger, logging.ERROR)
+    sys.stderr = sl
+
+    # Get logger
+    logger = logging.getLogger(logger_name)
+    logger.setLevel(logging.INFO)
+
+    # Add a file handler for all loggers
+    if handler is None:
+        os.makedirs(LOGDIR, exist_ok=True)
+        filename = os.path.join(LOGDIR, logger_filename)
+        handler = logging.handlers.TimedRotatingFileHandler(
+            filename, when='D', utc=True, encoding='UTF-8')
+        handler.setFormatter(formatter)
+
+        for name, item in logging.root.manager.loggerDict.items():
+            if isinstance(item, logging.Logger):
+                item.addHandler(handler)
+
+    return logger
+
+
+class StreamToLogger(object):
+    """
+    Fake file-like stream object that redirects writes to a logger instance.
+    """
+    def __init__(self, logger, log_level=logging.INFO):
+        self.terminal = sys.stdout
+        self.logger = logger
+        self.log_level = log_level
+        self.linebuf = ''
+
+    def __getattr__(self, attr):
+        return getattr(self.terminal, attr)
+
+    def write(self, buf):
+        temp_linebuf = self.linebuf + buf
+        self.linebuf = ''
+        for line in temp_linebuf.splitlines(True):
+            # From the io.TextIOWrapper docs:
+            #   On output, if newline is None, any '\n' characters written
+            #   are translated to the system default line separator.
+            # By default sys.stdout.write() expects '\n' newlines and then
+            # translates them so this is still cross platform.
+            if line[-1] == '\n':
+                self.logger.log(self.log_level, line.rstrip())
+            else:
+                self.linebuf += line
+
+    def flush(self):
+        if self.linebuf != '':
+            self.logger.log(self.log_level, self.linebuf.rstrip())
+        self.linebuf = ''
+
+
+def disable_torch_init():
+    """
+    Disable the redundant torch default initialization to accelerate model creation.
+    """
+    import torch
+    setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
+    setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
+
+
+def violates_moderation(text):
+    """
+    Check whether the text violates OpenAI moderation API.
+    """
+    url = "https://api.openai.com/v1/moderations"
+    headers = {"Content-Type": "application/json",
+               "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]}
+    text = text.replace("\n", "")
+    data = "{" + '"input": ' + f'"{text}"' + "}"
+    data = data.encode("utf-8")
+    try:
+        ret = requests.post(url, headers=headers, data=data, timeout=5)
+        flagged = ret.json()["results"][0]["flagged"]
+    except requests.exceptions.RequestException as e:
+        flagged = False
+    except KeyError as e:
+        flagged = False
+
+    return flagged
+
+
+def pretty_print_semaphore(semaphore):
+    if semaphore is None:
+        return "None"
+    return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"
+
+def master_print(*args):
+    import torch
+    if torch.cuda.current_device() == 0:
+        print(*args)
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+def is_main_process():
+    return get_rank() == 0
+
+
+class DatasetIter(object):
+    def __init__(self, size, world_size, local_rank, num_workers=1):
+        self.size = size
+        self.world_size = world_size
+        self.local_rank = local_rank
+        # self.num_workers = 1 if num_workers == 0 else num_workers
+        assert num_workers == 1, 'num workers must be 1'
+        self.num_workers = num_workers
+        self.per_worker = int(math.floor(self.size / float(self.world_size * self.num_workers)))
+        self.worker_indexs = dict()
+
+        for worker_id in range(self.num_workers):
+            self.init_worker_index(worker_id)
+    def init_worker_index(self, worker_id):
+
+        start = self.per_worker * (self.local_rank * self.num_workers + worker_id)
+        end = min(start + self.per_worker, self.size)
+        rank_indexs = list(range(start, end))
+        random.shuffle(rank_indexs)
+
+        self.worker_indexs[worker_id] = rank_indexs
+
+    def increment(self, worker_id):
+
+        if len(self.worker_indexs[worker_id]) == 0:
+            self.init_worker_index(worker_id)
+
+        next_iter, self.worker_indexs[worker_id] = self.worker_indexs[worker_id][0], self.worker_indexs[worker_id][1:]
+        return next_iter
\ No newline at end of file
diff --git a/app/main.py b/app/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..319667b5307e1a69deb0cd5971ddfaef1c8551a9
--- /dev/null
+++ b/app/main.py
@@ -0,0 +1,247 @@
+import time
+import os
+import base64
+from io import BytesIO
+import concurrent.futures
+import numpy as np
+from PIL import Image
+import torch
+import torch.nn as nn
+import torch_neuronx
+import transformers
+from transformers import AutoConfig
+from llava.constants import MM_TOKEN_INDEX, DEFAULT_VIDEO_TOKEN, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN
+from llava.conversation import conv_templates
+from llava.model.utils import LayerNorm
+from llava.mm_utils import tokenizer_image_token, process_images_v2
+from llava.model.multimodal_encoder.processor import Blip2ImageTrainProcessor
+
+from typing import Dict
+from fastapi import FastAPI, Request, HTTPException
+from backend_model import MistralModel
+
+# Suppress transformers logging
+transformers.logging.set_verbosity_error()
+
+NUM_SEGMENTS = 10  # Number of frame segments to use
+WEIGHT_ROOT = '/root/inf2_dir/'  # Root directory for model weights
+CONFIG_DIR = os.path.join(WEIGHT_ROOT, "llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch")  # Tokenizer directory
+NEURON_VISION_PATH = os.path.join(WEIGHT_ROOT, "inf2_weights", "neuron_eva_vit_batch7.pth")  # Vision model weights (Neuron format)
+NEURON_BERT_PATH = os.path.join(WEIGHT_ROOT, "inf2_weights", "neuron_bert.pth")  # BERT model weights (Neuron format)
+PROJECTOR_PATH = os.path.join(WEIGHT_ROOT, "inf2_weights", 'projector.pth')  # Projector weights
+EMBED_TOKEN_PATH = os.path.join(WEIGHT_ROOT, "inf2_weights", 'embed_tokens.pth')  # Embedding weights
+QUERY_TOKEN_PATH = os.path.join(WEIGHT_ROOT, "inf2_weights", 'query_tokens.pth')
+LAYERNORM_SAVE_PATH = os.path.join(WEIGHT_ROOT, "inf2_weights", 'ln_state_dict.pth')
+POSITION_ENCODING_SAVE_PATH = os.path.join(WEIGHT_ROOT, "inf2_weights", 'frame_position_encoding.pth')
+
+
+def generate_input_ids(tokenizer):
+    """
+    Generate input token IDs and conversation template for the model.
+
+    Args:
+        tokenizer (AutoTokenizer): Tokenizer instance.
+
+    Returns:
+        tuple: (input_ids, conv)
+            input_ids (torch.Tensor): Input token IDs.
+            conv (Conversation): Conversation template.
+    """
+    conv = conv_templates['thoth'].copy()  # Copy the conversation template
+    qs = "Please describe this video in detail."
+    qs = DEFAULT_VIDEO_START_TOKEN + DEFAULT_VIDEO_TOKEN + DEFAULT_VIDEO_END_TOKEN + '\n' + qs  # Prepend video tokens
+    conv.append_message(conv.roles[0], qs)  # Add the question to the conversation
+    conv.append_message(conv.roles[1], None)  # Add a placeholder for the response
+    prompt = conv.get_prompt()  # Get the conversation prompt
+    input_ids = tokenizer_image_token(prompt, tokenizer, MM_TOKEN_INDEX, return_tensors='pt').unsqueeze(
+        0)  # Tokenize and convert to tensor
+    return input_ids
+
+
+def uniform_sample(frames, num_segments):
+    """
+    Uniformly sample frames from the provided list.
+
+    Args:
+        frames (list): List of frame images.
+        num_segments (int): Number of segments to sample.
+
+    Returns:
+        list: List of sampled frames.
+    """
+    indices = np.linspace(start=0, stop=len(frames) - 1, num=num_segments).astype(
+        int)  # Calculate indices for uniform sampling
+    frames = [frames[ind] for ind in indices]  # Sample frames based on indices
+    return frames
+
+
+def image_open_byteio(byte_data):
+    output = Image.open(BytesIO(byte_data)).convert('RGB')
+    return output
+
+# Create FastAPI app
+app = FastAPI()
+mistral_model = MistralModel(model_name=CONFIG_DIR)  # Load Mistral model
+
+# Load model configuration and tokenizer
+config = AutoConfig.from_pretrained(CONFIG_DIR, trust_remote_code=True)
+tokenizer = mistral_model.tokenizer
+
+input_ids = generate_input_ids(tokenizer)  # Generate input IDs and conversation template
+input_ids = input_ids[0].to('cpu')  # [token_len]
+
+# Initialize image processor and vision module
+image_processor = Blip2ImageTrainProcessor(
+    image_size=config.img_size,
+    is_training=False)
+
+# vision_module = MASPVision()
+# new_vision_state_dict = torch.load(VISION_STATE_DICT, map_location='cpu')  # Load vision state dict
+# vision_module.load_state_dict(new_vision_state_dict)
+
+with torch_neuronx.experimental.neuron_cores_context(start_nc=0, nc_count=2):  # Use Neuron cores for inference
+    vision_module_neuron = torch.jit.load(NEURON_VISION_PATH)
+vision_module_neuron = vision_module_neuron.eval()
+
+# Load embedding weights and set up embedding module
+padding_idx = config.pad_token_id
+embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx)
+embed_weight = torch.load(EMBED_TOKEN_PATH)
+embed_tokens.load_state_dict(embed_weight)
+embed_tokens = embed_tokens.eval()
+embed_tokens.to(torch.float16).to('cpu')
+
+#layer norm
+vision_width = 1408
+ln_vision = LayerNorm(vision_width)
+ln_vision_weight = torch.load(LAYERNORM_SAVE_PATH)
+ln_vision.load_state_dict(ln_vision_weight)
+ln_vision = ln_vision.eval()
+ln_vision = ln_vision.to(torch.float32)
+
+num_query_token = 32
+query_tokens = nn.Parameter(
+    torch.zeros(1, num_query_token, 768)
+)
+query_tokens.data.normal_(mean=0.0, std=0.02)
+query_tokens_weight = torch.load(QUERY_TOKEN_PATH)['query_tokens']
+query_tokens.data = query_tokens_weight
+
+frame_position_encoding = nn.Embedding(10, 768)
+frame_position_encoding_weight = torch.load(POSITION_ENCODING_SAVE_PATH)
+frame_position_encoding.load_state_dict(frame_position_encoding_weight)
+
+projector = nn.Linear(config.mm_hidden_size, config.hidden_size)
+projector_weight = torch.load(PROJECTOR_PATH)
+projector.load_state_dict(projector_weight)
+
+neuron_bert = torch.jit.load(NEURON_BERT_PATH)
+neuron_bert = neuron_bert.eval()
+def vision_module_neuron_output(image):
+    """
+    Get output from the vision module (Neuron format).
+
+    Args:
+        image (torch.Tensor): Input image.
+
+    Returns:
+        torch.Tensor: Vision module output.
+    """
+    output = vision_module_neuron(image)
+    return output
+
+
+@app.post("/generate")
+async def generate(request: Request) -> Dict[str, str]:
+    """
+    Generate text using the Mistral model.
+
+    Args:
+        request (Request): The incoming request object.
+
+    Returns:
+        Dict[str, str]: A dictionary containing the generated text or an error message.
+    """
+    try:
+        s1 = time.time()
+        request_payload = await request.json()
+        request_payload_keys = request_payload.keys()
+
+        if "images" in request_payload_keys:  # If input is a list of images
+            packed_data = request_payload.get("images")
+            # unpacked_data = [base64.b64decode(item) for item in packed_data]  # Decode base64 images
+            # input_images = [Image.open(BytesIO(byte_data)).convert('RGB') for byte_data in unpacked_data]  # Load images
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+                unpacked_data = list(executor.map(base64.b64decode, packed_data))
+
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+                input_images = list(executor.map(image_open_byteio, unpacked_data))
+
+            input_images = uniform_sample(input_images, NUM_SEGMENTS)  # Sample frames
+            input_images = process_images_v2(input_images, image_processor, config)  # Process images
+            print("s1 - input_images time: ", time.time() - s1)
+
+            si = time.time()
+            with torch.inference_mode():  # Enable inference mode
+                with concurrent.futures.ThreadPoolExecutor() as executor:  # Use thread pool for parallel processing
+                    image_features_list = list(executor.map(vision_module_neuron_output, input_images))
+                image_features = torch.cat(image_features_list, dim=0)  # Concatenate image features
+                print("si - image_features neuron time: ", time.time() - si)
+
+                s2 = time.time()
+                # projector = nn.Linear(config.mm_hidden_size, config.hidden_size)  # Initialize projector
+                # projector_weight = torch.load(PROJECTOR_DIR)
+                # projector.load_state_dict(projector_weight)
+                # image_features = vision_module.forward_features(input_images, image_features,
+                #                                                 projector)  # Process vision features
+                image_features = ln_vision(image_features)
+                attn_mask = torch.ones(image_features.size()[:-1], dtype=torch.long).to(image_features.device)
+                query_tokens_inputs = query_tokens.expand(image_features.shape[0], -1, -1)
+
+                image_features = neuron_bert(
+                    query_tokens_inputs.to(torch.float32),
+                    image_features.to(torch.float32),
+                    attn_mask.to(torch.int64)
+                )["last_hidden_state"].to(torch.float32)
+
+                frame_ids = torch.arange(input_images.shape[0], dtype=torch.long, device=image_features.device).unsqueeze(1)
+                frame_ids = frame_ids.repeat(1, input_images.shape[1]).flatten(0, 1)  # [num_frames * num_patches]
+                image_features += frame_position_encoding(frame_ids).unsqueeze(-2)  # [num_frames, 1, 768]
+                projected_features = projector(image_features)
+
+                image_features = projected_features.flatten(0, 1)
+                print(image_features.shape)
+
+                image_features.to(device='cpu', dtype=torch.float16)  # Convert to float16 and move to CPU
+                print("s2 - image_features prepare time: ", time.time() - s2)
+
+                s3 = time.time()
+                vision_token_indice = torch.where(input_ids == MM_TOKEN_INDEX)[0][0]  # Get index of vision token
+                pre_text_token = embed_tokens(input_ids[:vision_token_indice])  # Embed tokens before vision token
+                post_text_token = embed_tokens(input_ids[vision_token_indice + 1:])  # Embed tokens after vision token
+                print("s3 - text_token time: ", time.time() - s3)
+
+                s4 = time.time()
+                inputs_embeds = torch.cat([pre_text_token, image_features, post_text_token]).unsqueeze(
+                    0)  # Concatenate input embeddings
+                inputs = inputs_embeds.detach().cpu().numpy().tolist()  # Convert to list for input to Mistral model
+                print("s4 - inputs time: ", time.time() - s4)
+
+        elif "inputs" in request_payload_keys:  # If input is normal text or embedding
+            si = time.time()
+            inputs = request_payload.get("inputs")
+        else:
+            raise HTTPException(status_code=400, detail="Please provide correct input")
+
+        if not inputs:
+            raise HTTPException(status_code=400, detail="No input provided")
+
+        s5 = time.time()
+        parameters = request_payload.get("parameters", {})  # Get additional parameters
+        generated_text = mistral_model.generate(inputs, parameters)  # Generate text using Mistral model
+        print("s5 - generated_text time: ", time.time() - s5)
+        print("total inference time: ", time.time() - si)
+        return {"generated_text": generated_text}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error generating text: {str(e)}")
+
diff --git a/app/requirements.txt b/app/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d1fa8fb847cd2bba77e6ace5bab0a3aaf18c40c6
--- /dev/null
+++ b/app/requirements.txt
@@ -0,0 +1,11 @@
+git+https://github.com/davidshtian/transformers-neuronx.git
+uvicorn[standard]
+fastapi
+msgpack
+neuronx-cc==2.*
+torch-neuronx==1.13.*
+torchvision
+omegaconf
+timm
+sentencepiece
+
diff --git a/app/tmp/main_old.py b/app/tmp/main_old.py
new file mode 100644
index 0000000000000000000000000000000000000000..de612b13cf181c1faafe5232f3577555edc72968
--- /dev/null
+++ b/app/tmp/main_old.py
@@ -0,0 +1,270 @@
+from typing import Dict
+from fastapi import FastAPI, Request, HTTPException
+from backend_model import MistralModel
+from llava.model.multimodal_encoder.processor import Blip2ImageTrainProcessor
+from transformers import AutoTokenizer
+from llava.constants import MM_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, \
+    DEFAULT_VIDEO_PATCH_TOKEN, DEFAULT_VIDEO_TOKEN, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN
+from llava.conversation import conv_templates
+from llava.model.multimodal_encoder.qformer import BertConfig, BertLMHeadModel, BertModel
+from llava.model.multimodal_projector.builder import build_vision_projector
+from llava.model.utils import LayerNorm
+from llava.model.multimodal_encoder.eva_clip_encoder import EvaClipVisionTower
+from llava.mm_utils import tokenizer_image_token, process_images_v2
+import torch
+import numpy as np
+from PIL import Image
+import os
+import msgpack
+from io import BytesIO
+import base64
+import torch.nn as nn
+from transformers import AutoConfig
+from collections import OrderedDict
+
+import torch_neuronx
+
+NUM_SEGMENTS = 10
+
+def generate_input_ids(tokenizer):
+    conv = conv_templates['thoth'].copy()
+    qs = "Describe the following video in detail."
+    qs = DEFAULT_VIDEO_START_TOKEN + DEFAULT_VIDEO_TOKEN + DEFAULT_VIDEO_END_TOKEN + '\n' + qs
+    conv.append_message(conv.roles[0], qs)
+    conv.append_message(conv.roles[1], None)
+    prompt = conv.get_prompt()
+    input_ids = tokenizer_image_token(prompt, tokenizer, MM_TOKEN_INDEX, return_tensors='pt').unsqueeze(0)
+    return input_ids, conv
+
+
+def generate_images(frame_folder, image_processor, model_cfg):
+    images = load_frames(frame_folder)
+    if len(images) > NUM_SEGMENTS:
+        images = uniform_sample(images, NUM_SEGMENTS)
+    return process_images_v2(images, image_processor, model_cfg)
+
+
+def uniform_sample(frames, num_segments):
+    indices = np.linspace(start=0, stop=len(frames) - 1, num=num_segments).astype(int)
+    frames = [frames[ind] for ind in indices]
+    return frames
+
+
+def load_frames(frames_dir):
+    results = []
+    image_files = [(int(os.path.splitext(img)[0]), img) for img in os.listdir(frames_dir) if img.endswith('jpg')]
+    image_files = sorted(image_files, key=lambda img: img[0])
+
+    for frame_name in image_files:
+        image_path = f"{frames_dir}/{frame_name[1]}"
+        image = Image.open(image_path).convert('RGB')
+        results.append(image)
+    return results
+
+
+class MASPVision(torch.nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        # device = 'cuda:0'
+        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        device_map = {"": 0}
+
+        vision_tower = EvaClipVisionTower("eva-vit-g", config, delay_load=True)
+        vision_tower.load_model(device_map=device_map)
+        vision_tower.to(device=device, dtype=torch.float16)
+
+        image_processor = Blip2ImageTrainProcessor(
+            image_size=config.img_size,
+            is_training=False)
+
+        cross_attention_freq = 2
+        vision_width = vision_tower.hidden_size
+        num_query_token = config.num_query_token
+        ln_vision = LayerNorm(vision_width)
+        encoder_config = BertConfig.from_pretrained("bert-base-uncased")
+        encoder_config.encoder_width = vision_width
+        # insert cross-attention layer every other block
+        encoder_config.add_cross_attention = True
+        encoder_config.cross_attention_freq = cross_attention_freq
+        encoder_config.query_length = num_query_token
+        # Qformer = BertLMHeadModel(config=encoder_config)
+        self.bert = BertModel(encoder_config, add_pooling_layer=False)
+        self.bert.embeddings.word_embeddings = None
+        self.bert.embeddings.position_embeddings = None
+
+        for layer in self.bert.encoder.layer:
+            layer.output = None
+            layer.intermediate = None
+
+        query_tokens = nn.Parameter(
+            torch.zeros(1, num_query_token, encoder_config.hidden_size)
+        )
+        query_tokens.data.normal_(mean=0.0, std=encoder_config.initializer_range)
+
+        frame_position_encoding = nn.Embedding(
+            config.max_num_segments,
+            encoder_config.hidden_size
+        )
+
+        mm_projector = build_vision_projector(config)
+
+        self.vision_tower = vision_tower
+        # self.qformer = Qformer
+        self.projector = mm_projector
+        self.query_tokens = query_tokens
+        self.ln_vision = ln_vision
+        self.frame_position_encoding = frame_position_encoding
+
+    def forward(self, images):
+        # images: [num_frames, patches, 3, image_size, image_size]
+        image_features = self.vision_tower(images.flatten(0, 1))
+        image_features = self.ln_vision(image_features)
+        attn_mask = torch.ones(image_features.size()[:-1], dtype=torch.long).to(
+            image_features.device)  # [num_frames * num_patches, 256]
+        query_tokens = self.query_tokens.expand(image_features.shape[0], -1, -1)  # [num_frames * num_patches, 32, 768]
+        dtype_ = self.vision_tower.dtype
+        image_features = self.bert(
+            query_embeds=query_tokens.to(dtype_),
+            encoder_hidden_states=image_features.to(dtype_),
+            encoder_attention_mask=attn_mask,
+            return_dict=True
+        ).last_hidden_state.to(dtype_)
+        frame_ids = torch.arange(images.shape[0], dtype=torch.long, device=image_features.device).unsqueeze(1)
+        frame_ids = frame_ids.repeat(1, images.shape[1]).flatten(0, 1)  # [num_frames * num_patches]
+        image_features += self.frame_position_encoding(frame_ids).unsqueeze(-2)  # [num_frames, 1, 768]
+        return self.projector(image_features)
+
+        # zheng add
+
+    def forward_features(self, a, b):
+        # images: [num_frames, patches, 3, image_size, image_size]
+        images = a
+        image_features = b
+        image_features = self.ln_vision(image_features)
+        attn_mask = torch.ones(image_features.size()[:-1], dtype=torch.long).to(
+            image_features.device)  # [num_frames * num_patches, 256]
+        query_tokens = self.query_tokens.expand(image_features.shape[0], -1, -1)  # [num_frames * num_patches, 32, 768]
+        dtype_ = self.vision_tower.dtype
+        image_features = self.bert(
+            query_embeds=query_tokens.to(dtype_),
+            encoder_hidden_states=image_features.to(dtype_),
+            encoder_attention_mask=attn_mask,
+            return_dict=True
+        ).last_hidden_state.to(dtype_)
+        frame_ids = torch.arange(images.shape[0], dtype=torch.long, device=image_features.device).unsqueeze(1)
+        frame_ids = frame_ids.repeat(1, images.shape[1]).flatten(0, 1)  # [num_frames * num_patches]
+        image_features += self.frame_position_encoding(frame_ids).unsqueeze(-2)  # [num_frames, 1, 768]
+        return self.projector(image_features)
+
+
+WEIGHT_ROOT = '/root/masp_models_inf2'
+tokenizer_dir = '../tokenizer_dir'
+NEURON_VISION_PATH = os.path.join(WEIGHT_ROOT, "./neuron_eva_vit_base.pt")
+VISION_STATE_DICT = os.path.join(WEIGHT_ROOT, 'new_vision_state_dict.pth')
+EMBED_TOKEN_PATH = os.path.join(WEIGHT_ROOT, 'embed_tokens.pth')
+EVA_VIT_PATH = os.path.join(WEIGHT_ROOT, 'eva_vit_g.pth')
+app = FastAPI()
+mistral_model = MistralModel()
+
+config = AutoConfig.from_pretrained(tokenizer_dir, trust_remote_code=True)
+
+config.vit_model_path = EVA_VIT_PATH
+tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
+
+tokenizer.add_tokens(
+    [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN],
+    special_tokens=True)
+
+input_ids, conv = generate_input_ids(tokenizer)
+input_ids = input_ids[0].to('cpu')  # [token_len]
+
+image_processor = Blip2ImageTrainProcessor(
+    image_size=config.img_size,
+    is_training=False)
+vision_module = MASPVision(config=config)
+# new_vision_state_dict = torch.load('new_vision_state_dict.pth')
+new_vision_state_dict = torch.load(VISION_STATE_DICT, map_location='cpu')
+
+vision_module.load_state_dict(new_vision_state_dict)
+vision_module = vision_module.eval()
+vision_module = vision_module.to('cpu')
+
+# vision_module.to(torch.float16)
+# zheng add
+vision_module.to(torch.float32)
+
+vision_module_neuron = torch.jit.load(NEURON_VISION_PATH)
+vision_module_neuron = vision_module_neuron.eval()
+
+padding_idx = config.pad_token_id
+embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx)
+embed_weight = torch.load(EMBED_TOKEN_PATH)
+embed_tokens.load_state_dict(embed_weight)
+embed_tokens = embed_tokens.eval()
+embed_tokens.to(torch.float16).to('cpu')
+
+@app.post("/generate")
+async def generate(request: Request) -> Dict[str, str]:
+    """
+    Generate text using the Mistral model.
+
+    Args:
+        request (Request): The incoming request object.
+
+    Returns:
+        Dict[str, str]: A dictionary containing the generated text or an error message.
+    """
+    try:
+        request_payload = await request.json()
+        packed_data = request_payload.get("images")
+        parameters = request_payload.get("parameters", {})
+        #unpacked_data = msgpack.unpackb(packed_data, raw=False)
+        unpacked_data = [base64.b64decode(item) for item in packed_data]
+        input_images = [Image.open(BytesIO(byte_data)).convert('RGB') for byte_data in unpacked_data]
+
+        input_images = uniform_sample(input_images, NUM_SEGMENTS)
+        input_images = process_images_v2(input_images, image_processor, config)
+
+        with torch.inference_mode():
+            # get image feature
+
+            # image_features = vision_module(images).flatten(0, 1) # [num_frames * num_patches * num_query_token, 4096]
+            image_features = torch.Tensor()  # init a tensor
+            for image in input_images:
+                output = vision_module_neuron(image)
+                output = output[:, 1:].to(torch.float32)
+                if len(image_features) == 0:
+                    image_features = output
+                else:
+                    image_features = torch.cat([image_features, output], dim=0)
+
+            # zheng [70, 256, 1408]
+            image_features = vision_module.forward_features(input_images, image_features)
+            image_features = image_features.flatten(0, 1)
+            print(image_features.shape)  # zheng [70, 32, 4096]
+
+            image_features.to(device='cpu', dtype=torch.float16)
+            image_features_numpy = image_features.detach().cpu().numpy()
+            # image_features_saved = np.load('image_features_numpy.npy')
+            # print(np.sum(image_features_numpy -image_features_saved ))
+            # image_features_numpy = image_features.detach().cpu().numpy
+            # np.save('image_features_numpy.npy', image_features_numpy)
+            # print('images features shape', image_features.shape)
+            # image_features = loaded(images).flatten(0, 1)
+            # concat with text features
+            vision_token_indice = torch.where(input_ids == MM_TOKEN_INDEX)[0][0]
+            pre_text_token = embed_tokens(input_ids[:vision_token_indice])  # zheng [32, 4096]
+            post_text_token = embed_tokens(input_ids[vision_token_indice + 1:])
+
+            inputs_embeds = torch.cat([pre_text_token, image_features, post_text_token]).unsqueeze(
+                0)  # [1, num_token, 4096]
+            inputs = inputs_embeds.detach().cpu().numpy().tolist()
+
+        if not inputs:
+            raise HTTPException(status_code=400, detail="No input provided")
+
+        generated_text = mistral_model.generate(inputs, parameters)
+        return {"generated_text": generated_text}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error generating text: {str(e)}")
diff --git a/app/tmp/mistral_standalone.py b/app/tmp/mistral_standalone.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7da15ee43e00cdc2db25393f048cbdfb92c1a77
--- /dev/null
+++ b/app/tmp/mistral_standalone.py
@@ -0,0 +1,27 @@
+import logging
+from typing import Union, List, Optional, Dict, Any, Literal
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer
+from transformers_neuronx import MistralForSampling, GQA, NeuronConfig
+import time
+import math
+
+
+model_name = './checkpoint-3000'
+amp =  'bf16'
+batch_size = 1
+tp_degree = 8
+n_positions = 8192
+neuron_config = NeuronConfig(group_query_attention=GQA.SHARD_OVER_HEADS)
+
+
+model = MistralForSampling.from_pretrained(
+    model_name,
+    amp=amp,
+    batch_size=batch_size,
+    tp_degree=tp_degree,
+    n_positions=n_positions,
+    neuron_config=neuron_config
+)
+model.to_neuron()
\ No newline at end of file
diff --git a/app/tmp/predict_vision.py b/app/tmp/predict_vision.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3b2f3e5f7a53de693b671a42b5f2ec55e2245f0
--- /dev/null
+++ b/app/tmp/predict_vision.py
@@ -0,0 +1,270 @@
+from llava.model.multimodal_encoder.processor import Blip2ImageTrainProcessor
+from transformers import AutoTokenizer
+from llava.constants import MM_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, \
+    DEFAULT_VIDEO_PATCH_TOKEN, DEFAULT_VIDEO_TOKEN, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN
+from llava.conversation import conv_templates
+from llava.model.multimodal_encoder.qformer import BertConfig, BertLMHeadModel, BertModel
+from llava.model.multimodal_projector.builder import build_vision_projector
+from llava.model.utils import LayerNorm
+from llava.model.multimodal_encoder.eva_clip_encoder import EvaClipVisionTower
+import torch
+from llava.mm_utils import tokenizer_image_token, process_images_v2, KeywordsStoppingCriteria
+import numpy as np
+from PIL import Image
+import os
+import torch.nn as nn
+from transformers import AutoConfig
+from collections import OrderedDict
+
+import torch_neuronx
+
+NUM_SEGMENTS = 10
+
+def generate_input_ids(tokenizer):
+    conv = conv_templates['thoth'].copy()
+    qs = "Describe the following video in detail."
+    qs = DEFAULT_VIDEO_START_TOKEN + DEFAULT_VIDEO_TOKEN + DEFAULT_VIDEO_END_TOKEN + '\n' + qs
+    conv.append_message(conv.roles[0], qs)
+    conv.append_message(conv.roles[1], None)
+    prompt = conv.get_prompt()
+    input_ids = tokenizer_image_token(prompt, tokenizer, MM_TOKEN_INDEX, return_tensors='pt').unsqueeze(0)
+    return input_ids, conv
+
+
+def generate_images(frame_folder, image_processor, model_cfg):
+    images = load_frames(frame_folder)
+    if len(images) > NUM_SEGMENTS:
+        images = uniform_sample(images, NUM_SEGMENTS)
+    return process_images_v2(images, image_processor, model_cfg)
+
+
+def uniform_sample(frames, num_segments):
+    indices = np.linspace(start=0, stop=len(frames) - 1, num=num_segments).astype(int)
+    frames = [frames[ind] for ind in indices]
+    return frames
+
+
+def load_frames(frames_dir):
+    results = []
+    image_files = [(int(os.path.splitext(img)[0]), img) for img in os.listdir(frames_dir) if img.endswith('jpg')]
+    image_files = sorted(image_files, key=lambda img: img[0])
+
+    for frame_name in image_files:
+        image_path = f"{frames_dir}/{frame_name[1]}"
+        image = Image.open(image_path).convert('RGB')
+        results.append(image)
+    return results
+
+
+class MASPVision(torch.nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        # device = 'cuda:0'
+        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        device_map = {"": 0}
+        config.vit_model_path = 'eva_vit_g.pth'
+        vision_tower = EvaClipVisionTower("eva-vit-g", config, delay_load=True)
+        vision_tower.load_model(device_map=device_map)
+        vision_tower.to(device=device, dtype=torch.float16)
+
+        image_processor = Blip2ImageTrainProcessor(
+            image_size=config.img_size,
+            is_training=False)
+
+        cross_attention_freq = 2
+        vision_width = vision_tower.hidden_size
+        num_query_token = config.num_query_token
+        ln_vision = LayerNorm(vision_width)
+        encoder_config = BertConfig.from_pretrained("bert-base-uncased")
+        encoder_config.encoder_width = vision_width
+        # insert cross-attention layer every other block
+        encoder_config.add_cross_attention = True
+        encoder_config.cross_attention_freq = cross_attention_freq
+        encoder_config.query_length = num_query_token
+        # Qformer = BertLMHeadModel(config=encoder_config)
+        self.bert = BertModel(encoder_config, add_pooling_layer=False)
+        self.bert.embeddings.word_embeddings = None
+        self.bert.embeddings.position_embeddings = None
+
+        for layer in self.bert.encoder.layer:
+            layer.output = None
+            layer.intermediate = None
+
+        query_tokens = nn.Parameter(
+            torch.zeros(1, num_query_token, encoder_config.hidden_size)
+        )
+        query_tokens.data.normal_(mean=0.0, std=encoder_config.initializer_range)
+
+        frame_position_encoding = nn.Embedding(
+            config.max_num_segments,
+            encoder_config.hidden_size
+        )
+
+        mm_projector = build_vision_projector(config)
+
+        self.vision_tower = vision_tower
+        # self.qformer = Qformer
+        self.projector = mm_projector
+        self.query_tokens = query_tokens
+        self.ln_vision = ln_vision
+        self.frame_position_encoding = frame_position_encoding
+
+    def forward(self, images):
+        # images: [num_frames, patches, 3, image_size, image_size]
+        image_features = self.vision_tower(images.flatten(0, 1))
+        image_features = self.ln_vision(image_features)
+        attn_mask = torch.ones(image_features.size()[:-1], dtype=torch.long).to(
+            image_features.device)  # [num_frames * num_patches, 256]
+        query_tokens = self.query_tokens.expand(image_features.shape[0], -1, -1)  # [num_frames * num_patches, 32, 768]
+        dtype_ = self.vision_tower.dtype
+        image_features = self.bert(
+            query_embeds=query_tokens.to(dtype_),
+            encoder_hidden_states=image_features.to(dtype_),
+            encoder_attention_mask=attn_mask,
+            return_dict=True
+        ).last_hidden_state.to(dtype_)
+        frame_ids = torch.arange(images.shape[0], dtype=torch.long, device=image_features.device).unsqueeze(1)
+        frame_ids = frame_ids.repeat(1, images.shape[1]).flatten(0, 1)  # [num_frames * num_patches]
+        image_features += self.frame_position_encoding(frame_ids).unsqueeze(-2)  # [num_frames, 1, 768]
+        return self.projector(image_features)
+
+        # zheng add
+
+    def forward_features(self, a, b):
+        # images: [num_frames, patches, 3, image_size, image_size]
+        images = a
+        image_features = b
+        image_features = self.ln_vision(image_features)
+        attn_mask = torch.ones(image_features.size()[:-1], dtype=torch.long).to(
+            image_features.device)  # [num_frames * num_patches, 256]
+        query_tokens = self.query_tokens.expand(image_features.shape[0], -1, -1)  # [num_frames * num_patches, 32, 768]
+        dtype_ = self.vision_tower.dtype
+        image_features = self.bert(
+            query_embeds=query_tokens.to(dtype_),
+            encoder_hidden_states=image_features.to(dtype_),
+            encoder_attention_mask=attn_mask,
+            return_dict=True
+        ).last_hidden_state.to(dtype_)
+        frame_ids = torch.arange(images.shape[0], dtype=torch.long, device=image_features.device).unsqueeze(1)
+        frame_ids = frame_ids.repeat(1, images.shape[1]).flatten(0, 1)  # [num_frames * num_patches]
+        image_features += self.frame_position_encoding(frame_ids).unsqueeze(-2)  # [num_frames, 1, 768]
+        return self.projector(image_features)
+
+
+if __name__ == '__main__':
+    frame_folder = './v12044gd0000cl5c6rfog65i2eoqcqig'
+    tokenizer_dir = '../tokenizer_dir'
+    # device = 'cuda:0'
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
+    config = AutoConfig.from_pretrained(tokenizer_dir, trust_remote_code=True)
+    tokenizer.add_tokens(
+        [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN],
+        special_tokens=True)
+
+    image_processor = Blip2ImageTrainProcessor(
+        image_size=config.img_size,
+        is_training=False)
+
+    input_ids, conv = generate_input_ids(tokenizer)
+
+    # images = generate_images(frame_folder, image_processor, config).to(device).half() # [num_frames, patches, 3, image_size, image_size]
+
+    # zheng
+    images = generate_images(frame_folder, image_processor, config).to(device)
+
+    vision_module = MASPVision(config=config)
+
+    input_ids = input_ids[0].to(device)  # [token_len]
+    # new_vision_state_dict = torch.load('new_vision_state_dict.pth')
+    new_vision_state_dict = torch.load('new_vision_state_dict.pth', map_location=device)
+
+    # vision_state_dict = torch.load('masp_vision_statedict.pth', map_location="cuda:0")
+    # new_vision_state_dict = OrderedDict()
+    # for k, v in vision_state_dict.items():
+    #     if 'qformer' in k:
+    #         new_key = k[8:]
+    #         new_vision_state_dict[new_key] = v
+    #     else:
+    #         new_vision_state_dict[k] = v
+
+    vision_module.load_state_dict(new_vision_state_dict)
+    vision_module = vision_module.eval()
+    vision_module = vision_module.to(device)
+
+    # vision_module.to(torch.float16)
+    # zheng add
+    vision_module.to(torch.float32)
+
+    # zheng add
+    vision_module_neuron = torch.jit.load("./neuron_eva_vit_base.pt")
+    vision_module_neuron = vision_module_neuron.eval()
+    # output=vision_module_neuron(images)
+
+    padding_idx = config.pad_token_id
+    embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx)
+    embed_weight = torch.load('embed_tokens.pth')
+    embed_tokens.load_state_dict(embed_weight)
+    embed_tokens = embed_tokens.eval()
+    embed_tokens.to(torch.float16).to(device)
+    # vision_module = vision_module.eval()
+    # vision_state_dict = vision_module.state_dict()
+    # torch.save(vision_state_dict, 'masp_vision_statedict.pth')
+
+    # infernece
+    # print(images.shape) # [10, 7, 3, 224, 224]
+    # dummy_images = torch.rand(10, 7, 3, 224, 224).to(model.device)
+    # scripted_vision_module = torch.jit.script(vision_module)
+    # print('begin to trace')
+    # traced_vision_module = torch.jit.trace(vision_module, (images))
+    # traced_vision_module.save('traced_vision_module.pt')
+    # loaded = torch.jit.load('traced_vision_module.pt')
+
+    import time
+
+    start = time.time()
+    with torch.inference_mode():
+        # get image feature
+
+        # image_features = vision_module(images).flatten(0, 1) # [num_frames * num_patches * num_query_token, 4096]
+
+        image_features = torch.Tensor()  # init a tensor
+        for image in images:
+            output = vision_module_neuron(image)
+            output = output[:, 1:].to(torch.float32)
+            if len(image_features) == 0:
+                image_features = output
+            else:
+                image_features = torch.cat([image_features, output], dim=0)
+
+        # zheng [70, 256, 1408]
+        image_features = vision_module.forward_features(images, image_features)
+        image_features = image_features.flatten(0, 1)
+        print(image_features.shape)  # zheng [70, 32, 4096]
+
+        image_features.to(device=device, dtype=torch.float16)
+        image_features_numpy = image_features.detach().cpu().numpy()
+        # image_features_saved = np.load('image_features_numpy.npy')
+        # print(np.sum(image_features_numpy -image_features_saved ))
+        # image_features_numpy = image_features.detach().cpu().numpy
+        # np.save('image_features_numpy.npy', image_features_numpy)
+        # print('images features shape', image_features.shape)
+        # image_features = loaded(images).flatten(0, 1)
+        # concat with text features
+        vision_token_indice = torch.where(input_ids == MM_TOKEN_INDEX)[0][0]
+        pre_text_token = embed_tokens(input_ids[:vision_token_indice])  # zheng [32, 4096]
+        post_text_token = embed_tokens(input_ids[vision_token_indice + 1:])
+
+        inputs_embeds = torch.cat([pre_text_token, image_features, post_text_token]).unsqueeze(
+            0)  # [1, num_token, 4096]
+
+        print("Inference time:", time.time() - start)
+
+        input_embeds_numpy = inputs_embeds.detach().cpu().numpy()
+        image_embeds_saved = np.load('inputs_embeds.npy')
+        diff = np.sum(input_embeds_numpy - image_embeds_saved)
+        print('diff with saved in the disk', diff)
+        # print('inputs embeds numpy shape', input_embeds_numpy.shape)
+        # np.save('inputs_embeds.npy', input_embeds_numpy)
\ No newline at end of file
diff --git a/app/tmp/qformer-tian.py b/app/tmp/qformer-tian.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffd6fa5cc9321859aef735970adbeb211c95866e
--- /dev/null
+++ b/app/tmp/qformer-tian.py
@@ -0,0 +1,1151 @@
+"""
+Adapted from salesforce@LAVIS. Below is the original copyright:
+ * Copyright (c) 2023, salesforce.com, inc.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+ * By Junnan Li
+ * Based on huggingface code base
+ * https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/models/bert
+"""
+
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple, Dict, Any
+
+import torch
+from torch import Tensor, device, dtype, nn
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+import torch.nn.functional as F
+
+from transformers.activations import ACT2FN
+from transformers.file_utils import (
+    ModelOutput,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import (
+    PreTrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from transformers.utils import logging
+from transformers.models.bert.configuration_bert import BertConfig
+from llava.model.utils import LayerNorm
+
+logger = logging.get_logger(__name__)
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word and position embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = None
+        self.position_embeddings = None
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.LayerNorm = LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))
+        )
+        self.position_embedding_type = getattr(
+            config, "position_embedding_type", "absolute"
+        )
+
+        self.config = config
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        query_embeds=None,
+        past_key_values_length=0,
+    ):
+        seq_length = 0
+
+        position_ids = self.position_ids[
+            :, past_key_values_length : seq_length + past_key_values_length
+            ].clone()
+
+        embeddings = query_embeds
+
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertSelfAttention(nn.Module):
+    def __init__(self, config, is_cross_attention):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+            config, "embedding_size"
+        ):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_width, self.all_head_size)
+            self.value = nn.Linear(config.encoder_width, self.all_head_size)
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if (
+            self.position_embedding_type == "relative_key"
+            or self.position_embedding_type == "relative_key_query"
+        ):
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(
+                2 * config.max_position_embeddings - 1, self.attention_head_size
+            )
+        self.save_attention = False
+
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+
+    def get_attn_gradients(self):
+        return self.attn_gradients
+
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+
+    def get_attention_map(self):
+        return self.attention_map
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (
+            self.num_attention_heads,
+            self.attention_head_size,
+        )
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        mixed_query_layer = self.query(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if (
+            self.position_embedding_type == "relative_key"
+            or self.position_embedding_type == "relative_key_query"
+        ):
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(
+                seq_length, dtype=torch.long, device=hidden_states.device
+            ).view(-1, 1)
+            position_ids_r = torch.arange(
+                seq_length, dtype=torch.long, device=hidden_states.device
+            ).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(
+                distance + self.max_position_embeddings - 1
+            )
+            positional_embedding = positional_embedding.to(
+                dtype=query_layer.dtype
+            )  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum(
+                    "bhld,lrd->bhlr", query_layer, positional_embedding
+                )
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum(
+                    "bhld,lrd->bhlr", query_layer, positional_embedding
+                )
+                relative_position_scores_key = torch.einsum(
+                    "bhrd,lrd->bhlr", key_layer, positional_embedding
+                )
+                attention_scores = (
+                    attention_scores
+                    + relative_position_scores_query
+                    + relative_position_scores_key
+                )
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores.to(torch.float32)).to(attention_scores.dtype)
+
+        if is_cross_attention and self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+
+        context_layer = torch.matmul(attention_probs_dropped, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (
+            (context_layer, attention_probs) if output_attentions else (context_layer,)
+        )
+
+        outputs = outputs + (past_key_value,)
+        return outputs
+
+
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.LayerNorm = LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.self = BertSelfAttention(config, is_cross_attention)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads,
+            self.self.num_attention_heads,
+            self.self.attention_head_size,
+            self.pruned_heads,
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = (
+            self.self.attention_head_size * self.self.num_attention_heads
+        )
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[
+            1:
+        ]  # add attentions if we output them
+        return outputs
+
+
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.LayerNorm = LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+    def __init__(self, config, layer_num):
+        super().__init__()
+        self.config = config
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config)
+        self.layer_num = layer_num
+        if (
+            self.config.add_cross_attention
+            and layer_num % self.config.cross_attention_freq == 0
+        ):
+            self.crossattention = BertAttention(
+                config, is_cross_attention=self.config.add_cross_attention
+            )
+            self.has_cross_attention = True
+        else:
+            self.has_cross_attention = False
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+        self.intermediate_query = BertIntermediate(config)
+        self.output_query = BertOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        query_length=0,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = (
+            past_key_value[:2] if past_key_value is not None else None
+        )
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:-1]
+
+        present_key_value = self_attention_outputs[-1]
+
+        if query_length > 0:
+            query_attention_output = attention_output[:, :query_length, :]
+
+            if self.has_cross_attention:
+                assert (
+                    encoder_hidden_states is not None
+                ), "encoder_hidden_states must be given for cross-attention layers"
+                cross_attention_outputs = self.crossattention(
+                    query_attention_output,
+                    attention_mask,
+                    head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+                query_attention_output = cross_attention_outputs[0]
+                outputs = (
+                    outputs + cross_attention_outputs[1:-1]
+                )  # add cross attentions if we output attention weights
+
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk_query,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                query_attention_output,
+            )
+            if attention_output.shape[1] > query_length:
+                layer_output_text = apply_chunking_to_forward(
+                    self.feed_forward_chunk,
+                    self.chunk_size_feed_forward,
+                    self.seq_len_dim,
+                    attention_output[:, query_length:, :],
+                )
+                layer_output = torch.cat([layer_output, layer_output_text], dim=1)
+        else:
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                attention_output,
+            )
+        outputs = (layer_output,) + outputs
+
+        outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+    def feed_forward_chunk_query(self, attention_output):
+        intermediate_output = self.intermediate_query(attention_output)
+        layer_output = self.output_query(intermediate_output, attention_output)
+        return layer_output
+
+
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [BertLayer(config, i) for i in range(config.num_hidden_layers)]
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        query_length=0,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = (
+            () if output_attentions and self.config.add_cross_attention else None
+        )
+
+        next_decoder_cache = () if use_cache else None
+
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+
+                if use_cache:
+                    logger.warn(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(
+                            *inputs, past_key_value, output_attentions, query_length
+                        )
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                    query_length,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.LayerNorm = LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class BertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = BertConfig
+    base_model_prefix = "bert"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+
+class BertModel(BertPreTrainedModel):
+    """
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+    input to the forward pass.
+    """
+
+    def __init__(self, config, add_pooling_layer=False):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = BertEmbeddings(config)
+
+        self.encoder = BertEncoder(config)
+
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_extended_attention_mask(
+        self,
+        attention_mask: Tensor,
+        input_shape: Tuple[int],
+        device: device,
+        is_decoder: bool,
+        has_query: bool = False,
+    ) -> Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+
+        Arguments:
+            attention_mask (:obj:`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (:obj:`Tuple[int]`):
+                The shape of the input to the model.
+            device: (:obj:`torch.device`):
+                The device of the input to the model.
+
+        Returns:
+            :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if is_decoder:
+                batch_size, seq_length = input_shape
+
+                seq_ids = torch.arange(seq_length, device=device)
+                causal_mask = (
+                    seq_ids[None, None, :].repeat(batch_size, seq_length, 1)
+                    <= seq_ids[None, :, None]
+                )
+
+                # add a prefix ones mask to the causal mask
+                # causal and attention masks must have same type with pytorch version < 1.3
+                causal_mask = causal_mask.to(attention_mask.dtype)
+
+                if causal_mask.shape[1] < attention_mask.shape[1]:
+                    prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
+                    if has_query:  # UniLM style attention mask
+                        causal_mask = torch.cat(
+                            [
+                                torch.zeros(
+                                    (batch_size, prefix_seq_len, seq_length),
+                                    device=device,
+                                    dtype=causal_mask.dtype,
+                                ),
+                                causal_mask,
+                            ],
+                            axis=1,
+                        )
+                    causal_mask = torch.cat(
+                        [
+                            torch.ones(
+                                (batch_size, causal_mask.shape[1], prefix_seq_len),
+                                device=device,
+                                dtype=causal_mask.dtype,
+                            ),
+                            causal_mask,
+                        ],
+                        axis=-1,
+                    )
+                extended_attention_mask = (
+                    causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
+                )
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(
+            dtype=self.dtype
+        )  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    def forward(
+        self,
+        query_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = self.config.output_attentions
+
+        output_hidden_states = self.config.output_hidden_states
+
+        return_dict = True
+        
+        use_cache = False
+
+        input_ids = None
+        
+        position_ids = None
+
+        # past_key_values_length
+        past_key_values_length = 0
+
+        query_length = query_embeds.shape[1]
+        
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            query_embeds=query_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+
+        input_shape = embedding_output.size()[:-1]
+        batch_size = input_shape[-2]
+        seq_length = input_shape[-1]
+        device = embedding_output.device
+
+        attention_mask = None
+        attention_mask = torch.ones(
+            ((batch_size, seq_length + past_key_values_length)), device=device
+            )
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        is_decoder = False
+        extended_attention_mask = self.get_extended_attention_mask(
+                attention_mask, input_shape, device, is_decoder
+            )
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        print("encoder_hidden_states.size(): ", encoder_hidden_states.size())
+        encoder_batch_size = encoder_hidden_states.size()[-3]
+        encoder_sequence_length = encoder_hidden_states.size()[-2]
+        encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+        print("encoder_hidden_shape: ", encoder_hidden_shape)
+
+        encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = None
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        past_key_values = None
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            query_length=query_length,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = None
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+class BertLMHeadModel(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        query_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=True,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        return_logits=False,
+        is_decoder=True,
+        reduction="mean",
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
+            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        Returns:
+        Example::
+            >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
+            >>> import torch
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+            >>> config = BertConfig.from_pretrained("bert-base-cased")
+            >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+            >>> prediction_logits = outputs.logits
+        """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        if labels is not None:
+            use_cache = False
+        if past_key_values is not None:
+            query_embeds = None
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            query_embeds=query_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+        )
+
+        sequence_output = outputs[0]
+        if query_embeds is not None:
+            sequence_output = outputs[0][:, query_embeds.shape[1] :, :]
+
+        prediction_scores = self.cls(sequence_output)
+
+        if return_logits:
+            return prediction_scores[:, :-1, :].contiguous()
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1)
+            lm_loss = loss_fct(
+                shifted_prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1),
+            )
+            if reduction == "none":
+                lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1)
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, query_embeds, past=None, attention_mask=None, **model_kwargs
+    ):
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_ids.shape)
+        query_mask = input_ids.new_ones(query_embeds.shape[:-1])
+        attention_mask = torch.cat([query_mask, attention_mask], dim=-1)
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            "input_ids": input_ids,
+            "query_embeds": query_embeds,
+            "attention_mask": attention_mask,
+            "past_key_values": past,
+            "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None),
+            "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None),
+            "is_decoder": True,
+        }
+
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (
+                tuple(
+                    past_state.index_select(0, beam_idx) for past_state in layer_past
+                ),
+            )
+        return reordered_past
+
+
+class BertForMaskedLM(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        query_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        return_logits=False,
+        is_decoder=False,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        """
+
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            query_embeds=query_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+        )
+
+        if query_embeds is not None:
+            sequence_output = outputs[0][:, query_embeds.shape[1] :, :]
+        prediction_scores = self.cls(sequence_output)
+
+        if return_logits:
+            return prediction_scores
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)
+            )
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return (
+                ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+            )
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/client/client.py b/client/client.py
new file mode 100644
index 0000000000000000000000000000000000000000..78c0e317da5714afa9eefcc5411d316ceb21b29e
--- /dev/null
+++ b/client/client.py
@@ -0,0 +1,12 @@
+import requests
+
+url = 'http://127.0.0.1:8000/generate'
+
+
+
+payload = {'inputs': 'Who are you?', "parameters":{"max_new_tokens": 4}}
+headers = {'Content-Type': 'application/json'}
+
+response = requests.post(url, json=payload, headers=headers)
+
+print(response.text)
diff --git a/client/embeds_client.py b/client/embeds_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e24da324a8a2fa05cdac31396fd75ebf516d4c5
--- /dev/null
+++ b/client/embeds_client.py
@@ -0,0 +1,14 @@
+import json
+import requests
+import torch
+import numpy as np
+
+image_embeds_saved = np.load('input_embeds.npy')
+payload = {'inputs': image_embeds_saved.tolist()}
+
+url = 'http://127.0.0.1:8000/generate'
+headers = {'Content-Type': 'application/json'}
+
+response = requests.post(url, json=payload, headers=headers)
+
+print(response.text)
diff --git a/client/embeds_save.py b/client/embeds_save.py
new file mode 100644
index 0000000000000000000000000000000000000000..a91b8fd4ee51a1607ea7f549c4734cf4a7d61a3d
--- /dev/null
+++ b/client/embeds_save.py
@@ -0,0 +1,34 @@
+import numpy as np
+import torch.nn.functional as F
+from transformers import AutoTokenizer
+from transformers_neuronx import MistralForSampling, GQA, NeuronConfig
+
+# Set sharding strategy for GQA to be shard over heads
+neuron_config = NeuronConfig(
+    group_query_attention=GQA.SHARD_OVER_HEADS
+)
+
+# Create and compile the Neuron model
+model_neuron = MistralForSampling.from_pretrained('mistralai/Mistral-7B-Instruct-v0.2', amp='bf16', batch_size=1, tp_degree=2, n_positions=2048, neuron_config=neuron_config)
+model_neuron.to_neuron()
+
+tokenizer = AutoTokenizer.from_pretrained('mistralai/Mistral-7B-Instruct-v0.2')
+tokenizer.pad_token_id = tokenizer.eos_token_id
+
+input_prompt = 'Who are you?'
+
+input_prompt = "[INST] " + input_prompt + " [/INST]"
+encoded_input = tokenizer(input_prompt, return_tensors='pt')
+original_input_ids = encoded_input.input_ids
+input_ids_length = original_input_ids.shape[1]
+power_of_length = 64
+while power_of_length < input_ids_length:
+    power_of_length *= 2
+padding_size = ((input_ids_length - 1) // 64 + 1) * power_of_length
+padding_gap = padding_size - input_ids_length
+padded_input_ids = F.pad(original_input_ids, (padding_gap, 0), value=tokenizer.pad_token_id)
+
+input_embeds = model_neuron.chkpt_model.model.embed_tokens(padded_input_ids)
+
+input_embeds_np = input_embeds.detach().numpy()
+np.save('./input_embeds.npy', input_embeds_np)
diff --git a/client/images_client.py b/client/images_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..762c4a64ad6b5c0367fdee1ad2fd0456a684d04b
--- /dev/null
+++ b/client/images_client.py
@@ -0,0 +1,49 @@
+from PIL import Image
+import io
+import os
+import time
+import requests
+import msgpack
+import base64
+import numpy as np
+from io import BytesIO
+def select_frames(input_frames, num_segments = 10):
+    indices = np.linspace(start=0, stop=len(input_frames)-1, num=num_segments).astype(int)
+    frames = [input_frames[ind] for ind in indices]
+    return frames
+
+tic = time.time()
+video_dir = './images/'
+frames = [(os.path.splitext(item)[0], os.path.join(video_dir, item)) for item in os.listdir(video_dir)]
+frames = [item[1] for item in sorted(frames, key=lambda x: x[0])]
+
+image_paths = select_frames(frames, num_segments=10)
+
+request = {}
+byte_images = []
+for image_path in image_paths:
+    img = Image.open(image_path)
+    byte_io = BytesIO()
+    img.save(byte_io, format='PNG')
+    encoded_image = base64.b64encode(byte_io.getvalue()).decode('utf-8')
+    byte_images.append(encoded_image)
+# Step 4: Pack the byte data with msgpack
+#packed_data = msgpack.packb(byte_images)
+
+payload = {
+        "images": byte_images,
+    }
+
+
+IP = '10.100.193.192'
+url = f'http://{IP}:8001/generate'
+headers = {'Content-Type': 'application/json'}
+
+tic1 = time.time()
+
+response = requests.post(url, json=payload, headers=headers)
+toc = time.time()
+print(response.text)
+
+print('stage 1', tic1 - tic)
+print('stage 2', toc - tic1)
\ No newline at end of file
diff --git a/client/images_client_laplace.py b/client/images_client_laplace.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9cb63d01e3aba65570bbf11923cd5a6a7ab101d
--- /dev/null
+++ b/client/images_client_laplace.py
@@ -0,0 +1,52 @@
+import os
+import random
+import requests
+import base64
+import time
+from io import BytesIO
+import numpy as np
+
+from PIL import Image
+import msgpack
+from laplace import Client
+from io import BytesIO
+
+
+def select_frames(input_frames, num_segments=10):
+    indices = np.linspace(start=0, stop=len(input_frames) - 1, num=num_segments).astype(int)
+    frames = [input_frames[ind] for ind in indices]
+    return frames
+
+client = Client("sd://data.tns.masp_inf2?cluster=default", timeout=100)
+
+video_dir = './v12044gd0000cl5c6rfog65i2eoqcqig'
+frames = [(os.path.splitext(item)[0], os.path.join(video_dir, item)) for item in os.listdir(video_dir)]
+frames = [item[1] for item in sorted(frames, key=lambda x: int(x[0]))]
+
+out_frames = select_frames(frames)
+request = {}
+byte_images = []
+for image_path in out_frames:
+    img = Image.open(image_path)
+    byte_io = BytesIO()
+    img.save(byte_io, format='PNG')
+    byte_images.append(byte_io.getvalue())
+
+# Step 4: Pack the byte data with msgpack
+packed_data = msgpack.packb(byte_images)
+request['images'] = [packed_data]
+
+# adjust it if you hope to tune those paramters:
+temp = 0.01
+# prompt = 'Did the image contains a lion? answer yes or no'
+request['temperature'] = [str(temp).encode()]
+# request['prompt'] = [prompt.encode()]
+
+tic = time.time()
+results = client.matx_inference(model_name="data-tns-masp-inf2", input_lists=request)
+toc = time.time()
+print('time to do the inference')
+print(toc - tic)
+# results = client.matx_inference(model_name="data-tns-masp-model-b",input_lists=request)
+outputs = results.output_bytes_lists
+print(outputs['output'][0])
diff --git a/conversions/convert2inf2.py b/conversions/convert2inf2.py
new file mode 100644
index 0000000000000000000000000000000000000000..295f7f0ab5752d8b3950e97e5a30745e194b940f
--- /dev/null
+++ b/conversions/convert2inf2.py
@@ -0,0 +1,184 @@
+import sys
+import torch
+import os
+import random
+from io import BytesIO
+import numpy as np
+import time
+from llava.constants import MM_TOKEN_INDEX, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN, DEFAULT_VIDEO_TOKEN, \
+    DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from llava.conversation import conv_templates, SeparatorStyle
+from llava.utils import disable_torch_init
+from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria, process_images_v2
+from llava.model.builder import load_pretrained_model
+from llava.model.multimodal_encoder.processor import Blip2ImageTrainProcessor
+from llava.model import LlavaMistralForCausalLM
+from llava.model.multimodal_encoder.eva_vit import create_eva_vit_g
+import torch_neuronx
+import torch
+import torch_neuronx
+from llava.model import LlavaMistralForCausalLM
+from transformers import AutoTokenizer
+from llava.constants import MM_TOKEN_INDEX, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN, DEFAULT_VIDEO_TOKEN, \
+    DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+
+from transformers import CLIPImageProcessor
+from PIL import Image
+import logging
+from qformer_tian import BertConfig, BertModel
+
+
+def select_frames(input_frames, num_segments=10):
+    indices = np.linspace(start=0, stop=len(input_frames) - 1, num=num_segments).astype(int)
+
+    frames = [input_frames[ind] for ind in indices]
+
+    return frames
+
+
+def generate_input_ids(tokenizer):
+    conv = conv_templates['v1'].copy()
+    qs = "Describe the following video in detail."
+    qs = DEFAULT_VIDEO_START_TOKEN + DEFAULT_VIDEO_TOKEN + DEFAULT_VIDEO_END_TOKEN + '\n' + qs
+    conv.append_message(conv.roles[0], qs)
+    conv.append_message(conv.roles[1], None)
+    prompt = conv.get_prompt()
+    input_ids = tokenizer_image_token(prompt, tokenizer, MM_TOKEN_INDEX, return_tensors='pt').unsqueeze(0)
+    return input_ids, conv
+
+
+def uniform_sample(frames, num_segments):
+    indices = np.linspace(start=0, stop=len(frames) - 1, num=num_segments).astype(int)
+    frames = [frames[ind] for ind in indices]
+    return frames
+
+
+save_root = '/root/inf2_dir/inf2_weights'
+if not os.path.isdir(save_root):
+    os.makedirs(save_root)
+
+EVITG_SAVE_PATH = os.path.join(save_root, 'neuron_eva_vit_batch7.pth')
+LAYERNORM_SAVE_PATH = os.path.join(save_root, 'ln_state_dict.pth')
+QUERYTOKEN_SAVE_PATH = os.path.join(save_root, 'query_tokens.pth')
+BERT_SAVE_PATH = os.path.join(save_root, 'neuron_bert.pth')
+POSITION_ENCODING_SAVE_PATH = os.path.join(save_root, 'frame_position_encoding.pth')
+PROJECTOR_SAVE_PATH = os.path.join(save_root, 'projector.pth')
+EMBED_TOKENS_SAVE_PATH = os.path.join(save_root, 'embed_tokens.pth')
+
+model_path = './llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/'
+disable_torch_init()
+# print(model_path)
+device_map = {"": 'cpu'}
+kwargs = {"device_map": device_map}
+kwargs['torch_dtype'] = torch.float32
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+model = LlavaMistralForCausalLM.from_pretrained(
+    model_path,
+    low_cpu_mem_usage=True,
+    **kwargs
+)
+tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN],
+                     special_tokens=True)
+model.resize_token_embeddings(len(tokenizer))
+
+model.config.vit_precision == 'fp32'
+vision_tower = model.get_vision_tower()
+vision_tower.is_loaded = False
+vision_tower.load_model(device_map=device_map)
+vision_tower = vision_tower.to(torch.float32)
+
+vision_tower = vision_tower.eval()
+print('vision tower hiidden size')
+print(vision_tower.hidden_size)
+
+batch_size = 7
+img_size = 224
+input_shape = (batch_size, 3, img_size, img_size)
+input_data = torch.zeros(input_shape, dtype=torch.float32)
+model_neuronx = torch_neuronx.trace(vision_tower, input_data, compiler_args=["--model-type=transformer"])
+model_neuronx.save(EVITG_SAVE_PATH)
+
+image_processor = Blip2ImageTrainProcessor(
+    image_size=model.config.img_size,
+    is_training=False)
+
+input_ids, conv = generate_input_ids(tokenizer)
+device = torch.device('cpu')
+model = model.to(device)
+conv_mode = 'v1'
+NUM_SEGMENTS = 10
+
+video_dir = './v12044gd0000cl5c6rfog65i2eoqcqig'
+frames = [(int(os.path.splitext(item)[0]), os.path.join(video_dir, item)) for item in os.listdir(video_dir)]
+frames = [item[1] for item in sorted(frames, key=lambda x: x[0])]
+images = [Image.open(frame).convert('RGB') for frame in frames]
+images = uniform_sample(images, NUM_SEGMENTS)
+images = process_images_v2(images, image_processor, model.config)
+
+# save layer norm
+ln_vision = model.get_ln_vision()
+ln_vision = ln_vision.eval()
+ln_state_dict = ln_vision.state_dict()
+torch.save(ln_state_dict, LAYERNORM_SAVE_PATH)
+
+query_tokens = model.get_query_tokens()
+# save query tokens
+# attn_mask = torch.ones(image_features.size()[:-1], dtype=torch.long).to(image_features.device)
+# query_tokens_inputs = query_tokens.expand(image_features.shape[0], -1, -1)
+
+query_tokens_state_dict = {'query_tokens': query_tokens.data}
+torch.save(query_tokens_state_dict, QUERYTOKEN_SAVE_PATH)
+# print('shape of query tokens', query_tokens_inputs.shape)
+
+# save qformer
+qformer = model.get_qformer()
+bert_torch = qformer.bert
+bert_torch = bert_torch.eval()
+bert_torch = bert_torch.to(torch.float32)
+
+vision_width = 1408
+cross_attention_freq = 2
+num_query_token = 32
+encoder_config = BertConfig.from_pretrained("bert-base-uncased")
+encoder_config.encoder_width = vision_width
+# insert cross-attention layer every other block
+encoder_config.add_cross_attention = True
+encoder_config.cross_attention_freq = cross_attention_freq
+encoder_config.query_length = num_query_token
+bert = BertModel(encoder_config, add_pooling_layer=False)
+bert.embeddings.word_embeddings = None
+bert.embeddings.position_embeddings = None
+
+for layer in bert.encoder.layer:
+    layer.output = None
+    layer.intermediate = None
+
+bert.load_state_dict(bert_torch.state_dict())
+bert = bert.eval()
+
+input_example = (
+    torch.zeros(70, 32, 768, dtype=torch.float32),
+    torch.zeros(70, 256, 1408, dtype=torch.float32),
+    torch.zeros(70, 256, dtype=torch.int64)
+)
+neuron_bert = torch_neuronx.trace(bert, input_example)
+neuron_bert.save(BERT_SAVE_PATH)
+
+# save projector and frame position encoding
+frame_position_encoding = model.get_frame_position_encoding()
+projector = model.get_model().mm_projector
+
+frame_position_encoding = frame_position_encoding.eval()
+frame_position_encoding = frame_position_encoding.to(torch.float32)
+
+projector = projector.eval()
+projector = projector.to(torch.float32)
+
+torch.save(frame_position_encoding.state_dict(), POSITION_ENCODING_SAVE_PATH)
+torch.save(projector.state_dict(), PROJECTOR_SAVE_PATH)
+
+# save embed_tokenss
+embed_tokens = model.get_model().embed_tokens
+embed_tokens = embed_tokens.eval()
+embed_tokens = embed_tokens.to(torch.float32)
+torch.save(embed_tokens.state_dict(), EMBED_TOKENS_SAVE_PATH)
\ No newline at end of file
diff --git a/conversions/inference_inf2.py b/conversions/inference_inf2.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bc2e0732fb4e17216969768b62b0a001c55ca7e
--- /dev/null
+++ b/conversions/inference_inf2.py
@@ -0,0 +1,217 @@
+import time
+import os
+import base64
+from io import BytesIO
+import concurrent.futures
+import logging
+import numpy as np
+from PIL import Image
+import torch
+import torch.nn as nn
+import torch_neuronx
+import transformers
+from transformers import AutoConfig, AutoTokenizer
+from llava.constants import MM_TOKEN_INDEX, DEFAULT_VIDEO_TOKEN, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN
+from llava.conversation import conv_templates
+from llava.model.utils import LayerNorm
+from llava.mm_utils import tokenizer_image_token
+from llava.model.multimodal_encoder.processor import Blip2ImageTrainProcessor
+from transformers_neuronx import MistralForSampling, GQA, NeuronConfig, QuantizationConfig
+from typing import Dict, Optional, Any
+from fastapi import FastAPI, Request, HTTPException
+# Suppress transformers logging
+transformers.logging.set_verbosity_error()
+NUM_SEGMENTS = 10  # Number of frame segments to use
+WEIGHT_ROOT = '/home/ubuntu/'  # Root directory for model weights
+CONFIG_DIR = os.path.join(WEIGHT_ROOT, "llava-mistral_videollava_ptv12_250k_samep_only_sopv2_mistralv2_scratch")  # Tokenizer directory
+NEURON_VISION_PATH = os.path.join(WEIGHT_ROOT, "inf2_weights", "neuron_eva_vit_batch7.pth")  # Vision model weights (Neuron format)
+NEURON_BERT_PATH = os.path.join(WEIGHT_ROOT, "inf2_weights", "neuron_bert.pth")  # BERT model weights (Neuron format)
+PROJECTOR_PATH = os.path.join(WEIGHT_ROOT, "inf2_weights", 'projector.pth')  # Projector weights
+EMBED_TOKEN_PATH = os.path.join(WEIGHT_ROOT, "inf2_weights", 'embed_tokens.pth')  # Embedding weights
+QUERY_TOKEN_PATH = os.path.join(WEIGHT_ROOT, "inf2_weights", 'query_tokens.pth')
+LAYERNORM_SAVE_PATH = os.path.join(WEIGHT_ROOT, "inf2_weights", 'ln_state_dict.pth')
+POSITION_ENCODING_SAVE_PATH = os.path.join(WEIGHT_ROOT, "inf2_weights", 'frame_position_encoding.pth')
+COMPILED_MODEL_PATH = os.path.join(WEIGHT_ROOT, 'mistral-compiled')
+class MistralModel:
+    def __init__(self, model_name):
+        self.neuron_config = NeuronConfig(group_query_attention=GQA.SHARD_OVER_HEADS,
+                                          quant=QuantizationConfig(quant_dtype='s8', dequant_dtype='bf16'))
+        self.model_name = model_name
+        self.amp = 'bf16'
+        self.batch_size = 1
+        self.tp_degree = 2
+        self.n_positions = 4096
+        self.context_length_estimate_start = 2289
+        self.context_length_estimate = [self.context_length_estimate_start, 4096]
+        self.model = MistralForSampling.from_pretrained(
+            self.model_name,
+            amp=self.amp,
+            batch_size=self.batch_size,
+            tp_degree=self.tp_degree,
+            n_positions=self.n_positions,
+            neuron_config=self.neuron_config,
+            context_length_estimate=self.context_length_estimate
+        )
+        self.model.load(COMPILED_MODEL_PATH)
+        self.model.to_neuron()
+        # self.model.save(COMPILED_MODEL_PATH)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+    def generate(self, inputs: torch.tensor, parameters: Optional[Dict[str, Any]] = None) -> str:
+        try:
+            max_new_tokens = parameters.get("max_new_tokens", 256)
+            top_k = parameters.get("top_k", 100)
+            top_p = parameters.get("top_p", 0.1)
+            temperature = parameters.get("temperature", 0.1)
+            no_repeat_ngram_size = parameters.get("no_repeat_ngram_size", 3)
+            with torch.inference_mode():
+                generated_sequence = self.model.sample(inputs,
+                                                       sequence_length=min(self.n_positions, self.context_length_estimate_start + max_new_tokens),
+                                                       start_ids=None, top_k=top_k, top_p=top_p, temperature=temperature,
+                                                       no_repeat_ngram_size=no_repeat_ngram_size)
+                with concurrent.futures.ThreadPoolExecutor(16) as executor:
+                    decoded_output = list(executor.map(self.tokenizer.decode, generated_sequence))
+            generated_text = decoded_output[0].strip("</s>").strip()
+            return generated_text
+        except Exception as e:
+            logging.error(f"Error generating text: {e}")
+            raise
+# Create FastAPI app
+app = FastAPI()
+mistral_model = MistralModel(model_name=CONFIG_DIR)  # Load Mistral model
+processor = Blip2ImageTrainProcessor(image_size=224, is_training=False)
+def generate_input_ids(tokenizer):
+    conv = conv_templates['thoth'].copy()  # Copy the conversation template
+    qs = "Please describe this video in detail."
+    qs = DEFAULT_VIDEO_START_TOKEN + DEFAULT_VIDEO_TOKEN + DEFAULT_VIDEO_END_TOKEN + '\n' + qs  # Prepend video tokens
+    conv.append_message(conv.roles[0], qs)  # Add the question to the conversation
+    conv.append_message(conv.roles[1], None)  # Add a placeholder for the response
+    prompt = conv.get_prompt()  # Get the conversation prompt
+    input_ids = tokenizer_image_token(prompt, tokenizer, MM_TOKEN_INDEX, return_tensors='pt').unsqueeze(0)  # Tokenize and convert to tensor
+    return input_ids
+def uniform_sample(frames, num_segments):
+    indices = np.linspace(start=0, stop=len(frames) - 1, num=num_segments).astype(
+        int)  # Calculate indices for uniform sampling
+    frames = [frames[ind] for ind in indices]  # Sample frames based on indices
+    return frames
+def image_open_byteio(byte_data):
+    output = Image.open(BytesIO(byte_data)).convert('RGB')
+    return output
+def process_anyres_image(image):
+    new_image = Image.new('RGB', (224, 224), (0, 0, 0))
+    new_image.paste(image.resize((224, 224)), (0, 0))
+    torch_stack = processor.preprocess(new_image).repeat(7,1,1,1)
+    return torch_stack
+# Load model configuration and tokenizer
+config = AutoConfig.from_pretrained(CONFIG_DIR, trust_remote_code=True)
+tokenizer = mistral_model.tokenizer
+input_ids = generate_input_ids(tokenizer)  # Generate input IDs and conversation template
+input_ids = input_ids[0].to('cpu')  # [token_len]
+with torch_neuronx.experimental.neuron_cores_context(start_nc=0, nc_count=2):  # Use Neuron cores for inference
+    vision_module_neuron = torch.jit.load(NEURON_VISION_PATH)
+vision_module_neuron = vision_module_neuron.eval()
+# Load embedding weights and set up embedding module
+padding_idx = config.pad_token_id
+embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx)
+embed_weight = torch.load(EMBED_TOKEN_PATH)
+embed_tokens.load_state_dict(embed_weight)
+embed_tokens = embed_tokens.eval()
+embed_tokens.to(torch.float16).to('cpu')
+#layer norm
+vision_width = 1408
+ln_vision = LayerNorm(vision_width)
+ln_vision_weight = torch.load(LAYERNORM_SAVE_PATH)
+ln_vision.load_state_dict(ln_vision_weight)
+ln_vision = ln_vision.eval()
+ln_vision = ln_vision.to(torch.float32)
+num_query_token = 32
+query_tokens = nn.Parameter(
+    torch.zeros(1, num_query_token, 768)
+)
+query_tokens.data.normal_(mean=0.0, std=0.02)
+query_tokens_weight = torch.load(QUERY_TOKEN_PATH)['query_tokens']
+query_tokens.data = query_tokens_weight
+frame_position_encoding = nn.Embedding(10, 768)
+frame_position_encoding_weight = torch.load(POSITION_ENCODING_SAVE_PATH)
+frame_position_encoding.load_state_dict(frame_position_encoding_weight)
+projector = nn.Linear(config.mm_hidden_size, config.hidden_size)
+projector_weight = torch.load(PROJECTOR_PATH)
+projector.load_state_dict(projector_weight)
+neuron_bert = torch.jit.load(NEURON_BERT_PATH)
+neuron_bert = neuron_bert.eval()
+@app.post("/generate")
+async def generate(request: Request) -> Dict[str, str]:
+    """
+    Generate text using the Mistral model.
+    Args:
+        request (Request): The incoming request object.
+    Returns:
+        Dict[str, str]: A dictionary containing the generated text or an error message.
+    """
+    try:
+        s1 = time.time()
+        request_payload = await request.json()
+        request_payload_keys = request_payload.keys()
+        s11 = time.time()
+        print("request_payload_keys time: ", s11-s1)
+        if "images" in request_payload_keys:  # If input is a list of images
+            packed_data = request_payload.get("images")
+            s12 = time.time()
+            print("packed_data time: ", s12-s11)
+            with concurrent.futures.ThreadPoolExecutor(10) as executor:
+                unpacked_data = list(executor.map(base64.b64decode, packed_data))
+            s13 = time.time()
+            print("unpacked_data time: ", s13-s12)
+            with concurrent.futures.ThreadPoolExecutor(10) as executor:
+                input_images = list(executor.map(image_open_byteio, unpacked_data))
+            s14 = time.time()
+            print("image_open_byteio time: ", s14-s13)
+            input_images = uniform_sample(input_images, NUM_SEGMENTS)  # Sample frames
+            s15 = time.time()
+            print("uniform_sample time: ", s15-s14)
+            with concurrent.futures.ThreadPoolExecutor(10) as executor:
+                new_images = list(executor.map(process_anyres_image, input_images))
+            input_images = torch.stack(new_images, dim=0)
+            s16 = time.time()
+            print("process_images_v2 time: ", s16-s15)
+            print("s1 - input_images time: ", time.time() - s1)
+            si = time.time()
+            with torch.inference_mode():  # Enable inference mode
+                with concurrent.futures.ThreadPoolExecutor(2) as executor:  # Use thread pool for parallel processing
+                    image_features_list = list(executor.map(vision_module_neuron, input_images))
+                image_features = torch.cat(image_features_list, dim=0)  # Concatenate image features
+                print("si - image_features neuron time: ", time.time() - si)
+                s2 = time.time()
+                image_features = ln_vision(image_features)
+                attn_mask = torch.ones(image_features.size()[:-1], dtype=torch.long).to(image_features.device)
+                query_tokens_inputs = query_tokens.expand(image_features.shape[0], -1, -1)
+                image_features = neuron_bert(
+                    query_tokens_inputs.to(torch.float32),
+                    image_features.to(torch.float32),
+                    attn_mask.to(torch.int64)
+                )["last_hidden_state"].to(torch.float32)
+                frame_ids = torch.arange(input_images.shape[0], dtype=torch.long, device=image_features.device).unsqueeze(1)
+                frame_ids = frame_ids.repeat(1, input_images.shape[1]).flatten(0, 1)  # [num_frames * num_patches]
+                image_features += frame_position_encoding(frame_ids).unsqueeze(-2)  # [num_frames, 1, 768]
+                projected_features = projector(image_features)
+                image_features = projected_features.flatten(0, 1)
+                print(image_features.shape)
+                image_features.to(device='cpu', dtype=torch.float16)  # Convert to float16 and move to CPU
+                print("s2 - image_features prepare time: ", time.time() - s2)
+                s3 = time.time()
+                vision_token_indice = torch.where(input_ids == MM_TOKEN_INDEX)[0][0]  # Get index of vision token
+                pre_text_token = embed_tokens(input_ids[:vision_token_indice])  # Embed tokens before vision token
+                post_text_token = embed_tokens(input_ids[vision_token_indice + 1:])  # Embed tokens after vision token
+                print("s3 - text_token time: ", time.time() - s3)
+                s4 = time.time()
+                inputs_embeds = torch.cat([pre_text_token, image_features, post_text_token]).unsqueeze(0)  # Concatenate input embeddings
+                print("s4 - inputs time: ", time.time() - s4)
+        else:
+            raise HTTPException(status_code=400, detail="Please provide correct input")
+        s5 = time.time()
+        parameters = request_payload.get("parameters", {})  # Get additional parameters
+        generated_text = mistral_model.generate(inputs_embeds, parameters)  # Generate text using Mistral model
+        print("s5 - generated_text time: ", time.time() - s5)
+        print("total inference time: ", time.time() - si)
+        return {"generated_text": generated_text}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error generating text: {str(e)}")
\ No newline at end of file
diff --git a/doas-install.sh b/doas-install.sh
new file mode 100644
index 0000000000000000000000000000000000000000..fe5db62548166a843b8fabe416e5c0db5c97ab54
--- /dev/null
+++ b/doas-install.sh
@@ -0,0 +1,693 @@
+#!/bin/bash
+
+set -u
+
+# If DOAS_UPDATE_ROOT is unset or empty, default it.
+DOAS_SCM_REPO_NAME=${DOAS_SCM_REPO_NAME:-security/zti/doas}
+DOAS_UPDATE_ROOT="${DOAS_UPDATE_ROOT:-https://tosv.byted.org/obj}"
+
+DOAS_VALID_ENV_VARS=( "cn" "sg" "us" "i18n" "ttp" )
+DOAS_VALID_TOS_ENV_VARS=( "cn" "sg" "us" "i18n" )
+
+usage() {
+    cat 1>&2 <<EOF
+doas-install - install doas
+The installer for doas
+
+USAGE:
+    doas-install [FLAGS] [OPTIONS]
+
+FLAGS:
+    -v, --verbose           Enable verbose output
+    -q, --quiet             Disable progress output
+    -y                      Disable confirmation prompt.
+    -h, --help              Prints help information
+    --scm                   Use SCM source to download doas
+
+OPTIONS:
+    -e, --env <>...         Runtime Environment (cn/us/i18n/ttp)
+EOF
+}
+
+
+main() {
+    downloader --check
+    need_cmd uname
+    need_cmd mktemp
+    need_cmd chmod
+    need_cmd mkdir
+    need_cmd rm
+    need_cmd rmdir
+    need_cmd mv
+    need_cmd cp
+
+    get_architecture || return 1
+    local _arch="$RETVAL"
+    assert_nz "$_arch" "arch"
+    local _result=${_env:-}
+    local _version
+    local _dir
+    _dir="$(ensure mktemp -d)"
+    local _file="${_dir}/doas"
+    local _version_file="${_dir}/doas_version"
+    local _doas_path
+
+    local _ansi_escapes_are_valid=false
+    if [ -t 2 ]; then
+        if [ "${TERM+set}" = 'set' ]; then
+            case "$TERM" in
+                xterm*|rxvt*|urxvt*|linux*|vt*)
+                    _ansi_escapes_are_valid=true
+                ;;
+            esac
+        fi
+    fi
+
+    # check if we have to use /dev/tty to prompt the user
+    local need_tty=yes
+    # check if we only to use SCM download doas
+    local only_scm=no
+    local _env=${_env:-}
+    for arg in "$@"; do
+        case "$arg" in
+            -h|--help)
+                usage
+                exit 0
+                ;;
+            -y)
+                shift
+                # user wants to skip the prompt -- we don't need /dev/tty
+                need_tty=no
+                ;;
+            --scm)
+                shift
+                # user wants to use SCM to download doas
+                only_scm=yes
+                ;;
+            -e|--env)
+                shift
+                _env="$1"
+                if [[ ! " ${DOAS_VALID_ENV_VARS[*]} " =~ " ${_env} " ]]; then
+                    echo "Invalid environment: ${_env}" >&2
+                    exit 1
+                fi
+                ;;
+            *)
+                ;;
+        esac
+    done
+
+    if $_ansi_escapes_are_valid; then
+        printf "\33[1minfo:\33[0m detecting doas lastest version\n" 1>&2
+    else
+        printf '%s\n' 'info: detecting doas lastest version' 1>&2
+    fi
+    
+    ensure mkdir -p "$_dir"
+
+    if [ ! "$only_scm" = "yes" ]; then
+        if $_ansi_escapes_are_valid; then
+            printf "\33[1minfo:\33[0m downloading doas binary from TOS bucket\n" 1>&2
+        else
+            printf '%s\n' 'info: downloading doas binary from TOS bucket' 1>&2
+        fi
+        if tos_downloader "$_env" "$_version_file" "$_file" "$_arch"; then
+            _result=1
+        fi
+        ignore rm "${_version_file}"
+    fi
+
+    if [[ -z "$_result" && $_arch = "linux-amd64" ]]; then
+        if $_ansi_escapes_are_valid; then
+            printf "\33[1minfo:\33[0m downloading doas binary from SCM\n" 1>&2
+        else
+            printf '%s\n' 'info: downloading doas binary from SCM' 1>&2
+        fi
+        if scm_downloader "$_env" "$_dir" "$_file" "$_arch"; then
+            _result=1
+        fi
+    fi
+
+    if [ -z "$_result" ]; then
+        if $_ansi_escapes_are_valid; then
+            printf "\33[1merror:\33[0m failed to download doas\n" 1>&2
+        else
+            printf '%s\n' 'error: failed to download doas' 1>&2
+        fi
+        exit 1
+    fi
+
+    ensure chmod u+x "$_file"
+    case "$_arch" in
+
+        *darwin*)
+        if check_cmd xattr; then
+            xattr -p com.apple.quarantine "${_file}" >/dev/null 2>&1
+            status=$?
+            if [ -z $status ]; then
+                xattr -d com.apple.quarantine "${_file}"
+            fi 
+        else
+            echo "warning: xattr not found, skipping com.apple.quarantine"
+        fi
+        ;;
+    esac
+
+    if [ ! -x "$_file" ]; then
+        printf '%s\n' "Cannot execute $_file (likely because of mounting /tmp as noexec)." 1>&2
+        printf '%s\n' "Please copy the file to a location where you can execute binaries and run ./doas install." 1>&2
+        exit 1
+    fi
+
+    if [ "$need_tty" = "yes" ]; then
+        # The installer is going to want to ask for confirmation by
+        # reading stdin.  This script was piped into `sh` though and
+        # doesn't have stdin to pass to its children. Instead we're going
+        # to explicitly connect /dev/tty to the installer's stdin.
+        if [ ! -t 1 ]; then
+            err "Unable to run interactively. Run with -y to accept defaults, --help for additional options"
+        fi
+
+        ignore "$_file" install < /dev/tty
+    else
+        ignore "$_file" install
+    fi
+
+    local _retval=$?
+
+    _doas_path=$(command -v doas)
+    if [[ -z $_doas_path ]]; then
+        cp -rf "$_file" "${PWD}/doas"
+        if $_ansi_escapes_are_valid; then
+            printf "\33[1mError:\33[0m No installation permission, please manually move doas from the current directory to \$PATH\n" 1>&2
+        else
+            printf '%s\n' "Error: No installation permission, please manually move doas from the current directory to \$PATH" 1>&2
+        fi
+    else
+        if ! cmp_binary "$_file" "$_doas_path"; then
+            cp -rf "$_file" "${PWD}/doas"
+            if $_ansi_escapes_are_valid; then
+                printf "\33[1mError:\33[0m No installation permission, please manually move doas from the current directory to \$PATH\n" 1>&2
+            else
+                printf '%s\n' "Error: No installation permission, please manually move doas from the current directory to \$PATH" 1>&2
+            fi
+        else
+            if $_ansi_escapes_are_valid; then
+                printf "\33[1minfo:\33[0m doas installed successfully, current path: %s\n" "$_doas_path" 1>&2
+            else
+                printf '%s\n' "info: doas installed successfully, current path: $_doas_path" 1>&2
+            fi
+        fi
+    fi
+
+    ignore rm "${_file}"
+    ignore rmdir "$_dir"
+
+    return "$_retval"
+}
+
+cmp_binary() {
+    local _status
+    _status=$(cmp --silent "$1" "$2"; echo $?)
+    return "$_status"
+}
+# This is just for indicating that commands' results are being
+# intentionally ignored. Usually, because it's being executed
+# as part of error handling.
+ignore() {
+    "$@"
+}
+
+assert_nz() {
+    if [ -z "$1" ]; then err "assert_nz $2"; fi
+}
+
+check_proc() {
+    # Check for /proc by looking for the /proc/self/exe link
+    # This is only run on Linux
+    if ! test -L /proc/self/exe ; then
+        err "fatal: Unable to find /proc/self/exe.  Is /proc mounted?  Installation cannot proceed without /proc."
+    fi
+}
+
+get_bitness() {
+    need_cmd head
+    # Architecture detection without dependencies beyond coreutils.
+    # ELF files start out "\x7fELF", and the following byte is
+    #   0x01 for 32-bit and
+    #   0x02 for 64-bit.
+    # The printf builtin on some shells like dash only supports octal
+    # escape sequences, so we use those.
+    local _current_exe_head
+    _current_exe_head=$(head -c 5 /proc/self/exe )
+    if [ "$_current_exe_head" = "$(printf '\177ELF\001')" ]; then
+        echo 32
+    elif [ "$_current_exe_head" = "$(printf '\177ELF\002')" ]; then
+        echo 64
+    else
+        err "unknown platform bitness"
+    fi
+}
+
+is_host_amd64_elf() {
+    need_cmd head
+    need_cmd tail
+    # ELF e_machine detection without dependencies beyond coreutils.
+    # Two-byte field at offset 0x12 indicates the CPU,
+    # but we're interested in it being 0x3E to indicate amd64, or not that.
+    local _current_exe_machine
+    _current_exe_machine=$(head -c 19 /proc/self/exe | tail -c 1)
+    [ "$_current_exe_machine" = "$(printf '\076')" ]
+}
+
+say() {
+    printf 'doas-install: %s\n' "$1"
+}
+
+err() {
+    say "$1" >&2
+    exit 1
+}
+
+need_cmd() {
+    if ! check_cmd "$1"; then
+        err "need '$1' (command not found)"
+    fi
+}
+
+check_cmd() {
+    command -v "$1" > /dev/null 2>&1
+}
+
+
+# This wraps curl or wget. Try curl first, if not installed,
+# use wget instead.
+downloader() {
+    local _dld
+    local _ciphersuites
+    local _err
+    local _status
+    if check_cmd curl; then
+        _dld=curl
+    elif check_cmd wget; then
+        _dld=wget
+    else
+        _dld='curl or wget' # to be used in error message of need_cmd
+    fi
+
+    if [ "$1" = --check ]; then
+        need_cmd "$_dld"
+    elif [ "$_dld" = curl ]; then
+        get_ciphersuites_for_curl
+        _ciphersuites="$RETVAL"
+        if [ -n "$_ciphersuites" ]; then
+            _err=$(curl --proto '=https' --tlsv1.2 --ciphers "$_ciphersuites" --silent --show-error --fail --location "$1" --output "$2" 2>&1)
+            _status=$?
+        else
+            echo "Warning: Not enforcing strong cipher suites for TLS, this is potentially less secure"
+            if ! check_help_for "$3" curl --proto --tlsv1.2; then
+                echo "Warning: Not enforcing TLS v1.2, this is potentially less secure"
+                _err=$(curl --silent --show-error --fail --location "$1" --output "$2" 2>&1)
+                _status=$?
+            else
+                _err=$(curl --proto '=https' --tlsv1.2 --silent --show-error --fail --location "$1" --output "$2" 2>&1)
+                _status=$?
+            fi
+        fi
+        if [ -n "$_err" ]; then
+            echo "$_err" >&2
+            if echo "$_err" | grep -q 404$; then
+                if [[ ! "$1" =~ "CURRENT_VERSION" ]]; then
+                    err "installer for platform '$3' not found, this may be unsupported"
+                fi
+            fi
+        fi
+        return $_status
+    elif [ "$_dld" = wget ]; then
+        get_ciphersuites_for_wget
+        _ciphersuites="$RETVAL"
+        if [ -n "$_ciphersuites" ]; then
+            _err=$(wget --https-only --secure-protocol=TLSv1_2 --ciphers "$_ciphersuites" "$1" -O "$2" 2>&1)
+            _status=$?
+        else
+            echo "Warning: Not enforcing strong cipher suites for TLS, this is potentially less secure"
+            if ! check_help_for "$3" wget --https-only --secure-protocol; then
+                echo "Warning: Not enforcing TLS v1.2, this is potentially less secure"
+                _err=$(wget "$1" -O "$2" 2>&1)
+                _status=$?
+            else
+                _err=$(wget --https-only --secure-protocol=TLSv1_2 "$1" -O "$2" 2>&1)
+                _status=$?
+            fi
+        fi
+        if [ -n "$_err" ]; then
+            echo "$_err" >&2
+            if echo "$_err" | grep -q ' 404 Not Found$'; then
+                if [[ ! "$1" =~ "CURRENT_VERSION" ]]; then
+                    err "installer for platform '$3' not found, this may be unsupported"
+                fi
+            fi
+        fi
+        return $_status
+    else
+        err "Unknown downloader"   # should not reach here
+    fi
+}
+
+check_help_for() {
+    local _arch
+    local _cmd
+    local _arg
+    _arch="$1"
+    shift
+    _cmd="$1"
+    shift
+
+    local _category
+    if "$_cmd" --help | grep -q 'For all options use the manual or "--help all".'; then
+      _category="all"
+    else
+      _category=""
+    fi
+
+    case "$_arch" in
+
+        *darwin*)
+        if check_cmd sw_vers; then
+            case $(sw_vers -productVersion) in
+                10.*)
+                    # If we're running on macOS, older than 10.13, then we always
+                    # fail to find these options to force fallback
+                    if [ "$(sw_vers -productVersion | cut -d. -f2)" -lt 13 ]; then
+                        # Older than 10.13
+                        echo "Warning: Detected macOS platform older than 10.13"
+                        return 1
+                    fi
+                    ;;
+                11.*)
+                    # We assume Big Sur will be OK for now
+                    ;;
+                *)
+                    # Unknown product version, warn and continue
+                    echo "Warning: Detected unknown macOS major version: $(sw_vers -productVersion)"
+                    echo "Warning TLS capabilities detection may fail"
+                    ;;
+            esac
+        fi
+        ;;
+
+    esac
+
+    for _arg in "$@"; do
+        if ! "$_cmd" --help "$_category" | grep -q -- "$_arg"; then
+            return 1
+        fi
+    done
+
+    true # not strictly needed
+}
+
+get_architecture() {
+    local _ostype _cputype _bitness _arch
+    _ostype="$(uname -s)"
+    _cputype="$(uname -m)"
+
+    if [ "$_ostype" = Darwin ] && [ "$_cputype" = i386 ]; then
+        # Darwin `uname -m` lies
+        if sysctl hw.optional.x86_64 | grep -q ': 1'; then
+            _cputype=x86_64
+        fi
+    fi
+
+    case "$_ostype" in
+
+        Linux)
+            check_proc
+            _ostype=linux
+            _bitness=$(get_bitness)
+            ;;
+
+        FreeBSD)
+            _ostype=freebsd
+            ;;
+
+        Darwin)
+            _ostype=darwin
+            ;;
+
+        MINGW* | MSYS* | CYGWIN*)
+            _ostype=windows
+            ;;
+
+        *)
+            err "unrecognized OS type: $_ostype"
+            ;;
+
+    esac
+
+    case "$_cputype" in
+
+        i386 | i486 | i686 | i786 | x86)
+            _cputype=386
+            ;;
+
+        xscale | arm | armv6l)
+            _cputype=arm
+            ;;
+
+        aarch64 | arm64)
+            _cputype=arm64
+            ;;
+
+        x86_64 | x86-64 | x64 | amd64)
+            _cputype=amd64
+            ;;
+
+        *)
+            err "unsupported CPU type: $_cputype"
+
+    esac
+
+    _arch="${_ostype}-${_cputype}"
+
+    RETVAL="$_arch"
+}
+
+# Return cipher suite string specified by user, otherwise return strong TLS 1.2-1.3 cipher suites
+# if support by local tools is detected. Detection currently supports these wget backends: 
+# GnuTLS and OpenSSL (possibly also LibreSSL and BoringSSL). Return value can be empty.
+get_ciphersuites_for_wget() {
+    if [ -n "${RUSTUP_TLS_CIPHERSUITES-}" ]; then
+        # user specified custom cipher suites, assume they know what they're doing
+        RETVAL="$RUSTUP_TLS_CIPHERSUITES"
+        return
+    fi
+
+    local _cs=""
+    if wget -V | grep -q '\-DHAVE_LIBSSL'; then
+        # "unspecified" is for arch, allows for possibility old OS using macports, homebrew, etc.
+        if check_help_for "notspecified" "wget" "TLSv1_2" "--ciphers" "--https-only" "--secure-protocol"; then
+            _cs=$(get_strong_ciphersuites_for "openssl")
+        fi
+    elif wget -V | grep -q '\-DHAVE_LIBGNUTLS'; then
+        # "unspecified" is for arch, allows for possibility old OS using macports, homebrew, etc.
+        if check_help_for "notspecified" "wget" "TLSv1_2" "--ciphers" "--https-only" "--secure-protocol"; then
+            _cs=$(get_strong_ciphersuites_for "gnutls")
+        fi
+    fi
+
+    RETVAL="$_cs"
+}
+
+
+# Return cipher suite string specified by user, otherwise return strong TLS 1.2-1.3 cipher suites
+# if support by local tools is detected. Detection currently supports these curl backends: 
+# GnuTLS and OpenSSL (possibly also LibreSSL and BoringSSL). Return value can be empty.
+get_ciphersuites_for_curl() {
+    if [ -n "${RUSTUP_TLS_CIPHERSUITES-}" ]; then
+        # user specified custom cipher suites, assume they know what they're doing
+        RETVAL="$RUSTUP_TLS_CIPHERSUITES"
+        return
+    fi
+
+    local _openssl_syntax="no"
+    local _gnutls_syntax="no"
+    local _backend_supported="yes"
+    if curl -V | grep -q ' OpenSSL/'; then
+        _openssl_syntax="yes"
+    elif curl -V | grep -iq ' LibreSSL/'; then
+        _openssl_syntax="yes"
+    elif curl -V | grep -iq ' BoringSSL/'; then
+        _openssl_syntax="yes"
+    elif curl -V | grep -iq ' GnuTLS/'; then
+        _gnutls_syntax="yes"
+    else
+        _backend_supported="no"
+    fi
+
+    local _args_supported="no"
+    if [ "$_backend_supported" = "yes" ]; then
+        # "unspecified" is for arch, allows for possibility old OS using macports, homebrew, etc.
+        if check_help_for "notspecified" "curl" "--tlsv1.2" "--ciphers" "--proto"; then
+            _args_supported="yes"
+        fi
+    fi
+
+    local _cs=""
+    if [ "$_args_supported" = "yes" ]; then
+        if [ "$_openssl_syntax" = "yes" ]; then
+            _cs=$(get_strong_ciphersuites_for "openssl")
+        elif [ "$_gnutls_syntax" = "yes" ]; then
+            _cs=$(get_strong_ciphersuites_for "gnutls")
+        fi
+    fi
+
+    RETVAL="$_cs"
+}
+
+# Run a command that should never fail. If the command fails execution
+# will immediately terminate with an error showing the failing
+# command.
+ensure() {
+    if ! "$@"; then err "command failed: $*"; fi
+}
+
+
+# Return strong TLS 1.2-1.3 cipher suites in OpenSSL or GnuTLS syntax. TLS 1.2 
+# excludes non-ECDHE and non-AEAD cipher suites. DHE is excluded due to bad 
+# DH params often found on servers (see RFC 7919). Sequence matches or is
+# similar to Firefox 68 ESR with weak cipher suites disabled via about:config.  
+# $1 must be openssl or gnutls.
+get_strong_ciphersuites_for() {
+    if [ "$1" = "openssl" ]; then
+        # OpenSSL is forgiving of unknown values, no problems with TLS 1.3 values on versions that don't support it yet.
+        echo "TLS_AES_128_GCM_SHA256:TLS_CHACHA20_POLY1305_SHA256:TLS_AES_256_GCM_SHA384:ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384"
+    elif [ "$1" = "gnutls" ]; then
+        # GnuTLS isn't forgiving of unknown values, so this may require a GnuTLS version that supports TLS 1.3 even if wget doesn't.
+        # Begin with SECURE128 (and higher) then remove/add to build cipher suites. Produces same 9 cipher suites as OpenSSL but in slightly different order.
+        echo "SECURE128:-VERS-SSL3.0:-VERS-TLS1.0:-VERS-TLS1.1:-VERS-DTLS-ALL:-CIPHER-ALL:-MAC-ALL:-KX-ALL:+AEAD:+ECDHE-ECDSA:+ECDHE-RSA:+AES-128-GCM:+CHACHA20-POLY1305:+AES-256-GCM"
+    fi 
+}
+
+get_tos_env_bucket() {
+    if [ "$1" = "cn" ]; then
+        echo "doas-user-binary"
+    elif [ "$1" = "sg" ]; then
+        echo "doas-user-binary-sg"
+    elif [ "$1" = "us" ]; then
+        echo "doas-user-binary-us"
+    elif [ "$1" = "i18n" ]; then
+        echo "doas-user-binary-aiso"
+    elif [ "$1" = "ttp" ]; then
+        err "TTP is not supported to use TOS bucket downloader"
+    fi 
+}
+
+get_scm_env_url() {
+    if [ "$1" = "cn" ]; then
+        echo "https://luban-source.byted.org"
+    elif [ "$1" = "sg" ]; then
+        echo "https://luban-source.byted.org"
+    elif [ "$1" = "us" ]; then
+        echo "https://luban-source-us.byted.org"
+    elif [ "$1" = "i18n" ]; then
+        echo "https://luban-source-us.byted.org"
+    elif [ "$1" = "ttp" ]; then
+        echo "https://luban-source.tiktokd.org"
+    fi 
+}
+
+tos_downloader() {
+    need_cmd gzip
+    local _retval=${_retval:-}
+    local _version
+    local _version_url
+    local _bucket_name
+    local _tos_env=$1
+    local _version_file=$2
+    local _file=$3
+    local _arch=$4
+    if [[ -z "$1" ]]; then
+        for _tos_env in "${DOAS_VALID_TOS_ENV_VARS[@]}"; do
+            _bucket_name=$(get_tos_env_bucket "$_tos_env")
+            _version_url="${DOAS_UPDATE_ROOT}/${_bucket_name}/CURRENT_VERSION"
+            if downloader "$_version_url" "$_version_file" "$_arch"; then
+                _retval="$_tos_env"
+                break
+            fi
+        done
+    elif [[  "$_tos_env" = "ttp" ]]; then
+        err "TOS bucket downloader not supports TTP"
+    else
+        _bucket_name=$(get_tos_env_bucket "$_tos_env")
+        _version_url="${DOAS_UPDATE_ROOT}/${_bucket_name}/CURRENT_VERSION"
+        if downloader "$_version_url" "$_version_file" "$_arch"; then
+            _retval="$_tos_env"
+        fi
+    fi
+
+    if [[ -z "$_retval" ]]; then 
+        echo "Unable to download doas from TOS bucket" 
+        return 1
+    fi
+
+    _version="$(cat "$_version_file")"
+    if [[ -z "$_version" ]]; then
+        echo "Unable to read version from $_version_file"
+        return 1
+    fi
+    local _url="${DOAS_UPDATE_ROOT}/${_bucket_name}/${_version}/doas-${_arch}.gz"
+    ensure downloader "$_url" "${_file}.gz" "$_arch"
+    ensure gzip -d "${_file}.gz"
+
+    true
+}
+
+scm_downloader() {
+    need_cmd tar
+    local _retval=${_retval:-}
+    local _scm_host
+    local _scm_url
+    local _compress_ext=".tar.gz"
+    local _dir=$2
+    local _file=$3
+    local _arch=$4
+    if [[ -z "$1" ]]; then
+        for _env in "${DOAS_VALID_ENV_VARS[@]}"; do
+            _scm_host=$(get_scm_env_url "$_env")
+            _scm_url="${_scm_host}/repository/scm/api/v1/download_latest/?name=${DOAS_SCM_REPO_NAME}"
+            if downloader "$_scm_url" "${_file}${_compress_ext}" "$_arch"; then
+                _retval="$_env"
+                break
+            fi
+        done
+    else
+        _scm_host=$(get_scm_env_url "$1")
+        _scm_url="${_scm_host}/repository/scm/api/v1/download_latest/?name=${DOAS_SCM_REPO_NAME}"
+        if downloader "$_scm_url" "${_file}${_compress_ext}" "$_arch"; then
+            _retval="$1"
+        fi
+    fi
+
+    if [[ -z "$_retval" ]]; then 
+        echo "Unable to download doas from SCM Repo ${DOAS_SCM_REPO_NAME}" 
+        return 1
+    fi
+    
+    ensure tar zxf "${_file}${_compress_ext}" -C "$_dir"
+    ensure mv "$_dir/bin/doas" "$_file"
+
+    ignore rm "${_file}${_compress_ext}"
+    ignore rmdir "$_dir/bin"
+    ignore rm "$_dir/current_revision"
+
+    true
+}
+
+SUDO_USER=${SUDO_USER:-}
+if [ -n "$SUDO_USER" ]; then
+    SUDO_PATH=$(sudo -Hiu $SUDO_USER printenv PATH)
+fi
+export SUDO_PATH=${SUDO_PATH:-}
+
+main "$@" || exit 1
\ No newline at end of file
diff --git a/docker-compose-inf2-48xl.yaml b/docker-compose-inf2-48xl.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b1ce86a32d71d2b8e0ce2faaf6db98ae1cae0381
--- /dev/null
+++ b/docker-compose-inf2-48xl.yaml
@@ -0,0 +1,172 @@
+version: '3.7'
+
+services:
+  masp-0:
+    image: masp_image:latest
+    deploy:
+      resources:
+        reservations:
+          cpus: '16'
+    ports:
+      - "8000:8000"
+    volumes:
+      - /root/masp_serving_inf2/app:/app
+      - /root/inf2_dir/:/root/inf2_dir/
+    devices:
+      - "/dev/neuron0"
+
+  masp-1:
+    image: masp_image:latest
+    deploy:
+      resources:
+        reservations:
+          cpus: '16'
+    ports:
+      - "8001:8000"
+    volumes:
+      - /root/masp_serving_inf2/app:/app
+      - /root/inf2_dir/:/root/inf2_dir/
+    devices:
+      - "/dev/neuron1"
+
+
+  masp-2:
+    image: masp_image:latest
+    deploy:
+      resources:
+        reservations:
+          cpus: '16'
+    ports:
+      - "8002:8000"
+    volumes:
+      - /root/masp_serving_inf2/app:/app
+      - /root/inf2_dir/:/root/inf2_dir/
+    devices:
+      - "/dev/neuron2"
+
+  masp-3:
+    image: masp_image:latest
+    deploy:
+      resources:
+        reservations:
+          cpus: '16'
+    ports:
+      - "8003:8000"
+    volumes:
+      - /root/masp_serving_inf2/app:/app
+      - /root/inf2_dir/:/root/inf2_dir/
+    devices:
+      - "/dev/neuron3"
+
+  masp-4:
+    image: masp_image:latest
+    deploy:
+      resources:
+        reservations:
+          cpus: '16'
+    ports:
+      - "8004:8000"
+    volumes:
+      - /root/masp_serving_inf2/app:/app
+      - /root/inf2_dir/:/root/inf2_dir/
+    devices:
+      - "/dev/neuron4"
+
+  masp-5:
+    image: masp_image:latest
+    deploy:
+      resources:
+        reservations:
+          cpus: '16'
+    ports:
+      - "8005:8000"
+    volumes:
+      - /root/masp_serving_inf2/app:/app
+      - /root/inf2_dir/:/root/inf2_dir/
+    devices:
+      - "/dev/neuron5"
+
+  masp-6:
+    image: masp_image:latest
+    deploy:
+      resources:
+        reservations:
+          cpus: '16'
+    ports:
+      - "8006:8000"
+    volumes:
+      - /root/masp_serving_inf2/app:/app
+      - /root/inf2_dir/:/root/inf2_dir/
+    devices:
+      - "/dev/neuron6"
+
+  masp-7:
+    image: masp_image:latest
+    deploy:
+      resources:
+        reservations:
+          cpus: '16'
+    ports:
+      - "8007:8000"
+    volumes:
+      - /root/masp_serving_inf2/app:/app
+      - /root/inf2_dir/:/root/inf2_dir/
+    devices:
+      - "/dev/neuron7"
+
+  masp-8:
+    image: masp_image:latest
+    deploy:
+      resources:
+        reservations:
+          cpus: '16'
+    ports:
+      - "8008:8000"
+    volumes:
+      - /root/masp_serving_inf2/app:/app
+      - /root/inf2_dir/:/root/inf2_dir/
+    devices:
+      - "/dev/neuron8"
+
+  masp-9:
+    image: masp_image:latest
+    deploy:
+      resources:
+        reservations:
+          cpus: '16'
+    ports:
+      - "8009:8000"
+    volumes:
+      - /root/masp_serving_inf2/app:/app
+      - /root/inf2_dir/:/root/inf2_dir/
+    devices:
+      - "/dev/neuron9"
+
+  masp-10:
+    image: masp_image:latest
+    deploy:
+      resources:
+        reservations:
+          cpus: '16'
+    ports:
+      - "8010:8000"
+    volumes:
+      - /root/masp_serving_inf2/app:/app
+      - /root/inf2_dir/:/root/inf2_dir/
+    devices:
+      - "/dev/neuron10"
+
+  masp-11:
+    image: masp_image:latest
+    deploy:
+      resources:
+        reservations:
+          cpus: '16'
+    ports:
+      - "8011:8000"
+    volumes:
+      - /root/masp_serving_inf2/app:/app
+      - /root/inf2_dir/:/root/inf2_dir/
+    devices:
+      - "/dev/neuron11"
+
diff --git a/images/0.jpg b/images/0.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..7277d3ed29d3d30f1b97f22536c20a4cef099490
Binary files /dev/null and b/images/0.jpg differ
diff --git a/images/1028.jpg b/images/1028.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3980dfd352b0348458b427affbe5a1972bfb33d0
Binary files /dev/null and b/images/1028.jpg differ
diff --git a/images/104.jpg b/images/104.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b82f862811143d8b11d8d75e828e3368228d8b93
Binary files /dev/null and b/images/104.jpg differ
diff --git a/images/1058.jpg b/images/1058.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..354cc1d011f289913ff6d9d4d20eaf832281b1bc
Binary files /dev/null and b/images/1058.jpg differ
diff --git a/images/1087.jpg b/images/1087.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..fe87e8cdebef4c7fea47911d8e4be7bf377bf68b
Binary files /dev/null and b/images/1087.jpg differ
diff --git a/images/1117.jpg b/images/1117.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2d39dd1e615b45a2c302ea6433c6e08ccaa242e3
Binary files /dev/null and b/images/1117.jpg differ
diff --git a/images/1147.jpg b/images/1147.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d77038917deb509aeac860f13b438ca6127c6a87
Binary files /dev/null and b/images/1147.jpg differ
diff --git a/images/1177.jpg b/images/1177.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..def465137f20774bc83dc32172eb80ae054f9849
Binary files /dev/null and b/images/1177.jpg differ
diff --git a/images/1207.jpg b/images/1207.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..ab3122352d9a4a08bb1aeb5a6d54be7e29391046
Binary files /dev/null and b/images/1207.jpg differ
diff --git a/images/1236.jpg b/images/1236.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..046bebc7d324b39a9741647537a9b4f4502a23f4
Binary files /dev/null and b/images/1236.jpg differ
diff --git a/images/1266.jpg b/images/1266.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..46652e06e91b1bf48a23bbaecf3deebb4b17fcca
Binary files /dev/null and b/images/1266.jpg differ
diff --git a/images/1281.jpg b/images/1281.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..73e079989235b771f90dbc8d9100362952089522
Binary files /dev/null and b/images/1281.jpg differ
diff --git a/images/134.jpg b/images/134.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..68fb3892bb605dbaebefa3d10e58e8fe7a2a9cb3
Binary files /dev/null and b/images/134.jpg differ
diff --git a/images/15.jpg b/images/15.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..ffbe6d21af5d4bfec992027b6f6e2c268eefa0aa
Binary files /dev/null and b/images/15.jpg differ
diff --git a/images/164.jpg b/images/164.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..cb79d624b81831afb5006db4f141b7374b3156ec
Binary files /dev/null and b/images/164.jpg differ
diff --git a/images/194.jpg b/images/194.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d186a42f8cd18958d71c4e38c50f1f791ffa65b9
Binary files /dev/null and b/images/194.jpg differ
diff --git a/images/223.jpg b/images/223.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..35147da2a1be90df438a527e169c9691dcb741ba
Binary files /dev/null and b/images/223.jpg differ
diff --git a/images/253.jpg b/images/253.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1032d88a414215fd16b17c8a648ed68dcb2fd7ae
Binary files /dev/null and b/images/253.jpg differ
diff --git a/images/283.jpg b/images/283.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b411c7840647948f413abaa3153934d79fbe8eee
Binary files /dev/null and b/images/283.jpg differ
diff --git a/images/313.jpg b/images/313.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..29312a09cb911fc6929a2789d1534ea4ae82d6d6
Binary files /dev/null and b/images/313.jpg differ
diff --git a/images/343.jpg b/images/343.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d6a265c5e0e629f60861291e79ffdb4002684629
Binary files /dev/null and b/images/343.jpg differ
diff --git a/images/372.jpg b/images/372.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..09e59f16742ef66a7b3a6aed6d7d519007bcbb7d
Binary files /dev/null and b/images/372.jpg differ
diff --git a/images/402.jpg b/images/402.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..53bd089a0bad195d3b5e5a0b9ab67129dbd52341
Binary files /dev/null and b/images/402.jpg differ
diff --git a/images/432.jpg b/images/432.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9e7ae3193a9036e19061dd810a6f819b7feb8407
Binary files /dev/null and b/images/432.jpg differ
diff --git a/images/45.jpg b/images/45.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..79124bb8bff256932a6119e4410b7dc6a0cbf5d8
Binary files /dev/null and b/images/45.jpg differ
diff --git a/images/462.jpg b/images/462.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1a37a0255319353d00f38d141484d77b5d74146d
Binary files /dev/null and b/images/462.jpg differ
diff --git a/images/492.jpg b/images/492.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..72c3f4c615c1926f91bfa283a64a4cb4d6c5c1c8
Binary files /dev/null and b/images/492.jpg differ
diff --git a/images/521.jpg b/images/521.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2393208ba3cd83dab58254e92d932fc37e109f37
Binary files /dev/null and b/images/521.jpg differ
diff --git a/images/551.jpg b/images/551.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..677451ade8ebad4768ea013cce34b1cfbaca74fb
Binary files /dev/null and b/images/551.jpg differ
diff --git a/images/581.jpg b/images/581.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..efc54fe368248f03dc85087ea16f2af888f596a3
Binary files /dev/null and b/images/581.jpg differ
diff --git a/images/611.jpg b/images/611.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1b6488e58a7f1ec37508908c9bb87bca73a76db2
Binary files /dev/null and b/images/611.jpg differ
diff --git a/images/641.jpg b/images/641.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..c4205ef8169182c823f58275d7ddb74765944083
Binary files /dev/null and b/images/641.jpg differ
diff --git a/images/670.jpg b/images/670.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6ca31722f112dc535d472ea6ef3576ddec3982cb
Binary files /dev/null and b/images/670.jpg differ
diff --git a/images/700.jpg b/images/700.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..011dce508845d53b61adcb18a106122a418f2a94
Binary files /dev/null and b/images/700.jpg differ
diff --git a/images/730.jpg b/images/730.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..e70f79fba8f3536ee93f064db0f34f5165186fff
Binary files /dev/null and b/images/730.jpg differ
diff --git a/images/74.jpg b/images/74.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..97c0d98a25b656a6ac4443482a0b3331851ece26
Binary files /dev/null and b/images/74.jpg differ
diff --git a/images/760.jpg b/images/760.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..41855e43e1449266337c5327aa9ec6d39e3a0334
Binary files /dev/null and b/images/760.jpg differ
diff --git a/images/789.jpg b/images/789.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..aad54cd54150688cbcc669d4edfddbe1fd512224
Binary files /dev/null and b/images/789.jpg differ
diff --git a/images/819.jpg b/images/819.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9d9b01f24a50a7f95a678406f083341839044f46
Binary files /dev/null and b/images/819.jpg differ
diff --git a/images/849.jpg b/images/849.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8d233a8f2d39fb32ad26d7c60ec587d5adb41b9c
Binary files /dev/null and b/images/849.jpg differ
diff --git a/images/879.jpg b/images/879.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..5adf2eeef0f944120581fbdf11e8fba9ab4eb547
Binary files /dev/null and b/images/879.jpg differ
diff --git a/images/909.jpg b/images/909.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d139a095df3a831faee813e861b520a4367a5edf
Binary files /dev/null and b/images/909.jpg differ
diff --git a/images/938.jpg b/images/938.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3d7b98b73d240a3422ef73ebbaecaa98f0075978
Binary files /dev/null and b/images/938.jpg differ
diff --git a/images/968.jpg b/images/968.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..7bc1e605822b977a3dbceef0b00dbc51d108b6b9
Binary files /dev/null and b/images/968.jpg differ
diff --git a/images/998.jpg b/images/998.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..7fad4fce23ebeca2bd980722b3ad033754ae7066
Binary files /dev/null and b/images/998.jpg differ
diff --git a/inf2_dir/inf2_weights/embed_tokens.pth b/inf2_dir/inf2_weights/embed_tokens.pth
new file mode 100644
index 0000000000000000000000000000000000000000..0a7ecbcdae470ba73fc2e3470833f1e77ece1294
--- /dev/null
+++ b/inf2_dir/inf2_weights/embed_tokens.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:492bdd222b34b02fdc693a823a691c668a8853a7ab68b827441f45fbed8d9126
+size 524354362
diff --git a/inf2_dir/inf2_weights/frame_position_encoding.pth b/inf2_dir/inf2_weights/frame_position_encoding.pth
new file mode 100644
index 0000000000000000000000000000000000000000..da8c743164134025061c55047e9a1cc77b2d915d
--- /dev/null
+++ b/inf2_dir/inf2_weights/frame_position_encoding.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ca476975aa417d1dc94d39c001e9128e69198100402c0d456c491151b0bff14
+size 31643
diff --git a/inf2_dir/inf2_weights/ln_state_dict.pth b/inf2_dir/inf2_weights/ln_state_dict.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d44be42af8440f18ed82e3d329a7b339979f5fe4
--- /dev/null
+++ b/inf2_dir/inf2_weights/ln_state_dict.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2c6899689eabe5be703ceca2c0a87f6230d2a18b36dea3a4fb77f8b99eb4e978
+size 12351
diff --git a/inf2_dir/inf2_weights/neuron_bert.pth b/inf2_dir/inf2_weights/neuron_bert.pth
new file mode 100644
index 0000000000000000000000000000000000000000..42564da09a2f191263529f2487f83d84271bfe3b
--- /dev/null
+++ b/inf2_dir/inf2_weights/neuron_bert.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:627a78fd250f0bdfb9c5a9635c6d6b77641aea07e2819b5da179f9a38da562d5
+size 175611386
diff --git a/inf2_dir/inf2_weights/neuron_eva_vit_batch7.pth b/inf2_dir/inf2_weights/neuron_eva_vit_batch7.pth
new file mode 100644
index 0000000000000000000000000000000000000000..ee6b06a886929c8f1d3e1092b6ec8deaa0f55f45
--- /dev/null
+++ b/inf2_dir/inf2_weights/neuron_eva_vit_batch7.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8067990002c66263b0c3f6e792b949735db2e00eb0eaf35b7922396063da531
+size 1613470226
diff --git a/inf2_dir/inf2_weights/projector.pth b/inf2_dir/inf2_weights/projector.pth
new file mode 100644
index 0000000000000000000000000000000000000000..ea9d4cf452d0685c20cbd4bf849a14be86cf0785
--- /dev/null
+++ b/inf2_dir/inf2_weights/projector.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c950220c1f0a7501efad13c13b7ceddecf45ded13f504b405f0e5eb0a0605d92
+size 12600367
diff --git a/inf2_dir/inf2_weights/query_tokens.pth b/inf2_dir/inf2_weights/query_tokens.pth
new file mode 100644
index 0000000000000000000000000000000000000000..fca52373eb112b29e47016af43cd8ca748f7fcae
--- /dev/null
+++ b/inf2_dir/inf2_weights/query_tokens.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:54f46d7fb15d9e09031522691ff644522d6e76cdc23b27f1435370dbfbd0b2ed
+size 99066
diff --git a/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/added_tokens.json b/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..6026716cf93a162a66d98c4e111c6cfe6bbe2162
--- /dev/null
+++ b/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/added_tokens.json
@@ -0,0 +1,6 @@
+{
+  "<im_end>": 32001,
+  "<im_start>": 32000,
+  "<vid_end>": 32003,
+  "<vid_start>": 32002
+}
diff --git a/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/config.json b/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..3851b0c57fbf83abab2e01066ab40f2c5ad6dfaa
--- /dev/null
+++ b/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/config.json
@@ -0,0 +1,52 @@
+{
+  "_name_or_path": "mistralai/Mistral-7B-Instruct-v0.1",
+  "adapter_module_name": null,
+  "adapter_module_path": null,
+  "architectures": [
+    "LlavaMistralForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "drop_path_rate": 0.0,
+  "eos_token_id": 2,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "anyres",
+  "image_grid_pinpoints": "[(448, 672), (672, 448)]",
+  "img_size": 224,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "lora_lr": null,
+  "max_num_segments": 10,
+  "max_position_embeddings": 32768,
+  "mm_hidden_size": 768,
+  "mm_patch_merge_type": "flat",
+  "mm_projector_lr": null,
+  "mm_projector_type": "linear",
+  "mm_use_patch_token": false,
+  "mm_use_start_end": true,
+  "mm_vision_select_feature": "patch",
+  "mm_vision_select_layer": -2,
+  "mm_vision_tower": "eva-vit-g",
+  "model_type": "mistral",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "num_query_token": 32,
+  "qformer_model_path": "./model/blip2_pretrained_flant5xxl.pth",
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 10000.0,
+  "sliding_window": 4096,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 4096,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.1",
+  "tune_mm_mlp_adapter": false,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "vit_model_path": "./model/eva_vit_g.pth",
+  "vit_precision": "fp16",
+  "vocab_size": 32004
+}
diff --git a/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/generation_config.json b/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0b1a8731b0741d4ed33026951c009b0c8d4ed399
--- /dev/null
+++ b/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/generation_config.json
@@ -0,0 +1,6 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "transformers_version": "4.38.1"
+}
diff --git a/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/model-00001-of-00004.safetensors b/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/model-00001-of-00004.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e6eb35f0352d0c2fb1dc9e0f5b7346e170a740e8
--- /dev/null
+++ b/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/model-00001-of-00004.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55cf1c1ee585192d043922f436c3e1befbc205f48b93a67263ac38de3ca4f4ed
+size 4943244352
diff --git a/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/model-00002-of-00004.safetensors b/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/model-00002-of-00004.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..36bd07abcf646d907e2c3b5be860ad3b1a759805
--- /dev/null
+++ b/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/model-00002-of-00004.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0ae0dc2436d53e37e12daf9814a2cb9578ec64613b5094d8521c43145efb2a17
+size 4999819336
diff --git a/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/model-00003-of-00004.safetensors b/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/model-00003-of-00004.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..df696b033afb16aad169defb3d9f3edb99222c33
--- /dev/null
+++ b/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/model-00003-of-00004.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cbae04fc46fd5310fad3e0d9fd79f83f464430d1be130283cf72d2d4089af248
+size 4999653632
diff --git a/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/model-00004-of-00004.safetensors b/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/model-00004-of-00004.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..68323269fb9249e5c64ec3ad388a535e18b5159e
--- /dev/null
+++ b/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/model-00004-of-00004.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9588c121c048aa6b8276f4162dffd12261b93e39e2d75d37ac5934b727991d44
+size 1729385856
diff --git a/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/model.safetensors.index.json b/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/model.safetensors.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..fc1bf2f90d7e35272b8d1c7af74979df82b28f57
--- /dev/null
+++ b/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/model.safetensors.index.json
@@ -0,0 +1,1070 @@
+{
+  "metadata": {
+    "total_size": 16671968000
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.embeddings.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.embeddings.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.embeddings.position_ids": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.attention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.attention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.attention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.attention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.attention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.attention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.attention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.attention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.attention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.attention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.crossattention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.crossattention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.crossattention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.crossattention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.crossattention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.crossattention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.crossattention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.crossattention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.crossattention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.crossattention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.intermediate_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.intermediate_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.output_query.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.output_query.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.output_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.0.output_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.1.attention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.1.attention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.1.attention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.1.attention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.1.attention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.1.attention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.1.attention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.1.attention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.1.attention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.1.attention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.1.intermediate_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.1.intermediate_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.1.output_query.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.1.output_query.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.1.output_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.1.output_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.attention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.attention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.attention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.attention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.attention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.attention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.attention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.attention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.attention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.attention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.crossattention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.crossattention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.crossattention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.crossattention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.crossattention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.crossattention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.crossattention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.crossattention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.crossattention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.crossattention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.intermediate_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.intermediate_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.output_query.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.output_query.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.output_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.10.output_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.11.attention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.11.attention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.11.attention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.11.attention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.11.attention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.11.attention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.11.attention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.11.attention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.11.attention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.11.attention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.11.intermediate_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.11.intermediate_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.11.output_query.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.11.output_query.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.11.output_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.11.output_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.attention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.attention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.attention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.attention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.attention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.attention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.attention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.attention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.attention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.attention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.crossattention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.crossattention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.crossattention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.crossattention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.crossattention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.crossattention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.crossattention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.crossattention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.crossattention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.crossattention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.intermediate_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.intermediate_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.output_query.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.output_query.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.output_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.2.output_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.3.attention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.3.attention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.3.attention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.3.attention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.3.attention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.3.attention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.3.attention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.3.attention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.3.attention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.3.attention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.3.intermediate_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.3.intermediate_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.3.output_query.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.3.output_query.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.3.output_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.3.output_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.attention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.attention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.attention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.attention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.attention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.attention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.attention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.attention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.attention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.attention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.crossattention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.crossattention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.crossattention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.crossattention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.crossattention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.crossattention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.crossattention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.crossattention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.crossattention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.crossattention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.intermediate_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.intermediate_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.output_query.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.output_query.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.output_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.4.output_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.5.attention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.5.attention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.5.attention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.5.attention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.5.attention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.5.attention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.5.attention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.5.attention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.5.attention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.5.attention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.5.intermediate_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.5.intermediate_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.5.output_query.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.5.output_query.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.5.output_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.5.output_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.attention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.attention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.attention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.attention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.attention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.attention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.attention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.attention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.attention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.attention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.crossattention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.crossattention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.crossattention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.crossattention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.crossattention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.crossattention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.crossattention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.crossattention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.crossattention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.crossattention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.intermediate_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.intermediate_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.output_query.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.output_query.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.output_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.6.output_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.7.attention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.7.attention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.7.attention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.7.attention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.7.attention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.7.attention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.7.attention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.7.attention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.7.attention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.7.attention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.7.intermediate_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.7.intermediate_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.7.output_query.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.7.output_query.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.7.output_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.7.output_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.attention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.attention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.attention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.attention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.attention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.attention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.attention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.attention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.attention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.attention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.crossattention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.crossattention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.crossattention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.crossattention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.crossattention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.crossattention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.crossattention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.crossattention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.crossattention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.crossattention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.intermediate_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.intermediate_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.output_query.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.output_query.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.output_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.8.output_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.9.attention.output.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.9.attention.output.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.9.attention.output.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.9.attention.output.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.9.attention.self.key.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.9.attention.self.key.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.9.attention.self.query.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.9.attention.self.query.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.9.attention.self.value.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.9.attention.self.value.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.9.intermediate_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.9.intermediate_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.9.output_query.LayerNorm.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.9.output_query.LayerNorm.weight": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.9.output_query.dense.bias": "model-00004-of-00004.safetensors",
+    "model.Qformer.bert.encoder.layer.9.output_query.dense.weight": "model-00004-of-00004.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.frame_position_encoding.weight": "model-00004-of-00004.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.ln_vision.bias": "model-00004-of-00004.safetensors",
+    "model.ln_vision.weight": "model-00004-of-00004.safetensors",
+    "model.mm_projector.bias": "model-00004-of-00004.safetensors",
+    "model.mm_projector.weight": "model-00004-of-00004.safetensors",
+    "model.norm.weight": "model-00003-of-00004.safetensors",
+    "model.query_tokens": "model-00001-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.32.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.32.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.32.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.32.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.32.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.32.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.32.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.32.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.32.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.32.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.32.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.32.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.32.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.33.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.33.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.33.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.33.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.33.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.33.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.33.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.33.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.33.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.33.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.33.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.33.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.33.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.34.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.34.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.34.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.34.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.34.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.34.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.34.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.34.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.34.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.34.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.34.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.34.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.34.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.35.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.35.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.35.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.35.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.35.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.35.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.35.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.35.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.35.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.35.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.35.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.35.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.35.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.36.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.36.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.36.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.36.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.36.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.36.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.36.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.36.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.36.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.36.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.36.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.36.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.36.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.37.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.37.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.37.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.37.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.37.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.37.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.37.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.37.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.37.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.37.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.37.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.37.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.37.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.38.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.38.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.38.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.38.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.38.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.38.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.38.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.38.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.38.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.38.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.38.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.38.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.38.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.cls_token": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.patch_embed.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.patch_embed.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.pos_embed": "model-00003-of-00004.safetensors"
+  }
+}
diff --git a/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/special_tokens_map.json b/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..dbca2649c4283d1a3d00918c51afc73ec26deb89
--- /dev/null
+++ b/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/special_tokens_map.json
@@ -0,0 +1,54 @@
+{
+  "additional_special_tokens": [
+    {
+      "content": "<im_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<im_end>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<vid_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<vid_end>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<unk>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/tokenizer.model b/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..8b443ef19c2a19acc3ac64fb9c3db4a72921dff6
--- /dev/null
+++ b/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443
diff --git a/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/tokenizer_config.json b/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4d7b8174e06542f888455f9e59f5a6ad113be728
--- /dev/null
+++ b/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/tokenizer_config.json
@@ -0,0 +1,82 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32000": {
+      "content": "<im_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<im_end>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32002": {
+      "content": "<vid_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32003": {
+      "content": "<vid_end>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<im_start>",
+    "<im_end>",
+    "<vid_start>",
+    "<vid_end>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 4096,
+  "pad_token": "<unk>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/trainer_state.json b/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..23faff28a3cf13dec2e007b8a793ce5bf4110a24
--- /dev/null
+++ b/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/trainer_state.json
@@ -0,0 +1,39440 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 5630,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "grad_norm": 185.03423776374265,
+      "learning_rate": 5.91715976331361e-08,
+      "loss": 2.9015,
+      "step": 1
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 140.91811853095518,
+      "learning_rate": 1.183431952662722e-07,
+      "loss": 2.8906,
+      "step": 2
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 150.59140099416928,
+      "learning_rate": 1.775147928994083e-07,
+      "loss": 2.9428,
+      "step": 3
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 137.87121743065478,
+      "learning_rate": 2.366863905325444e-07,
+      "loss": 2.9222,
+      "step": 4
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 129.87723923185592,
+      "learning_rate": 2.958579881656805e-07,
+      "loss": 2.8828,
+      "step": 5
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 120.36725591743145,
+      "learning_rate": 3.550295857988166e-07,
+      "loss": 2.7553,
+      "step": 6
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 111.4464133538857,
+      "learning_rate": 4.1420118343195276e-07,
+      "loss": 2.7544,
+      "step": 7
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 73.6471458108128,
+      "learning_rate": 4.733727810650888e-07,
+      "loss": 2.5427,
+      "step": 8
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 66.01511660972629,
+      "learning_rate": 5.32544378698225e-07,
+      "loss": 2.5063,
+      "step": 9
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 42.87851218226416,
+      "learning_rate": 5.91715976331361e-07,
+      "loss": 2.2204,
+      "step": 10
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 38.7957689097456,
+      "learning_rate": 6.50887573964497e-07,
+      "loss": 2.259,
+      "step": 11
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 45.63634643918716,
+      "learning_rate": 7.100591715976332e-07,
+      "loss": 2.2278,
+      "step": 12
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 44.32686856115866,
+      "learning_rate": 7.692307692307694e-07,
+      "loss": 2.1381,
+      "step": 13
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 30.650632751396717,
+      "learning_rate": 8.284023668639055e-07,
+      "loss": 2.0519,
+      "step": 14
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 23.043462704553598,
+      "learning_rate": 8.875739644970415e-07,
+      "loss": 2.0568,
+      "step": 15
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 20.30146871876839,
+      "learning_rate": 9.467455621301776e-07,
+      "loss": 2.0604,
+      "step": 16
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 21.614455381241164,
+      "learning_rate": 1.0059171597633138e-06,
+      "loss": 1.9987,
+      "step": 17
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 15.923946244635609,
+      "learning_rate": 1.06508875739645e-06,
+      "loss": 1.944,
+      "step": 18
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 11.719908580529962,
+      "learning_rate": 1.1242603550295859e-06,
+      "loss": 1.9251,
+      "step": 19
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 14.955092476377942,
+      "learning_rate": 1.183431952662722e-06,
+      "loss": 1.9565,
+      "step": 20
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 13.91507757746308,
+      "learning_rate": 1.242603550295858e-06,
+      "loss": 1.9495,
+      "step": 21
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 13.426854145745748,
+      "learning_rate": 1.301775147928994e-06,
+      "loss": 1.8125,
+      "step": 22
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 14.115147270167709,
+      "learning_rate": 1.3609467455621303e-06,
+      "loss": 1.8201,
+      "step": 23
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 13.137923822926558,
+      "learning_rate": 1.4201183431952664e-06,
+      "loss": 1.7391,
+      "step": 24
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 11.007010025177166,
+      "learning_rate": 1.4792899408284026e-06,
+      "loss": 1.8632,
+      "step": 25
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 14.420480449251249,
+      "learning_rate": 1.5384615384615387e-06,
+      "loss": 1.8567,
+      "step": 26
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 12.917326278105127,
+      "learning_rate": 1.5976331360946749e-06,
+      "loss": 1.7337,
+      "step": 27
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 11.987793639654871,
+      "learning_rate": 1.656804733727811e-06,
+      "loss": 1.8665,
+      "step": 28
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 13.04742653048263,
+      "learning_rate": 1.7159763313609468e-06,
+      "loss": 1.7403,
+      "step": 29
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 13.835408994519614,
+      "learning_rate": 1.775147928994083e-06,
+      "loss": 1.8046,
+      "step": 30
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 10.389402736383124,
+      "learning_rate": 1.834319526627219e-06,
+      "loss": 1.7021,
+      "step": 31
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 12.855802492811486,
+      "learning_rate": 1.8934911242603552e-06,
+      "loss": 1.7998,
+      "step": 32
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 8.890839445812098,
+      "learning_rate": 1.952662721893491e-06,
+      "loss": 1.7592,
+      "step": 33
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 9.92512150749279,
+      "learning_rate": 2.0118343195266275e-06,
+      "loss": 1.7163,
+      "step": 34
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 8.295265367406795,
+      "learning_rate": 2.0710059171597635e-06,
+      "loss": 1.6903,
+      "step": 35
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 13.448001530980992,
+      "learning_rate": 2.1301775147929e-06,
+      "loss": 1.6966,
+      "step": 36
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 11.223830225413302,
+      "learning_rate": 2.1893491124260358e-06,
+      "loss": 1.6753,
+      "step": 37
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 10.042117005875703,
+      "learning_rate": 2.2485207100591717e-06,
+      "loss": 1.6621,
+      "step": 38
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 8.674721055493,
+      "learning_rate": 2.307692307692308e-06,
+      "loss": 1.6906,
+      "step": 39
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 11.353414761998637,
+      "learning_rate": 2.366863905325444e-06,
+      "loss": 1.6502,
+      "step": 40
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 12.758387265227427,
+      "learning_rate": 2.42603550295858e-06,
+      "loss": 1.6409,
+      "step": 41
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 7.658159044388063,
+      "learning_rate": 2.485207100591716e-06,
+      "loss": 1.6143,
+      "step": 42
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 11.809777039668994,
+      "learning_rate": 2.5443786982248527e-06,
+      "loss": 1.6966,
+      "step": 43
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 9.860057350882773,
+      "learning_rate": 2.603550295857988e-06,
+      "loss": 1.5797,
+      "step": 44
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 13.30998463300329,
+      "learning_rate": 2.6627218934911246e-06,
+      "loss": 1.6248,
+      "step": 45
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 11.727102228296744,
+      "learning_rate": 2.7218934911242605e-06,
+      "loss": 1.6045,
+      "step": 46
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 12.206361968575692,
+      "learning_rate": 2.7810650887573965e-06,
+      "loss": 1.6422,
+      "step": 47
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 9.276474804489423,
+      "learning_rate": 2.840236686390533e-06,
+      "loss": 1.6483,
+      "step": 48
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 13.776503881203862,
+      "learning_rate": 2.8994082840236688e-06,
+      "loss": 1.6321,
+      "step": 49
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 10.122443535079855,
+      "learning_rate": 2.958579881656805e-06,
+      "loss": 1.5315,
+      "step": 50
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 9.258919380408724,
+      "learning_rate": 3.017751479289941e-06,
+      "loss": 1.5671,
+      "step": 51
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 14.322792686276212,
+      "learning_rate": 3.0769230769230774e-06,
+      "loss": 1.6285,
+      "step": 52
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 10.51385456935555,
+      "learning_rate": 3.1360946745562134e-06,
+      "loss": 1.5776,
+      "step": 53
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 13.304908289058428,
+      "learning_rate": 3.1952662721893497e-06,
+      "loss": 1.5922,
+      "step": 54
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 11.522086381391395,
+      "learning_rate": 3.2544378698224853e-06,
+      "loss": 1.5524,
+      "step": 55
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 12.225494847218089,
+      "learning_rate": 3.313609467455622e-06,
+      "loss": 1.5645,
+      "step": 56
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 10.78065774686398,
+      "learning_rate": 3.3727810650887576e-06,
+      "loss": 1.5161,
+      "step": 57
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 13.334120718788487,
+      "learning_rate": 3.4319526627218935e-06,
+      "loss": 1.5574,
+      "step": 58
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 10.333150882237918,
+      "learning_rate": 3.49112426035503e-06,
+      "loss": 1.4947,
+      "step": 59
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 15.949681837188866,
+      "learning_rate": 3.550295857988166e-06,
+      "loss": 1.5219,
+      "step": 60
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 12.937167832583926,
+      "learning_rate": 3.609467455621302e-06,
+      "loss": 1.5533,
+      "step": 61
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 11.042165725130756,
+      "learning_rate": 3.668639053254438e-06,
+      "loss": 1.5041,
+      "step": 62
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 12.83096083228766,
+      "learning_rate": 3.7278106508875745e-06,
+      "loss": 1.4633,
+      "step": 63
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 11.911027619713748,
+      "learning_rate": 3.7869822485207104e-06,
+      "loss": 1.4361,
+      "step": 64
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 13.855538485838935,
+      "learning_rate": 3.846153846153847e-06,
+      "loss": 1.5075,
+      "step": 65
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 10.500713276198754,
+      "learning_rate": 3.905325443786982e-06,
+      "loss": 1.5129,
+      "step": 66
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 9.747289595121385,
+      "learning_rate": 3.964497041420119e-06,
+      "loss": 1.5582,
+      "step": 67
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 11.797796609011046,
+      "learning_rate": 4.023668639053255e-06,
+      "loss": 1.498,
+      "step": 68
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 11.157755548762593,
+      "learning_rate": 4.0828402366863906e-06,
+      "loss": 1.5423,
+      "step": 69
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 12.657351495380297,
+      "learning_rate": 4.142011834319527e-06,
+      "loss": 1.4391,
+      "step": 70
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 10.52338994466744,
+      "learning_rate": 4.201183431952663e-06,
+      "loss": 1.5286,
+      "step": 71
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 12.71964522754013,
+      "learning_rate": 4.2603550295858e-06,
+      "loss": 1.4874,
+      "step": 72
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 14.712908363871222,
+      "learning_rate": 4.319526627218935e-06,
+      "loss": 1.4439,
+      "step": 73
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 8.904645430832526,
+      "learning_rate": 4.3786982248520715e-06,
+      "loss": 1.5543,
+      "step": 74
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 8.646373241526595,
+      "learning_rate": 4.437869822485207e-06,
+      "loss": 1.5086,
+      "step": 75
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 10.650441641928463,
+      "learning_rate": 4.497041420118343e-06,
+      "loss": 1.4822,
+      "step": 76
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 9.299994514457845,
+      "learning_rate": 4.55621301775148e-06,
+      "loss": 1.4747,
+      "step": 77
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 10.37236793903165,
+      "learning_rate": 4.615384615384616e-06,
+      "loss": 1.487,
+      "step": 78
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 12.780188220919364,
+      "learning_rate": 4.674556213017752e-06,
+      "loss": 1.4947,
+      "step": 79
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 15.03893755556276,
+      "learning_rate": 4.733727810650888e-06,
+      "loss": 1.4663,
+      "step": 80
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 11.505808521947003,
+      "learning_rate": 4.792899408284024e-06,
+      "loss": 1.4791,
+      "step": 81
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 13.562606973177447,
+      "learning_rate": 4.85207100591716e-06,
+      "loss": 1.4764,
+      "step": 82
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 8.86167164074553,
+      "learning_rate": 4.911242603550296e-06,
+      "loss": 1.4297,
+      "step": 83
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 15.082716319738257,
+      "learning_rate": 4.970414201183432e-06,
+      "loss": 1.4415,
+      "step": 84
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 11.57129716182721,
+      "learning_rate": 5.029585798816569e-06,
+      "loss": 1.4342,
+      "step": 85
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 10.412597509967762,
+      "learning_rate": 5.088757396449705e-06,
+      "loss": 1.4639,
+      "step": 86
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 11.328557308617444,
+      "learning_rate": 5.14792899408284e-06,
+      "loss": 1.4772,
+      "step": 87
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 13.518028271219835,
+      "learning_rate": 5.207100591715976e-06,
+      "loss": 1.4611,
+      "step": 88
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 11.077359999599375,
+      "learning_rate": 5.266272189349113e-06,
+      "loss": 1.4617,
+      "step": 89
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 10.303778501840824,
+      "learning_rate": 5.325443786982249e-06,
+      "loss": 1.4397,
+      "step": 90
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 12.638167598739152,
+      "learning_rate": 5.384615384615385e-06,
+      "loss": 1.5278,
+      "step": 91
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 12.965508414670452,
+      "learning_rate": 5.443786982248521e-06,
+      "loss": 1.4824,
+      "step": 92
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 8.735178792533928,
+      "learning_rate": 5.502958579881657e-06,
+      "loss": 1.4072,
+      "step": 93
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 9.218482867495894,
+      "learning_rate": 5.562130177514793e-06,
+      "loss": 1.4084,
+      "step": 94
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 10.746631506582341,
+      "learning_rate": 5.621301775147929e-06,
+      "loss": 1.4066,
+      "step": 95
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 8.943898208714225,
+      "learning_rate": 5.680473372781066e-06,
+      "loss": 1.4287,
+      "step": 96
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 10.797500570135853,
+      "learning_rate": 5.739644970414202e-06,
+      "loss": 1.4492,
+      "step": 97
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 9.771524306027379,
+      "learning_rate": 5.7988165680473375e-06,
+      "loss": 1.457,
+      "step": 98
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 9.32876994618674,
+      "learning_rate": 5.857988165680474e-06,
+      "loss": 1.4472,
+      "step": 99
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 11.562723570954564,
+      "learning_rate": 5.91715976331361e-06,
+      "loss": 1.4162,
+      "step": 100
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 14.062882611546689,
+      "learning_rate": 5.976331360946747e-06,
+      "loss": 1.4259,
+      "step": 101
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 10.839267648979705,
+      "learning_rate": 6.035502958579882e-06,
+      "loss": 1.3762,
+      "step": 102
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 11.580937564107458,
+      "learning_rate": 6.0946745562130185e-06,
+      "loss": 1.4486,
+      "step": 103
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 10.79153833649814,
+      "learning_rate": 6.153846153846155e-06,
+      "loss": 1.3987,
+      "step": 104
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 9.1258777202552,
+      "learning_rate": 6.21301775147929e-06,
+      "loss": 1.3998,
+      "step": 105
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 13.735211178999263,
+      "learning_rate": 6.272189349112427e-06,
+      "loss": 1.4802,
+      "step": 106
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 11.411224378976232,
+      "learning_rate": 6.331360946745563e-06,
+      "loss": 1.4233,
+      "step": 107
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 8.89698123769219,
+      "learning_rate": 6.3905325443786995e-06,
+      "loss": 1.4265,
+      "step": 108
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 13.06811883738269,
+      "learning_rate": 6.449704142011834e-06,
+      "loss": 1.4646,
+      "step": 109
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 9.730261566894487,
+      "learning_rate": 6.5088757396449705e-06,
+      "loss": 1.4327,
+      "step": 110
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 9.938742476224435,
+      "learning_rate": 6.568047337278107e-06,
+      "loss": 1.3941,
+      "step": 111
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 9.540083078375504,
+      "learning_rate": 6.627218934911244e-06,
+      "loss": 1.4028,
+      "step": 112
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 8.365520224146486,
+      "learning_rate": 6.686390532544379e-06,
+      "loss": 1.4219,
+      "step": 113
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 9.413472765415372,
+      "learning_rate": 6.745562130177515e-06,
+      "loss": 1.4039,
+      "step": 114
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 14.037076089900689,
+      "learning_rate": 6.8047337278106515e-06,
+      "loss": 1.3321,
+      "step": 115
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 9.953076444073526,
+      "learning_rate": 6.863905325443787e-06,
+      "loss": 1.5052,
+      "step": 116
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 9.603338614617456,
+      "learning_rate": 6.923076923076923e-06,
+      "loss": 1.4506,
+      "step": 117
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 9.685249822422092,
+      "learning_rate": 6.98224852071006e-06,
+      "loss": 1.4767,
+      "step": 118
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 11.664649058573136,
+      "learning_rate": 7.041420118343196e-06,
+      "loss": 1.3708,
+      "step": 119
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 10.252882948983812,
+      "learning_rate": 7.100591715976332e-06,
+      "loss": 1.4177,
+      "step": 120
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 15.340858759864672,
+      "learning_rate": 7.159763313609468e-06,
+      "loss": 1.2766,
+      "step": 121
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 10.595217887572176,
+      "learning_rate": 7.218934911242604e-06,
+      "loss": 1.4046,
+      "step": 122
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 10.679146948278289,
+      "learning_rate": 7.278106508875741e-06,
+      "loss": 1.4522,
+      "step": 123
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 9.823549875035894,
+      "learning_rate": 7.337278106508876e-06,
+      "loss": 1.4054,
+      "step": 124
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 8.21532420098497,
+      "learning_rate": 7.396449704142013e-06,
+      "loss": 1.3981,
+      "step": 125
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 9.115475557387429,
+      "learning_rate": 7.455621301775149e-06,
+      "loss": 1.4094,
+      "step": 126
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 9.937294021726514,
+      "learning_rate": 7.5147928994082845e-06,
+      "loss": 1.4811,
+      "step": 127
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 10.53531074598496,
+      "learning_rate": 7.573964497041421e-06,
+      "loss": 1.423,
+      "step": 128
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 8.421867946951231,
+      "learning_rate": 7.633136094674556e-06,
+      "loss": 1.4256,
+      "step": 129
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 9.07423142117401,
+      "learning_rate": 7.692307692307694e-06,
+      "loss": 1.386,
+      "step": 130
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 13.010115450878272,
+      "learning_rate": 7.751479289940829e-06,
+      "loss": 1.3881,
+      "step": 131
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 9.04997406873111,
+      "learning_rate": 7.810650887573965e-06,
+      "loss": 1.3617,
+      "step": 132
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 10.548726762207098,
+      "learning_rate": 7.869822485207102e-06,
+      "loss": 1.4225,
+      "step": 133
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 10.13322351976804,
+      "learning_rate": 7.928994082840237e-06,
+      "loss": 1.3858,
+      "step": 134
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 13.323294219972729,
+      "learning_rate": 7.988165680473373e-06,
+      "loss": 1.3651,
+      "step": 135
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 9.66552897496895,
+      "learning_rate": 8.04733727810651e-06,
+      "loss": 1.3808,
+      "step": 136
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 9.521143749459256,
+      "learning_rate": 8.106508875739646e-06,
+      "loss": 1.356,
+      "step": 137
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 9.766402887655207,
+      "learning_rate": 8.165680473372781e-06,
+      "loss": 1.366,
+      "step": 138
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 10.760832382288783,
+      "learning_rate": 8.224852071005918e-06,
+      "loss": 1.4723,
+      "step": 139
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 7.545332982686552,
+      "learning_rate": 8.284023668639054e-06,
+      "loss": 1.3411,
+      "step": 140
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 10.198678996894566,
+      "learning_rate": 8.343195266272191e-06,
+      "loss": 1.3759,
+      "step": 141
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 9.84188388334431,
+      "learning_rate": 8.402366863905327e-06,
+      "loss": 1.3574,
+      "step": 142
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 7.138073042802784,
+      "learning_rate": 8.461538461538462e-06,
+      "loss": 1.3726,
+      "step": 143
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 10.442611577314288,
+      "learning_rate": 8.5207100591716e-06,
+      "loss": 1.35,
+      "step": 144
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 8.188941543976533,
+      "learning_rate": 8.579881656804735e-06,
+      "loss": 1.398,
+      "step": 145
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 9.11669003235787,
+      "learning_rate": 8.63905325443787e-06,
+      "loss": 1.3857,
+      "step": 146
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 7.172210650329822,
+      "learning_rate": 8.698224852071006e-06,
+      "loss": 1.3397,
+      "step": 147
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 9.93117490123539,
+      "learning_rate": 8.757396449704143e-06,
+      "loss": 1.335,
+      "step": 148
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 9.45857329275793,
+      "learning_rate": 8.816568047337279e-06,
+      "loss": 1.4352,
+      "step": 149
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 10.291788115144577,
+      "learning_rate": 8.875739644970414e-06,
+      "loss": 1.4069,
+      "step": 150
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 9.538766330680252,
+      "learning_rate": 8.934911242603551e-06,
+      "loss": 1.3597,
+      "step": 151
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 8.913061728929955,
+      "learning_rate": 8.994082840236687e-06,
+      "loss": 1.4256,
+      "step": 152
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 9.446874981599557,
+      "learning_rate": 9.053254437869822e-06,
+      "loss": 1.4008,
+      "step": 153
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 10.36696117660475,
+      "learning_rate": 9.11242603550296e-06,
+      "loss": 1.3849,
+      "step": 154
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 8.486967038412088,
+      "learning_rate": 9.171597633136095e-06,
+      "loss": 1.426,
+      "step": 155
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 9.853691657463907,
+      "learning_rate": 9.230769230769232e-06,
+      "loss": 1.4207,
+      "step": 156
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 10.705163353125887,
+      "learning_rate": 9.289940828402368e-06,
+      "loss": 1.2883,
+      "step": 157
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 7.6515949781812855,
+      "learning_rate": 9.349112426035503e-06,
+      "loss": 1.3615,
+      "step": 158
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 9.975014236223362,
+      "learning_rate": 9.40828402366864e-06,
+      "loss": 1.4159,
+      "step": 159
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 9.955787222133399,
+      "learning_rate": 9.467455621301776e-06,
+      "loss": 1.3006,
+      "step": 160
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 14.489070278735568,
+      "learning_rate": 9.526627218934912e-06,
+      "loss": 1.3622,
+      "step": 161
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 10.485472221469456,
+      "learning_rate": 9.585798816568049e-06,
+      "loss": 1.4007,
+      "step": 162
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 9.38304168392138,
+      "learning_rate": 9.644970414201184e-06,
+      "loss": 1.3454,
+      "step": 163
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 10.522668227429723,
+      "learning_rate": 9.70414201183432e-06,
+      "loss": 1.3803,
+      "step": 164
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 9.672932340278885,
+      "learning_rate": 9.763313609467457e-06,
+      "loss": 1.2448,
+      "step": 165
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 8.567656975062183,
+      "learning_rate": 9.822485207100593e-06,
+      "loss": 1.3689,
+      "step": 166
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 10.992910915487302,
+      "learning_rate": 9.88165680473373e-06,
+      "loss": 1.4033,
+      "step": 167
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 10.42265972466807,
+      "learning_rate": 9.940828402366864e-06,
+      "loss": 1.3175,
+      "step": 168
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 11.899406051755337,
+      "learning_rate": 1e-05,
+      "loss": 1.3587,
+      "step": 169
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 9.739628587152973,
+      "learning_rate": 9.999999172638347e-06,
+      "loss": 1.3422,
+      "step": 170
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 11.558850378695885,
+      "learning_rate": 9.999996690553654e-06,
+      "loss": 1.3037,
+      "step": 171
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 11.42451378147048,
+      "learning_rate": 9.999992553746751e-06,
+      "loss": 1.3364,
+      "step": 172
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 10.40265186947678,
+      "learning_rate": 9.999986762219002e-06,
+      "loss": 1.364,
+      "step": 173
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 8.53594497867977,
+      "learning_rate": 9.999979315972324e-06,
+      "loss": 1.4084,
+      "step": 174
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 7.183256407550151,
+      "learning_rate": 9.999970215009182e-06,
+      "loss": 1.3374,
+      "step": 175
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 7.870968240238341,
+      "learning_rate": 9.999959459332586e-06,
+      "loss": 1.291,
+      "step": 176
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 10.854413259884652,
+      "learning_rate": 9.9999470489461e-06,
+      "loss": 1.3725,
+      "step": 177
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 12.334075083006413,
+      "learning_rate": 9.999932983853827e-06,
+      "loss": 1.258,
+      "step": 178
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 10.53976808262216,
+      "learning_rate": 9.999917264060423e-06,
+      "loss": 1.2837,
+      "step": 179
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 12.911426729057814,
+      "learning_rate": 9.999899889571092e-06,
+      "loss": 1.4187,
+      "step": 180
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 10.258089873409897,
+      "learning_rate": 9.999880860391581e-06,
+      "loss": 1.4314,
+      "step": 181
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 9.426195758176695,
+      "learning_rate": 9.99986017652819e-06,
+      "loss": 1.2375,
+      "step": 182
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 8.800842830232034,
+      "learning_rate": 9.999837837987762e-06,
+      "loss": 1.3563,
+      "step": 183
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 11.463487468864626,
+      "learning_rate": 9.999813844777694e-06,
+      "loss": 1.27,
+      "step": 184
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 9.886521152010447,
+      "learning_rate": 9.999788196905922e-06,
+      "loss": 1.3772,
+      "step": 185
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 7.706991793179981,
+      "learning_rate": 9.999760894380935e-06,
+      "loss": 1.3806,
+      "step": 186
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 13.355425466781034,
+      "learning_rate": 9.999731937211771e-06,
+      "loss": 1.3038,
+      "step": 187
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 11.888898201004015,
+      "learning_rate": 9.999701325408011e-06,
+      "loss": 1.362,
+      "step": 188
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 12.69704793385098,
+      "learning_rate": 9.999669058979785e-06,
+      "loss": 1.4124,
+      "step": 189
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 9.19732562212585,
+      "learning_rate": 9.999635137937775e-06,
+      "loss": 1.4301,
+      "step": 190
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 7.863129534817616,
+      "learning_rate": 9.999599562293204e-06,
+      "loss": 1.3088,
+      "step": 191
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 8.936116554465244,
+      "learning_rate": 9.999562332057843e-06,
+      "loss": 1.2886,
+      "step": 192
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 8.52082224030725,
+      "learning_rate": 9.99952344724402e-06,
+      "loss": 1.3871,
+      "step": 193
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 8.442376299809249,
+      "learning_rate": 9.9994829078646e-06,
+      "loss": 1.3689,
+      "step": 194
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 10.861781268321277,
+      "learning_rate": 9.999440713933e-06,
+      "loss": 1.3075,
+      "step": 195
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 8.813329231465246,
+      "learning_rate": 9.99939686546318e-06,
+      "loss": 1.2588,
+      "step": 196
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 11.736797233053991,
+      "learning_rate": 9.999351362469658e-06,
+      "loss": 1.3279,
+      "step": 197
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 11.405661333558216,
+      "learning_rate": 9.999304204967488e-06,
+      "loss": 1.3329,
+      "step": 198
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 8.916826173317205,
+      "learning_rate": 9.999255392972279e-06,
+      "loss": 1.3774,
+      "step": 199
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 11.20049609541979,
+      "learning_rate": 9.999204926500184e-06,
+      "loss": 1.3623,
+      "step": 200
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 11.19170283440837,
+      "learning_rate": 9.999152805567904e-06,
+      "loss": 1.3314,
+      "step": 201
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 8.634846314818686,
+      "learning_rate": 9.999099030192692e-06,
+      "loss": 1.2796,
+      "step": 202
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 9.521529397362855,
+      "learning_rate": 9.99904360039234e-06,
+      "loss": 1.4274,
+      "step": 203
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 9.419046641655584,
+      "learning_rate": 9.998986516185192e-06,
+      "loss": 1.3526,
+      "step": 204
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 11.525753626150642,
+      "learning_rate": 9.998927777590144e-06,
+      "loss": 1.3565,
+      "step": 205
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 13.496515919848386,
+      "learning_rate": 9.998867384626631e-06,
+      "loss": 1.3051,
+      "step": 206
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 10.2897808791316,
+      "learning_rate": 9.998805337314644e-06,
+      "loss": 1.3155,
+      "step": 207
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 9.105445371377735,
+      "learning_rate": 9.998741635674713e-06,
+      "loss": 1.3805,
+      "step": 208
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 7.5082837283716115,
+      "learning_rate": 9.998676279727922e-06,
+      "loss": 1.3667,
+      "step": 209
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 13.215638095158459,
+      "learning_rate": 9.9986092694959e-06,
+      "loss": 1.3537,
+      "step": 210
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 11.881210456172074,
+      "learning_rate": 9.998540605000823e-06,
+      "loss": 1.3441,
+      "step": 211
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 11.364815171479467,
+      "learning_rate": 9.998470286265415e-06,
+      "loss": 1.3863,
+      "step": 212
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 10.758694660198586,
+      "learning_rate": 9.99839831331295e-06,
+      "loss": 1.3768,
+      "step": 213
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 8.858658114592211,
+      "learning_rate": 9.998324686167245e-06,
+      "loss": 1.37,
+      "step": 214
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 13.93575186056554,
+      "learning_rate": 9.998249404852666e-06,
+      "loss": 1.3048,
+      "step": 215
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 9.858520849287675,
+      "learning_rate": 9.998172469394128e-06,
+      "loss": 1.2924,
+      "step": 216
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 8.353560843341423,
+      "learning_rate": 9.99809387981709e-06,
+      "loss": 1.3293,
+      "step": 217
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 11.553261749077015,
+      "learning_rate": 9.998013636147565e-06,
+      "loss": 1.2527,
+      "step": 218
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 8.57705358690245,
+      "learning_rate": 9.997931738412108e-06,
+      "loss": 1.3577,
+      "step": 219
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 8.179492256308288,
+      "learning_rate": 9.99784818663782e-06,
+      "loss": 1.2759,
+      "step": 220
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 8.100955461637028,
+      "learning_rate": 9.997762980852354e-06,
+      "loss": 1.2535,
+      "step": 221
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 11.54365874365203,
+      "learning_rate": 9.997676121083907e-06,
+      "loss": 1.2296,
+      "step": 222
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 14.604276470176979,
+      "learning_rate": 9.997587607361228e-06,
+      "loss": 1.2862,
+      "step": 223
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 8.444954166173504,
+      "learning_rate": 9.997497439713608e-06,
+      "loss": 1.3922,
+      "step": 224
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 8.600788313144259,
+      "learning_rate": 9.997405618170886e-06,
+      "loss": 1.3821,
+      "step": 225
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 9.127441282245044,
+      "learning_rate": 9.997312142763452e-06,
+      "loss": 1.3121,
+      "step": 226
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 10.564406848828774,
+      "learning_rate": 9.997217013522242e-06,
+      "loss": 1.3468,
+      "step": 227
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 9.48605607535145,
+      "learning_rate": 9.997120230478735e-06,
+      "loss": 1.2912,
+      "step": 228
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 9.53865696265436,
+      "learning_rate": 9.997021793664964e-06,
+      "loss": 1.2818,
+      "step": 229
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 10.83338835287169,
+      "learning_rate": 9.996921703113505e-06,
+      "loss": 1.3028,
+      "step": 230
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 9.44333912062963,
+      "learning_rate": 9.996819958857482e-06,
+      "loss": 1.3149,
+      "step": 231
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 10.091061507793055,
+      "learning_rate": 9.996716560930567e-06,
+      "loss": 1.3429,
+      "step": 232
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 10.158346198169232,
+      "learning_rate": 9.99661150936698e-06,
+      "loss": 1.3412,
+      "step": 233
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 8.373544890198279,
+      "learning_rate": 9.996504804201485e-06,
+      "loss": 1.321,
+      "step": 234
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 9.951706666206265,
+      "learning_rate": 9.9963964454694e-06,
+      "loss": 1.2807,
+      "step": 235
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 8.664774697606303,
+      "learning_rate": 9.996286433206581e-06,
+      "loss": 1.3612,
+      "step": 236
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 10.75457907068592,
+      "learning_rate": 9.996174767449436e-06,
+      "loss": 1.2575,
+      "step": 237
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 13.835650990896957,
+      "learning_rate": 9.996061448234924e-06,
+      "loss": 1.3251,
+      "step": 238
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 9.757552710962106,
+      "learning_rate": 9.995946475600544e-06,
+      "loss": 1.3069,
+      "step": 239
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 8.3998560543428,
+      "learning_rate": 9.995829849584349e-06,
+      "loss": 1.376,
+      "step": 240
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 6.5869107933280215,
+      "learning_rate": 9.995711570224931e-06,
+      "loss": 1.291,
+      "step": 241
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 9.98848874041882,
+      "learning_rate": 9.995591637561439e-06,
+      "loss": 1.3573,
+      "step": 242
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 8.801127918687417,
+      "learning_rate": 9.99547005163356e-06,
+      "loss": 1.3724,
+      "step": 243
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 7.031402237298736,
+      "learning_rate": 9.995346812481533e-06,
+      "loss": 1.3051,
+      "step": 244
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 8.070448003343488,
+      "learning_rate": 9.995221920146146e-06,
+      "loss": 1.3619,
+      "step": 245
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 9.959690340407661,
+      "learning_rate": 9.995095374668729e-06,
+      "loss": 1.3537,
+      "step": 246
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 9.549773502096576,
+      "learning_rate": 9.994967176091161e-06,
+      "loss": 1.3306,
+      "step": 247
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 9.546602608478421,
+      "learning_rate": 9.994837324455871e-06,
+      "loss": 1.2716,
+      "step": 248
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 9.065171035332382,
+      "learning_rate": 9.994705819805832e-06,
+      "loss": 1.2893,
+      "step": 249
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 6.988263463596534,
+      "learning_rate": 9.994572662184564e-06,
+      "loss": 1.3089,
+      "step": 250
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 10.039309503647843,
+      "learning_rate": 9.994437851636132e-06,
+      "loss": 1.3771,
+      "step": 251
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 9.032138392989598,
+      "learning_rate": 9.994301388205157e-06,
+      "loss": 1.2577,
+      "step": 252
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 11.7002852246443,
+      "learning_rate": 9.994163271936798e-06,
+      "loss": 1.3223,
+      "step": 253
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 7.694482031893588,
+      "learning_rate": 9.994023502876762e-06,
+      "loss": 1.2775,
+      "step": 254
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 10.984127100976329,
+      "learning_rate": 9.993882081071307e-06,
+      "loss": 1.3303,
+      "step": 255
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 10.07976533031397,
+      "learning_rate": 9.993739006567235e-06,
+      "loss": 1.2806,
+      "step": 256
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 9.588893714267956,
+      "learning_rate": 9.993594279411897e-06,
+      "loss": 1.3387,
+      "step": 257
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 9.107827498723967,
+      "learning_rate": 9.993447899653189e-06,
+      "loss": 1.3113,
+      "step": 258
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 10.993979608475192,
+      "learning_rate": 9.993299867339553e-06,
+      "loss": 1.2439,
+      "step": 259
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 12.17512936753257,
+      "learning_rate": 9.99315018251998e-06,
+      "loss": 1.3204,
+      "step": 260
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 10.994062668164496,
+      "learning_rate": 9.992998845244009e-06,
+      "loss": 1.3596,
+      "step": 261
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 8.79947945915395,
+      "learning_rate": 9.992845855561722e-06,
+      "loss": 1.3318,
+      "step": 262
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 7.989469470146771,
+      "learning_rate": 9.992691213523754e-06,
+      "loss": 1.3064,
+      "step": 263
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 9.167684886732081,
+      "learning_rate": 9.992534919181278e-06,
+      "loss": 1.3029,
+      "step": 264
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 13.762934784860262,
+      "learning_rate": 9.992376972586022e-06,
+      "loss": 1.2927,
+      "step": 265
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 9.352402891439576,
+      "learning_rate": 9.992217373790258e-06,
+      "loss": 1.331,
+      "step": 266
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 9.644458627140102,
+      "learning_rate": 9.992056122846803e-06,
+      "loss": 1.2425,
+      "step": 267
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 9.506483663487785,
+      "learning_rate": 9.991893219809022e-06,
+      "loss": 1.2893,
+      "step": 268
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 7.799895601411697,
+      "learning_rate": 9.991728664730826e-06,
+      "loss": 1.383,
+      "step": 269
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 7.966849700957347,
+      "learning_rate": 9.991562457666674e-06,
+      "loss": 1.2723,
+      "step": 270
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 10.503924440422999,
+      "learning_rate": 9.991394598671575e-06,
+      "loss": 1.3214,
+      "step": 271
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 11.132892184343556,
+      "learning_rate": 9.991225087801077e-06,
+      "loss": 1.2755,
+      "step": 272
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 8.57044800691008,
+      "learning_rate": 9.99105392511128e-06,
+      "loss": 1.2828,
+      "step": 273
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 10.848938168270516,
+      "learning_rate": 9.99088111065883e-06,
+      "loss": 1.2341,
+      "step": 274
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 11.818757101064477,
+      "learning_rate": 9.990706644500917e-06,
+      "loss": 1.259,
+      "step": 275
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 8.594564691678144,
+      "learning_rate": 9.99053052669528e-06,
+      "loss": 1.3666,
+      "step": 276
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 6.144809504734492,
+      "learning_rate": 9.990352757300209e-06,
+      "loss": 1.2522,
+      "step": 277
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 6.7772725375519585,
+      "learning_rate": 9.99017333637453e-06,
+      "loss": 1.3077,
+      "step": 278
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 11.255386125549903,
+      "learning_rate": 9.989992263977622e-06,
+      "loss": 1.2455,
+      "step": 279
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 8.467376228206485,
+      "learning_rate": 9.989809540169412e-06,
+      "loss": 1.2704,
+      "step": 280
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 9.45417950812266,
+      "learning_rate": 9.989625165010373e-06,
+      "loss": 1.2222,
+      "step": 281
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 8.49914496460507,
+      "learning_rate": 9.989439138561518e-06,
+      "loss": 1.3949,
+      "step": 282
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 8.397444708759457,
+      "learning_rate": 9.989251460884417e-06,
+      "loss": 1.2887,
+      "step": 283
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 11.242716125251253,
+      "learning_rate": 9.989062132041176e-06,
+      "loss": 1.2791,
+      "step": 284
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 8.658612766415901,
+      "learning_rate": 9.988871152094457e-06,
+      "loss": 1.2586,
+      "step": 285
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 9.343724393191541,
+      "learning_rate": 9.988678521107459e-06,
+      "loss": 1.3003,
+      "step": 286
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 9.564434926803408,
+      "learning_rate": 9.988484239143936e-06,
+      "loss": 1.4169,
+      "step": 287
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 10.257829982854751,
+      "learning_rate": 9.988288306268182e-06,
+      "loss": 1.2322,
+      "step": 288
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 10.382163888104103,
+      "learning_rate": 9.988090722545041e-06,
+      "loss": 1.3154,
+      "step": 289
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 9.35838928406191,
+      "learning_rate": 9.987891488039905e-06,
+      "loss": 1.2563,
+      "step": 290
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 9.94179702042423,
+      "learning_rate": 9.987690602818704e-06,
+      "loss": 1.2656,
+      "step": 291
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 9.062473560118182,
+      "learning_rate": 9.987488066947923e-06,
+      "loss": 1.2746,
+      "step": 292
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 9.69051860156859,
+      "learning_rate": 9.987283880494591e-06,
+      "loss": 1.2551,
+      "step": 293
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 7.553619542328771,
+      "learning_rate": 9.987078043526282e-06,
+      "loss": 1.2786,
+      "step": 294
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 9.421632329389052,
+      "learning_rate": 9.986870556111115e-06,
+      "loss": 1.2569,
+      "step": 295
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 11.229884611409439,
+      "learning_rate": 9.986661418317759e-06,
+      "loss": 1.2683,
+      "step": 296
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 11.057412649061298,
+      "learning_rate": 9.986450630215425e-06,
+      "loss": 1.2976,
+      "step": 297
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 10.367586768403578,
+      "learning_rate": 9.986238191873874e-06,
+      "loss": 1.242,
+      "step": 298
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 7.631040866841598,
+      "learning_rate": 9.98602410336341e-06,
+      "loss": 1.3188,
+      "step": 299
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 11.059920894185083,
+      "learning_rate": 9.985808364754885e-06,
+      "loss": 1.2513,
+      "step": 300
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 11.194282335330081,
+      "learning_rate": 9.985590976119697e-06,
+      "loss": 1.201,
+      "step": 301
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 7.163748906405387,
+      "learning_rate": 9.985371937529788e-06,
+      "loss": 1.3233,
+      "step": 302
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 9.233071619984663,
+      "learning_rate": 9.98515124905765e-06,
+      "loss": 1.2996,
+      "step": 303
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 9.511846177390218,
+      "learning_rate": 9.984928910776317e-06,
+      "loss": 1.2806,
+      "step": 304
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 11.420153637210275,
+      "learning_rate": 9.984704922759372e-06,
+      "loss": 1.3201,
+      "step": 305
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 10.944292630541968,
+      "learning_rate": 9.984479285080939e-06,
+      "loss": 1.2741,
+      "step": 306
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 11.235462716157672,
+      "learning_rate": 9.984251997815697e-06,
+      "loss": 1.2661,
+      "step": 307
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 7.797068722199958,
+      "learning_rate": 9.98402306103886e-06,
+      "loss": 1.2978,
+      "step": 308
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 9.00164898000404,
+      "learning_rate": 9.9837924748262e-06,
+      "loss": 1.3185,
+      "step": 309
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 8.866649451210137,
+      "learning_rate": 9.983560239254022e-06,
+      "loss": 1.1812,
+      "step": 310
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 10.134317893505646,
+      "learning_rate": 9.983326354399186e-06,
+      "loss": 1.3123,
+      "step": 311
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 8.86856192658387,
+      "learning_rate": 9.983090820339096e-06,
+      "loss": 1.3631,
+      "step": 312
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 9.679295870877812,
+      "learning_rate": 9.982853637151697e-06,
+      "loss": 1.2725,
+      "step": 313
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 11.48756352437403,
+      "learning_rate": 9.982614804915487e-06,
+      "loss": 1.2152,
+      "step": 314
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 9.00327119412051,
+      "learning_rate": 9.982374323709506e-06,
+      "loss": 1.2672,
+      "step": 315
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 9.442018925512244,
+      "learning_rate": 9.982132193613339e-06,
+      "loss": 1.227,
+      "step": 316
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 8.91665013355913,
+      "learning_rate": 9.981888414707116e-06,
+      "loss": 1.242,
+      "step": 317
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 10.710266909228109,
+      "learning_rate": 9.981642987071516e-06,
+      "loss": 1.325,
+      "step": 318
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 6.581371848510774,
+      "learning_rate": 9.981395910787764e-06,
+      "loss": 1.2497,
+      "step": 319
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 10.115721557881082,
+      "learning_rate": 9.981147185937625e-06,
+      "loss": 1.32,
+      "step": 320
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 9.527146229938918,
+      "learning_rate": 9.980896812603416e-06,
+      "loss": 1.3054,
+      "step": 321
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 9.050678544691156,
+      "learning_rate": 9.980644790867994e-06,
+      "loss": 1.301,
+      "step": 322
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 12.3395716505136,
+      "learning_rate": 9.980391120814768e-06,
+      "loss": 1.2603,
+      "step": 323
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 9.266177496553405,
+      "learning_rate": 9.980135802527683e-06,
+      "loss": 1.3305,
+      "step": 324
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 8.76017434683413,
+      "learning_rate": 9.97987883609124e-06,
+      "loss": 1.3081,
+      "step": 325
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 9.851008779516395,
+      "learning_rate": 9.979620221590482e-06,
+      "loss": 1.3149,
+      "step": 326
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 11.428325154300813,
+      "learning_rate": 9.97935995911099e-06,
+      "loss": 1.2281,
+      "step": 327
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 7.002986081977759,
+      "learning_rate": 9.979098048738903e-06,
+      "loss": 1.2737,
+      "step": 328
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 8.561265722615607,
+      "learning_rate": 9.978834490560893e-06,
+      "loss": 1.1817,
+      "step": 329
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 8.240667755679292,
+      "learning_rate": 9.978569284664189e-06,
+      "loss": 1.3161,
+      "step": 330
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 10.556730467746394,
+      "learning_rate": 9.978302431136553e-06,
+      "loss": 1.2904,
+      "step": 331
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 9.182697953549333,
+      "learning_rate": 9.978033930066303e-06,
+      "loss": 1.2687,
+      "step": 332
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 9.086045705566308,
+      "learning_rate": 9.977763781542298e-06,
+      "loss": 1.2604,
+      "step": 333
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 9.526049552694248,
+      "learning_rate": 9.977491985653942e-06,
+      "loss": 1.2692,
+      "step": 334
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 10.64428774077217,
+      "learning_rate": 9.977218542491183e-06,
+      "loss": 1.2575,
+      "step": 335
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 8.776650866463434,
+      "learning_rate": 9.976943452144515e-06,
+      "loss": 1.2518,
+      "step": 336
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 10.234711780687547,
+      "learning_rate": 9.976666714704982e-06,
+      "loss": 1.2609,
+      "step": 337
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 9.541073862495212,
+      "learning_rate": 9.976388330264162e-06,
+      "loss": 1.3764,
+      "step": 338
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 7.810095292647886,
+      "learning_rate": 9.976108298914191e-06,
+      "loss": 1.2658,
+      "step": 339
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 9.30274829849319,
+      "learning_rate": 9.975826620747741e-06,
+      "loss": 1.3077,
+      "step": 340
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 8.382070640193882,
+      "learning_rate": 9.975543295858035e-06,
+      "loss": 1.2382,
+      "step": 341
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 12.141364134405348,
+      "learning_rate": 9.975258324338832e-06,
+      "loss": 1.2726,
+      "step": 342
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 8.990330186094187,
+      "learning_rate": 9.974971706284445e-06,
+      "loss": 1.2328,
+      "step": 343
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 8.029068596092866,
+      "learning_rate": 9.97468344178973e-06,
+      "loss": 1.3488,
+      "step": 344
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 9.610921361063971,
+      "learning_rate": 9.974393530950085e-06,
+      "loss": 1.2702,
+      "step": 345
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 10.001931693970462,
+      "learning_rate": 9.974101973861455e-06,
+      "loss": 1.2799,
+      "step": 346
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 6.796578242951165,
+      "learning_rate": 9.973808770620331e-06,
+      "loss": 1.2951,
+      "step": 347
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 9.920737144987479,
+      "learning_rate": 9.973513921323741e-06,
+      "loss": 1.2753,
+      "step": 348
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 7.7988920402929045,
+      "learning_rate": 9.973217426069271e-06,
+      "loss": 1.3076,
+      "step": 349
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 11.398414453747858,
+      "learning_rate": 9.97291928495504e-06,
+      "loss": 1.2302,
+      "step": 350
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 11.522381532347008,
+      "learning_rate": 9.972619498079719e-06,
+      "loss": 1.2711,
+      "step": 351
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 9.7600980189431,
+      "learning_rate": 9.972318065542517e-06,
+      "loss": 1.3317,
+      "step": 352
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 9.76126835313348,
+      "learning_rate": 9.972014987443197e-06,
+      "loss": 1.2404,
+      "step": 353
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 10.343318627717133,
+      "learning_rate": 9.971710263882056e-06,
+      "loss": 1.2626,
+      "step": 354
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 6.356999864970095,
+      "learning_rate": 9.971403894959944e-06,
+      "loss": 1.2316,
+      "step": 355
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 9.94570376640879,
+      "learning_rate": 9.97109588077825e-06,
+      "loss": 1.3074,
+      "step": 356
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 6.990009300615567,
+      "learning_rate": 9.97078622143891e-06,
+      "loss": 1.2375,
+      "step": 357
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 8.748245956034292,
+      "learning_rate": 9.970474917044407e-06,
+      "loss": 1.2804,
+      "step": 358
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 8.541289265465696,
+      "learning_rate": 9.970161967697763e-06,
+      "loss": 1.3014,
+      "step": 359
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 7.718953236697652,
+      "learning_rate": 9.969847373502545e-06,
+      "loss": 1.2645,
+      "step": 360
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 11.627936821248209,
+      "learning_rate": 9.969531134562869e-06,
+      "loss": 1.2715,
+      "step": 361
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 7.171483178683805,
+      "learning_rate": 9.969213250983392e-06,
+      "loss": 1.2816,
+      "step": 362
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 10.536223051882853,
+      "learning_rate": 9.968893722869316e-06,
+      "loss": 1.2518,
+      "step": 363
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 9.02413628419718,
+      "learning_rate": 9.968572550326388e-06,
+      "loss": 1.2761,
+      "step": 364
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 9.18102303333466,
+      "learning_rate": 9.968249733460896e-06,
+      "loss": 1.228,
+      "step": 365
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 6.8736411048849675,
+      "learning_rate": 9.967925272379675e-06,
+      "loss": 1.262,
+      "step": 366
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 9.606450880239935,
+      "learning_rate": 9.967599167190106e-06,
+      "loss": 1.2531,
+      "step": 367
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 7.68302429340889,
+      "learning_rate": 9.96727141800011e-06,
+      "loss": 1.2041,
+      "step": 368
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 10.925737433769532,
+      "learning_rate": 9.966942024918153e-06,
+      "loss": 1.3425,
+      "step": 369
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 8.327933891414435,
+      "learning_rate": 9.966610988053248e-06,
+      "loss": 1.2236,
+      "step": 370
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 8.263024867955194,
+      "learning_rate": 9.966278307514949e-06,
+      "loss": 1.2189,
+      "step": 371
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 11.675136403555861,
+      "learning_rate": 9.965943983413353e-06,
+      "loss": 1.2921,
+      "step": 372
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 10.433037163310704,
+      "learning_rate": 9.965608015859108e-06,
+      "loss": 1.309,
+      "step": 373
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 8.62432697384353,
+      "learning_rate": 9.965270404963395e-06,
+      "loss": 1.2484,
+      "step": 374
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 8.018373845004776,
+      "learning_rate": 9.964931150837945e-06,
+      "loss": 1.2901,
+      "step": 375
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 9.780619077959738,
+      "learning_rate": 9.964590253595036e-06,
+      "loss": 1.2204,
+      "step": 376
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 8.812600201430119,
+      "learning_rate": 9.964247713347484e-06,
+      "loss": 1.2165,
+      "step": 377
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 8.397165701652888,
+      "learning_rate": 9.96390353020865e-06,
+      "loss": 1.2195,
+      "step": 378
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 8.964955528198466,
+      "learning_rate": 9.963557704292442e-06,
+      "loss": 1.2024,
+      "step": 379
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 8.852116806813621,
+      "learning_rate": 9.963210235713306e-06,
+      "loss": 1.309,
+      "step": 380
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 8.502681029838817,
+      "learning_rate": 9.962861124586238e-06,
+      "loss": 1.2899,
+      "step": 381
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 7.698349298305216,
+      "learning_rate": 9.962510371026773e-06,
+      "loss": 1.2483,
+      "step": 382
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 7.897402055143953,
+      "learning_rate": 9.962157975150992e-06,
+      "loss": 1.2498,
+      "step": 383
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 8.410115693613196,
+      "learning_rate": 9.961803937075516e-06,
+      "loss": 1.2178,
+      "step": 384
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 8.525278730951303,
+      "learning_rate": 9.961448256917514e-06,
+      "loss": 1.2931,
+      "step": 385
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 9.318927370551974,
+      "learning_rate": 9.961090934794698e-06,
+      "loss": 1.2487,
+      "step": 386
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 8.428706944927056,
+      "learning_rate": 9.960731970825317e-06,
+      "loss": 1.2412,
+      "step": 387
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 10.199202961377114,
+      "learning_rate": 9.960371365128174e-06,
+      "loss": 1.2945,
+      "step": 388
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 10.885808742542306,
+      "learning_rate": 9.960009117822604e-06,
+      "loss": 1.265,
+      "step": 389
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 8.753359948192225,
+      "learning_rate": 9.959645229028497e-06,
+      "loss": 1.2972,
+      "step": 390
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 6.8048748514399096,
+      "learning_rate": 9.959279698866274e-06,
+      "loss": 1.2728,
+      "step": 391
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 9.221859599395426,
+      "learning_rate": 9.958912527456908e-06,
+      "loss": 1.2559,
+      "step": 392
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 10.546035429609816,
+      "learning_rate": 9.958543714921914e-06,
+      "loss": 1.2483,
+      "step": 393
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 7.49911780760754,
+      "learning_rate": 9.958173261383344e-06,
+      "loss": 1.2923,
+      "step": 394
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 8.680931638566182,
+      "learning_rate": 9.957801166963803e-06,
+      "loss": 1.2917,
+      "step": 395
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 8.667188237713022,
+      "learning_rate": 9.95742743178643e-06,
+      "loss": 1.2673,
+      "step": 396
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 8.031233368279288,
+      "learning_rate": 9.957052055974912e-06,
+      "loss": 1.2459,
+      "step": 397
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 8.974681774258318,
+      "learning_rate": 9.956675039653479e-06,
+      "loss": 1.3198,
+      "step": 398
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 10.631822682638534,
+      "learning_rate": 9.956296382946898e-06,
+      "loss": 1.2914,
+      "step": 399
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 10.67262024570538,
+      "learning_rate": 9.955916085980488e-06,
+      "loss": 1.248,
+      "step": 400
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 8.875062649087674,
+      "learning_rate": 9.955534148880104e-06,
+      "loss": 1.2148,
+      "step": 401
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 12.401642003922518,
+      "learning_rate": 9.955150571772148e-06,
+      "loss": 1.2187,
+      "step": 402
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 8.350594179533644,
+      "learning_rate": 9.954765354783559e-06,
+      "loss": 1.2082,
+      "step": 403
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 8.700819098086264,
+      "learning_rate": 9.954378498041826e-06,
+      "loss": 1.2607,
+      "step": 404
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 7.949220218829657,
+      "learning_rate": 9.953990001674978e-06,
+      "loss": 1.2202,
+      "step": 405
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 12.906642169887322,
+      "learning_rate": 9.953599865811581e-06,
+      "loss": 1.2544,
+      "step": 406
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 12.422675708356621,
+      "learning_rate": 9.953208090580753e-06,
+      "loss": 1.1933,
+      "step": 407
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 10.021366659985132,
+      "learning_rate": 9.952814676112148e-06,
+      "loss": 1.2218,
+      "step": 408
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 10.245712426540909,
+      "learning_rate": 9.952419622535965e-06,
+      "loss": 1.209,
+      "step": 409
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 11.958398051561112,
+      "learning_rate": 9.952022929982942e-06,
+      "loss": 1.2916,
+      "step": 410
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 7.237497439992601,
+      "learning_rate": 9.951624598584369e-06,
+      "loss": 1.2356,
+      "step": 411
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 11.49888988428453,
+      "learning_rate": 9.951224628472065e-06,
+      "loss": 1.2311,
+      "step": 412
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 6.363146530935149,
+      "learning_rate": 9.9508230197784e-06,
+      "loss": 1.2345,
+      "step": 413
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 7.1571620163968666,
+      "learning_rate": 9.950419772636286e-06,
+      "loss": 1.1982,
+      "step": 414
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 10.964586382765747,
+      "learning_rate": 9.950014887179173e-06,
+      "loss": 1.2716,
+      "step": 415
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 11.641558999860854,
+      "learning_rate": 9.949608363541056e-06,
+      "loss": 1.265,
+      "step": 416
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 9.724103083648927,
+      "learning_rate": 9.949200201856475e-06,
+      "loss": 1.3795,
+      "step": 417
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 9.182946069644503,
+      "learning_rate": 9.948790402260507e-06,
+      "loss": 1.2024,
+      "step": 418
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 8.466926914382986,
+      "learning_rate": 9.94837896488877e-06,
+      "loss": 1.1879,
+      "step": 419
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 8.439009165665398,
+      "learning_rate": 9.947965889877432e-06,
+      "loss": 1.2031,
+      "step": 420
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 8.870533412527934,
+      "learning_rate": 9.947551177363192e-06,
+      "loss": 1.1782,
+      "step": 421
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 7.863716668788013,
+      "learning_rate": 9.947134827483305e-06,
+      "loss": 1.2674,
+      "step": 422
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 8.523727756156203,
+      "learning_rate": 9.946716840375552e-06,
+      "loss": 1.2695,
+      "step": 423
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 8.620382018098091,
+      "learning_rate": 9.946297216178268e-06,
+      "loss": 1.3025,
+      "step": 424
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 11.309066295015782,
+      "learning_rate": 9.945875955030325e-06,
+      "loss": 1.1871,
+      "step": 425
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 6.557162256226696,
+      "learning_rate": 9.945453057071133e-06,
+      "loss": 1.1944,
+      "step": 426
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 9.123965828666591,
+      "learning_rate": 9.945028522440654e-06,
+      "loss": 1.3678,
+      "step": 427
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 9.885203992479019,
+      "learning_rate": 9.944602351279381e-06,
+      "loss": 1.2663,
+      "step": 428
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 8.221210232764554,
+      "learning_rate": 9.944174543728358e-06,
+      "loss": 1.1997,
+      "step": 429
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 8.400684826739507,
+      "learning_rate": 9.94374509992916e-06,
+      "loss": 1.2752,
+      "step": 430
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 9.561240921293425,
+      "learning_rate": 9.943314020023912e-06,
+      "loss": 1.2524,
+      "step": 431
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 12.11274785057126,
+      "learning_rate": 9.942881304155276e-06,
+      "loss": 1.2699,
+      "step": 432
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 6.154964948800642,
+      "learning_rate": 9.942446952466458e-06,
+      "loss": 1.2744,
+      "step": 433
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 9.23574596886594,
+      "learning_rate": 9.942010965101205e-06,
+      "loss": 1.2159,
+      "step": 434
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 8.80202335868047,
+      "learning_rate": 9.941573342203805e-06,
+      "loss": 1.2054,
+      "step": 435
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 13.068617467925892,
+      "learning_rate": 9.941134083919086e-06,
+      "loss": 1.233,
+      "step": 436
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 9.581344043031098,
+      "learning_rate": 9.940693190392417e-06,
+      "loss": 1.2369,
+      "step": 437
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 7.486343055396589,
+      "learning_rate": 9.940250661769712e-06,
+      "loss": 1.2519,
+      "step": 438
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 8.098014854964717,
+      "learning_rate": 9.939806498197421e-06,
+      "loss": 1.2598,
+      "step": 439
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 8.447142183540242,
+      "learning_rate": 9.939360699822538e-06,
+      "loss": 1.1837,
+      "step": 440
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 7.556280664785657,
+      "learning_rate": 9.9389132667926e-06,
+      "loss": 1.2323,
+      "step": 441
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 11.714192055291493,
+      "learning_rate": 9.93846419925568e-06,
+      "loss": 1.2475,
+      "step": 442
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 7.37692916585073,
+      "learning_rate": 9.938013497360395e-06,
+      "loss": 1.258,
+      "step": 443
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 8.560988284352149,
+      "learning_rate": 9.937561161255904e-06,
+      "loss": 1.2536,
+      "step": 444
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 6.596416363703586,
+      "learning_rate": 9.937107191091904e-06,
+      "loss": 1.2525,
+      "step": 445
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 8.082367237164359,
+      "learning_rate": 9.936651587018634e-06,
+      "loss": 1.2439,
+      "step": 446
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 8.26579554456561,
+      "learning_rate": 9.936194349186871e-06,
+      "loss": 1.2405,
+      "step": 447
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 9.99196969229866,
+      "learning_rate": 9.93573547774794e-06,
+      "loss": 1.2619,
+      "step": 448
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 7.85950933758222,
+      "learning_rate": 9.9352749728537e-06,
+      "loss": 1.2958,
+      "step": 449
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 11.457615565363486,
+      "learning_rate": 9.934812834656554e-06,
+      "loss": 1.2077,
+      "step": 450
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 14.059578391616023,
+      "learning_rate": 9.934349063309441e-06,
+      "loss": 1.2121,
+      "step": 451
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 7.939171754492314,
+      "learning_rate": 9.933883658965846e-06,
+      "loss": 1.2551,
+      "step": 452
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 7.787794245844977,
+      "learning_rate": 9.933416621779793e-06,
+      "loss": 1.2324,
+      "step": 453
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 9.404273760005633,
+      "learning_rate": 9.932947951905843e-06,
+      "loss": 1.2075,
+      "step": 454
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 9.104510257592642,
+      "learning_rate": 9.932477649499101e-06,
+      "loss": 1.3059,
+      "step": 455
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 10.459057844974263,
+      "learning_rate": 9.932005714715211e-06,
+      "loss": 1.1859,
+      "step": 456
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 7.892955428494308,
+      "learning_rate": 9.93153214771036e-06,
+      "loss": 1.2741,
+      "step": 457
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 10.063820692480293,
+      "learning_rate": 9.931056948641266e-06,
+      "loss": 1.2209,
+      "step": 458
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 6.295496257711925,
+      "learning_rate": 9.930580117665199e-06,
+      "loss": 1.1759,
+      "step": 459
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 9.526292274535916,
+      "learning_rate": 9.930101654939962e-06,
+      "loss": 1.1967,
+      "step": 460
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 7.513556088211928,
+      "learning_rate": 9.929621560623899e-06,
+      "loss": 1.2339,
+      "step": 461
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 8.458513281320059,
+      "learning_rate": 9.929139834875896e-06,
+      "loss": 1.2317,
+      "step": 462
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 13.341378347960879,
+      "learning_rate": 9.928656477855379e-06,
+      "loss": 1.2737,
+      "step": 463
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 10.300025642955926,
+      "learning_rate": 9.928171489722307e-06,
+      "loss": 1.2315,
+      "step": 464
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 7.445357586602429,
+      "learning_rate": 9.92768487063719e-06,
+      "loss": 1.1578,
+      "step": 465
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 7.014935475016698,
+      "learning_rate": 9.927196620761068e-06,
+      "loss": 1.2537,
+      "step": 466
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 8.879751087178647,
+      "learning_rate": 9.926706740255527e-06,
+      "loss": 1.251,
+      "step": 467
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 16.25912572663353,
+      "learning_rate": 9.92621522928269e-06,
+      "loss": 1.1974,
+      "step": 468
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 9.664490838895574,
+      "learning_rate": 9.925722088005218e-06,
+      "loss": 1.2638,
+      "step": 469
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 8.750353027936386,
+      "learning_rate": 9.925227316586316e-06,
+      "loss": 1.2668,
+      "step": 470
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 9.527631337154137,
+      "learning_rate": 9.924730915189727e-06,
+      "loss": 1.2121,
+      "step": 471
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 10.474706624755662,
+      "learning_rate": 9.924232883979728e-06,
+      "loss": 1.1388,
+      "step": 472
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 8.220161616484726,
+      "learning_rate": 9.923733223121143e-06,
+      "loss": 1.3463,
+      "step": 473
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 8.844326917233335,
+      "learning_rate": 9.92323193277933e-06,
+      "loss": 1.2524,
+      "step": 474
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 9.17248119191394,
+      "learning_rate": 9.922729013120191e-06,
+      "loss": 1.284,
+      "step": 475
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 7.768476014777398,
+      "learning_rate": 9.922224464310163e-06,
+      "loss": 1.2245,
+      "step": 476
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 10.545991719786056,
+      "learning_rate": 9.921718286516224e-06,
+      "loss": 1.1793,
+      "step": 477
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 10.79814225999179,
+      "learning_rate": 9.921210479905891e-06,
+      "loss": 1.2358,
+      "step": 478
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 12.669138247971764,
+      "learning_rate": 9.92070104464722e-06,
+      "loss": 1.3033,
+      "step": 479
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 11.624079463768117,
+      "learning_rate": 9.920189980908805e-06,
+      "loss": 1.1485,
+      "step": 480
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 11.660041789380854,
+      "learning_rate": 9.91967728885978e-06,
+      "loss": 1.3175,
+      "step": 481
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 9.612305109909988,
+      "learning_rate": 9.919162968669817e-06,
+      "loss": 1.1859,
+      "step": 482
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 6.926138033045526,
+      "learning_rate": 9.918647020509131e-06,
+      "loss": 1.2635,
+      "step": 483
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 10.917367306760731,
+      "learning_rate": 9.91812944454847e-06,
+      "loss": 1.2873,
+      "step": 484
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 8.665006839966253,
+      "learning_rate": 9.91761024095912e-06,
+      "loss": 1.1876,
+      "step": 485
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 7.985702602436964,
+      "learning_rate": 9.917089409912913e-06,
+      "loss": 1.2816,
+      "step": 486
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 7.229207497180563,
+      "learning_rate": 9.916566951582213e-06,
+      "loss": 1.2207,
+      "step": 487
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 9.524493888698963,
+      "learning_rate": 9.916042866139928e-06,
+      "loss": 1.269,
+      "step": 488
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 9.577907552282277,
+      "learning_rate": 9.915517153759497e-06,
+      "loss": 1.3242,
+      "step": 489
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 8.757530243079652,
+      "learning_rate": 9.914989814614903e-06,
+      "loss": 1.2376,
+      "step": 490
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 12.73557127902518,
+      "learning_rate": 9.914460848880669e-06,
+      "loss": 1.2832,
+      "step": 491
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 10.641783187554115,
+      "learning_rate": 9.91393025673185e-06,
+      "loss": 1.196,
+      "step": 492
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 8.197506478058148,
+      "learning_rate": 9.913398038344042e-06,
+      "loss": 1.1278,
+      "step": 493
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 8.323720320087135,
+      "learning_rate": 9.912864193893383e-06,
+      "loss": 1.2868,
+      "step": 494
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 7.690320499084032,
+      "learning_rate": 9.912328723556543e-06,
+      "loss": 1.1569,
+      "step": 495
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 7.326707860887639,
+      "learning_rate": 9.911791627510736e-06,
+      "loss": 1.1846,
+      "step": 496
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 11.829445380895953,
+      "learning_rate": 9.91125290593371e-06,
+      "loss": 1.2176,
+      "step": 497
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 8.676556732790637,
+      "learning_rate": 9.91071255900375e-06,
+      "loss": 1.2124,
+      "step": 498
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 7.047236005786082,
+      "learning_rate": 9.910170586899683e-06,
+      "loss": 1.1779,
+      "step": 499
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 8.029161249241392,
+      "learning_rate": 9.909626989800874e-06,
+      "loss": 1.1543,
+      "step": 500
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 9.77957622209998,
+      "learning_rate": 9.909081767887218e-06,
+      "loss": 1.2748,
+      "step": 501
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 6.264263602929576,
+      "learning_rate": 9.908534921339156e-06,
+      "loss": 1.1713,
+      "step": 502
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 8.408817357269164,
+      "learning_rate": 9.907986450337666e-06,
+      "loss": 1.1926,
+      "step": 503
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 6.713339528494365,
+      "learning_rate": 9.907436355064258e-06,
+      "loss": 1.1873,
+      "step": 504
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 10.856535341996452,
+      "learning_rate": 9.906884635700984e-06,
+      "loss": 1.2327,
+      "step": 505
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 9.449936935699787,
+      "learning_rate": 9.906331292430436e-06,
+      "loss": 1.293,
+      "step": 506
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 10.850829846159733,
+      "learning_rate": 9.905776325435737e-06,
+      "loss": 1.1634,
+      "step": 507
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 7.711311382055524,
+      "learning_rate": 9.905219734900549e-06,
+      "loss": 1.295,
+      "step": 508
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 8.513421494186378,
+      "learning_rate": 9.904661521009073e-06,
+      "loss": 1.3584,
+      "step": 509
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 10.222186724086626,
+      "learning_rate": 9.904101683946052e-06,
+      "loss": 1.1757,
+      "step": 510
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 8.1150659954251,
+      "learning_rate": 9.903540223896755e-06,
+      "loss": 1.1896,
+      "step": 511
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 7.23544307171203,
+      "learning_rate": 9.902977141046997e-06,
+      "loss": 1.2229,
+      "step": 512
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 8.952376165524253,
+      "learning_rate": 9.902412435583127e-06,
+      "loss": 1.246,
+      "step": 513
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 9.063790686475254,
+      "learning_rate": 9.901846107692031e-06,
+      "loss": 1.1505,
+      "step": 514
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 8.730628804052095,
+      "learning_rate": 9.901278157561133e-06,
+      "loss": 1.2149,
+      "step": 515
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 8.775246900004046,
+      "learning_rate": 9.900708585378392e-06,
+      "loss": 1.2657,
+      "step": 516
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 11.181544509306974,
+      "learning_rate": 9.900137391332305e-06,
+      "loss": 1.1873,
+      "step": 517
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 10.384801794093706,
+      "learning_rate": 9.899564575611906e-06,
+      "loss": 1.2726,
+      "step": 518
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 7.661242928581244,
+      "learning_rate": 9.898990138406765e-06,
+      "loss": 1.294,
+      "step": 519
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 8.740122402608735,
+      "learning_rate": 9.898414079906988e-06,
+      "loss": 1.1585,
+      "step": 520
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 7.321769772091939,
+      "learning_rate": 9.89783640030322e-06,
+      "loss": 1.268,
+      "step": 521
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 7.450016548204915,
+      "learning_rate": 9.89725709978664e-06,
+      "loss": 1.1963,
+      "step": 522
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 8.822399660550241,
+      "learning_rate": 9.896676178548967e-06,
+      "loss": 1.1993,
+      "step": 523
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 9.851175693003508,
+      "learning_rate": 9.896093636782452e-06,
+      "loss": 1.2411,
+      "step": 524
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 8.193698359877931,
+      "learning_rate": 9.895509474679883e-06,
+      "loss": 1.2001,
+      "step": 525
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 10.240096830520763,
+      "learning_rate": 9.894923692434586e-06,
+      "loss": 1.173,
+      "step": 526
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 7.7326162141419825,
+      "learning_rate": 9.894336290240421e-06,
+      "loss": 1.2499,
+      "step": 527
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 12.276591300303195,
+      "learning_rate": 9.893747268291789e-06,
+      "loss": 1.2442,
+      "step": 528
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 6.558645464102979,
+      "learning_rate": 9.893156626783622e-06,
+      "loss": 1.2558,
+      "step": 529
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 7.240454345196765,
+      "learning_rate": 9.89256436591139e-06,
+      "loss": 1.1679,
+      "step": 530
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 6.94371439880215,
+      "learning_rate": 9.891970485871096e-06,
+      "loss": 1.1852,
+      "step": 531
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 8.393339179885693,
+      "learning_rate": 9.891374986859285e-06,
+      "loss": 1.3223,
+      "step": 532
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 10.962772546224802,
+      "learning_rate": 9.890777869073033e-06,
+      "loss": 1.2028,
+      "step": 533
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 8.839248352874709,
+      "learning_rate": 9.890179132709952e-06,
+      "loss": 1.1443,
+      "step": 534
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 7.029316792586741,
+      "learning_rate": 9.889578777968192e-06,
+      "loss": 1.2512,
+      "step": 535
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 9.38277729601508,
+      "learning_rate": 9.888976805046438e-06,
+      "loss": 1.2206,
+      "step": 536
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 9.538656860026029,
+      "learning_rate": 9.888373214143906e-06,
+      "loss": 1.2305,
+      "step": 537
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 6.752448913810515,
+      "learning_rate": 9.887768005460353e-06,
+      "loss": 1.2451,
+      "step": 538
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 10.836237825706966,
+      "learning_rate": 9.887161179196072e-06,
+      "loss": 1.2121,
+      "step": 539
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 10.735619382483208,
+      "learning_rate": 9.886552735551887e-06,
+      "loss": 1.2301,
+      "step": 540
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 6.223434648919733,
+      "learning_rate": 9.885942674729159e-06,
+      "loss": 1.1858,
+      "step": 541
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.490926243868017,
+      "learning_rate": 9.885330996929783e-06,
+      "loss": 1.2123,
+      "step": 542
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.95282345070694,
+      "learning_rate": 9.884717702356193e-06,
+      "loss": 1.2971,
+      "step": 543
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 9.536546338300921,
+      "learning_rate": 9.884102791211355e-06,
+      "loss": 1.186,
+      "step": 544
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 6.593561229237792,
+      "learning_rate": 9.88348626369877e-06,
+      "loss": 1.2649,
+      "step": 545
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 10.038636431408943,
+      "learning_rate": 9.882868120022474e-06,
+      "loss": 1.1629,
+      "step": 546
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.380919287913484,
+      "learning_rate": 9.882248360387038e-06,
+      "loss": 1.2116,
+      "step": 547
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.694222953507182,
+      "learning_rate": 9.881626984997572e-06,
+      "loss": 1.186,
+      "step": 548
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 9.39051372127475,
+      "learning_rate": 9.881003994059712e-06,
+      "loss": 1.2919,
+      "step": 549
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 10.098612364973626,
+      "learning_rate": 9.880379387779637e-06,
+      "loss": 1.2402,
+      "step": 550
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.148639418340338,
+      "learning_rate": 9.879753166364055e-06,
+      "loss": 1.2056,
+      "step": 551
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 9.606044512273394,
+      "learning_rate": 9.87912533002021e-06,
+      "loss": 1.2673,
+      "step": 552
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.34779608390543,
+      "learning_rate": 9.878495878955884e-06,
+      "loss": 1.1959,
+      "step": 553
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.094489165619544,
+      "learning_rate": 9.877864813379387e-06,
+      "loss": 1.1886,
+      "step": 554
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 9.999624760789441,
+      "learning_rate": 9.87723213349957e-06,
+      "loss": 1.2206,
+      "step": 555
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 7.493921966287265,
+      "learning_rate": 9.876597839525814e-06,
+      "loss": 1.256,
+      "step": 556
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 7.771788425707162,
+      "learning_rate": 9.875961931668034e-06,
+      "loss": 1.2361,
+      "step": 557
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 6.774377999299782,
+      "learning_rate": 9.875324410136683e-06,
+      "loss": 1.158,
+      "step": 558
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.703949761072744,
+      "learning_rate": 9.874685275142742e-06,
+      "loss": 1.2558,
+      "step": 559
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.582937771080294,
+      "learning_rate": 9.874044526897728e-06,
+      "loss": 1.2345,
+      "step": 560
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 9.02242692102833,
+      "learning_rate": 9.8734021656137e-06,
+      "loss": 1.1815,
+      "step": 561
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.293304391554845,
+      "learning_rate": 9.87275819150324e-06,
+      "loss": 1.1296,
+      "step": 562
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 9.661990076982711,
+      "learning_rate": 9.872112604779465e-06,
+      "loss": 1.1586,
+      "step": 563
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 12.339589498740027,
+      "learning_rate": 9.871465405656033e-06,
+      "loss": 1.2748,
+      "step": 564
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 7.659494236091539,
+      "learning_rate": 9.87081659434713e-06,
+      "loss": 1.1383,
+      "step": 565
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.794198075128477,
+      "learning_rate": 9.870166171067477e-06,
+      "loss": 1.1639,
+      "step": 566
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.022339719775399,
+      "learning_rate": 9.869514136032325e-06,
+      "loss": 1.236,
+      "step": 567
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 9.004732316695865,
+      "learning_rate": 9.868860489457464e-06,
+      "loss": 1.2619,
+      "step": 568
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 12.02114932383443,
+      "learning_rate": 9.868205231559216e-06,
+      "loss": 1.2461,
+      "step": 569
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 6.83036048892407,
+      "learning_rate": 9.867548362554433e-06,
+      "loss": 1.2359,
+      "step": 570
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.378790326152965,
+      "learning_rate": 9.866889882660502e-06,
+      "loss": 1.1584,
+      "step": 571
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 9.987839922873858,
+      "learning_rate": 9.866229792095346e-06,
+      "loss": 1.1392,
+      "step": 572
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.353139025028037,
+      "learning_rate": 9.865568091077416e-06,
+      "loss": 1.1595,
+      "step": 573
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 9.025121723952122,
+      "learning_rate": 9.864904779825698e-06,
+      "loss": 1.2303,
+      "step": 574
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 9.008461065692169,
+      "learning_rate": 9.864239858559713e-06,
+      "loss": 1.165,
+      "step": 575
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.689505120177254,
+      "learning_rate": 9.863573327499513e-06,
+      "loss": 1.2517,
+      "step": 576
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.41488270671321,
+      "learning_rate": 9.862905186865684e-06,
+      "loss": 1.1942,
+      "step": 577
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 12.41234371288047,
+      "learning_rate": 9.862235436879341e-06,
+      "loss": 1.2269,
+      "step": 578
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.638207901649618,
+      "learning_rate": 9.861564077762135e-06,
+      "loss": 1.1758,
+      "step": 579
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 11.21409250943848,
+      "learning_rate": 9.860891109736248e-06,
+      "loss": 1.2806,
+      "step": 580
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.984414909840394,
+      "learning_rate": 9.860216533024398e-06,
+      "loss": 1.2209,
+      "step": 581
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 7.873036501389551,
+      "learning_rate": 9.85954034784983e-06,
+      "loss": 1.1834,
+      "step": 582
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 10.070058852233863,
+      "learning_rate": 9.858862554436326e-06,
+      "loss": 1.2126,
+      "step": 583
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.107523824270011,
+      "learning_rate": 9.858183153008196e-06,
+      "loss": 1.2675,
+      "step": 584
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.549510957940694,
+      "learning_rate": 9.857502143790286e-06,
+      "loss": 1.1768,
+      "step": 585
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.067597708646918,
+      "learning_rate": 9.85681952700797e-06,
+      "loss": 1.1968,
+      "step": 586
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 9.946753632063007,
+      "learning_rate": 9.856135302887157e-06,
+      "loss": 1.2196,
+      "step": 587
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.459644862172693,
+      "learning_rate": 9.85544947165429e-06,
+      "loss": 1.2492,
+      "step": 588
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 9.892143105333336,
+      "learning_rate": 9.854762033536339e-06,
+      "loss": 1.2092,
+      "step": 589
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 9.251289575611755,
+      "learning_rate": 9.854072988760807e-06,
+      "loss": 1.2301,
+      "step": 590
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 13.062422100035128,
+      "learning_rate": 9.853382337555732e-06,
+      "loss": 1.2574,
+      "step": 591
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 7.68132181156768,
+      "learning_rate": 9.85269008014968e-06,
+      "loss": 1.1459,
+      "step": 592
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.338300671961282,
+      "learning_rate": 9.85199621677175e-06,
+      "loss": 1.1977,
+      "step": 593
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 10.568151192084317,
+      "learning_rate": 9.851300747651573e-06,
+      "loss": 1.175,
+      "step": 594
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 9.031884198905594,
+      "learning_rate": 9.85060367301931e-06,
+      "loss": 1.1466,
+      "step": 595
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.768530282208483,
+      "learning_rate": 9.849904993105654e-06,
+      "loss": 1.1607,
+      "step": 596
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 6.198562487100519,
+      "learning_rate": 9.84920470814183e-06,
+      "loss": 1.1928,
+      "step": 597
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.129817731792986,
+      "learning_rate": 9.848502818359596e-06,
+      "loss": 1.2084,
+      "step": 598
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 9.17019656561337,
+      "learning_rate": 9.847799323991234e-06,
+      "loss": 1.2722,
+      "step": 599
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 10.081052529445557,
+      "learning_rate": 9.847094225269564e-06,
+      "loss": 1.2862,
+      "step": 600
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.327268653154317,
+      "learning_rate": 9.846387522427933e-06,
+      "loss": 1.2057,
+      "step": 601
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 6.825436097900967,
+      "learning_rate": 9.845679215700225e-06,
+      "loss": 1.2172,
+      "step": 602
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 6.748310624271783,
+      "learning_rate": 9.844969305320846e-06,
+      "loss": 1.2351,
+      "step": 603
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 9.197914449676391,
+      "learning_rate": 9.844257791524738e-06,
+      "loss": 1.171,
+      "step": 604
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.574018631227156,
+      "learning_rate": 9.843544674547375e-06,
+      "loss": 1.1934,
+      "step": 605
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 10.030927601802231,
+      "learning_rate": 9.842829954624754e-06,
+      "loss": 1.2031,
+      "step": 606
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 9.546562870218287,
+      "learning_rate": 9.842113631993412e-06,
+      "loss": 1.1961,
+      "step": 607
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 10.493061977047876,
+      "learning_rate": 9.841395706890413e-06,
+      "loss": 1.2507,
+      "step": 608
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 9.752933129797437,
+      "learning_rate": 9.840676179553348e-06,
+      "loss": 1.1648,
+      "step": 609
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 7.032042007121331,
+      "learning_rate": 9.83995505022034e-06,
+      "loss": 1.1884,
+      "step": 610
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 7.284917571051285,
+      "learning_rate": 9.839232319130047e-06,
+      "loss": 1.1963,
+      "step": 611
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 6.602992251802299,
+      "learning_rate": 9.838507986521648e-06,
+      "loss": 1.2335,
+      "step": 612
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.48470356317902,
+      "learning_rate": 9.837782052634861e-06,
+      "loss": 1.2113,
+      "step": 613
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.70043842694895,
+      "learning_rate": 9.837054517709927e-06,
+      "loss": 1.2055,
+      "step": 614
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 6.735579767595129,
+      "learning_rate": 9.836325381987623e-06,
+      "loss": 1.2505,
+      "step": 615
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 7.467135078586999,
+      "learning_rate": 9.83559464570925e-06,
+      "loss": 1.094,
+      "step": 616
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 10.05696362059544,
+      "learning_rate": 9.834862309116642e-06,
+      "loss": 1.187,
+      "step": 617
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.139642021856599,
+      "learning_rate": 9.83412837245216e-06,
+      "loss": 1.2291,
+      "step": 618
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.65692882000961,
+      "learning_rate": 9.833392835958699e-06,
+      "loss": 1.1948,
+      "step": 619
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 7.035863336047722,
+      "learning_rate": 9.832655699879682e-06,
+      "loss": 1.251,
+      "step": 620
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 7.171238897683117,
+      "learning_rate": 9.831916964459058e-06,
+      "loss": 1.1766,
+      "step": 621
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.131578308515415,
+      "learning_rate": 9.831176629941306e-06,
+      "loss": 1.2517,
+      "step": 622
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 10.22671427438357,
+      "learning_rate": 9.830434696571439e-06,
+      "loss": 1.2029,
+      "step": 623
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 6.906913061709933,
+      "learning_rate": 9.829691164594994e-06,
+      "loss": 1.2652,
+      "step": 624
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 9.824270186848914,
+      "learning_rate": 9.82894603425804e-06,
+      "loss": 1.2547,
+      "step": 625
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.77829308644985,
+      "learning_rate": 9.828199305807174e-06,
+      "loss": 1.2262,
+      "step": 626
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 10.51759434973706,
+      "learning_rate": 9.82745097948952e-06,
+      "loss": 1.118,
+      "step": 627
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 12.055176222081494,
+      "learning_rate": 9.826701055552733e-06,
+      "loss": 1.2261,
+      "step": 628
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 7.197111505908639,
+      "learning_rate": 9.825949534245e-06,
+      "loss": 1.2843,
+      "step": 629
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.089687287078688,
+      "learning_rate": 9.82519641581503e-06,
+      "loss": 1.1762,
+      "step": 630
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.178628456276291,
+      "learning_rate": 9.824441700512062e-06,
+      "loss": 1.1362,
+      "step": 631
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 7.750808540810014,
+      "learning_rate": 9.823685388585866e-06,
+      "loss": 1.181,
+      "step": 632
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 5.971431126611037,
+      "learning_rate": 9.822927480286742e-06,
+      "loss": 1.2043,
+      "step": 633
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 9.04828581799787,
+      "learning_rate": 9.822167975865512e-06,
+      "loss": 1.1404,
+      "step": 634
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.302499675134017,
+      "learning_rate": 9.821406875573533e-06,
+      "loss": 1.2173,
+      "step": 635
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 7.1309075078901145,
+      "learning_rate": 9.820644179662684e-06,
+      "loss": 1.2194,
+      "step": 636
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 10.941395547476466,
+      "learning_rate": 9.819879888385378e-06,
+      "loss": 1.2454,
+      "step": 637
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 12.586148174488837,
+      "learning_rate": 9.819114001994553e-06,
+      "loss": 1.194,
+      "step": 638
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 11.123825385665883,
+      "learning_rate": 9.818346520743674e-06,
+      "loss": 1.1959,
+      "step": 639
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 10.70021197376136,
+      "learning_rate": 9.817577444886731e-06,
+      "loss": 1.1537,
+      "step": 640
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 10.5350996753934,
+      "learning_rate": 9.816806774678253e-06,
+      "loss": 1.2783,
+      "step": 641
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.266436681142414,
+      "learning_rate": 9.816034510373287e-06,
+      "loss": 1.2455,
+      "step": 642
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 7.81439450801421,
+      "learning_rate": 9.815260652227405e-06,
+      "loss": 1.1955,
+      "step": 643
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.870271080528024,
+      "learning_rate": 9.814485200496717e-06,
+      "loss": 1.305,
+      "step": 644
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 6.061724922145783,
+      "learning_rate": 9.81370815543785e-06,
+      "loss": 1.1995,
+      "step": 645
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 7.675498403633729,
+      "learning_rate": 9.812929517307966e-06,
+      "loss": 1.185,
+      "step": 646
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 8.09846540284863,
+      "learning_rate": 9.812149286364751e-06,
+      "loss": 1.2271,
+      "step": 647
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 10.787352055198463,
+      "learning_rate": 9.811367462866416e-06,
+      "loss": 1.192,
+      "step": 648
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 10.036204079195581,
+      "learning_rate": 9.810584047071705e-06,
+      "loss": 1.1682,
+      "step": 649
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 7.516255505198569,
+      "learning_rate": 9.80979903923988e-06,
+      "loss": 1.2242,
+      "step": 650
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 9.423587245345649,
+      "learning_rate": 9.80901243963074e-06,
+      "loss": 1.2024,
+      "step": 651
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 9.98151045303215,
+      "learning_rate": 9.808224248504606e-06,
+      "loss": 1.2084,
+      "step": 652
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 11.70671885928098,
+      "learning_rate": 9.80743446612232e-06,
+      "loss": 1.1673,
+      "step": 653
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 7.5266366716316,
+      "learning_rate": 9.806643092745262e-06,
+      "loss": 1.2432,
+      "step": 654
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 9.098590768302342,
+      "learning_rate": 9.805850128635331e-06,
+      "loss": 1.2503,
+      "step": 655
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 5.834128111606166,
+      "learning_rate": 9.805055574054954e-06,
+      "loss": 1.2062,
+      "step": 656
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 7.460038183822387,
+      "learning_rate": 9.804259429267085e-06,
+      "loss": 1.1983,
+      "step": 657
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 5.297020277505172,
+      "learning_rate": 9.803461694535204e-06,
+      "loss": 1.1545,
+      "step": 658
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 8.593457814147902,
+      "learning_rate": 9.802662370123314e-06,
+      "loss": 1.281,
+      "step": 659
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 8.039058248360176,
+      "learning_rate": 9.801861456295953e-06,
+      "loss": 1.2213,
+      "step": 660
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 10.941572169612817,
+      "learning_rate": 9.801058953318174e-06,
+      "loss": 1.1738,
+      "step": 661
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 8.545113034078792,
+      "learning_rate": 9.800254861455564e-06,
+      "loss": 1.1408,
+      "step": 662
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 9.443182068027358,
+      "learning_rate": 9.799449180974231e-06,
+      "loss": 1.228,
+      "step": 663
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 7.149346471522878,
+      "learning_rate": 9.79864191214081e-06,
+      "loss": 1.315,
+      "step": 664
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 10.606686331243294,
+      "learning_rate": 9.797833055222467e-06,
+      "loss": 1.2212,
+      "step": 665
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 7.5458783971529755,
+      "learning_rate": 9.797022610486886e-06,
+      "loss": 1.1387,
+      "step": 666
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 10.67460650559838,
+      "learning_rate": 9.796210578202278e-06,
+      "loss": 1.1938,
+      "step": 667
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 8.681111013927044,
+      "learning_rate": 9.795396958637382e-06,
+      "loss": 1.1214,
+      "step": 668
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 9.690524289472515,
+      "learning_rate": 9.79458175206146e-06,
+      "loss": 1.1052,
+      "step": 669
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 9.352068952019353,
+      "learning_rate": 9.7937649587443e-06,
+      "loss": 1.1546,
+      "step": 670
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 8.207341531907794,
+      "learning_rate": 9.79294657895622e-06,
+      "loss": 1.149,
+      "step": 671
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 7.399583297510709,
+      "learning_rate": 9.792126612968054e-06,
+      "loss": 1.209,
+      "step": 672
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 9.511158539921933,
+      "learning_rate": 9.791305061051164e-06,
+      "loss": 1.1891,
+      "step": 673
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 10.207203732494506,
+      "learning_rate": 9.790481923477442e-06,
+      "loss": 1.1884,
+      "step": 674
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 8.874790672243044,
+      "learning_rate": 9.7896572005193e-06,
+      "loss": 1.162,
+      "step": 675
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 9.107876332256055,
+      "learning_rate": 9.788830892449677e-06,
+      "loss": 1.1695,
+      "step": 676
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 8.874690508943573,
+      "learning_rate": 9.78800299954203e-06,
+      "loss": 1.2613,
+      "step": 677
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 6.308678880931297,
+      "learning_rate": 9.78717352207035e-06,
+      "loss": 1.2118,
+      "step": 678
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 6.917299882418908,
+      "learning_rate": 9.78634246030915e-06,
+      "loss": 1.1403,
+      "step": 679
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 10.62712782725359,
+      "learning_rate": 9.78550981453346e-06,
+      "loss": 1.1904,
+      "step": 680
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 8.228063980578938,
+      "learning_rate": 9.784675585018844e-06,
+      "loss": 1.1372,
+      "step": 681
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 9.9153654589841,
+      "learning_rate": 9.783839772041383e-06,
+      "loss": 1.1112,
+      "step": 682
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 6.367781521306305,
+      "learning_rate": 9.783002375877687e-06,
+      "loss": 1.2076,
+      "step": 683
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 7.278461457001586,
+      "learning_rate": 9.782163396804886e-06,
+      "loss": 1.1417,
+      "step": 684
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 10.019822585559016,
+      "learning_rate": 9.781322835100639e-06,
+      "loss": 1.1688,
+      "step": 685
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 7.207791851380966,
+      "learning_rate": 9.78048069104312e-06,
+      "loss": 1.1391,
+      "step": 686
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 7.409038024162773,
+      "learning_rate": 9.779636964911037e-06,
+      "loss": 1.1755,
+      "step": 687
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 9.97137532858594,
+      "learning_rate": 9.778791656983613e-06,
+      "loss": 1.209,
+      "step": 688
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 7.408130448124987,
+      "learning_rate": 9.777944767540601e-06,
+      "loss": 1.1901,
+      "step": 689
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 9.30682199363717,
+      "learning_rate": 9.777096296862274e-06,
+      "loss": 1.1203,
+      "step": 690
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 8.093501274021515,
+      "learning_rate": 9.776246245229429e-06,
+      "loss": 1.198,
+      "step": 691
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 8.149479277862957,
+      "learning_rate": 9.775394612923383e-06,
+      "loss": 1.2115,
+      "step": 692
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 7.509052789817103,
+      "learning_rate": 9.774541400225981e-06,
+      "loss": 1.2197,
+      "step": 693
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 8.342712927270325,
+      "learning_rate": 9.773686607419593e-06,
+      "loss": 1.1651,
+      "step": 694
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 7.0675732183111935,
+      "learning_rate": 9.772830234787102e-06,
+      "loss": 1.1659,
+      "step": 695
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 10.464286568729984,
+      "learning_rate": 9.771972282611924e-06,
+      "loss": 1.1831,
+      "step": 696
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 9.979101817147493,
+      "learning_rate": 9.771112751177991e-06,
+      "loss": 1.2578,
+      "step": 697
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 8.907893484597073,
+      "learning_rate": 9.770251640769763e-06,
+      "loss": 1.1653,
+      "step": 698
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 8.991298022927783,
+      "learning_rate": 9.769388951672218e-06,
+      "loss": 1.1703,
+      "step": 699
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 9.061862368499902,
+      "learning_rate": 9.76852468417086e-06,
+      "loss": 1.1581,
+      "step": 700
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 10.210776435943322,
+      "learning_rate": 9.767658838551711e-06,
+      "loss": 1.2201,
+      "step": 701
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 10.193817159530852,
+      "learning_rate": 9.76679141510132e-06,
+      "loss": 1.1558,
+      "step": 702
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 6.691752355800361,
+      "learning_rate": 9.765922414106757e-06,
+      "loss": 1.2463,
+      "step": 703
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 6.083006430961972,
+      "learning_rate": 9.765051835855612e-06,
+      "loss": 1.1825,
+      "step": 704
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.516843989320003,
+      "learning_rate": 9.764179680635998e-06,
+      "loss": 1.2394,
+      "step": 705
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 10.373766373374712,
+      "learning_rate": 9.76330594873655e-06,
+      "loss": 1.188,
+      "step": 706
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 7.41549068570001,
+      "learning_rate": 9.762430640446425e-06,
+      "loss": 1.2009,
+      "step": 707
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 7.842125766874309,
+      "learning_rate": 9.761553756055303e-06,
+      "loss": 1.2019,
+      "step": 708
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 9.520781091673891,
+      "learning_rate": 9.760675295853383e-06,
+      "loss": 1.181,
+      "step": 709
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.081006412686044,
+      "learning_rate": 9.759795260131386e-06,
+      "loss": 1.2514,
+      "step": 710
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 12.2659012453025,
+      "learning_rate": 9.758913649180557e-06,
+      "loss": 1.1129,
+      "step": 711
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.165003150077185,
+      "learning_rate": 9.758030463292659e-06,
+      "loss": 1.2087,
+      "step": 712
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.56189113842857,
+      "learning_rate": 9.757145702759978e-06,
+      "loss": 1.1976,
+      "step": 713
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.254641608249855,
+      "learning_rate": 9.756259367875321e-06,
+      "loss": 1.2119,
+      "step": 714
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 7.130201783152484,
+      "learning_rate": 9.755371458932018e-06,
+      "loss": 1.2413,
+      "step": 715
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 9.81428756836966,
+      "learning_rate": 9.75448197622391e-06,
+      "loss": 1.1972,
+      "step": 716
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 9.871038617424794,
+      "learning_rate": 9.753590920045377e-06,
+      "loss": 1.235,
+      "step": 717
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.551566503266399,
+      "learning_rate": 9.752698290691301e-06,
+      "loss": 1.2274,
+      "step": 718
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.547018156980837,
+      "learning_rate": 9.751804088457097e-06,
+      "loss": 1.1526,
+      "step": 719
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 7.643944626431121,
+      "learning_rate": 9.750908313638696e-06,
+      "loss": 1.1656,
+      "step": 720
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.421347186390637,
+      "learning_rate": 9.75001096653255e-06,
+      "loss": 1.1401,
+      "step": 721
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.28138484645427,
+      "learning_rate": 9.74911204743563e-06,
+      "loss": 1.2197,
+      "step": 722
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 6.969927740681171,
+      "learning_rate": 9.74821155664543e-06,
+      "loss": 1.2498,
+      "step": 723
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.37712779245037,
+      "learning_rate": 9.74730949445996e-06,
+      "loss": 1.1742,
+      "step": 724
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 6.947748734077959,
+      "learning_rate": 9.746405861177756e-06,
+      "loss": 1.2285,
+      "step": 725
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 11.906546344230811,
+      "learning_rate": 9.74550065709787e-06,
+      "loss": 1.1848,
+      "step": 726
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 7.5840354432373065,
+      "learning_rate": 9.744593882519873e-06,
+      "loss": 1.1267,
+      "step": 727
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 7.429058877557707,
+      "learning_rate": 9.743685537743856e-06,
+      "loss": 1.2584,
+      "step": 728
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 9.633965235972088,
+      "learning_rate": 9.742775623070434e-06,
+      "loss": 1.2254,
+      "step": 729
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 7.183757904850385,
+      "learning_rate": 9.741864138800737e-06,
+      "loss": 1.1516,
+      "step": 730
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 7.744789177058351,
+      "learning_rate": 9.740951085236417e-06,
+      "loss": 1.249,
+      "step": 731
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 5.658133217835584,
+      "learning_rate": 9.74003646267964e-06,
+      "loss": 1.156,
+      "step": 732
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 6.873814334807182,
+      "learning_rate": 9.739120271433102e-06,
+      "loss": 1.1474,
+      "step": 733
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 7.835995361842111,
+      "learning_rate": 9.738202511800005e-06,
+      "loss": 1.2625,
+      "step": 734
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.758894721931265,
+      "learning_rate": 9.73728318408408e-06,
+      "loss": 1.2131,
+      "step": 735
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 9.218143047907258,
+      "learning_rate": 9.736362288589577e-06,
+      "loss": 1.2246,
+      "step": 736
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 6.934947483683387,
+      "learning_rate": 9.735439825621255e-06,
+      "loss": 1.167,
+      "step": 737
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.310291954954065,
+      "learning_rate": 9.734515795484402e-06,
+      "loss": 1.1173,
+      "step": 738
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 9.072900107326296,
+      "learning_rate": 9.73359019848482e-06,
+      "loss": 1.1764,
+      "step": 739
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 10.04641155279249,
+      "learning_rate": 9.732663034928828e-06,
+      "loss": 1.2061,
+      "step": 740
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 10.647401803296006,
+      "learning_rate": 9.73173430512327e-06,
+      "loss": 1.1844,
+      "step": 741
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.957904068283357,
+      "learning_rate": 9.730804009375504e-06,
+      "loss": 1.3251,
+      "step": 742
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 9.038879100803703,
+      "learning_rate": 9.729872147993402e-06,
+      "loss": 1.1901,
+      "step": 743
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.958722503915892,
+      "learning_rate": 9.728938721285361e-06,
+      "loss": 1.2581,
+      "step": 744
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 7.907401495515576,
+      "learning_rate": 9.728003729560295e-06,
+      "loss": 1.143,
+      "step": 745
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 7.247185618707715,
+      "learning_rate": 9.727067173127634e-06,
+      "loss": 1.1593,
+      "step": 746
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 6.600265064202668,
+      "learning_rate": 9.726129052297325e-06,
+      "loss": 1.2435,
+      "step": 747
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 6.634702919090851,
+      "learning_rate": 9.725189367379832e-06,
+      "loss": 1.1438,
+      "step": 748
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 10.267548738620876,
+      "learning_rate": 9.724248118686145e-06,
+      "loss": 1.1536,
+      "step": 749
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 7.159215858964157,
+      "learning_rate": 9.72330530652776e-06,
+      "loss": 1.1681,
+      "step": 750
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.259385519583624,
+      "learning_rate": 9.722360931216698e-06,
+      "loss": 1.1598,
+      "step": 751
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 6.67527758035727,
+      "learning_rate": 9.721414993065495e-06,
+      "loss": 1.1666,
+      "step": 752
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 7.115337039006,
+      "learning_rate": 9.720467492387202e-06,
+      "loss": 1.1476,
+      "step": 753
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.47426097009708,
+      "learning_rate": 9.719518429495393e-06,
+      "loss": 1.2232,
+      "step": 754
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 6.366887406615617,
+      "learning_rate": 9.718567804704151e-06,
+      "loss": 1.2117,
+      "step": 755
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 7.151531998629602,
+      "learning_rate": 9.717615618328083e-06,
+      "loss": 1.1507,
+      "step": 756
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 7.682713392165872,
+      "learning_rate": 9.716661870682308e-06,
+      "loss": 1.1607,
+      "step": 757
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 7.850321996896991,
+      "learning_rate": 9.715706562082467e-06,
+      "loss": 1.1623,
+      "step": 758
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 7.25399249072408,
+      "learning_rate": 9.714749692844712e-06,
+      "loss": 1.1751,
+      "step": 759
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.42422163739266,
+      "learning_rate": 9.713791263285715e-06,
+      "loss": 1.1776,
+      "step": 760
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 8.06571390152155,
+      "learning_rate": 9.71283127372266e-06,
+      "loss": 1.2759,
+      "step": 761
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 6.656744618707669,
+      "learning_rate": 9.711869724473253e-06,
+      "loss": 1.2585,
+      "step": 762
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.046919481677634,
+      "learning_rate": 9.710906615855713e-06,
+      "loss": 1.1912,
+      "step": 763
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 7.328430752057255,
+      "learning_rate": 9.709941948188777e-06,
+      "loss": 1.2291,
+      "step": 764
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 7.623314847420244,
+      "learning_rate": 9.708975721791695e-06,
+      "loss": 1.15,
+      "step": 765
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 7.74627810259798,
+      "learning_rate": 9.708007936984235e-06,
+      "loss": 1.1848,
+      "step": 766
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 7.668440065938989,
+      "learning_rate": 9.707038594086679e-06,
+      "loss": 1.1494,
+      "step": 767
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 11.25667326893544,
+      "learning_rate": 9.706067693419829e-06,
+      "loss": 1.1634,
+      "step": 768
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 7.154335695629255,
+      "learning_rate": 9.705095235304995e-06,
+      "loss": 1.2176,
+      "step": 769
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 6.5043962680041,
+      "learning_rate": 9.704121220064011e-06,
+      "loss": 1.1492,
+      "step": 770
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 7.742403763832229,
+      "learning_rate": 9.70314564801922e-06,
+      "loss": 1.1499,
+      "step": 771
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.94083298585899,
+      "learning_rate": 9.702168519493483e-06,
+      "loss": 1.2631,
+      "step": 772
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 7.258638761878086,
+      "learning_rate": 9.701189834810175e-06,
+      "loss": 1.2146,
+      "step": 773
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 11.069962450704383,
+      "learning_rate": 9.700209594293187e-06,
+      "loss": 1.1726,
+      "step": 774
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 8.74615395152891,
+      "learning_rate": 9.699227798266922e-06,
+      "loss": 1.2185,
+      "step": 775
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 6.537747770825489,
+      "learning_rate": 9.698244447056304e-06,
+      "loss": 1.1156,
+      "step": 776
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 10.165178698044851,
+      "learning_rate": 9.697259540986766e-06,
+      "loss": 1.1774,
+      "step": 777
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 8.577545832724773,
+      "learning_rate": 9.696273080384257e-06,
+      "loss": 1.2212,
+      "step": 778
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 6.131284588543876,
+      "learning_rate": 9.695285065575242e-06,
+      "loss": 1.2391,
+      "step": 779
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 6.6772627156711515,
+      "learning_rate": 9.694295496886697e-06,
+      "loss": 1.1476,
+      "step": 780
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 6.858817876113335,
+      "learning_rate": 9.693304374646117e-06,
+      "loss": 1.1641,
+      "step": 781
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 7.111311889771411,
+      "learning_rate": 9.692311699181509e-06,
+      "loss": 1.2117,
+      "step": 782
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 7.755527988625424,
+      "learning_rate": 9.69131747082139e-06,
+      "loss": 1.1402,
+      "step": 783
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 7.6762817862675945,
+      "learning_rate": 9.690321689894797e-06,
+      "loss": 1.1925,
+      "step": 784
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.056085385334292,
+      "learning_rate": 9.68932435673128e-06,
+      "loss": 1.1886,
+      "step": 785
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 6.138024691143919,
+      "learning_rate": 9.688325471660898e-06,
+      "loss": 1.1789,
+      "step": 786
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 5.882511365857431,
+      "learning_rate": 9.687325035014227e-06,
+      "loss": 1.1207,
+      "step": 787
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.467969673851785,
+      "learning_rate": 9.68632304712236e-06,
+      "loss": 1.1641,
+      "step": 788
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.888438140710948,
+      "learning_rate": 9.685319508316893e-06,
+      "loss": 1.2006,
+      "step": 789
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 5.829107683800937,
+      "learning_rate": 9.684314418929947e-06,
+      "loss": 1.1665,
+      "step": 790
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 8.371065555238046,
+      "learning_rate": 9.68330777929415e-06,
+      "loss": 1.1747,
+      "step": 791
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 6.81625197746316,
+      "learning_rate": 9.682299589742643e-06,
+      "loss": 1.1134,
+      "step": 792
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 8.528601978869624,
+      "learning_rate": 9.681289850609083e-06,
+      "loss": 1.2119,
+      "step": 793
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 8.385001035120139,
+      "learning_rate": 9.680278562227635e-06,
+      "loss": 1.2576,
+      "step": 794
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 8.734208234494906,
+      "learning_rate": 9.67926572493298e-06,
+      "loss": 1.2027,
+      "step": 795
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 12.087516237834526,
+      "learning_rate": 9.678251339060314e-06,
+      "loss": 1.1508,
+      "step": 796
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 7.8597623103406,
+      "learning_rate": 9.677235404945337e-06,
+      "loss": 1.2269,
+      "step": 797
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 7.858905028408973,
+      "learning_rate": 9.676217922924274e-06,
+      "loss": 1.1111,
+      "step": 798
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 5.816061663139562,
+      "learning_rate": 9.67519889333385e-06,
+      "loss": 1.1568,
+      "step": 799
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 8.509009145765019,
+      "learning_rate": 9.674178316511307e-06,
+      "loss": 1.1772,
+      "step": 800
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.201456990337133,
+      "learning_rate": 9.673156192794405e-06,
+      "loss": 1.2355,
+      "step": 801
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 10.910279801272782,
+      "learning_rate": 9.672132522521404e-06,
+      "loss": 1.1268,
+      "step": 802
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 8.197521042921833,
+      "learning_rate": 9.671107306031087e-06,
+      "loss": 1.1878,
+      "step": 803
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 7.677604374463345,
+      "learning_rate": 9.670080543662742e-06,
+      "loss": 1.1545,
+      "step": 804
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 8.984149477774123,
+      "learning_rate": 9.669052235756168e-06,
+      "loss": 1.1519,
+      "step": 805
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 5.845410694908592,
+      "learning_rate": 9.668022382651682e-06,
+      "loss": 1.1307,
+      "step": 806
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.604114780825324,
+      "learning_rate": 9.666990984690108e-06,
+      "loss": 1.248,
+      "step": 807
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 8.291908155707526,
+      "learning_rate": 9.66595804221278e-06,
+      "loss": 1.1894,
+      "step": 808
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.46530694164314,
+      "learning_rate": 9.664923555561545e-06,
+      "loss": 1.1838,
+      "step": 809
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 9.591515582920792,
+      "learning_rate": 9.663887525078762e-06,
+      "loss": 1.1864,
+      "step": 810
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 6.518042373991564,
+      "learning_rate": 9.662849951107297e-06,
+      "loss": 1.1721,
+      "step": 811
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 6.645849125016727,
+      "learning_rate": 9.661810833990532e-06,
+      "loss": 1.1364,
+      "step": 812
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 7.028495093348952,
+      "learning_rate": 9.660770174072357e-06,
+      "loss": 1.1938,
+      "step": 813
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 7.299776656318672,
+      "learning_rate": 9.659727971697173e-06,
+      "loss": 1.2317,
+      "step": 814
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 7.034364738745098,
+      "learning_rate": 9.65868422720989e-06,
+      "loss": 1.1838,
+      "step": 815
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 10.5355007795953,
+      "learning_rate": 9.657638940955934e-06,
+      "loss": 1.1455,
+      "step": 816
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 9.637459897278356,
+      "learning_rate": 9.65659211328123e-06,
+      "loss": 1.2375,
+      "step": 817
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 6.653216141907513,
+      "learning_rate": 9.655543744532225e-06,
+      "loss": 1.156,
+      "step": 818
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 9.929805915474738,
+      "learning_rate": 9.654493835055869e-06,
+      "loss": 1.2344,
+      "step": 819
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 8.100576449103258,
+      "learning_rate": 9.653442385199625e-06,
+      "loss": 1.1841,
+      "step": 820
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 10.290341082710885,
+      "learning_rate": 9.652389395311463e-06,
+      "loss": 1.1348,
+      "step": 821
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 7.174972208945665,
+      "learning_rate": 9.651334865739868e-06,
+      "loss": 1.2099,
+      "step": 822
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 10.496362469087853,
+      "learning_rate": 9.650278796833825e-06,
+      "loss": 1.2339,
+      "step": 823
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 7.067715302440286,
+      "learning_rate": 9.649221188942842e-06,
+      "loss": 1.1133,
+      "step": 824
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 9.158023315508416,
+      "learning_rate": 9.648162042416923e-06,
+      "loss": 1.1933,
+      "step": 825
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 8.092556265771297,
+      "learning_rate": 9.647101357606586e-06,
+      "loss": 1.1246,
+      "step": 826
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 9.972676217314214,
+      "learning_rate": 9.646039134862863e-06,
+      "loss": 1.1362,
+      "step": 827
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 9.629394909351563,
+      "learning_rate": 9.644975374537291e-06,
+      "loss": 1.2297,
+      "step": 828
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 7.934713194815075,
+      "learning_rate": 9.643910076981913e-06,
+      "loss": 1.2215,
+      "step": 829
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 8.075342524279769,
+      "learning_rate": 9.642843242549285e-06,
+      "loss": 1.1714,
+      "step": 830
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 10.52798945216272,
+      "learning_rate": 9.64177487159247e-06,
+      "loss": 1.1274,
+      "step": 831
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 7.606227164445244,
+      "learning_rate": 9.64070496446504e-06,
+      "loss": 1.2356,
+      "step": 832
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 7.156029524683828,
+      "learning_rate": 9.639633521521074e-06,
+      "loss": 1.1805,
+      "step": 833
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 6.8012129730493465,
+      "learning_rate": 9.638560543115161e-06,
+      "loss": 1.1159,
+      "step": 834
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 7.650947491355262,
+      "learning_rate": 9.637486029602396e-06,
+      "loss": 1.1143,
+      "step": 835
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 10.28190072479046,
+      "learning_rate": 9.636409981338387e-06,
+      "loss": 1.1856,
+      "step": 836
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 7.929979424618778,
+      "learning_rate": 9.635332398679243e-06,
+      "loss": 1.217,
+      "step": 837
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 10.092030452356196,
+      "learning_rate": 9.634253281981586e-06,
+      "loss": 1.1704,
+      "step": 838
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 8.257312061924186,
+      "learning_rate": 9.633172631602546e-06,
+      "loss": 1.1407,
+      "step": 839
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 11.248025089496762,
+      "learning_rate": 9.632090447899752e-06,
+      "loss": 1.1565,
+      "step": 840
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 7.274260677871599,
+      "learning_rate": 9.631006731231354e-06,
+      "loss": 1.1836,
+      "step": 841
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 10.206521408612568,
+      "learning_rate": 9.629921481955997e-06,
+      "loss": 1.2323,
+      "step": 842
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 6.8880979565748985,
+      "learning_rate": 9.628834700432842e-06,
+      "loss": 1.2194,
+      "step": 843
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 7.960226892740643,
+      "learning_rate": 9.627746387021551e-06,
+      "loss": 1.1257,
+      "step": 844
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 7.417774201325189,
+      "learning_rate": 9.626656542082298e-06,
+      "loss": 1.2892,
+      "step": 845
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 9.440791450808902,
+      "learning_rate": 9.62556516597576e-06,
+      "loss": 1.2378,
+      "step": 846
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 9.59726858824413,
+      "learning_rate": 9.62447225906312e-06,
+      "loss": 1.2593,
+      "step": 847
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 8.989494385764937,
+      "learning_rate": 9.623377821706073e-06,
+      "loss": 1.2676,
+      "step": 848
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 6.648665629712106,
+      "learning_rate": 9.622281854266816e-06,
+      "loss": 1.1884,
+      "step": 849
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 9.172472872915769,
+      "learning_rate": 9.621184357108054e-06,
+      "loss": 1.1766,
+      "step": 850
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 7.459146961104933,
+      "learning_rate": 9.620085330592997e-06,
+      "loss": 1.1368,
+      "step": 851
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 7.417988779730909,
+      "learning_rate": 9.618984775085362e-06,
+      "loss": 1.2641,
+      "step": 852
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 10.787679790150397,
+      "learning_rate": 9.617882690949374e-06,
+      "loss": 1.2347,
+      "step": 853
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 9.414250646419818,
+      "learning_rate": 9.616779078549758e-06,
+      "loss": 1.1188,
+      "step": 854
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 6.793953861584451,
+      "learning_rate": 9.615673938251753e-06,
+      "loss": 1.1136,
+      "step": 855
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 10.757122099792314,
+      "learning_rate": 9.614567270421095e-06,
+      "loss": 1.223,
+      "step": 856
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 11.848245250168477,
+      "learning_rate": 9.613459075424033e-06,
+      "loss": 1.0916,
+      "step": 857
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 9.699340559518452,
+      "learning_rate": 9.612349353627318e-06,
+      "loss": 1.1832,
+      "step": 858
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 8.029397973323293,
+      "learning_rate": 9.611238105398206e-06,
+      "loss": 1.1055,
+      "step": 859
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 7.662495678758359,
+      "learning_rate": 9.610125331104455e-06,
+      "loss": 1.158,
+      "step": 860
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 7.348404518712869,
+      "learning_rate": 9.60901103111434e-06,
+      "loss": 1.1866,
+      "step": 861
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 7.944878795529339,
+      "learning_rate": 9.607895205796625e-06,
+      "loss": 1.165,
+      "step": 862
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 6.756301238369285,
+      "learning_rate": 9.606777855520588e-06,
+      "loss": 1.1578,
+      "step": 863
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 7.558572366179669,
+      "learning_rate": 9.605658980656014e-06,
+      "loss": 1.1774,
+      "step": 864
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 6.088738083378869,
+      "learning_rate": 9.604538581573182e-06,
+      "loss": 1.0926,
+      "step": 865
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 10.513996741310375,
+      "learning_rate": 9.603416658642888e-06,
+      "loss": 1.2006,
+      "step": 866
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 8.022224824420697,
+      "learning_rate": 9.602293212236426e-06,
+      "loss": 1.1808,
+      "step": 867
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 5.016117161137801,
+      "learning_rate": 9.601168242725592e-06,
+      "loss": 1.2053,
+      "step": 868
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 7.488380603427804,
+      "learning_rate": 9.600041750482689e-06,
+      "loss": 1.1259,
+      "step": 869
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 7.3181116542934666,
+      "learning_rate": 9.598913735880525e-06,
+      "loss": 1.2079,
+      "step": 870
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 7.107061426843734,
+      "learning_rate": 9.597784199292408e-06,
+      "loss": 1.1912,
+      "step": 871
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 9.338476973442559,
+      "learning_rate": 9.596653141092156e-06,
+      "loss": 1.1509,
+      "step": 872
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 7.911731926462426,
+      "learning_rate": 9.595520561654082e-06,
+      "loss": 1.1913,
+      "step": 873
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 7.165067791068804,
+      "learning_rate": 9.594386461353011e-06,
+      "loss": 1.1026,
+      "step": 874
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 7.575689866576331,
+      "learning_rate": 9.593250840564267e-06,
+      "loss": 1.1601,
+      "step": 875
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 10.11628938601102,
+      "learning_rate": 9.592113699663675e-06,
+      "loss": 1.1806,
+      "step": 876
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 7.082968586733262,
+      "learning_rate": 9.590975039027568e-06,
+      "loss": 1.136,
+      "step": 877
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 7.710366721728349,
+      "learning_rate": 9.589834859032779e-06,
+      "loss": 1.1512,
+      "step": 878
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 13.123269533021393,
+      "learning_rate": 9.588693160056643e-06,
+      "loss": 1.2043,
+      "step": 879
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 10.751926801900058,
+      "learning_rate": 9.587549942477003e-06,
+      "loss": 1.2315,
+      "step": 880
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 6.682906241451182,
+      "learning_rate": 9.586405206672198e-06,
+      "loss": 1.1932,
+      "step": 881
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 8.74217412673026,
+      "learning_rate": 9.58525895302107e-06,
+      "loss": 1.2411,
+      "step": 882
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 11.775904578244289,
+      "learning_rate": 9.58411118190297e-06,
+      "loss": 1.1379,
+      "step": 883
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 10.408383918248257,
+      "learning_rate": 9.582961893697745e-06,
+      "loss": 1.0601,
+      "step": 884
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 7.960018766936029,
+      "learning_rate": 9.581811088785747e-06,
+      "loss": 1.1357,
+      "step": 885
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 9.012834027961507,
+      "learning_rate": 9.580658767547826e-06,
+      "loss": 1.1331,
+      "step": 886
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 8.93587142857814,
+      "learning_rate": 9.579504930365337e-06,
+      "loss": 1.1746,
+      "step": 887
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 6.389766050191984,
+      "learning_rate": 9.578349577620139e-06,
+      "loss": 1.1673,
+      "step": 888
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 7.550067848283375,
+      "learning_rate": 9.577192709694587e-06,
+      "loss": 1.2466,
+      "step": 889
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 6.747030530962537,
+      "learning_rate": 9.576034326971542e-06,
+      "loss": 1.2125,
+      "step": 890
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 7.2031690436977955,
+      "learning_rate": 9.574874429834363e-06,
+      "loss": 1.0963,
+      "step": 891
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 6.053943591870955,
+      "learning_rate": 9.573713018666912e-06,
+      "loss": 1.0883,
+      "step": 892
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 7.581385859090414,
+      "learning_rate": 9.572550093853555e-06,
+      "loss": 1.177,
+      "step": 893
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 6.17295201218072,
+      "learning_rate": 9.57138565577915e-06,
+      "loss": 1.1899,
+      "step": 894
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 8.218740685841194,
+      "learning_rate": 9.570219704829067e-06,
+      "loss": 1.15,
+      "step": 895
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 4.779319316071715,
+      "learning_rate": 9.569052241389169e-06,
+      "loss": 1.1087,
+      "step": 896
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 6.83910466587445,
+      "learning_rate": 9.567883265845821e-06,
+      "loss": 1.1545,
+      "step": 897
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 6.3966144544355075,
+      "learning_rate": 9.56671277858589e-06,
+      "loss": 1.1536,
+      "step": 898
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 11.9738847581529,
+      "learning_rate": 9.565540779996743e-06,
+      "loss": 1.1432,
+      "step": 899
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 7.040032882018994,
+      "learning_rate": 9.564367270466247e-06,
+      "loss": 1.1781,
+      "step": 900
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 11.142070832409306,
+      "learning_rate": 9.563192250382766e-06,
+      "loss": 1.205,
+      "step": 901
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 10.746617120213378,
+      "learning_rate": 9.56201572013517e-06,
+      "loss": 1.181,
+      "step": 902
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 10.062745416899235,
+      "learning_rate": 9.560837680112824e-06,
+      "loss": 1.2067,
+      "step": 903
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 8.303477541476704,
+      "learning_rate": 9.559658130705594e-06,
+      "loss": 1.2266,
+      "step": 904
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 8.41077746376906,
+      "learning_rate": 9.558477072303846e-06,
+      "loss": 1.1475,
+      "step": 905
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 12.650681815332389,
+      "learning_rate": 9.557294505298444e-06,
+      "loss": 1.1258,
+      "step": 906
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 8.904641792953244,
+      "learning_rate": 9.556110430080754e-06,
+      "loss": 1.1289,
+      "step": 907
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 9.694791591145568,
+      "learning_rate": 9.554924847042637e-06,
+      "loss": 1.1058,
+      "step": 908
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 7.426045629369545,
+      "learning_rate": 9.553737756576455e-06,
+      "loss": 1.2663,
+      "step": 909
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 6.349465563891908,
+      "learning_rate": 9.552549159075074e-06,
+      "loss": 1.1269,
+      "step": 910
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 5.840503332427091,
+      "learning_rate": 9.55135905493185e-06,
+      "loss": 1.1101,
+      "step": 911
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 6.858889137628009,
+      "learning_rate": 9.550167444540641e-06,
+      "loss": 1.1171,
+      "step": 912
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 7.327254715234784,
+      "learning_rate": 9.548974328295806e-06,
+      "loss": 1.185,
+      "step": 913
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 11.101092656418912,
+      "learning_rate": 9.5477797065922e-06,
+      "loss": 1.1871,
+      "step": 914
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 9.018250550494667,
+      "learning_rate": 9.546583579825179e-06,
+      "loss": 1.1235,
+      "step": 915
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 8.18593100628868,
+      "learning_rate": 9.54538594839059e-06,
+      "loss": 1.0396,
+      "step": 916
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 8.008650186361951,
+      "learning_rate": 9.544186812684786e-06,
+      "loss": 1.1359,
+      "step": 917
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 8.710905139134901,
+      "learning_rate": 9.542986173104613e-06,
+      "loss": 1.1716,
+      "step": 918
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 11.164545572973289,
+      "learning_rate": 9.541784030047417e-06,
+      "loss": 1.1272,
+      "step": 919
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 11.481812260352882,
+      "learning_rate": 9.540580383911041e-06,
+      "loss": 1.1428,
+      "step": 920
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 7.003830645330208,
+      "learning_rate": 9.539375235093826e-06,
+      "loss": 1.1419,
+      "step": 921
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 7.10842356775317,
+      "learning_rate": 9.538168583994608e-06,
+      "loss": 1.1664,
+      "step": 922
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 7.6733856929394815,
+      "learning_rate": 9.536960431012723e-06,
+      "loss": 1.1999,
+      "step": 923
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 11.66406958766081,
+      "learning_rate": 9.535750776548001e-06,
+      "loss": 1.2302,
+      "step": 924
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 6.722879568339871,
+      "learning_rate": 9.534539621000772e-06,
+      "loss": 1.132,
+      "step": 925
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 9.286677351626805,
+      "learning_rate": 9.533326964771862e-06,
+      "loss": 1.1572,
+      "step": 926
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 8.166958099323685,
+      "learning_rate": 9.532112808262592e-06,
+      "loss": 1.1113,
+      "step": 927
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 6.595681417621593,
+      "learning_rate": 9.53089715187478e-06,
+      "loss": 1.1296,
+      "step": 928
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 8.341934917970883,
+      "learning_rate": 9.529679996010744e-06,
+      "loss": 1.2418,
+      "step": 929
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 6.886954277183225,
+      "learning_rate": 9.528461341073294e-06,
+      "loss": 1.1257,
+      "step": 930
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.024157313522529,
+      "learning_rate": 9.527241187465735e-06,
+      "loss": 1.157,
+      "step": 931
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.959837930997718,
+      "learning_rate": 9.52601953559187e-06,
+      "loss": 1.1078,
+      "step": 932
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 9.7513867093195,
+      "learning_rate": 9.524796385856003e-06,
+      "loss": 1.1425,
+      "step": 933
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 6.941619018096998,
+      "learning_rate": 9.523571738662924e-06,
+      "loss": 1.1001,
+      "step": 934
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 10.034056720566069,
+      "learning_rate": 9.522345594417927e-06,
+      "loss": 1.219,
+      "step": 935
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 9.116082890716665,
+      "learning_rate": 9.521117953526797e-06,
+      "loss": 1.1629,
+      "step": 936
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 6.749514320156124,
+      "learning_rate": 9.519888816395814e-06,
+      "loss": 1.1238,
+      "step": 937
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 9.074979565590551,
+      "learning_rate": 9.518658183431754e-06,
+      "loss": 1.2022,
+      "step": 938
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 5.7270319426549765,
+      "learning_rate": 9.517426055041889e-06,
+      "loss": 1.2007,
+      "step": 939
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 10.885704373449883,
+      "learning_rate": 9.516192431633985e-06,
+      "loss": 1.1677,
+      "step": 940
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 9.35689362841285,
+      "learning_rate": 9.514957313616307e-06,
+      "loss": 1.2029,
+      "step": 941
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 9.505922911930622,
+      "learning_rate": 9.513720701397605e-06,
+      "loss": 1.2397,
+      "step": 942
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 8.621642458515396,
+      "learning_rate": 9.512482595387131e-06,
+      "loss": 1.1886,
+      "step": 943
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 5.838656519022851,
+      "learning_rate": 9.511242995994633e-06,
+      "loss": 1.224,
+      "step": 944
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 5.306787101360075,
+      "learning_rate": 9.510001903630345e-06,
+      "loss": 1.1563,
+      "step": 945
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 8.080673528913561,
+      "learning_rate": 9.508759318705002e-06,
+      "loss": 1.1049,
+      "step": 946
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.201562641081567,
+      "learning_rate": 9.50751524162983e-06,
+      "loss": 1.1418,
+      "step": 947
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 9.217997648395308,
+      "learning_rate": 9.506269672816553e-06,
+      "loss": 1.2055,
+      "step": 948
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 9.516545404538366,
+      "learning_rate": 9.50502261267738e-06,
+      "loss": 1.139,
+      "step": 949
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 10.270076564135913,
+      "learning_rate": 9.503774061625022e-06,
+      "loss": 1.124,
+      "step": 950
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 6.681767412903532,
+      "learning_rate": 9.502524020072682e-06,
+      "loss": 1.1973,
+      "step": 951
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 9.819179591754535,
+      "learning_rate": 9.501272488434051e-06,
+      "loss": 1.2161,
+      "step": 952
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 11.206061040584576,
+      "learning_rate": 9.500019467123318e-06,
+      "loss": 1.1072,
+      "step": 953
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.187596513249841,
+      "learning_rate": 9.498764956555166e-06,
+      "loss": 1.2142,
+      "step": 954
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 9.42281246072176,
+      "learning_rate": 9.497508957144764e-06,
+      "loss": 1.2081,
+      "step": 955
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.693429565935695,
+      "learning_rate": 9.496251469307783e-06,
+      "loss": 1.1177,
+      "step": 956
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.655429202112025,
+      "learning_rate": 9.494992493460378e-06,
+      "loss": 1.0982,
+      "step": 957
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 10.112703019959323,
+      "learning_rate": 9.493732030019203e-06,
+      "loss": 1.294,
+      "step": 958
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.027907848736021,
+      "learning_rate": 9.492470079401401e-06,
+      "loss": 1.204,
+      "step": 959
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 6.503646864207843,
+      "learning_rate": 9.491206642024605e-06,
+      "loss": 1.1555,
+      "step": 960
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 9.10360508535209,
+      "learning_rate": 9.489941718306949e-06,
+      "loss": 1.1958,
+      "step": 961
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.958534433600298,
+      "learning_rate": 9.488675308667049e-06,
+      "loss": 1.1447,
+      "step": 962
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 5.768611138341898,
+      "learning_rate": 9.487407413524013e-06,
+      "loss": 1.0495,
+      "step": 963
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.929306472037031,
+      "learning_rate": 9.48613803329745e-06,
+      "loss": 1.1556,
+      "step": 964
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 8.67660177687721,
+      "learning_rate": 9.484867168407454e-06,
+      "loss": 1.1797,
+      "step": 965
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 9.4360144708155,
+      "learning_rate": 9.483594819274606e-06,
+      "loss": 1.1929,
+      "step": 966
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 6.173868855447921,
+      "learning_rate": 9.482320986319989e-06,
+      "loss": 1.1084,
+      "step": 967
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 10.41420512198883,
+      "learning_rate": 9.481045669965168e-06,
+      "loss": 1.1756,
+      "step": 968
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.909312952574854,
+      "learning_rate": 9.479768870632203e-06,
+      "loss": 1.1878,
+      "step": 969
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 5.61562846448035,
+      "learning_rate": 9.478490588743643e-06,
+      "loss": 1.1319,
+      "step": 970
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 6.554743514744117,
+      "learning_rate": 9.477210824722532e-06,
+      "loss": 1.1417,
+      "step": 971
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.140525270714517,
+      "learning_rate": 9.475929578992396e-06,
+      "loss": 1.1382,
+      "step": 972
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 8.199175049023061,
+      "learning_rate": 9.474646851977258e-06,
+      "loss": 1.1369,
+      "step": 973
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 10.45741047376853,
+      "learning_rate": 9.473362644101633e-06,
+      "loss": 1.1545,
+      "step": 974
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.883660294100354,
+      "learning_rate": 9.472076955790519e-06,
+      "loss": 1.2144,
+      "step": 975
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 8.851968384910082,
+      "learning_rate": 9.470789787469408e-06,
+      "loss": 1.1983,
+      "step": 976
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 6.758029022376034,
+      "learning_rate": 9.469501139564285e-06,
+      "loss": 1.0925,
+      "step": 977
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.175496915894056,
+      "learning_rate": 9.468211012501615e-06,
+      "loss": 1.1416,
+      "step": 978
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 5.659456937252538,
+      "learning_rate": 9.466919406708366e-06,
+      "loss": 1.1622,
+      "step": 979
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 9.52618593340953,
+      "learning_rate": 9.465626322611982e-06,
+      "loss": 1.1999,
+      "step": 980
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 7.091985254340339,
+      "learning_rate": 9.464331760640405e-06,
+      "loss": 1.1432,
+      "step": 981
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 9.269954639928033,
+      "learning_rate": 9.463035721222063e-06,
+      "loss": 1.1546,
+      "step": 982
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 9.88647496227806,
+      "learning_rate": 9.461738204785873e-06,
+      "loss": 1.1274,
+      "step": 983
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 5.938297970007777,
+      "learning_rate": 9.460439211761242e-06,
+      "loss": 1.085,
+      "step": 984
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 8.666712506731354,
+      "learning_rate": 9.459138742578065e-06,
+      "loss": 1.2044,
+      "step": 985
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 7.345954424454518,
+      "learning_rate": 9.457836797666722e-06,
+      "loss": 1.1713,
+      "step": 986
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.795455708798988,
+      "learning_rate": 9.456533377458088e-06,
+      "loss": 1.1876,
+      "step": 987
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 7.848698971634035,
+      "learning_rate": 9.455228482383526e-06,
+      "loss": 1.195,
+      "step": 988
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 9.932478932524043,
+      "learning_rate": 9.453922112874877e-06,
+      "loss": 1.0952,
+      "step": 989
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 9.244557149289294,
+      "learning_rate": 9.45261426936448e-06,
+      "loss": 1.1525,
+      "step": 990
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 6.032884222604944,
+      "learning_rate": 9.45130495228516e-06,
+      "loss": 1.1317,
+      "step": 991
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 9.254551974424162,
+      "learning_rate": 9.449994162070227e-06,
+      "loss": 1.1967,
+      "step": 992
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 9.932521138771358,
+      "learning_rate": 9.44868189915348e-06,
+      "loss": 1.1138,
+      "step": 993
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 5.9771805175974375,
+      "learning_rate": 9.447368163969207e-06,
+      "loss": 1.193,
+      "step": 994
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 7.84357394533145,
+      "learning_rate": 9.44605295695218e-06,
+      "loss": 1.1438,
+      "step": 995
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 9.829703224862826,
+      "learning_rate": 9.44473627853766e-06,
+      "loss": 1.0807,
+      "step": 996
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.098467369252955,
+      "learning_rate": 9.443418129161397e-06,
+      "loss": 1.1491,
+      "step": 997
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.4771615785945,
+      "learning_rate": 9.442098509259623e-06,
+      "loss": 1.0792,
+      "step": 998
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 7.057484545011518,
+      "learning_rate": 9.44077741926906e-06,
+      "loss": 1.0968,
+      "step": 999
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 6.869795355777258,
+      "learning_rate": 9.439454859626915e-06,
+      "loss": 1.169,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 6.915896459854414,
+      "learning_rate": 9.438130830770883e-06,
+      "loss": 1.1834,
+      "step": 1001
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 9.27765601456418,
+      "learning_rate": 9.436805333139144e-06,
+      "loss": 1.1734,
+      "step": 1002
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 6.94353303293096,
+      "learning_rate": 9.435478367170363e-06,
+      "loss": 1.1545,
+      "step": 1003
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 6.838636362758001,
+      "learning_rate": 9.434149933303694e-06,
+      "loss": 1.238,
+      "step": 1004
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 7.269486500209905,
+      "learning_rate": 9.432820031978775e-06,
+      "loss": 1.1684,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 10.799524235494578,
+      "learning_rate": 9.431488663635728e-06,
+      "loss": 1.1037,
+      "step": 1006
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 10.373027117898825,
+      "learning_rate": 9.430155828715165e-06,
+      "loss": 1.1621,
+      "step": 1007
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 11.098996401567037,
+      "learning_rate": 9.428821527658178e-06,
+      "loss": 1.2016,
+      "step": 1008
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 7.514519145121632,
+      "learning_rate": 9.427485760906349e-06,
+      "loss": 1.1224,
+      "step": 1009
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 9.269944463402805,
+      "learning_rate": 9.426148528901742e-06,
+      "loss": 1.1582,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 9.086920282506787,
+      "learning_rate": 9.424809832086908e-06,
+      "loss": 1.1622,
+      "step": 1011
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.891630704685102,
+      "learning_rate": 9.423469670904876e-06,
+      "loss": 1.0956,
+      "step": 1012
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 7.931982036741372,
+      "learning_rate": 9.422128045799172e-06,
+      "loss": 1.1172,
+      "step": 1013
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 12.709257316553975,
+      "learning_rate": 9.420784957213798e-06,
+      "loss": 1.2358,
+      "step": 1014
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 6.783816587144932,
+      "learning_rate": 9.41944040559324e-06,
+      "loss": 1.2035,
+      "step": 1015
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.26670239898809,
+      "learning_rate": 9.41809439138247e-06,
+      "loss": 1.1584,
+      "step": 1016
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.9779336954468,
+      "learning_rate": 9.416746915026947e-06,
+      "loss": 1.1665,
+      "step": 1017
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 9.787748232713499,
+      "learning_rate": 9.41539797697261e-06,
+      "loss": 1.2085,
+      "step": 1018
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 5.894218700334919,
+      "learning_rate": 9.41404757766588e-06,
+      "loss": 1.1404,
+      "step": 1019
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 7.827833030028871,
+      "learning_rate": 9.41269571755367e-06,
+      "loss": 1.1573,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 6.923922340386938,
+      "learning_rate": 9.411342397083368e-06,
+      "loss": 1.1555,
+      "step": 1021
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 7.612275537966621,
+      "learning_rate": 9.409987616702845e-06,
+      "loss": 1.2057,
+      "step": 1022
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 9.43338159440817,
+      "learning_rate": 9.408631376860462e-06,
+      "loss": 1.2272,
+      "step": 1023
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.63244024290385,
+      "learning_rate": 9.40727367800506e-06,
+      "loss": 1.1386,
+      "step": 1024
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 6.462796298853504,
+      "learning_rate": 9.405914520585959e-06,
+      "loss": 1.1537,
+      "step": 1025
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 9.758218133961499,
+      "learning_rate": 9.404553905052967e-06,
+      "loss": 1.1281,
+      "step": 1026
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 11.738060978690822,
+      "learning_rate": 9.403191831856374e-06,
+      "loss": 1.1784,
+      "step": 1027
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 6.358989357083349,
+      "learning_rate": 9.401828301446948e-06,
+      "loss": 1.0927,
+      "step": 1028
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 9.034680495206818,
+      "learning_rate": 9.400463314275942e-06,
+      "loss": 1.1791,
+      "step": 1029
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 6.017411920414849,
+      "learning_rate": 9.399096870795093e-06,
+      "loss": 1.2037,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 10.204662382695671,
+      "learning_rate": 9.397728971456619e-06,
+      "loss": 1.188,
+      "step": 1031
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 6.616557308373342,
+      "learning_rate": 9.396359616713215e-06,
+      "loss": 1.1796,
+      "step": 1032
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 7.418996717558115,
+      "learning_rate": 9.394988807018067e-06,
+      "loss": 1.1333,
+      "step": 1033
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.940433969755777,
+      "learning_rate": 9.393616542824832e-06,
+      "loss": 1.1558,
+      "step": 1034
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 9.108193736707108,
+      "learning_rate": 9.392242824587656e-06,
+      "loss": 1.1358,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 8.938988007634075,
+      "learning_rate": 9.390867652761163e-06,
+      "loss": 1.1307,
+      "step": 1036
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 7.082663379158123,
+      "learning_rate": 9.38949102780046e-06,
+      "loss": 1.1831,
+      "step": 1037
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 5.928115322340665,
+      "learning_rate": 9.388112950161135e-06,
+      "loss": 1.1347,
+      "step": 1038
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 9.956786009678524,
+      "learning_rate": 9.38673342029925e-06,
+      "loss": 1.1367,
+      "step": 1039
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 5.724458616497805,
+      "learning_rate": 9.385352438671359e-06,
+      "loss": 1.1249,
+      "step": 1040
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 5.1163169827004475,
+      "learning_rate": 9.383970005734486e-06,
+      "loss": 1.1987,
+      "step": 1041
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 6.724532206494129,
+      "learning_rate": 9.382586121946141e-06,
+      "loss": 1.1102,
+      "step": 1042
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 8.162011971971356,
+      "learning_rate": 9.381200787764318e-06,
+      "loss": 1.1188,
+      "step": 1043
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 7.708197235579937,
+      "learning_rate": 9.379814003647478e-06,
+      "loss": 1.1681,
+      "step": 1044
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 9.244055244427846,
+      "learning_rate": 9.378425770054574e-06,
+      "loss": 1.1791,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 10.142407072430991,
+      "learning_rate": 9.377036087445036e-06,
+      "loss": 1.2238,
+      "step": 1046
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 8.157858007862952,
+      "learning_rate": 9.375644956278767e-06,
+      "loss": 1.172,
+      "step": 1047
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 6.086022537004462,
+      "learning_rate": 9.374252377016159e-06,
+      "loss": 1.1547,
+      "step": 1048
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 7.716935313443208,
+      "learning_rate": 9.372858350118077e-06,
+      "loss": 1.1773,
+      "step": 1049
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 6.685774615175206,
+      "learning_rate": 9.371462876045866e-06,
+      "loss": 1.128,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 9.895282088447505,
+      "learning_rate": 9.370065955261352e-06,
+      "loss": 1.1886,
+      "step": 1051
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 12.244209654614103,
+      "learning_rate": 9.368667588226838e-06,
+      "loss": 1.2504,
+      "step": 1052
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 9.221532628754316,
+      "learning_rate": 9.367267775405106e-06,
+      "loss": 1.1531,
+      "step": 1053
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 5.115222965363304,
+      "learning_rate": 9.365866517259417e-06,
+      "loss": 1.1531,
+      "step": 1054
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 6.323586328440821,
+      "learning_rate": 9.364463814253511e-06,
+      "loss": 1.0833,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 7.802496933330672,
+      "learning_rate": 9.363059666851602e-06,
+      "loss": 1.2557,
+      "step": 1056
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 9.831552240854235,
+      "learning_rate": 9.361654075518388e-06,
+      "loss": 1.0651,
+      "step": 1057
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 8.08242668808357,
+      "learning_rate": 9.36024704071904e-06,
+      "loss": 1.1622,
+      "step": 1058
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 7.008458068746483,
+      "learning_rate": 9.358838562919208e-06,
+      "loss": 1.1734,
+      "step": 1059
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 8.537751015165814,
+      "learning_rate": 9.357428642585023e-06,
+      "loss": 1.1502,
+      "step": 1060
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 6.047790156622784,
+      "learning_rate": 9.35601728018309e-06,
+      "loss": 1.1219,
+      "step": 1061
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 6.532775783986996,
+      "learning_rate": 9.354604476180489e-06,
+      "loss": 1.0656,
+      "step": 1062
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 9.614800128788076,
+      "learning_rate": 9.353190231044784e-06,
+      "loss": 1.1059,
+      "step": 1063
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 6.87771151526488,
+      "learning_rate": 9.351774545244008e-06,
+      "loss": 1.1526,
+      "step": 1064
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 5.5874290530259705,
+      "learning_rate": 9.350357419246677e-06,
+      "loss": 1.1643,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 9.549318066164696,
+      "learning_rate": 9.34893885352178e-06,
+      "loss": 1.147,
+      "step": 1066
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 9.515393069218453,
+      "learning_rate": 9.347518848538788e-06,
+      "loss": 1.1836,
+      "step": 1067
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 11.313868200946018,
+      "learning_rate": 9.346097404767639e-06,
+      "loss": 1.1631,
+      "step": 1068
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 8.057191148899484,
+      "learning_rate": 9.344674522678752e-06,
+      "loss": 1.1713,
+      "step": 1069
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 14.517898669166506,
+      "learning_rate": 9.343250202743025e-06,
+      "loss": 1.138,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 8.57252818892509,
+      "learning_rate": 9.341824445431828e-06,
+      "loss": 1.1724,
+      "step": 1071
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 9.396558941917382,
+      "learning_rate": 9.340397251217009e-06,
+      "loss": 1.1678,
+      "step": 1072
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 6.719333790359232,
+      "learning_rate": 9.33896862057089e-06,
+      "loss": 1.1764,
+      "step": 1073
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 6.694584472742868,
+      "learning_rate": 9.337538553966265e-06,
+      "loss": 1.1435,
+      "step": 1074
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 9.029453231188967,
+      "learning_rate": 9.336107051876414e-06,
+      "loss": 1.1505,
+      "step": 1075
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 11.211346332600323,
+      "learning_rate": 9.334674114775076e-06,
+      "loss": 1.1462,
+      "step": 1076
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 7.92214301195445,
+      "learning_rate": 9.333239743136481e-06,
+      "loss": 1.14,
+      "step": 1077
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 6.417305756718152,
+      "learning_rate": 9.331803937435324e-06,
+      "loss": 1.1821,
+      "step": 1078
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 8.981609265500582,
+      "learning_rate": 9.33036669814678e-06,
+      "loss": 1.162,
+      "step": 1079
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 9.758759018426268,
+      "learning_rate": 9.328928025746491e-06,
+      "loss": 1.1859,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 8.537848611944408,
+      "learning_rate": 9.32748792071058e-06,
+      "loss": 1.0637,
+      "step": 1081
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 6.6157985010606275,
+      "learning_rate": 9.326046383515642e-06,
+      "loss": 1.1825,
+      "step": 1082
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 6.632484596046262,
+      "learning_rate": 9.324603414638747e-06,
+      "loss": 1.144,
+      "step": 1083
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 8.300883060483105,
+      "learning_rate": 9.323159014557437e-06,
+      "loss": 1.1993,
+      "step": 1084
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 5.19912361550403,
+      "learning_rate": 9.32171318374973e-06,
+      "loss": 1.1698,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 7.874612298992396,
+      "learning_rate": 9.320265922694113e-06,
+      "loss": 1.2309,
+      "step": 1086
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 10.819563661976316,
+      "learning_rate": 9.318817231869551e-06,
+      "loss": 1.1734,
+      "step": 1087
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 6.8709324542699575,
+      "learning_rate": 9.31736711175548e-06,
+      "loss": 1.1825,
+      "step": 1088
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 8.10514341597688,
+      "learning_rate": 9.315915562831812e-06,
+      "loss": 1.0984,
+      "step": 1089
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 6.53366025198026,
+      "learning_rate": 9.314462585578925e-06,
+      "loss": 1.1362,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 8.048616620331533,
+      "learning_rate": 9.313008180477678e-06,
+      "loss": 1.1079,
+      "step": 1091
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 6.789455500955744,
+      "learning_rate": 9.311552348009396e-06,
+      "loss": 1.224,
+      "step": 1092
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 5.841449298916209,
+      "learning_rate": 9.310095088655881e-06,
+      "loss": 1.2222,
+      "step": 1093
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 7.8722158775052335,
+      "learning_rate": 9.308636402899405e-06,
+      "loss": 1.0986,
+      "step": 1094
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 7.420244499675459,
+      "learning_rate": 9.30717629122271e-06,
+      "loss": 1.1703,
+      "step": 1095
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 8.793180919282158,
+      "learning_rate": 9.305714754109013e-06,
+      "loss": 1.1823,
+      "step": 1096
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 5.831390009906441,
+      "learning_rate": 9.304251792042004e-06,
+      "loss": 1.1817,
+      "step": 1097
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 6.120549710958989,
+      "learning_rate": 9.30278740550584e-06,
+      "loss": 1.132,
+      "step": 1098
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.360074745640787,
+      "learning_rate": 9.301321594985154e-06,
+      "loss": 1.2432,
+      "step": 1099
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 9.50331583278353,
+      "learning_rate": 9.299854360965047e-06,
+      "loss": 1.1992,
+      "step": 1100
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.4358182481192,
+      "learning_rate": 9.29838570393109e-06,
+      "loss": 1.1766,
+      "step": 1101
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 7.331341499384628,
+      "learning_rate": 9.29691562436933e-06,
+      "loss": 1.1049,
+      "step": 1102
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 6.288175593650978,
+      "learning_rate": 9.295444122766283e-06,
+      "loss": 1.1003,
+      "step": 1103
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 6.541200877492541,
+      "learning_rate": 9.293971199608935e-06,
+      "loss": 1.1558,
+      "step": 1104
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 10.661585887042136,
+      "learning_rate": 9.292496855384738e-06,
+      "loss": 1.1941,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 6.917655865152832,
+      "learning_rate": 9.29102109058162e-06,
+      "loss": 1.1798,
+      "step": 1106
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 5.946733230051259,
+      "learning_rate": 9.289543905687979e-06,
+      "loss": 1.1675,
+      "step": 1107
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 6.783757384525512,
+      "learning_rate": 9.28806530119268e-06,
+      "loss": 1.1099,
+      "step": 1108
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 7.842010292100991,
+      "learning_rate": 9.286585277585061e-06,
+      "loss": 1.1415,
+      "step": 1109
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 9.771935809889456,
+      "learning_rate": 9.285103835354924e-06,
+      "loss": 1.15,
+      "step": 1110
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 6.664165995543322,
+      "learning_rate": 9.28362097499255e-06,
+      "loss": 1.1411,
+      "step": 1111
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 7.085915914990294,
+      "learning_rate": 9.28213669698868e-06,
+      "loss": 1.1095,
+      "step": 1112
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.37942274217362,
+      "learning_rate": 9.280651001834528e-06,
+      "loss": 1.2271,
+      "step": 1113
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.36685582429165,
+      "learning_rate": 9.279163890021776e-06,
+      "loss": 1.1105,
+      "step": 1114
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.065359064751213,
+      "learning_rate": 9.27767536204258e-06,
+      "loss": 1.1556,
+      "step": 1115
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 6.593689357671291,
+      "learning_rate": 9.276185418389558e-06,
+      "loss": 1.2189,
+      "step": 1116
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 7.1638865037983095,
+      "learning_rate": 9.274694059555796e-06,
+      "loss": 1.2344,
+      "step": 1117
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 6.7890694499306035,
+      "learning_rate": 9.273201286034855e-06,
+      "loss": 1.2038,
+      "step": 1118
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 6.374254719735724,
+      "learning_rate": 9.271707098320761e-06,
+      "loss": 1.1894,
+      "step": 1119
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 9.129811276974369,
+      "learning_rate": 9.270211496908004e-06,
+      "loss": 1.183,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 7.262984553510941,
+      "learning_rate": 9.268714482291546e-06,
+      "loss": 1.1471,
+      "step": 1121
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 9.405870469773916,
+      "learning_rate": 9.267216054966818e-06,
+      "loss": 1.1815,
+      "step": 1122
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 9.112901434812148,
+      "learning_rate": 9.265716215429715e-06,
+      "loss": 1.1251,
+      "step": 1123
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 6.51990341606496,
+      "learning_rate": 9.264214964176601e-06,
+      "loss": 1.2098,
+      "step": 1124
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 7.161462918010964,
+      "learning_rate": 9.262712301704308e-06,
+      "loss": 1.143,
+      "step": 1125
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.565595525105332,
+      "learning_rate": 9.261208228510135e-06,
+      "loss": 1.1382,
+      "step": 1126
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.681509685541329,
+      "learning_rate": 9.259702745091845e-06,
+      "loss": 1.086,
+      "step": 1127
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 7.28872483080392,
+      "learning_rate": 9.258195851947668e-06,
+      "loss": 1.184,
+      "step": 1128
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 7.07449303522779,
+      "learning_rate": 9.256687549576308e-06,
+      "loss": 1.1265,
+      "step": 1129
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.256629882059608,
+      "learning_rate": 9.255177838476925e-06,
+      "loss": 1.2392,
+      "step": 1130
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 9.732849490774635,
+      "learning_rate": 9.253666719149149e-06,
+      "loss": 1.1913,
+      "step": 1131
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 7.587948901763961,
+      "learning_rate": 9.252154192093081e-06,
+      "loss": 1.0685,
+      "step": 1132
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 7.817811597351766,
+      "learning_rate": 9.25064025780928e-06,
+      "loss": 1.0721,
+      "step": 1133
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 7.1665357305518365,
+      "learning_rate": 9.249124916798777e-06,
+      "loss": 1.1775,
+      "step": 1134
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 7.5781535744666,
+      "learning_rate": 9.247608169563066e-06,
+      "loss": 1.1134,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.63485135318945,
+      "learning_rate": 9.246090016604104e-06,
+      "loss": 1.106,
+      "step": 1136
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 7.298271532910465,
+      "learning_rate": 9.244570458424318e-06,
+      "loss": 1.0986,
+      "step": 1137
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 7.115300970992309,
+      "learning_rate": 9.243049495526597e-06,
+      "loss": 1.1132,
+      "step": 1138
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 6.1027823972402,
+      "learning_rate": 9.241527128414295e-06,
+      "loss": 1.1429,
+      "step": 1139
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 7.092592744279875,
+      "learning_rate": 9.240003357591231e-06,
+      "loss": 1.1284,
+      "step": 1140
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.691360704886655,
+      "learning_rate": 9.23847818356169e-06,
+      "loss": 1.1883,
+      "step": 1141
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.370333928323031,
+      "learning_rate": 9.23695160683042e-06,
+      "loss": 1.1143,
+      "step": 1142
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 9.107258821397998,
+      "learning_rate": 9.235423627902631e-06,
+      "loss": 1.1862,
+      "step": 1143
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 5.798196917509035,
+      "learning_rate": 9.233894247284005e-06,
+      "loss": 1.1432,
+      "step": 1144
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.388364204725287,
+      "learning_rate": 9.232363465480677e-06,
+      "loss": 1.1376,
+      "step": 1145
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 14.719921527701352,
+      "learning_rate": 9.230831282999252e-06,
+      "loss": 1.176,
+      "step": 1146
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 11.589932441984786,
+      "learning_rate": 9.2292977003468e-06,
+      "loss": 1.1153,
+      "step": 1147
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.280028584438217,
+      "learning_rate": 9.22776271803085e-06,
+      "loss": 1.1453,
+      "step": 1148
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 9.170379129429866,
+      "learning_rate": 9.226226336559396e-06,
+      "loss": 1.131,
+      "step": 1149
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 9.680946991735283,
+      "learning_rate": 9.224688556440897e-06,
+      "loss": 1.1637,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.86309478552014,
+      "learning_rate": 9.223149378184274e-06,
+      "loss": 1.0368,
+      "step": 1151
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 9.66995152760919,
+      "learning_rate": 9.221608802298905e-06,
+      "loss": 1.1839,
+      "step": 1152
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 8.700487476730897,
+      "learning_rate": 9.220066829294638e-06,
+      "loss": 1.1255,
+      "step": 1153
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 12.095027869098347,
+      "learning_rate": 9.218523459681783e-06,
+      "loss": 1.093,
+      "step": 1154
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 7.525122596888575,
+      "learning_rate": 9.216978693971107e-06,
+      "loss": 1.2396,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.313349867041522,
+      "learning_rate": 9.215432532673842e-06,
+      "loss": 1.1665,
+      "step": 1156
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 10.657454096796242,
+      "learning_rate": 9.213884976301684e-06,
+      "loss": 1.1294,
+      "step": 1157
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 7.476175286242274,
+      "learning_rate": 9.212336025366789e-06,
+      "loss": 1.1261,
+      "step": 1158
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 10.536574332403378,
+      "learning_rate": 9.21078568038177e-06,
+      "loss": 1.1087,
+      "step": 1159
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 11.694558635295499,
+      "learning_rate": 9.209233941859707e-06,
+      "loss": 1.0773,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.285624015108462,
+      "learning_rate": 9.207680810314142e-06,
+      "loss": 1.1407,
+      "step": 1161
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 9.40479029684826,
+      "learning_rate": 9.206126286259073e-06,
+      "loss": 1.0967,
+      "step": 1162
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.435191589823761,
+      "learning_rate": 9.204570370208965e-06,
+      "loss": 1.1416,
+      "step": 1163
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 4.860851964212172,
+      "learning_rate": 9.203013062678734e-06,
+      "loss": 1.1206,
+      "step": 1164
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 7.26212872660337,
+      "learning_rate": 9.201454364183768e-06,
+      "loss": 1.0646,
+      "step": 1165
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.436406655344493,
+      "learning_rate": 9.199894275239907e-06,
+      "loss": 1.1854,
+      "step": 1166
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 6.607969687725106,
+      "learning_rate": 9.198332796363456e-06,
+      "loss": 1.2009,
+      "step": 1167
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 7.308550918402052,
+      "learning_rate": 9.196769928071176e-06,
+      "loss": 1.1515,
+      "step": 1168
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 6.972569771589018,
+      "learning_rate": 9.195205670880291e-06,
+      "loss": 1.0972,
+      "step": 1169
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 6.8095617042435395,
+      "learning_rate": 9.193640025308486e-06,
+      "loss": 1.1094,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.210269121540861,
+      "learning_rate": 9.192072991873899e-06,
+      "loss": 1.1622,
+      "step": 1171
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 7.022252523041045,
+      "learning_rate": 9.190504571095133e-06,
+      "loss": 1.1011,
+      "step": 1172
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 10.419855003865209,
+      "learning_rate": 9.18893476349125e-06,
+      "loss": 1.1686,
+      "step": 1173
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.09064295362256,
+      "learning_rate": 9.187363569581767e-06,
+      "loss": 1.104,
+      "step": 1174
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.121287399727342,
+      "learning_rate": 9.185790989886664e-06,
+      "loss": 1.2415,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 10.036562193766382,
+      "learning_rate": 9.184217024926376e-06,
+      "loss": 1.0598,
+      "step": 1176
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 9.157032802066983,
+      "learning_rate": 9.1826416752218e-06,
+      "loss": 1.1994,
+      "step": 1177
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 7.533579990426096,
+      "learning_rate": 9.181064941294288e-06,
+      "loss": 1.1168,
+      "step": 1178
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 6.825772228602853,
+      "learning_rate": 9.179486823665655e-06,
+      "loss": 1.1546,
+      "step": 1179
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.159265913235183,
+      "learning_rate": 9.177907322858166e-06,
+      "loss": 1.1611,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 6.849706638781688,
+      "learning_rate": 9.176326439394552e-06,
+      "loss": 1.0703,
+      "step": 1181
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 9.97544088760587,
+      "learning_rate": 9.174744173797995e-06,
+      "loss": 1.1323,
+      "step": 1182
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 11.44864558146627,
+      "learning_rate": 9.173160526592143e-06,
+      "loss": 1.1307,
+      "step": 1183
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 7.820668006160372,
+      "learning_rate": 9.171575498301087e-06,
+      "loss": 1.1971,
+      "step": 1184
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 6.813874360001088,
+      "learning_rate": 9.16998908944939e-06,
+      "loss": 1.0535,
+      "step": 1185
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 9.199021596508127,
+      "learning_rate": 9.168401300562065e-06,
+      "loss": 1.1986,
+      "step": 1186
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 7.216936218990294,
+      "learning_rate": 9.166812132164578e-06,
+      "loss": 1.1571,
+      "step": 1187
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 10.536463277096855,
+      "learning_rate": 9.165221584782862e-06,
+      "loss": 1.1508,
+      "step": 1188
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 7.463881033490948,
+      "learning_rate": 9.163629658943294e-06,
+      "loss": 1.1754,
+      "step": 1189
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.58146920601818,
+      "learning_rate": 9.162036355172719e-06,
+      "loss": 1.1237,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 6.854313360703013,
+      "learning_rate": 9.160441673998428e-06,
+      "loss": 1.0763,
+      "step": 1191
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 9.32624672417788,
+      "learning_rate": 9.158845615948172e-06,
+      "loss": 1.2217,
+      "step": 1192
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.52816631326505,
+      "learning_rate": 9.157248181550163e-06,
+      "loss": 1.1117,
+      "step": 1193
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 5.744819240379695,
+      "learning_rate": 9.155649371333059e-06,
+      "loss": 1.1858,
+      "step": 1194
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.536276198339,
+      "learning_rate": 9.154049185825979e-06,
+      "loss": 1.1083,
+      "step": 1195
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 7.716026051615391,
+      "learning_rate": 9.152447625558496e-06,
+      "loss": 1.0924,
+      "step": 1196
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.07028685651057,
+      "learning_rate": 9.150844691060638e-06,
+      "loss": 1.1999,
+      "step": 1197
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 8.005509781792222,
+      "learning_rate": 9.149240382862887e-06,
+      "loss": 1.0526,
+      "step": 1198
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 9.24374579960754,
+      "learning_rate": 9.14763470149618e-06,
+      "loss": 1.1386,
+      "step": 1199
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 9.895693511907728,
+      "learning_rate": 9.146027647491908e-06,
+      "loss": 1.0957,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 10.54664258198599,
+      "learning_rate": 9.144419221381919e-06,
+      "loss": 1.2007,
+      "step": 1201
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 6.909228188231587,
+      "learning_rate": 9.142809423698512e-06,
+      "loss": 1.1109,
+      "step": 1202
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 7.468136506536781,
+      "learning_rate": 9.141198254974443e-06,
+      "loss": 1.1021,
+      "step": 1203
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 7.229359082846948,
+      "learning_rate": 9.139585715742915e-06,
+      "loss": 1.0761,
+      "step": 1204
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 9.976010942881583,
+      "learning_rate": 9.137971806537593e-06,
+      "loss": 1.2171,
+      "step": 1205
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 6.1694603020169545,
+      "learning_rate": 9.136356527892592e-06,
+      "loss": 1.0904,
+      "step": 1206
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 6.722616859890153,
+      "learning_rate": 9.134739880342476e-06,
+      "loss": 1.1348,
+      "step": 1207
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 12.25312898857162,
+      "learning_rate": 9.13312186442227e-06,
+      "loss": 1.1056,
+      "step": 1208
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 5.732169471808125,
+      "learning_rate": 9.131502480667444e-06,
+      "loss": 1.1659,
+      "step": 1209
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 6.132339107335282,
+      "learning_rate": 9.129881729613927e-06,
+      "loss": 1.1958,
+      "step": 1210
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 7.808534358158566,
+      "learning_rate": 9.1282596117981e-06,
+      "loss": 1.137,
+      "step": 1211
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 7.680874267873705,
+      "learning_rate": 9.126636127756789e-06,
+      "loss": 1.0956,
+      "step": 1212
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 11.3610263068139,
+      "learning_rate": 9.12501127802728e-06,
+      "loss": 1.0995,
+      "step": 1213
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 6.8887558392995825,
+      "learning_rate": 9.12338506314731e-06,
+      "loss": 1.1565,
+      "step": 1214
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.55664683971052,
+      "learning_rate": 9.121757483655063e-06,
+      "loss": 1.0869,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.135207425412807,
+      "learning_rate": 9.120128540089179e-06,
+      "loss": 1.2163,
+      "step": 1216
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 9.733599680793823,
+      "learning_rate": 9.11849823298875e-06,
+      "loss": 1.2494,
+      "step": 1217
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 7.862266158114783,
+      "learning_rate": 9.116866562893314e-06,
+      "loss": 1.0909,
+      "step": 1218
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 10.090272018513511,
+      "learning_rate": 9.115233530342867e-06,
+      "loss": 1.128,
+      "step": 1219
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 9.243523769469068,
+      "learning_rate": 9.113599135877852e-06,
+      "loss": 1.1921,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.580407749266586,
+      "learning_rate": 9.111963380039157e-06,
+      "loss": 1.1897,
+      "step": 1221
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 6.831335326281276,
+      "learning_rate": 9.110326263368135e-06,
+      "loss": 1.1202,
+      "step": 1222
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.760243962325156,
+      "learning_rate": 9.108687786406577e-06,
+      "loss": 1.1809,
+      "step": 1223
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.147603066182734,
+      "learning_rate": 9.107047949696728e-06,
+      "loss": 1.0795,
+      "step": 1224
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 7.553788234657959,
+      "learning_rate": 9.105406753781285e-06,
+      "loss": 1.1328,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 7.647128411789034,
+      "learning_rate": 9.103764199203391e-06,
+      "loss": 1.0839,
+      "step": 1226
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 9.569176504270198,
+      "learning_rate": 9.102120286506643e-06,
+      "loss": 1.1986,
+      "step": 1227
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 7.595254562664006,
+      "learning_rate": 9.100475016235083e-06,
+      "loss": 1.1958,
+      "step": 1228
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.847491021829276,
+      "learning_rate": 9.098828388933206e-06,
+      "loss": 1.2592,
+      "step": 1229
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 7.9887100055624405,
+      "learning_rate": 9.09718040514595e-06,
+      "loss": 1.193,
+      "step": 1230
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 6.1352450272529655,
+      "learning_rate": 9.095531065418714e-06,
+      "loss": 1.1981,
+      "step": 1231
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 7.493567216695012,
+      "learning_rate": 9.093880370297334e-06,
+      "loss": 1.1052,
+      "step": 1232
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 9.726742132342135,
+      "learning_rate": 9.092228320328098e-06,
+      "loss": 1.0774,
+      "step": 1233
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 7.9227060589306335,
+      "learning_rate": 9.090574916057745e-06,
+      "loss": 1.1903,
+      "step": 1234
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.583683233978629,
+      "learning_rate": 9.088920158033458e-06,
+      "loss": 1.1666,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 10.24216995005428,
+      "learning_rate": 9.087264046802873e-06,
+      "loss": 1.1721,
+      "step": 1236
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 5.73275441610726,
+      "learning_rate": 9.085606582914069e-06,
+      "loss": 1.106,
+      "step": 1237
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 9.52166261106952,
+      "learning_rate": 9.083947766915577e-06,
+      "loss": 1.0655,
+      "step": 1238
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 6.879778110768063,
+      "learning_rate": 9.082287599356372e-06,
+      "loss": 1.1233,
+      "step": 1239
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 10.203994106131276,
+      "learning_rate": 9.080626080785878e-06,
+      "loss": 1.1319,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 9.949152956902491,
+      "learning_rate": 9.078963211753964e-06,
+      "loss": 1.0826,
+      "step": 1241
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 9.149372832285744,
+      "learning_rate": 9.07729899281095e-06,
+      "loss": 1.094,
+      "step": 1242
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 6.38077825065622,
+      "learning_rate": 9.075633424507599e-06,
+      "loss": 1.1603,
+      "step": 1243
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 10.485435322551513,
+      "learning_rate": 9.073966507395123e-06,
+      "loss": 1.1546,
+      "step": 1244
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 6.942631455090426,
+      "learning_rate": 9.072298242025178e-06,
+      "loss": 1.146,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.162682979826313,
+      "learning_rate": 9.070628628949869e-06,
+      "loss": 1.148,
+      "step": 1246
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.623917177031045,
+      "learning_rate": 9.068957668721744e-06,
+      "loss": 1.1947,
+      "step": 1247
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 7.577533921790683,
+      "learning_rate": 9.0672853618938e-06,
+      "loss": 1.2329,
+      "step": 1248
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.260089652347116,
+      "learning_rate": 9.065611709019476e-06,
+      "loss": 1.0898,
+      "step": 1249
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.255145469455565,
+      "learning_rate": 9.06393671065266e-06,
+      "loss": 1.1721,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.238205793565598,
+      "learning_rate": 9.062260367347685e-06,
+      "loss": 1.1084,
+      "step": 1251
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 5.3237257000810745,
+      "learning_rate": 9.060582679659323e-06,
+      "loss": 1.1266,
+      "step": 1252
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 9.763132817686504,
+      "learning_rate": 9.0589036481428e-06,
+      "loss": 1.1233,
+      "step": 1253
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 9.07485631002019,
+      "learning_rate": 9.057223273353783e-06,
+      "loss": 1.1057,
+      "step": 1254
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 6.5688603333965,
+      "learning_rate": 9.055541555848382e-06,
+      "loss": 1.1294,
+      "step": 1255
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.191399380192301,
+      "learning_rate": 9.05385849618315e-06,
+      "loss": 1.1583,
+      "step": 1256
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 10.639023153775927,
+      "learning_rate": 9.05217409491509e-06,
+      "loss": 1.1628,
+      "step": 1257
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 6.910218020032199,
+      "learning_rate": 9.050488352601643e-06,
+      "loss": 1.0921,
+      "step": 1258
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.93177971776828,
+      "learning_rate": 9.048801269800698e-06,
+      "loss": 1.1345,
+      "step": 1259
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 10.471446246419536,
+      "learning_rate": 9.047112847070587e-06,
+      "loss": 1.1661,
+      "step": 1260
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 7.680689887144662,
+      "learning_rate": 9.045423084970082e-06,
+      "loss": 1.1089,
+      "step": 1261
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 10.460544936933394,
+      "learning_rate": 9.043731984058403e-06,
+      "loss": 1.1833,
+      "step": 1262
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.762512412712422,
+      "learning_rate": 9.04203954489521e-06,
+      "loss": 1.1684,
+      "step": 1263
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.06697569054788,
+      "learning_rate": 9.040345768040605e-06,
+      "loss": 1.167,
+      "step": 1264
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 8.642956781080022,
+      "learning_rate": 9.038650654055137e-06,
+      "loss": 1.1444,
+      "step": 1265
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 6.841805659567064,
+      "learning_rate": 9.036954203499792e-06,
+      "loss": 1.1757,
+      "step": 1266
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 9.985752505528154,
+      "learning_rate": 9.035256416936006e-06,
+      "loss": 1.1638,
+      "step": 1267
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.495796446257213,
+      "learning_rate": 9.033557294925647e-06,
+      "loss": 1.1538,
+      "step": 1268
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.99167752201167,
+      "learning_rate": 9.031856838031034e-06,
+      "loss": 1.1068,
+      "step": 1269
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.432303854577457,
+      "learning_rate": 9.030155046814924e-06,
+      "loss": 1.1482,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 7.80750413737361,
+      "learning_rate": 9.028451921840513e-06,
+      "loss": 1.1271,
+      "step": 1271
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 7.528289908139458,
+      "learning_rate": 9.026747463671442e-06,
+      "loss": 1.1133,
+      "step": 1272
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 7.295613303524269,
+      "learning_rate": 9.025041672871795e-06,
+      "loss": 1.0902,
+      "step": 1273
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 4.581087165220124,
+      "learning_rate": 9.023334550006093e-06,
+      "loss": 1.1048,
+      "step": 1274
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.775160169078683,
+      "learning_rate": 9.021626095639296e-06,
+      "loss": 1.105,
+      "step": 1275
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.94505827484061,
+      "learning_rate": 9.019916310336812e-06,
+      "loss": 1.1181,
+      "step": 1276
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 7.705709103260194,
+      "learning_rate": 9.018205194664485e-06,
+      "loss": 1.183,
+      "step": 1277
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 4.942902813894164,
+      "learning_rate": 9.016492749188596e-06,
+      "loss": 1.0969,
+      "step": 1278
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.361464392450355,
+      "learning_rate": 9.014778974475875e-06,
+      "loss": 1.0882,
+      "step": 1279
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 7.028428206679681,
+      "learning_rate": 9.013063871093481e-06,
+      "loss": 1.1333,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 9.64211926895499,
+      "learning_rate": 9.011347439609024e-06,
+      "loss": 1.2068,
+      "step": 1281
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.856876580972925,
+      "learning_rate": 9.009629680590543e-06,
+      "loss": 1.1353,
+      "step": 1282
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 9.498969808273223,
+      "learning_rate": 9.007910594606524e-06,
+      "loss": 1.1401,
+      "step": 1283
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 10.60084113848236,
+      "learning_rate": 9.006190182225887e-06,
+      "loss": 1.2029,
+      "step": 1284
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 5.20960879398269,
+      "learning_rate": 9.004468444017997e-06,
+      "loss": 1.0196,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 7.893604147497553,
+      "learning_rate": 9.00274538055265e-06,
+      "loss": 1.0878,
+      "step": 1286
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 10.05923191860272,
+      "learning_rate": 9.001020992400086e-06,
+      "loss": 1.0822,
+      "step": 1287
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.396630903983285,
+      "learning_rate": 8.999295280130984e-06,
+      "loss": 1.1105,
+      "step": 1288
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 7.477797020492111,
+      "learning_rate": 8.997568244316456e-06,
+      "loss": 1.1786,
+      "step": 1289
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.032022418178796,
+      "learning_rate": 8.995839885528058e-06,
+      "loss": 1.1268,
+      "step": 1290
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.856246068809115,
+      "learning_rate": 8.99411020433778e-06,
+      "loss": 1.0823,
+      "step": 1291
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 7.881337155648311,
+      "learning_rate": 8.99237920131805e-06,
+      "loss": 1.1018,
+      "step": 1292
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 9.83410820078052,
+      "learning_rate": 8.990646877041738e-06,
+      "loss": 1.1394,
+      "step": 1293
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 7.247547574668731,
+      "learning_rate": 8.988913232082142e-06,
+      "loss": 1.1156,
+      "step": 1294
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 6.567973142117832,
+      "learning_rate": 8.987178267013006e-06,
+      "loss": 1.0994,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.164243280112377,
+      "learning_rate": 8.985441982408507e-06,
+      "loss": 1.0903,
+      "step": 1296
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 6.735860847037359,
+      "learning_rate": 8.983704378843257e-06,
+      "loss": 1.1291,
+      "step": 1297
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.584195337982456,
+      "learning_rate": 8.98196545689231e-06,
+      "loss": 1.1375,
+      "step": 1298
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 9.01705126797518,
+      "learning_rate": 8.98022521713115e-06,
+      "loss": 1.1693,
+      "step": 1299
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 6.312003779509606,
+      "learning_rate": 8.978483660135702e-06,
+      "loss": 1.1482,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 7.07615786632347,
+      "learning_rate": 8.976740786482324e-06,
+      "loss": 1.1146,
+      "step": 1301
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.960641118680407,
+      "learning_rate": 8.974996596747812e-06,
+      "loss": 1.1315,
+      "step": 1302
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 7.157131319662147,
+      "learning_rate": 8.973251091509394e-06,
+      "loss": 1.1305,
+      "step": 1303
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.092369352974265,
+      "learning_rate": 8.971504271344736e-06,
+      "loss": 1.2367,
+      "step": 1304
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 6.845463066825342,
+      "learning_rate": 8.969756136831942e-06,
+      "loss": 1.1695,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 10.06334773940896,
+      "learning_rate": 8.968006688549545e-06,
+      "loss": 1.1146,
+      "step": 1306
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 7.62676726036474,
+      "learning_rate": 8.966255927076515e-06,
+      "loss": 1.1277,
+      "step": 1307
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 7.1787436267582425,
+      "learning_rate": 8.96450385299226e-06,
+      "loss": 1.2028,
+      "step": 1308
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 5.727946956300262,
+      "learning_rate": 8.962750466876617e-06,
+      "loss": 1.0667,
+      "step": 1309
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 7.479274037248901,
+      "learning_rate": 8.960995769309859e-06,
+      "loss": 1.1197,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 9.492745731385801,
+      "learning_rate": 8.959239760872697e-06,
+      "loss": 1.1508,
+      "step": 1311
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 7.0141180621327015,
+      "learning_rate": 8.957482442146271e-06,
+      "loss": 1.1319,
+      "step": 1312
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.970788572568575,
+      "learning_rate": 8.955723813712156e-06,
+      "loss": 1.0835,
+      "step": 1313
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.072505042008245,
+      "learning_rate": 8.953963876152363e-06,
+      "loss": 1.1601,
+      "step": 1314
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.403357228806748,
+      "learning_rate": 8.95220263004933e-06,
+      "loss": 1.0826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 7.098224853452291,
+      "learning_rate": 8.950440075985932e-06,
+      "loss": 1.0615,
+      "step": 1316
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 7.812799647694234,
+      "learning_rate": 8.948676214545482e-06,
+      "loss": 1.1104,
+      "step": 1317
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 6.466225331497809,
+      "learning_rate": 8.946911046311715e-06,
+      "loss": 1.0914,
+      "step": 1318
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 10.035231498308159,
+      "learning_rate": 8.945144571868806e-06,
+      "loss": 1.1519,
+      "step": 1319
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 5.497863321782744,
+      "learning_rate": 8.943376791801363e-06,
+      "loss": 1.0723,
+      "step": 1320
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 10.57230297721478,
+      "learning_rate": 8.94160770669442e-06,
+      "loss": 1.0855,
+      "step": 1321
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.560710194982823,
+      "learning_rate": 8.939837317133447e-06,
+      "loss": 1.1146,
+      "step": 1322
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 9.290260105847677,
+      "learning_rate": 8.938065623704345e-06,
+      "loss": 1.0832,
+      "step": 1323
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 8.713651321404859,
+      "learning_rate": 8.936292626993446e-06,
+      "loss": 1.169,
+      "step": 1324
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 9.466001998301154,
+      "learning_rate": 8.934518327587515e-06,
+      "loss": 1.0957,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 9.334207420112262,
+      "learning_rate": 8.932742726073747e-06,
+      "loss": 1.1737,
+      "step": 1326
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 5.49659390991285,
+      "learning_rate": 8.930965823039767e-06,
+      "loss": 1.1555,
+      "step": 1327
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 10.871089184851884,
+      "learning_rate": 8.929187619073632e-06,
+      "loss": 1.2444,
+      "step": 1328
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 6.026262159095677,
+      "learning_rate": 8.927408114763829e-06,
+      "loss": 1.2209,
+      "step": 1329
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 11.243858993284716,
+      "learning_rate": 8.925627310699275e-06,
+      "loss": 1.1253,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 8.495479527049207,
+      "learning_rate": 8.923845207469318e-06,
+      "loss": 1.0653,
+      "step": 1331
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 6.792406526872571,
+      "learning_rate": 8.922061805663735e-06,
+      "loss": 1.1058,
+      "step": 1332
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.276321545910072,
+      "learning_rate": 8.920277105872736e-06,
+      "loss": 1.1839,
+      "step": 1333
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.9532104345437515,
+      "learning_rate": 8.918491108686953e-06,
+      "loss": 1.1637,
+      "step": 1334
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 6.940198831499887,
+      "learning_rate": 8.916703814697456e-06,
+      "loss": 1.1233,
+      "step": 1335
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 6.475601176783441,
+      "learning_rate": 8.91491522449574e-06,
+      "loss": 1.1491,
+      "step": 1336
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.944914127805518,
+      "learning_rate": 8.913125338673728e-06,
+      "loss": 1.1062,
+      "step": 1337
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 8.577946387980898,
+      "learning_rate": 8.911334157823775e-06,
+      "loss": 1.1617,
+      "step": 1338
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 6.049035222604488,
+      "learning_rate": 8.90954168253866e-06,
+      "loss": 1.1036,
+      "step": 1339
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 6.981567917419026,
+      "learning_rate": 8.907747913411595e-06,
+      "loss": 1.0874,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.690200412806662,
+      "learning_rate": 8.90595285103622e-06,
+      "loss": 1.1757,
+      "step": 1341
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 6.2038020868067765,
+      "learning_rate": 8.904156496006597e-06,
+      "loss": 1.2219,
+      "step": 1342
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.6631723763109525,
+      "learning_rate": 8.902358848917223e-06,
+      "loss": 1.1174,
+      "step": 1343
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.308400464641598,
+      "learning_rate": 8.900559910363018e-06,
+      "loss": 1.1198,
+      "step": 1344
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 10.608961738750104,
+      "learning_rate": 8.898759680939335e-06,
+      "loss": 1.1179,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 6.275039919400026,
+      "learning_rate": 8.896958161241947e-06,
+      "loss": 1.0974,
+      "step": 1346
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 6.759227129446849,
+      "learning_rate": 8.895155351867057e-06,
+      "loss": 1.1109,
+      "step": 1347
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 8.015534614834236,
+      "learning_rate": 8.893351253411295e-06,
+      "loss": 1.1548,
+      "step": 1348
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 6.276533813963136,
+      "learning_rate": 8.89154586647172e-06,
+      "loss": 1.0663,
+      "step": 1349
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 9.032679629428639,
+      "learning_rate": 8.889739191645815e-06,
+      "loss": 1.1393,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.889831868518083,
+      "learning_rate": 8.887931229531488e-06,
+      "loss": 1.0946,
+      "step": 1351
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 6.778997346044752,
+      "learning_rate": 8.886121980727074e-06,
+      "loss": 1.0767,
+      "step": 1352
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.347207594042327,
+      "learning_rate": 8.884311445831335e-06,
+      "loss": 1.0977,
+      "step": 1353
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.031814388598085,
+      "learning_rate": 8.882499625443459e-06,
+      "loss": 1.0971,
+      "step": 1354
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.89502541427675,
+      "learning_rate": 8.880686520163058e-06,
+      "loss": 1.1237,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 6.224404235408305,
+      "learning_rate": 8.878872130590165e-06,
+      "loss": 1.1295,
+      "step": 1356
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.473494037337965,
+      "learning_rate": 8.87705645732525e-06,
+      "loss": 1.1551,
+      "step": 1357
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 8.387026289628437,
+      "learning_rate": 8.875239500969194e-06,
+      "loss": 1.1443,
+      "step": 1358
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.785953378547542,
+      "learning_rate": 8.873421262123313e-06,
+      "loss": 1.1907,
+      "step": 1359
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 9.445200604835415,
+      "learning_rate": 8.871601741389344e-06,
+      "loss": 1.1342,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.330190422228702,
+      "learning_rate": 8.869780939369443e-06,
+      "loss": 1.145,
+      "step": 1361
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 8.901500448389728,
+      "learning_rate": 8.867958856666198e-06,
+      "loss": 1.2252,
+      "step": 1362
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 8.45610144552464,
+      "learning_rate": 8.866135493882616e-06,
+      "loss": 1.1539,
+      "step": 1363
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 9.669401399982576,
+      "learning_rate": 8.864310851622133e-06,
+      "loss": 1.0219,
+      "step": 1364
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 9.481462383116153,
+      "learning_rate": 8.8624849304886e-06,
+      "loss": 1.1307,
+      "step": 1365
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.3706409811236515,
+      "learning_rate": 8.860657731086297e-06,
+      "loss": 1.1888,
+      "step": 1366
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 9.682176382770697,
+      "learning_rate": 8.858829254019928e-06,
+      "loss": 1.1968,
+      "step": 1367
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 9.317463803865149,
+      "learning_rate": 8.856999499894616e-06,
+      "loss": 1.1192,
+      "step": 1368
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.937029908394659,
+      "learning_rate": 8.855168469315906e-06,
+      "loss": 1.1109,
+      "step": 1369
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 8.65276190306904,
+      "learning_rate": 8.853336162889774e-06,
+      "loss": 1.1845,
+      "step": 1370
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 6.412426589763314,
+      "learning_rate": 8.851502581222606e-06,
+      "loss": 1.0694,
+      "step": 1371
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 9.894840039203673,
+      "learning_rate": 8.84966772492122e-06,
+      "loss": 1.1912,
+      "step": 1372
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 8.500971853985103,
+      "learning_rate": 8.847831594592851e-06,
+      "loss": 1.0435,
+      "step": 1373
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.647822629659925,
+      "learning_rate": 8.845994190845153e-06,
+      "loss": 1.1045,
+      "step": 1374
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 6.794163306740876,
+      "learning_rate": 8.844155514286211e-06,
+      "loss": 1.1492,
+      "step": 1375
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 8.169832629217469,
+      "learning_rate": 8.84231556552452e-06,
+      "loss": 1.1176,
+      "step": 1376
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 7.930630429467485,
+      "learning_rate": 8.840474345169004e-06,
+      "loss": 1.2079,
+      "step": 1377
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 13.345748553693515,
+      "learning_rate": 8.838631853829004e-06,
+      "loss": 1.1521,
+      "step": 1378
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 9.09486928055802,
+      "learning_rate": 8.836788092114284e-06,
+      "loss": 1.1639,
+      "step": 1379
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 6.478862734841478,
+      "learning_rate": 8.834943060635026e-06,
+      "loss": 1.0302,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 7.164482978092642,
+      "learning_rate": 8.833096760001834e-06,
+      "loss": 1.153,
+      "step": 1381
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 7.733173666307298,
+      "learning_rate": 8.831249190825729e-06,
+      "loss": 1.1117,
+      "step": 1382
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.237077201186308,
+      "learning_rate": 8.829400353718157e-06,
+      "loss": 1.1676,
+      "step": 1383
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.063609207651455,
+      "learning_rate": 8.82755024929098e-06,
+      "loss": 1.1726,
+      "step": 1384
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 7.931500380327539,
+      "learning_rate": 8.825698878156479e-06,
+      "loss": 1.0993,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.948578751211913,
+      "learning_rate": 8.823846240927356e-06,
+      "loss": 1.1411,
+      "step": 1386
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 5.919041166229979,
+      "learning_rate": 8.821992338216733e-06,
+      "loss": 1.1079,
+      "step": 1387
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.11618064662262,
+      "learning_rate": 8.820137170638148e-06,
+      "loss": 1.1129,
+      "step": 1388
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 7.818505442731587,
+      "learning_rate": 8.818280738805557e-06,
+      "loss": 1.0788,
+      "step": 1389
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 5.527186576919923,
+      "learning_rate": 8.81642304333334e-06,
+      "loss": 1.1377,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 9.84229647233774,
+      "learning_rate": 8.814564084836287e-06,
+      "loss": 1.0672,
+      "step": 1391
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.186790215295403,
+      "learning_rate": 8.812703863929614e-06,
+      "loss": 1.1886,
+      "step": 1392
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 6.003032602254835,
+      "learning_rate": 8.81084238122895e-06,
+      "loss": 1.1461,
+      "step": 1393
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.13587821051553,
+      "learning_rate": 8.808979637350343e-06,
+      "loss": 1.1076,
+      "step": 1394
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 10.277014382951782,
+      "learning_rate": 8.807115632910259e-06,
+      "loss": 1.1252,
+      "step": 1395
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.861163546547509,
+      "learning_rate": 8.805250368525577e-06,
+      "loss": 1.1276,
+      "step": 1396
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 9.042397695096154,
+      "learning_rate": 8.803383844813599e-06,
+      "loss": 1.0792,
+      "step": 1397
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.39252693367144,
+      "learning_rate": 8.80151606239204e-06,
+      "loss": 1.1063,
+      "step": 1398
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 5.860823766304868,
+      "learning_rate": 8.799647021879034e-06,
+      "loss": 1.1542,
+      "step": 1399
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 9.732040746640925,
+      "learning_rate": 8.797776723893127e-06,
+      "loss": 1.0869,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.631299161563025,
+      "learning_rate": 8.795905169053289e-06,
+      "loss": 1.1427,
+      "step": 1401
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 7.023214446015037,
+      "learning_rate": 8.794032357978895e-06,
+      "loss": 1.1448,
+      "step": 1402
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 11.69610847455918,
+      "learning_rate": 8.792158291289747e-06,
+      "loss": 1.1168,
+      "step": 1403
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 7.775554744230605,
+      "learning_rate": 8.790282969606057e-06,
+      "loss": 1.1336,
+      "step": 1404
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.802481077926362,
+      "learning_rate": 8.78840639354845e-06,
+      "loss": 1.1509,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 7.774193609944803,
+      "learning_rate": 8.786528563737968e-06,
+      "loss": 1.0607,
+      "step": 1406
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.730051750373438,
+      "learning_rate": 8.784649480796073e-06,
+      "loss": 1.1198,
+      "step": 1407
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 11.928524817111878,
+      "learning_rate": 8.782769145344633e-06,
+      "loss": 1.1355,
+      "step": 1408
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 6.117668512894786,
+      "learning_rate": 8.780887558005941e-06,
+      "loss": 1.1658,
+      "step": 1409
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.083307789203827,
+      "learning_rate": 8.779004719402693e-06,
+      "loss": 1.0578,
+      "step": 1410
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 7.900843035805289,
+      "learning_rate": 8.777120630158005e-06,
+      "loss": 1.1402,
+      "step": 1411
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 9.861559023336008,
+      "learning_rate": 8.775235290895406e-06,
+      "loss": 1.1325,
+      "step": 1412
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 7.961095072136899,
+      "learning_rate": 8.773348702238842e-06,
+      "loss": 1.1162,
+      "step": 1413
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 9.312374625654886,
+      "learning_rate": 8.771460864812668e-06,
+      "loss": 1.0782,
+      "step": 1414
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.689001788650046,
+      "learning_rate": 8.769571779241652e-06,
+      "loss": 1.104,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 7.933515114128349,
+      "learning_rate": 8.767681446150977e-06,
+      "loss": 1.1809,
+      "step": 1416
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 6.627465994136971,
+      "learning_rate": 8.765789866166244e-06,
+      "loss": 1.1528,
+      "step": 1417
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.326547866214128,
+      "learning_rate": 8.763897039913452e-06,
+      "loss": 1.02,
+      "step": 1418
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 7.346273671268325,
+      "learning_rate": 8.76200296801903e-06,
+      "loss": 1.1032,
+      "step": 1419
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.053353649692617,
+      "learning_rate": 8.760107651109805e-06,
+      "loss": 1.1515,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 7.145799052430155,
+      "learning_rate": 8.758211089813026e-06,
+      "loss": 1.1724,
+      "step": 1421
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 6.693668579214649,
+      "learning_rate": 8.756313284756351e-06,
+      "loss": 1.0932,
+      "step": 1422
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.046177882690507,
+      "learning_rate": 8.754414236567842e-06,
+      "loss": 1.1328,
+      "step": 1423
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 6.013264287965899,
+      "learning_rate": 8.752513945875987e-06,
+      "loss": 1.1674,
+      "step": 1424
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.679269849252417,
+      "learning_rate": 8.75061241330967e-06,
+      "loss": 1.1627,
+      "step": 1425
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.093683896927654,
+      "learning_rate": 8.748709639498196e-06,
+      "loss": 1.1519,
+      "step": 1426
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 9.449252655957572,
+      "learning_rate": 8.74680562507128e-06,
+      "loss": 1.112,
+      "step": 1427
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 7.773584369107336,
+      "learning_rate": 8.744900370659043e-06,
+      "loss": 1.0953,
+      "step": 1428
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 9.317290025698002,
+      "learning_rate": 8.742993876892017e-06,
+      "loss": 1.1156,
+      "step": 1429
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 7.454998775031592,
+      "learning_rate": 8.74108614440115e-06,
+      "loss": 1.1265,
+      "step": 1430
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.162889700521267,
+      "learning_rate": 8.739177173817793e-06,
+      "loss": 1.1021,
+      "step": 1431
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 6.6042055145504674,
+      "learning_rate": 8.73726696577371e-06,
+      "loss": 1.1132,
+      "step": 1432
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 7.432913425107613,
+      "learning_rate": 8.735355520901074e-06,
+      "loss": 1.0938,
+      "step": 1433
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 10.435833201672839,
+      "learning_rate": 8.73344283983247e-06,
+      "loss": 1.1187,
+      "step": 1434
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 7.325947475298695,
+      "learning_rate": 8.731528923200887e-06,
+      "loss": 1.1996,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 7.065564828588959,
+      "learning_rate": 8.729613771639726e-06,
+      "loss": 1.0548,
+      "step": 1436
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 7.104579531429295,
+      "learning_rate": 8.727697385782797e-06,
+      "loss": 1.1294,
+      "step": 1437
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 9.65711668262594,
+      "learning_rate": 8.725779766264315e-06,
+      "loss": 1.1167,
+      "step": 1438
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 5.740444988701515,
+      "learning_rate": 8.72386091371891e-06,
+      "loss": 1.0961,
+      "step": 1439
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.073366053617507,
+      "learning_rate": 8.721940828781615e-06,
+      "loss": 1.0719,
+      "step": 1440
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 9.418018761087556,
+      "learning_rate": 8.720019512087868e-06,
+      "loss": 1.1596,
+      "step": 1441
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 6.3270210087172005,
+      "learning_rate": 8.718096964273523e-06,
+      "loss": 1.1663,
+      "step": 1442
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 9.396794785826586,
+      "learning_rate": 8.716173185974834e-06,
+      "loss": 1.1553,
+      "step": 1443
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 9.145267006278418,
+      "learning_rate": 8.714248177828469e-06,
+      "loss": 1.1089,
+      "step": 1444
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.641059113852917,
+      "learning_rate": 8.712321940471495e-06,
+      "loss": 1.1844,
+      "step": 1445
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 6.578532868354802,
+      "learning_rate": 8.71039447454139e-06,
+      "loss": 1.2176,
+      "step": 1446
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 7.138106139982602,
+      "learning_rate": 8.708465780676043e-06,
+      "loss": 1.1436,
+      "step": 1447
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 5.896142207185301,
+      "learning_rate": 8.70653585951374e-06,
+      "loss": 1.1362,
+      "step": 1448
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.35741495820476,
+      "learning_rate": 8.704604711693182e-06,
+      "loss": 1.0526,
+      "step": 1449
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 6.253456864571495,
+      "learning_rate": 8.70267233785347e-06,
+      "loss": 1.0837,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.176984387274029,
+      "learning_rate": 8.700738738634111e-06,
+      "loss": 1.0861,
+      "step": 1451
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 6.986259175932968,
+      "learning_rate": 8.698803914675023e-06,
+      "loss": 1.1113,
+      "step": 1452
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.746763341036878,
+      "learning_rate": 8.696867866616525e-06,
+      "loss": 1.1735,
+      "step": 1453
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.069652034079317,
+      "learning_rate": 8.69493059509934e-06,
+      "loss": 1.2233,
+      "step": 1454
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.971342959886345,
+      "learning_rate": 8.692992100764599e-06,
+      "loss": 1.1883,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.93987246070001,
+      "learning_rate": 8.691052384253836e-06,
+      "loss": 1.136,
+      "step": 1456
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.023192533662996,
+      "learning_rate": 8.689111446208987e-06,
+      "loss": 1.1111,
+      "step": 1457
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 9.23008690848576,
+      "learning_rate": 8.6871692872724e-06,
+      "loss": 1.1936,
+      "step": 1458
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 6.128600127810107,
+      "learning_rate": 8.68522590808682e-06,
+      "loss": 1.1473,
+      "step": 1459
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 7.58320107108593,
+      "learning_rate": 8.683281309295397e-06,
+      "loss": 1.1162,
+      "step": 1460
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 7.848712208622345,
+      "learning_rate": 8.681335491541688e-06,
+      "loss": 1.2057,
+      "step": 1461
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 6.305816767381639,
+      "learning_rate": 8.679388455469649e-06,
+      "loss": 1.178,
+      "step": 1462
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 5.682085459373752,
+      "learning_rate": 8.677440201723641e-06,
+      "loss": 1.0721,
+      "step": 1463
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 7.691324913658943,
+      "learning_rate": 8.675490730948428e-06,
+      "loss": 1.0667,
+      "step": 1464
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.008580917279481,
+      "learning_rate": 8.67354004378918e-06,
+      "loss": 1.081,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 10.33017483335957,
+      "learning_rate": 8.671588140891463e-06,
+      "loss": 1.1965,
+      "step": 1466
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.183399722219201,
+      "learning_rate": 8.66963502290125e-06,
+      "loss": 1.1361,
+      "step": 1467
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 7.841354071616904,
+      "learning_rate": 8.667680690464914e-06,
+      "loss": 1.0604,
+      "step": 1468
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 6.669876042649443,
+      "learning_rate": 8.665725144229234e-06,
+      "loss": 1.0472,
+      "step": 1469
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 6.856199524050349,
+      "learning_rate": 8.663768384841384e-06,
+      "loss": 1.148,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.205984327075246,
+      "learning_rate": 8.661810412948946e-06,
+      "loss": 1.0992,
+      "step": 1471
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.946549334168644,
+      "learning_rate": 8.659851229199898e-06,
+      "loss": 1.1922,
+      "step": 1472
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 6.475189992036524,
+      "learning_rate": 8.657890834242624e-06,
+      "loss": 1.0866,
+      "step": 1473
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 7.566444581166848,
+      "learning_rate": 8.655929228725903e-06,
+      "loss": 1.0726,
+      "step": 1474
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 7.1944146389146555,
+      "learning_rate": 8.65396641329892e-06,
+      "loss": 1.193,
+      "step": 1475
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 6.768747393083312,
+      "learning_rate": 8.652002388611259e-06,
+      "loss": 1.1629,
+      "step": 1476
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 7.386152544338406,
+      "learning_rate": 8.650037155312901e-06,
+      "loss": 1.1293,
+      "step": 1477
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.309954424477025,
+      "learning_rate": 8.64807071405423e-06,
+      "loss": 1.1396,
+      "step": 1478
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 6.689437052018285,
+      "learning_rate": 8.646103065486031e-06,
+      "loss": 1.0584,
+      "step": 1479
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 6.929026958110146,
+      "learning_rate": 8.644134210259485e-06,
+      "loss": 1.1014,
+      "step": 1480
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 7.230387056679488,
+      "learning_rate": 8.642164149026177e-06,
+      "loss": 1.0356,
+      "step": 1481
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 9.155416990534357,
+      "learning_rate": 8.640192882438085e-06,
+      "loss": 1.1087,
+      "step": 1482
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.823146057073622,
+      "learning_rate": 8.63822041114759e-06,
+      "loss": 1.1122,
+      "step": 1483
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 6.812018272634502,
+      "learning_rate": 8.636246735807472e-06,
+      "loss": 1.1954,
+      "step": 1484
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 7.957488921841528,
+      "learning_rate": 8.634271857070906e-06,
+      "loss": 1.0999,
+      "step": 1485
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 7.909758811660444,
+      "learning_rate": 8.632295775591471e-06,
+      "loss": 1.0783,
+      "step": 1486
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 5.750079050187609,
+      "learning_rate": 8.630318492023137e-06,
+      "loss": 1.1596,
+      "step": 1487
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 11.313695611364297,
+      "learning_rate": 8.628340007020278e-06,
+      "loss": 1.108,
+      "step": 1488
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 7.053413093840827,
+      "learning_rate": 8.626360321237662e-06,
+      "loss": 1.1236,
+      "step": 1489
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 11.289850785035537,
+      "learning_rate": 8.624379435330455e-06,
+      "loss": 1.1123,
+      "step": 1490
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 6.381492595203658,
+      "learning_rate": 8.622397349954221e-06,
+      "loss": 1.0786,
+      "step": 1491
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 8.673408914215155,
+      "learning_rate": 8.620414065764922e-06,
+      "loss": 1.119,
+      "step": 1492
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 6.623978682142838,
+      "learning_rate": 8.618429583418914e-06,
+      "loss": 1.0723,
+      "step": 1493
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 6.9471445908997875,
+      "learning_rate": 8.61644390357295e-06,
+      "loss": 1.1067,
+      "step": 1494
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 8.664415070961526,
+      "learning_rate": 8.614457026884181e-06,
+      "loss": 1.2051,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.296534646022745,
+      "learning_rate": 8.612468954010156e-06,
+      "loss": 1.1516,
+      "step": 1496
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 4.950858967084948,
+      "learning_rate": 8.610479685608812e-06,
+      "loss": 1.1838,
+      "step": 1497
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 5.4894001124132545,
+      "learning_rate": 8.60848922233849e-06,
+      "loss": 1.0513,
+      "step": 1498
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.064855810396693,
+      "learning_rate": 8.606497564857922e-06,
+      "loss": 1.0188,
+      "step": 1499
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.395652734268294,
+      "learning_rate": 8.604504713826237e-06,
+      "loss": 1.1017,
+      "step": 1500
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.097592594643851,
+      "learning_rate": 8.602510669902957e-06,
+      "loss": 1.0727,
+      "step": 1501
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 9.600382538736111,
+      "learning_rate": 8.600515433748003e-06,
+      "loss": 1.126,
+      "step": 1502
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 5.570018632018081,
+      "learning_rate": 8.598519006021685e-06,
+      "loss": 1.1045,
+      "step": 1503
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 8.556260968030491,
+      "learning_rate": 8.596521387384713e-06,
+      "loss": 1.1395,
+      "step": 1504
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.0323643974323895,
+      "learning_rate": 8.594522578498186e-06,
+      "loss": 1.1213,
+      "step": 1505
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.903421384023599,
+      "learning_rate": 8.5925225800236e-06,
+      "loss": 1.1109,
+      "step": 1506
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 6.925443405961467,
+      "learning_rate": 8.590521392622842e-06,
+      "loss": 1.1216,
+      "step": 1507
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 6.481112304882877,
+      "learning_rate": 8.588519016958195e-06,
+      "loss": 1.0482,
+      "step": 1508
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.674826377932532,
+      "learning_rate": 8.586515453692337e-06,
+      "loss": 1.0706,
+      "step": 1509
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 10.596834840215237,
+      "learning_rate": 8.584510703488333e-06,
+      "loss": 1.2312,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 6.39610700651647,
+      "learning_rate": 8.582504767009649e-06,
+      "loss": 1.1548,
+      "step": 1511
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 6.52281253502873,
+      "learning_rate": 8.580497644920134e-06,
+      "loss": 1.0344,
+      "step": 1512
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 8.003919078275034,
+      "learning_rate": 8.578489337884037e-06,
+      "loss": 1.1447,
+      "step": 1513
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 9.000013596148102,
+      "learning_rate": 8.576479846565997e-06,
+      "loss": 1.0768,
+      "step": 1514
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 8.255880904516504,
+      "learning_rate": 8.574469171631043e-06,
+      "loss": 1.0402,
+      "step": 1515
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 8.271922810193049,
+      "learning_rate": 8.572457313744597e-06,
+      "loss": 1.0295,
+      "step": 1516
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 5.709503382741463,
+      "learning_rate": 8.570444273572474e-06,
+      "loss": 1.1574,
+      "step": 1517
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 8.316572591650651,
+      "learning_rate": 8.568430051780879e-06,
+      "loss": 1.0745,
+      "step": 1518
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 8.545903006152912,
+      "learning_rate": 8.566414649036404e-06,
+      "loss": 1.1837,
+      "step": 1519
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.800987486097605,
+      "learning_rate": 8.56439806600604e-06,
+      "loss": 1.1893,
+      "step": 1520
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 8.742194184013838,
+      "learning_rate": 8.562380303357166e-06,
+      "loss": 1.0687,
+      "step": 1521
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.1925215213032425,
+      "learning_rate": 8.560361361757543e-06,
+      "loss": 1.062,
+      "step": 1522
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 6.0321928005090415,
+      "learning_rate": 8.558341241875335e-06,
+      "loss": 1.2071,
+      "step": 1523
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.779964948641184,
+      "learning_rate": 8.556319944379088e-06,
+      "loss": 1.1217,
+      "step": 1524
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 8.454125373437646,
+      "learning_rate": 8.554297469937739e-06,
+      "loss": 1.0712,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 9.75703047018876,
+      "learning_rate": 8.552273819220616e-06,
+      "loss": 1.0973,
+      "step": 1526
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.620589155620682,
+      "learning_rate": 8.550248992897435e-06,
+      "loss": 1.0017,
+      "step": 1527
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 8.760544351244576,
+      "learning_rate": 8.5482229916383e-06,
+      "loss": 1.1204,
+      "step": 1528
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 8.505398538587245,
+      "learning_rate": 8.546195816113708e-06,
+      "loss": 1.1184,
+      "step": 1529
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.620376446527482,
+      "learning_rate": 8.54416746699454e-06,
+      "loss": 1.0903,
+      "step": 1530
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 10.558301714509284,
+      "learning_rate": 8.542137944952069e-06,
+      "loss": 1.1114,
+      "step": 1531
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 8.311224779192207,
+      "learning_rate": 8.540107250657951e-06,
+      "loss": 1.1827,
+      "step": 1532
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 8.816247642515009,
+      "learning_rate": 8.538075384784239e-06,
+      "loss": 1.0689,
+      "step": 1533
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 6.988470662664821,
+      "learning_rate": 8.536042348003364e-06,
+      "loss": 1.0179,
+      "step": 1534
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.013034974253607,
+      "learning_rate": 8.534008140988148e-06,
+      "loss": 1.1524,
+      "step": 1535
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 9.241656101379368,
+      "learning_rate": 8.531972764411806e-06,
+      "loss": 1.0488,
+      "step": 1536
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.090343185260643,
+      "learning_rate": 8.52993621894793e-06,
+      "loss": 1.157,
+      "step": 1537
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.449792055483782,
+      "learning_rate": 8.527898505270504e-06,
+      "loss": 1.1617,
+      "step": 1538
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 11.354874054334863,
+      "learning_rate": 8.525859624053901e-06,
+      "loss": 1.1101,
+      "step": 1539
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.654722766300467,
+      "learning_rate": 8.523819575972878e-06,
+      "loss": 1.1899,
+      "step": 1540
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.510982704277969,
+      "learning_rate": 8.521778361702576e-06,
+      "loss": 1.1907,
+      "step": 1541
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 8.047137418501203,
+      "learning_rate": 8.519735981918524e-06,
+      "loss": 1.1282,
+      "step": 1542
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 12.027231603821976,
+      "learning_rate": 8.51769243729664e-06,
+      "loss": 1.1392,
+      "step": 1543
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 8.557455149327701,
+      "learning_rate": 8.51564772851322e-06,
+      "loss": 1.1739,
+      "step": 1544
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.911488338065123,
+      "learning_rate": 8.513601856244951e-06,
+      "loss": 1.2125,
+      "step": 1545
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 5.776829553150534,
+      "learning_rate": 8.511554821168905e-06,
+      "loss": 1.1072,
+      "step": 1546
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.080926901057937,
+      "learning_rate": 8.509506623962535e-06,
+      "loss": 1.0984,
+      "step": 1547
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 5.278876691223219,
+      "learning_rate": 8.507457265303682e-06,
+      "loss": 1.1577,
+      "step": 1548
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 6.714041156430886,
+      "learning_rate": 8.50540674587057e-06,
+      "loss": 1.0712,
+      "step": 1549
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 9.775389013953605,
+      "learning_rate": 8.503355066341808e-06,
+      "loss": 1.0881,
+      "step": 1550
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 7.211518595404161,
+      "learning_rate": 8.50130222739639e-06,
+      "loss": 1.074,
+      "step": 1551
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 6.02179601458648,
+      "learning_rate": 8.499248229713688e-06,
+      "loss": 1.1261,
+      "step": 1552
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 6.121117426463433,
+      "learning_rate": 8.497193073973464e-06,
+      "loss": 1.0662,
+      "step": 1553
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 8.317048519398796,
+      "learning_rate": 8.49513676085586e-06,
+      "loss": 1.1189,
+      "step": 1554
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 6.467597426458884,
+      "learning_rate": 8.493079291041404e-06,
+      "loss": 1.2617,
+      "step": 1555
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 6.360774342634601,
+      "learning_rate": 8.491020665211001e-06,
+      "loss": 1.1187,
+      "step": 1556
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 8.991676636034914,
+      "learning_rate": 8.488960884045948e-06,
+      "loss": 1.1008,
+      "step": 1557
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 7.989186564260867,
+      "learning_rate": 8.486899948227911e-06,
+      "loss": 1.1607,
+      "step": 1558
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 7.71932464549547,
+      "learning_rate": 8.48483785843895e-06,
+      "loss": 1.0893,
+      "step": 1559
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 8.616905901664394,
+      "learning_rate": 8.482774615361505e-06,
+      "loss": 1.1357,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 8.567259281822198,
+      "learning_rate": 8.48071021967839e-06,
+      "loss": 1.1096,
+      "step": 1561
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 8.436650800687428,
+      "learning_rate": 8.478644672072809e-06,
+      "loss": 1.2237,
+      "step": 1562
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 8.055752689819968,
+      "learning_rate": 8.476577973228342e-06,
+      "loss": 1.1352,
+      "step": 1563
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 9.648006888628014,
+      "learning_rate": 8.474510123828957e-06,
+      "loss": 1.0807,
+      "step": 1564
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 7.920891313995861,
+      "learning_rate": 8.47244112455899e-06,
+      "loss": 1.1253,
+      "step": 1565
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 8.229245429101255,
+      "learning_rate": 8.470370976103171e-06,
+      "loss": 1.1418,
+      "step": 1566
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 7.539963751852076,
+      "learning_rate": 8.468299679146601e-06,
+      "loss": 1.1214,
+      "step": 1567
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 9.509043071928355,
+      "learning_rate": 8.466227234374768e-06,
+      "loss": 1.1468,
+      "step": 1568
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 6.439973285531645,
+      "learning_rate": 8.464153642473534e-06,
+      "loss": 1.1249,
+      "step": 1569
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 6.055179788559159,
+      "learning_rate": 8.462078904129144e-06,
+      "loss": 1.096,
+      "step": 1570
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 5.649743962134641,
+      "learning_rate": 8.460003020028223e-06,
+      "loss": 1.0907,
+      "step": 1571
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 8.727217889950772,
+      "learning_rate": 8.457925990857772e-06,
+      "loss": 1.0739,
+      "step": 1572
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 7.816963723816059,
+      "learning_rate": 8.455847817305172e-06,
+      "loss": 1.0506,
+      "step": 1573
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 8.11117269597485,
+      "learning_rate": 8.453768500058183e-06,
+      "loss": 1.1387,
+      "step": 1574
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 10.533696048923915,
+      "learning_rate": 8.451688039804948e-06,
+      "loss": 1.1082,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 7.266328159558275,
+      "learning_rate": 8.449606437233983e-06,
+      "loss": 1.1121,
+      "step": 1576
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 11.324493996247574,
+      "learning_rate": 8.447523693034181e-06,
+      "loss": 1.0801,
+      "step": 1577
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 10.515848999557893,
+      "learning_rate": 8.445439807894815e-06,
+      "loss": 1.1931,
+      "step": 1578
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 10.934799299085036,
+      "learning_rate": 8.443354782505537e-06,
+      "loss": 1.0891,
+      "step": 1579
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 7.678966169261664,
+      "learning_rate": 8.441268617556374e-06,
+      "loss": 1.0882,
+      "step": 1580
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 10.236593092910152,
+      "learning_rate": 8.439181313737734e-06,
+      "loss": 0.9993,
+      "step": 1581
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 7.4155115180425355,
+      "learning_rate": 8.437092871740396e-06,
+      "loss": 1.1289,
+      "step": 1582
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 9.970908753650024,
+      "learning_rate": 8.43500329225552e-06,
+      "loss": 1.1411,
+      "step": 1583
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 10.996567648489746,
+      "learning_rate": 8.432912575974638e-06,
+      "loss": 1.0596,
+      "step": 1584
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 9.850104313276695,
+      "learning_rate": 8.430820723589668e-06,
+      "loss": 1.1535,
+      "step": 1585
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 7.850357615099491,
+      "learning_rate": 8.428727735792891e-06,
+      "loss": 1.0688,
+      "step": 1586
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 9.585289834714079,
+      "learning_rate": 8.426633613276975e-06,
+      "loss": 1.0879,
+      "step": 1587
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 5.869190706265104,
+      "learning_rate": 8.424538356734957e-06,
+      "loss": 1.1278,
+      "step": 1588
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 5.890837669604686,
+      "learning_rate": 8.422441966860248e-06,
+      "loss": 1.1125,
+      "step": 1589
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 6.449234939671629,
+      "learning_rate": 8.42034444434664e-06,
+      "loss": 1.1003,
+      "step": 1590
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 5.846796936236897,
+      "learning_rate": 8.418245789888298e-06,
+      "loss": 1.1068,
+      "step": 1591
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 8.323545226725466,
+      "learning_rate": 8.416146004179757e-06,
+      "loss": 1.1372,
+      "step": 1592
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 7.382390510841563,
+      "learning_rate": 8.414045087915932e-06,
+      "loss": 1.0636,
+      "step": 1593
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 9.743655412921266,
+      "learning_rate": 8.41194304179211e-06,
+      "loss": 1.0979,
+      "step": 1594
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 8.242461514148353,
+      "learning_rate": 8.409839866503953e-06,
+      "loss": 1.1694,
+      "step": 1595
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 8.807889631616957,
+      "learning_rate": 8.407735562747492e-06,
+      "loss": 1.184,
+      "step": 1596
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 9.71067959402057,
+      "learning_rate": 8.405630131219137e-06,
+      "loss": 1.1738,
+      "step": 1597
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 8.804720123890034,
+      "learning_rate": 8.403523572615673e-06,
+      "loss": 1.1218,
+      "step": 1598
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 8.634884887745118,
+      "learning_rate": 8.401415887634248e-06,
+      "loss": 1.1132,
+      "step": 1599
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 7.788500986706484,
+      "learning_rate": 8.399307076972395e-06,
+      "loss": 1.0558,
+      "step": 1600
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 6.675381534970525,
+      "learning_rate": 8.397197141328008e-06,
+      "loss": 1.1514,
+      "step": 1601
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 6.7336751453792845,
+      "learning_rate": 8.395086081399362e-06,
+      "loss": 1.0443,
+      "step": 1602
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 10.328356657746534,
+      "learning_rate": 8.392973897885102e-06,
+      "loss": 1.0762,
+      "step": 1603
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 6.057045462141965,
+      "learning_rate": 8.390860591484242e-06,
+      "loss": 1.1463,
+      "step": 1604
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.5282422803261,
+      "learning_rate": 8.388746162896171e-06,
+      "loss": 1.1139,
+      "step": 1605
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.759971964640194,
+      "learning_rate": 8.386630612820645e-06,
+      "loss": 1.0691,
+      "step": 1606
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.279879911303409,
+      "learning_rate": 8.384513941957796e-06,
+      "loss": 1.1878,
+      "step": 1607
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.242477698720153,
+      "learning_rate": 8.382396151008128e-06,
+      "loss": 1.1329,
+      "step": 1608
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.5945320010893065,
+      "learning_rate": 8.380277240672505e-06,
+      "loss": 1.1199,
+      "step": 1609
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 5.281314348839786,
+      "learning_rate": 8.378157211652176e-06,
+      "loss": 1.1517,
+      "step": 1610
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 9.748901665524928,
+      "learning_rate": 8.376036064648752e-06,
+      "loss": 1.0795,
+      "step": 1611
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.635007771090519,
+      "learning_rate": 8.373913800364212e-06,
+      "loss": 1.1627,
+      "step": 1612
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 9.133073797855834,
+      "learning_rate": 8.37179041950091e-06,
+      "loss": 1.1234,
+      "step": 1613
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.536007368425786,
+      "learning_rate": 8.369665922761569e-06,
+      "loss": 1.1326,
+      "step": 1614
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 6.690776321220817,
+      "learning_rate": 8.367540310849277e-06,
+      "loss": 1.2058,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.132803078982299,
+      "learning_rate": 8.365413584467496e-06,
+      "loss": 1.1023,
+      "step": 1616
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.073316695127223,
+      "learning_rate": 8.363285744320056e-06,
+      "loss": 1.0725,
+      "step": 1617
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.554051030908807,
+      "learning_rate": 8.361156791111151e-06,
+      "loss": 1.106,
+      "step": 1618
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 5.771035471684702,
+      "learning_rate": 8.359026725545348e-06,
+      "loss": 1.1608,
+      "step": 1619
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.04189190825203,
+      "learning_rate": 8.356895548327582e-06,
+      "loss": 1.1229,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 11.530304149202875,
+      "learning_rate": 8.354763260163153e-06,
+      "loss": 1.1146,
+      "step": 1621
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 5.987719090089255,
+      "learning_rate": 8.35262986175773e-06,
+      "loss": 1.1789,
+      "step": 1622
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 9.46168073482656,
+      "learning_rate": 8.350495353817353e-06,
+      "loss": 1.0914,
+      "step": 1623
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.6709113126039,
+      "learning_rate": 8.348359737048424e-06,
+      "loss": 1.1399,
+      "step": 1624
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 11.751559455277793,
+      "learning_rate": 8.346223012157712e-06,
+      "loss": 1.1705,
+      "step": 1625
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 9.060390166063323,
+      "learning_rate": 8.344085179852358e-06,
+      "loss": 1.1435,
+      "step": 1626
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.222815033360108,
+      "learning_rate": 8.341946240839864e-06,
+      "loss": 1.0648,
+      "step": 1627
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 9.684082290519532,
+      "learning_rate": 8.339806195828102e-06,
+      "loss": 1.1667,
+      "step": 1628
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 6.191813485248613,
+      "learning_rate": 8.337665045525308e-06,
+      "loss": 1.0607,
+      "step": 1629
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.323375237585292,
+      "learning_rate": 8.335522790640084e-06,
+      "loss": 1.1763,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 6.961268298453554,
+      "learning_rate": 8.333379431881398e-06,
+      "loss": 1.196,
+      "step": 1631
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.68733733185016,
+      "learning_rate": 8.331234969958582e-06,
+      "loss": 1.1196,
+      "step": 1632
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.522004983059606,
+      "learning_rate": 8.329089405581337e-06,
+      "loss": 1.0822,
+      "step": 1633
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 6.6691901805647715,
+      "learning_rate": 8.326942739459722e-06,
+      "loss": 1.0448,
+      "step": 1634
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.206629341909127,
+      "learning_rate": 8.324794972304167e-06,
+      "loss": 1.1226,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.422451403483223,
+      "learning_rate": 8.322646104825467e-06,
+      "loss": 1.0558,
+      "step": 1636
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 6.035943465785907,
+      "learning_rate": 8.320496137734772e-06,
+      "loss": 1.037,
+      "step": 1637
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.699642774082879,
+      "learning_rate": 8.318345071743607e-06,
+      "loss": 1.1077,
+      "step": 1638
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 6.787321097584455,
+      "learning_rate": 8.316192907563854e-06,
+      "loss": 1.1073,
+      "step": 1639
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.848172869636143,
+      "learning_rate": 8.314039645907761e-06,
+      "loss": 1.0829,
+      "step": 1640
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.116444146041158,
+      "learning_rate": 8.311885287487937e-06,
+      "loss": 1.0471,
+      "step": 1641
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.824717121251663,
+      "learning_rate": 8.309729833017358e-06,
+      "loss": 1.1076,
+      "step": 1642
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.6405143871941545,
+      "learning_rate": 8.307573283209357e-06,
+      "loss": 1.153,
+      "step": 1643
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.025692611139542,
+      "learning_rate": 8.305415638777633e-06,
+      "loss": 1.0423,
+      "step": 1644
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.444658891885983,
+      "learning_rate": 8.303256900436249e-06,
+      "loss": 1.1464,
+      "step": 1645
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.639076391081458,
+      "learning_rate": 8.301097068899628e-06,
+      "loss": 1.1167,
+      "step": 1646
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.488688899212145,
+      "learning_rate": 8.298936144882553e-06,
+      "loss": 1.0981,
+      "step": 1647
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.320994642691316,
+      "learning_rate": 8.29677412910017e-06,
+      "loss": 1.0357,
+      "step": 1648
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 6.58740018636282,
+      "learning_rate": 8.294611022267988e-06,
+      "loss": 1.06,
+      "step": 1649
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.31385669047285,
+      "learning_rate": 8.292446825101876e-06,
+      "loss": 1.0366,
+      "step": 1650
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.957225486869659,
+      "learning_rate": 8.290281538318063e-06,
+      "loss": 1.0559,
+      "step": 1651
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.673852488085842,
+      "learning_rate": 8.288115162633139e-06,
+      "loss": 1.1244,
+      "step": 1652
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 6.775804088636756,
+      "learning_rate": 8.285947698764051e-06,
+      "loss": 1.0953,
+      "step": 1653
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.221215586913926,
+      "learning_rate": 8.283779147428117e-06,
+      "loss": 1.0932,
+      "step": 1654
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 5.347456338412862,
+      "learning_rate": 8.281609509343e-06,
+      "loss": 1.1639,
+      "step": 1655
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 6.51254634199525,
+      "learning_rate": 8.279438785226737e-06,
+      "loss": 1.2638,
+      "step": 1656
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.624651887034609,
+      "learning_rate": 8.277266975797712e-06,
+      "loss": 1.1503,
+      "step": 1657
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 9.76870766389028,
+      "learning_rate": 8.275094081774675e-06,
+      "loss": 1.0738,
+      "step": 1658
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 9.499819828856985,
+      "learning_rate": 8.272920103876736e-06,
+      "loss": 1.0589,
+      "step": 1659
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.626027021439293,
+      "learning_rate": 8.27074504282336e-06,
+      "loss": 1.1449,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 9.717979122279123,
+      "learning_rate": 8.26856889933437e-06,
+      "loss": 1.1326,
+      "step": 1661
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 5.980287934474336,
+      "learning_rate": 8.266391674129953e-06,
+      "loss": 1.129,
+      "step": 1662
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.808322870266851,
+      "learning_rate": 8.264213367930645e-06,
+      "loss": 1.1254,
+      "step": 1663
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.75572022492984,
+      "learning_rate": 8.262033981457349e-06,
+      "loss": 1.0862,
+      "step": 1664
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 5.519472656119113,
+      "learning_rate": 8.25985351543132e-06,
+      "loss": 1.1072,
+      "step": 1665
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.9800751179793865,
+      "learning_rate": 8.257671970574173e-06,
+      "loss": 1.1252,
+      "step": 1666
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.370901169480811,
+      "learning_rate": 8.255489347607875e-06,
+      "loss": 1.0309,
+      "step": 1667
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.865824915066225,
+      "learning_rate": 8.253305647254756e-06,
+      "loss": 1.101,
+      "step": 1668
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.971502989737235,
+      "learning_rate": 8.2511208702375e-06,
+      "loss": 1.1379,
+      "step": 1669
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.374084638706973,
+      "learning_rate": 8.248935017279148e-06,
+      "loss": 1.1016,
+      "step": 1670
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.6597811277065,
+      "learning_rate": 8.246748089103094e-06,
+      "loss": 1.1087,
+      "step": 1671
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.998890358992404,
+      "learning_rate": 8.244560086433091e-06,
+      "loss": 1.027,
+      "step": 1672
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.484586312468108,
+      "learning_rate": 8.242371009993249e-06,
+      "loss": 1.2276,
+      "step": 1673
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 5.735278021334699,
+      "learning_rate": 8.240180860508027e-06,
+      "loss": 1.1998,
+      "step": 1674
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.013358062147828,
+      "learning_rate": 8.237989638702246e-06,
+      "loss": 1.1397,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.522935430105936,
+      "learning_rate": 8.23579734530108e-06,
+      "loss": 1.0454,
+      "step": 1676
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 10.46559094193087,
+      "learning_rate": 8.233603981030056e-06,
+      "loss": 1.2113,
+      "step": 1677
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.471010486063868,
+      "learning_rate": 8.231409546615055e-06,
+      "loss": 1.0562,
+      "step": 1678
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 10.416827158644603,
+      "learning_rate": 8.229214042782314e-06,
+      "loss": 1.0814,
+      "step": 1679
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 9.123413119707005,
+      "learning_rate": 8.227017470258424e-06,
+      "loss": 1.1116,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 5.440752463588795,
+      "learning_rate": 8.224819829770328e-06,
+      "loss": 1.1887,
+      "step": 1681
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 6.70589237505729,
+      "learning_rate": 8.222621122045324e-06,
+      "loss": 1.0136,
+      "step": 1682
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.185511918863398,
+      "learning_rate": 8.22042134781106e-06,
+      "loss": 1.1494,
+      "step": 1683
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 6.2040653697842405,
+      "learning_rate": 8.218220507795546e-06,
+      "loss": 1.1127,
+      "step": 1684
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.488714930161553,
+      "learning_rate": 8.216018602727132e-06,
+      "loss": 1.1516,
+      "step": 1685
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 6.647700214161803,
+      "learning_rate": 8.21381563333453e-06,
+      "loss": 1.0862,
+      "step": 1686
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.282170350019845,
+      "learning_rate": 8.211611600346798e-06,
+      "loss": 1.1047,
+      "step": 1687
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.113954586485672,
+      "learning_rate": 8.209406504493353e-06,
+      "loss": 1.2043,
+      "step": 1688
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 11.94605704021878,
+      "learning_rate": 8.207200346503957e-06,
+      "loss": 1.092,
+      "step": 1689
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.231338458271956,
+      "learning_rate": 8.204993127108725e-06,
+      "loss": 1.1612,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 5.893142045792259,
+      "learning_rate": 8.202784847038128e-06,
+      "loss": 1.0763,
+      "step": 1691
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.195115973513511,
+      "learning_rate": 8.200575507022983e-06,
+      "loss": 1.0767,
+      "step": 1692
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.19862574417926,
+      "learning_rate": 8.198365107794457e-06,
+      "loss": 1.0776,
+      "step": 1693
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 4.247114581416052,
+      "learning_rate": 8.196153650084073e-06,
+      "loss": 1.075,
+      "step": 1694
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 6.803573894244704,
+      "learning_rate": 8.1939411346237e-06,
+      "loss": 1.1392,
+      "step": 1695
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.71598254979762,
+      "learning_rate": 8.191727562145559e-06,
+      "loss": 1.1958,
+      "step": 1696
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.378828347375094,
+      "learning_rate": 8.189512933382218e-06,
+      "loss": 1.1912,
+      "step": 1697
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.753469252606369,
+      "learning_rate": 8.187297249066596e-06,
+      "loss": 1.08,
+      "step": 1698
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.601082776275199,
+      "learning_rate": 8.185080509931967e-06,
+      "loss": 1.0883,
+      "step": 1699
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.405732413240015,
+      "learning_rate": 8.182862716711943e-06,
+      "loss": 1.1112,
+      "step": 1700
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.030941502431114,
+      "learning_rate": 8.180643870140494e-06,
+      "loss": 1.1389,
+      "step": 1701
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 6.7479839767545124,
+      "learning_rate": 8.178423970951937e-06,
+      "loss": 1.1607,
+      "step": 1702
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 10.399894063287773,
+      "learning_rate": 8.17620301988093e-06,
+      "loss": 1.143,
+      "step": 1703
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 5.8275138600315195,
+      "learning_rate": 8.173981017662492e-06,
+      "loss": 1.1182,
+      "step": 1704
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.966072845792144,
+      "learning_rate": 8.171757965031976e-06,
+      "loss": 1.1083,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 5.436145380466073,
+      "learning_rate": 8.169533862725094e-06,
+      "loss": 1.0991,
+      "step": 1706
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.994360744099766,
+      "learning_rate": 8.1673087114779e-06,
+      "loss": 1.1493,
+      "step": 1707
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 5.758717131666793,
+      "learning_rate": 8.165082512026797e-06,
+      "loss": 1.1189,
+      "step": 1708
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.077165707543616,
+      "learning_rate": 8.162855265108529e-06,
+      "loss": 1.0558,
+      "step": 1709
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 9.30319312167461,
+      "learning_rate": 8.160626971460195e-06,
+      "loss": 1.0702,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 9.631016642740933,
+      "learning_rate": 8.158397631819238e-06,
+      "loss": 1.1469,
+      "step": 1711
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.83618769012735,
+      "learning_rate": 8.156167246923446e-06,
+      "loss": 1.083,
+      "step": 1712
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 7.176920629528125,
+      "learning_rate": 8.153935817510948e-06,
+      "loss": 1.1132,
+      "step": 1713
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 8.121244928298676,
+      "learning_rate": 8.15170334432023e-06,
+      "loss": 1.0356,
+      "step": 1714
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 5.701723537252456,
+      "learning_rate": 8.149469828090113e-06,
+      "loss": 1.0464,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 5.640426222839591,
+      "learning_rate": 8.14723526955977e-06,
+      "loss": 1.0754,
+      "step": 1716
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 4.710782369574398,
+      "learning_rate": 8.144999669468714e-06,
+      "loss": 1.0791,
+      "step": 1717
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 10.236288816798574,
+      "learning_rate": 8.142763028556804e-06,
+      "loss": 1.1333,
+      "step": 1718
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.489374364149963,
+      "learning_rate": 8.140525347564248e-06,
+      "loss": 1.14,
+      "step": 1719
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 6.687619066329119,
+      "learning_rate": 8.138286627231594e-06,
+      "loss": 1.1271,
+      "step": 1720
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 9.695353563388755,
+      "learning_rate": 8.136046868299731e-06,
+      "loss": 1.0846,
+      "step": 1721
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.894689748808961,
+      "learning_rate": 8.133806071509899e-06,
+      "loss": 1.1515,
+      "step": 1722
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 11.945239367111188,
+      "learning_rate": 8.131564237603675e-06,
+      "loss": 1.1002,
+      "step": 1723
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 6.98506250629022,
+      "learning_rate": 8.129321367322984e-06,
+      "loss": 1.103,
+      "step": 1724
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.326930916943024,
+      "learning_rate": 8.127077461410091e-06,
+      "loss": 1.1457,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.368754324036257,
+      "learning_rate": 8.124832520607605e-06,
+      "loss": 1.0998,
+      "step": 1726
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 5.933755734084252,
+      "learning_rate": 8.122586545658476e-06,
+      "loss": 1.0898,
+      "step": 1727
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 9.557886482132005,
+      "learning_rate": 8.120339537306e-06,
+      "loss": 1.0153,
+      "step": 1728
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 5.506495954975014,
+      "learning_rate": 8.11809149629381e-06,
+      "loss": 1.109,
+      "step": 1729
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 6.12917366629312,
+      "learning_rate": 8.115842423365884e-06,
+      "loss": 1.0607,
+      "step": 1730
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 9.456980627333186,
+      "learning_rate": 8.113592319266541e-06,
+      "loss": 1.1036,
+      "step": 1731
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.62163684703399,
+      "learning_rate": 8.11134118474044e-06,
+      "loss": 1.0579,
+      "step": 1732
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.371501776007968,
+      "learning_rate": 8.109089020532583e-06,
+      "loss": 1.1653,
+      "step": 1733
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 10.03760609393529,
+      "learning_rate": 8.106835827388313e-06,
+      "loss": 1.1132,
+      "step": 1734
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 6.812938036600428,
+      "learning_rate": 8.104581606053307e-06,
+      "loss": 1.1425,
+      "step": 1735
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 6.338500226277438,
+      "learning_rate": 8.102326357273594e-06,
+      "loss": 1.1057,
+      "step": 1736
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 6.186866867653915,
+      "learning_rate": 8.100070081795532e-06,
+      "loss": 1.1116,
+      "step": 1737
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.840626819820057,
+      "learning_rate": 8.097812780365827e-06,
+      "loss": 1.1563,
+      "step": 1738
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.232239302488509,
+      "learning_rate": 8.095554453731516e-06,
+      "loss": 1.171,
+      "step": 1739
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 11.251273997934787,
+      "learning_rate": 8.093295102639987e-06,
+      "loss": 1.1486,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.765718804431864,
+      "learning_rate": 8.091034727838953e-06,
+      "loss": 1.0534,
+      "step": 1741
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.571304520944613,
+      "learning_rate": 8.088773330076475e-06,
+      "loss": 1.1039,
+      "step": 1742
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.286826416490063,
+      "learning_rate": 8.086510910100954e-06,
+      "loss": 1.0899,
+      "step": 1743
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.839531966538052,
+      "learning_rate": 8.084247468661124e-06,
+      "loss": 1.1322,
+      "step": 1744
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.725090714974995,
+      "learning_rate": 8.081983006506058e-06,
+      "loss": 1.0165,
+      "step": 1745
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.262564440992078,
+      "learning_rate": 8.079717524385165e-06,
+      "loss": 1.1183,
+      "step": 1746
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 6.917184077729875,
+      "learning_rate": 8.0774510230482e-06,
+      "loss": 1.0514,
+      "step": 1747
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.367483999046877,
+      "learning_rate": 8.075183503245248e-06,
+      "loss": 1.15,
+      "step": 1748
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.460492792794469,
+      "learning_rate": 8.072914965726727e-06,
+      "loss": 1.1152,
+      "step": 1749
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 10.058426985041082,
+      "learning_rate": 8.070645411243405e-06,
+      "loss": 1.2194,
+      "step": 1750
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 10.944397629165262,
+      "learning_rate": 8.068374840546373e-06,
+      "loss": 1.1446,
+      "step": 1751
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.485821778457746,
+      "learning_rate": 8.06610325438707e-06,
+      "loss": 1.0799,
+      "step": 1752
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 10.11984827918002,
+      "learning_rate": 8.06383065351726e-06,
+      "loss": 1.1188,
+      "step": 1753
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 6.091387491355479,
+      "learning_rate": 8.061557038689051e-06,
+      "loss": 1.1479,
+      "step": 1754
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.271114752899509,
+      "learning_rate": 8.059282410654883e-06,
+      "loss": 1.0869,
+      "step": 1755
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 10.890581919022287,
+      "learning_rate": 8.057006770167532e-06,
+      "loss": 1.0926,
+      "step": 1756
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.385701998303858,
+      "learning_rate": 8.05473011798011e-06,
+      "loss": 1.0897,
+      "step": 1757
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.55819258853254,
+      "learning_rate": 8.05245245484606e-06,
+      "loss": 1.108,
+      "step": 1758
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 9.234663028909,
+      "learning_rate": 8.050173781519166e-06,
+      "loss": 1.1507,
+      "step": 1759
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.527079411726982,
+      "learning_rate": 8.04789409875354e-06,
+      "loss": 1.1963,
+      "step": 1760
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.6981697960628,
+      "learning_rate": 8.045613407303633e-06,
+      "loss": 1.1258,
+      "step": 1761
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 6.409832258059475,
+      "learning_rate": 8.043331707924226e-06,
+      "loss": 1.0977,
+      "step": 1762
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.07350018600827,
+      "learning_rate": 8.041049001370435e-06,
+      "loss": 1.1273,
+      "step": 1763
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 9.601970743890353,
+      "learning_rate": 8.038765288397713e-06,
+      "loss": 1.067,
+      "step": 1764
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 5.772799728506541,
+      "learning_rate": 8.036480569761839e-06,
+      "loss": 1.1044,
+      "step": 1765
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.7726712588062075,
+      "learning_rate": 8.034194846218929e-06,
+      "loss": 1.0793,
+      "step": 1766
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.980072656554834,
+      "learning_rate": 8.03190811852543e-06,
+      "loss": 1.1833,
+      "step": 1767
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 8.771080063155832,
+      "learning_rate": 8.029620387438127e-06,
+      "loss": 1.147,
+      "step": 1768
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.994079438049525,
+      "learning_rate": 8.027331653714128e-06,
+      "loss": 1.0848,
+      "step": 1769
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 10.249406188914332,
+      "learning_rate": 8.02504191811088e-06,
+      "loss": 1.1432,
+      "step": 1770
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.92744501524129,
+      "learning_rate": 8.022751181386154e-06,
+      "loss": 1.1152,
+      "step": 1771
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.675371077655364,
+      "learning_rate": 8.020459444298063e-06,
+      "loss": 1.1665,
+      "step": 1772
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.0389601368121575,
+      "learning_rate": 8.01816670760504e-06,
+      "loss": 1.1047,
+      "step": 1773
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 9.417991563495644,
+      "learning_rate": 8.015872972065858e-06,
+      "loss": 1.1609,
+      "step": 1774
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 6.219051131426984,
+      "learning_rate": 8.013578238439614e-06,
+      "loss": 1.1137,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 8.348094514711583,
+      "learning_rate": 8.01128250748574e-06,
+      "loss": 1.1633,
+      "step": 1776
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.916907752243674,
+      "learning_rate": 8.008985779963996e-06,
+      "loss": 1.184,
+      "step": 1777
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 8.939235764858871,
+      "learning_rate": 8.006688056634467e-06,
+      "loss": 1.1266,
+      "step": 1778
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 8.795174321125034,
+      "learning_rate": 8.004389338257577e-06,
+      "loss": 1.1708,
+      "step": 1779
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 8.683592153671903,
+      "learning_rate": 8.002089625594075e-06,
+      "loss": 1.074,
+      "step": 1780
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 4.8224595642455785,
+      "learning_rate": 7.999788919405036e-06,
+      "loss": 1.18,
+      "step": 1781
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 5.742187316705578,
+      "learning_rate": 7.997487220451865e-06,
+      "loss": 1.0193,
+      "step": 1782
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 9.569094287334506,
+      "learning_rate": 7.995184529496303e-06,
+      "loss": 1.0993,
+      "step": 1783
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 9.539239040221439,
+      "learning_rate": 7.992880847300408e-06,
+      "loss": 1.1881,
+      "step": 1784
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 9.993881830122238,
+      "learning_rate": 7.990576174626573e-06,
+      "loss": 1.1777,
+      "step": 1785
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.215911984490551,
+      "learning_rate": 7.988270512237516e-06,
+      "loss": 1.1245,
+      "step": 1786
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 9.496777659457313,
+      "learning_rate": 7.985963860896285e-06,
+      "loss": 1.1131,
+      "step": 1787
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 5.867456564554495,
+      "learning_rate": 7.983656221366254e-06,
+      "loss": 1.1446,
+      "step": 1788
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 6.123543116556588,
+      "learning_rate": 7.981347594411123e-06,
+      "loss": 1.1482,
+      "step": 1789
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.179448138738822,
+      "learning_rate": 7.979037980794922e-06,
+      "loss": 1.0172,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.0010246376671565,
+      "learning_rate": 7.976727381282003e-06,
+      "loss": 1.1925,
+      "step": 1791
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.657114408800635,
+      "learning_rate": 7.974415796637047e-06,
+      "loss": 1.148,
+      "step": 1792
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 6.017988413485248,
+      "learning_rate": 7.972103227625062e-06,
+      "loss": 1.0939,
+      "step": 1793
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.946562844617598,
+      "learning_rate": 7.969789675011378e-06,
+      "loss": 1.1318,
+      "step": 1794
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 6.827043881882312,
+      "learning_rate": 7.967475139561654e-06,
+      "loss": 1.0628,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.608179616238721,
+      "learning_rate": 7.965159622041875e-06,
+      "loss": 1.0259,
+      "step": 1796
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.555130749211507,
+      "learning_rate": 7.962843123218346e-06,
+      "loss": 1.0658,
+      "step": 1797
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.421071176331012,
+      "learning_rate": 7.960525643857702e-06,
+      "loss": 1.1547,
+      "step": 1798
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.560632410815266,
+      "learning_rate": 7.958207184726898e-06,
+      "loss": 1.1265,
+      "step": 1799
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 11.49521081895177,
+      "learning_rate": 7.955887746593221e-06,
+      "loss": 1.0017,
+      "step": 1800
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 10.340908972151784,
+      "learning_rate": 7.95356733022427e-06,
+      "loss": 1.04,
+      "step": 1801
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 10.73470939209156,
+      "learning_rate": 7.951245936387978e-06,
+      "loss": 1.1013,
+      "step": 1802
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 8.030287042982557,
+      "learning_rate": 7.948923565852597e-06,
+      "loss": 1.0228,
+      "step": 1803
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 5.6631564059196675,
+      "learning_rate": 7.946600219386706e-06,
+      "loss": 1.1158,
+      "step": 1804
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 9.461321936401834,
+      "learning_rate": 7.944275897759199e-06,
+      "loss": 1.1818,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 4.978735418367262,
+      "learning_rate": 7.941950601739303e-06,
+      "loss": 1.0934,
+      "step": 1806
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 6.811114326696571,
+      "learning_rate": 7.939624332096558e-06,
+      "loss": 1.1664,
+      "step": 1807
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.080258896106657,
+      "learning_rate": 7.937297089600832e-06,
+      "loss": 1.1587,
+      "step": 1808
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 5.146249691063539,
+      "learning_rate": 7.934968875022316e-06,
+      "loss": 1.1228,
+      "step": 1809
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 6.005341097762396,
+      "learning_rate": 7.932639689131516e-06,
+      "loss": 1.1074,
+      "step": 1810
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.1286483712699065,
+      "learning_rate": 7.930309532699267e-06,
+      "loss": 1.0437,
+      "step": 1811
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 5.956197567198741,
+      "learning_rate": 7.92797840649672e-06,
+      "loss": 1.0829,
+      "step": 1812
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.883929450012413,
+      "learning_rate": 7.92564631129535e-06,
+      "loss": 1.0998,
+      "step": 1813
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.331808193276568,
+      "learning_rate": 7.92331324786695e-06,
+      "loss": 1.1663,
+      "step": 1814
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 8.914024620580074,
+      "learning_rate": 7.920979216983634e-06,
+      "loss": 1.0687,
+      "step": 1815
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 6.1290334397117405,
+      "learning_rate": 7.918644219417842e-06,
+      "loss": 1.1371,
+      "step": 1816
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.106424417752671,
+      "learning_rate": 7.916308255942322e-06,
+      "loss": 1.2086,
+      "step": 1817
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 5.765516840845342,
+      "learning_rate": 7.913971327330155e-06,
+      "loss": 1.0989,
+      "step": 1818
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 9.651424277115865,
+      "learning_rate": 7.91163343435473e-06,
+      "loss": 1.1091,
+      "step": 1819
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.260246972681967,
+      "learning_rate": 7.909294577789765e-06,
+      "loss": 1.1622,
+      "step": 1820
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 6.309242931588571,
+      "learning_rate": 7.906954758409289e-06,
+      "loss": 1.0164,
+      "step": 1821
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.341669031707355,
+      "learning_rate": 7.904613976987652e-06,
+      "loss": 1.1534,
+      "step": 1822
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 6.4159200667660174,
+      "learning_rate": 7.902272234299526e-06,
+      "loss": 1.0913,
+      "step": 1823
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 9.658559106149985,
+      "learning_rate": 7.899929531119897e-06,
+      "loss": 1.0905,
+      "step": 1824
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.745372483077858,
+      "learning_rate": 7.897585868224068e-06,
+      "loss": 1.1557,
+      "step": 1825
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.804852457400167,
+      "learning_rate": 7.895241246387663e-06,
+      "loss": 1.098,
+      "step": 1826
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 6.369435141930905,
+      "learning_rate": 7.892895666386625e-06,
+      "loss": 1.1847,
+      "step": 1827
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 10.6923107877663,
+      "learning_rate": 7.89054912899721e-06,
+      "loss": 1.1811,
+      "step": 1828
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.989021058002372,
+      "learning_rate": 7.888201634995987e-06,
+      "loss": 1.197,
+      "step": 1829
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.448708989533757,
+      "learning_rate": 7.885853185159854e-06,
+      "loss": 1.2249,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 6.557142398572359,
+      "learning_rate": 7.88350378026601e-06,
+      "loss": 1.0142,
+      "step": 1831
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 8.532735956995413,
+      "learning_rate": 7.881153421091987e-06,
+      "loss": 1.1127,
+      "step": 1832
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 9.695609659313611,
+      "learning_rate": 7.878802108415618e-06,
+      "loss": 1.084,
+      "step": 1833
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 8.96183172904453,
+      "learning_rate": 7.876449843015057e-06,
+      "loss": 1.0416,
+      "step": 1834
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.744068656431536,
+      "learning_rate": 7.874096625668776e-06,
+      "loss": 1.0258,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 6.254576191256361,
+      "learning_rate": 7.871742457155558e-06,
+      "loss": 1.0877,
+      "step": 1836
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 5.09951098420029,
+      "learning_rate": 7.869387338254505e-06,
+      "loss": 1.1069,
+      "step": 1837
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 8.731794257436976,
+      "learning_rate": 7.867031269745029e-06,
+      "loss": 1.1498,
+      "step": 1838
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 4.911582014421586,
+      "learning_rate": 7.86467425240686e-06,
+      "loss": 1.1436,
+      "step": 1839
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 9.539756962102098,
+      "learning_rate": 7.862316287020038e-06,
+      "loss": 1.0283,
+      "step": 1840
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.323406139419535,
+      "learning_rate": 7.85995737436492e-06,
+      "loss": 1.1075,
+      "step": 1841
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 9.473127235349423,
+      "learning_rate": 7.857597515222175e-06,
+      "loss": 1.0865,
+      "step": 1842
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.091069098735554,
+      "learning_rate": 7.85523671037279e-06,
+      "loss": 1.0887,
+      "step": 1843
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.431841808873336,
+      "learning_rate": 7.852874960598054e-06,
+      "loss": 1.1578,
+      "step": 1844
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.014382947587737,
+      "learning_rate": 7.85051226667958e-06,
+      "loss": 1.0615,
+      "step": 1845
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.413782386464512,
+      "learning_rate": 7.848148629399287e-06,
+      "loss": 1.112,
+      "step": 1846
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 8.693722464973161,
+      "learning_rate": 7.84578404953941e-06,
+      "loss": 1.1338,
+      "step": 1847
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 8.443970158675269,
+      "learning_rate": 7.84341852788249e-06,
+      "loss": 1.0985,
+      "step": 1848
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 8.640490278737012,
+      "learning_rate": 7.84105206521139e-06,
+      "loss": 1.1139,
+      "step": 1849
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 9.044573221516169,
+      "learning_rate": 7.838684662309273e-06,
+      "loss": 1.1878,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 11.798595397720984,
+      "learning_rate": 7.83631631995962e-06,
+      "loss": 1.0875,
+      "step": 1851
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 8.532267955700306,
+      "learning_rate": 7.833947038946224e-06,
+      "loss": 1.0971,
+      "step": 1852
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 5.864005176458704,
+      "learning_rate": 7.831576820053182e-06,
+      "loss": 1.1348,
+      "step": 1853
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 5.889177029761599,
+      "learning_rate": 7.829205664064904e-06,
+      "loss": 1.1296,
+      "step": 1854
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.708286017131596,
+      "learning_rate": 7.826833571766118e-06,
+      "loss": 1.2452,
+      "step": 1855
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 8.43774231703069,
+      "learning_rate": 7.824460543941851e-06,
+      "loss": 1.1387,
+      "step": 1856
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 9.245600418499166,
+      "learning_rate": 7.82208658137744e-06,
+      "loss": 1.1359,
+      "step": 1857
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 10.622398558985758,
+      "learning_rate": 7.819711684858544e-06,
+      "loss": 1.0954,
+      "step": 1858
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 4.780611172275122,
+      "learning_rate": 7.817335855171117e-06,
+      "loss": 1.1436,
+      "step": 1859
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 6.863196108375571,
+      "learning_rate": 7.814959093101428e-06,
+      "loss": 1.0803,
+      "step": 1860
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.695604848792385,
+      "learning_rate": 7.812581399436054e-06,
+      "loss": 1.0723,
+      "step": 1861
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 8.355373926664608,
+      "learning_rate": 7.810202774961879e-06,
+      "loss": 1.1456,
+      "step": 1862
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 8.845990291417703,
+      "learning_rate": 7.807823220466096e-06,
+      "loss": 1.1617,
+      "step": 1863
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 6.44950934447264,
+      "learning_rate": 7.805442736736208e-06,
+      "loss": 1.1196,
+      "step": 1864
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.728625750559947,
+      "learning_rate": 7.803061324560022e-06,
+      "loss": 1.0633,
+      "step": 1865
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.656833280452402,
+      "learning_rate": 7.800678984725654e-06,
+      "loss": 1.1171,
+      "step": 1866
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.762464662793009,
+      "learning_rate": 7.798295718021524e-06,
+      "loss": 1.1175,
+      "step": 1867
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.334732367290767,
+      "learning_rate": 7.795911525236367e-06,
+      "loss": 1.0842,
+      "step": 1868
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.75872616650864,
+      "learning_rate": 7.793526407159212e-06,
+      "loss": 1.1219,
+      "step": 1869
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 8.514787345152232,
+      "learning_rate": 7.791140364579408e-06,
+      "loss": 1.042,
+      "step": 1870
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 6.921252339385675,
+      "learning_rate": 7.788753398286598e-06,
+      "loss": 1.0226,
+      "step": 1871
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.622542583721713,
+      "learning_rate": 7.78636550907074e-06,
+      "loss": 1.09,
+      "step": 1872
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.353104704812729,
+      "learning_rate": 7.783976697722087e-06,
+      "loss": 1.108,
+      "step": 1873
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 8.856207942513127,
+      "learning_rate": 7.78158696503121e-06,
+      "loss": 1.0657,
+      "step": 1874
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 6.221143760413371,
+      "learning_rate": 7.779196311788973e-06,
+      "loss": 1.136,
+      "step": 1875
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 9.640208387455054,
+      "learning_rate": 7.776804738786554e-06,
+      "loss": 1.1292,
+      "step": 1876
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.123551003378806,
+      "learning_rate": 7.774412246815429e-06,
+      "loss": 1.0455,
+      "step": 1877
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 8.555042853259454,
+      "learning_rate": 7.77201883666738e-06,
+      "loss": 1.0207,
+      "step": 1878
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.550893225629888,
+      "learning_rate": 7.769624509134496e-06,
+      "loss": 1.1545,
+      "step": 1879
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 9.284738620770856,
+      "learning_rate": 7.767229265009165e-06,
+      "loss": 1.0746,
+      "step": 1880
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 6.395199431957567,
+      "learning_rate": 7.764833105084079e-06,
+      "loss": 1.1703,
+      "step": 1881
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 9.284999525504457,
+      "learning_rate": 7.762436030152239e-06,
+      "loss": 1.0965,
+      "step": 1882
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 7.3476763934587375,
+      "learning_rate": 7.760038041006938e-06,
+      "loss": 1.1376,
+      "step": 1883
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 8.303202024352606,
+      "learning_rate": 7.757639138441782e-06,
+      "loss": 1.1038,
+      "step": 1884
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 8.116098397491887,
+      "learning_rate": 7.755239323250675e-06,
+      "loss": 1.1181,
+      "step": 1885
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 9.459767914198348,
+      "learning_rate": 7.752838596227818e-06,
+      "loss": 1.1005,
+      "step": 1886
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 6.508009632132008,
+      "learning_rate": 7.750436958167727e-06,
+      "loss": 1.0595,
+      "step": 1887
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 6.472338338958308,
+      "learning_rate": 7.748034409865203e-06,
+      "loss": 1.0305,
+      "step": 1888
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 6.902619083020905,
+      "learning_rate": 7.745630952115365e-06,
+      "loss": 1.1094,
+      "step": 1889
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 9.03110700452794,
+      "learning_rate": 7.743226585713615e-06,
+      "loss": 1.131,
+      "step": 1890
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 12.44122666672117,
+      "learning_rate": 7.740821311455671e-06,
+      "loss": 1.1188,
+      "step": 1891
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.249211356167554,
+      "learning_rate": 7.738415130137547e-06,
+      "loss": 1.1703,
+      "step": 1892
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 6.6903412490305785,
+      "learning_rate": 7.736008042555552e-06,
+      "loss": 1.1028,
+      "step": 1893
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 8.302642774242063,
+      "learning_rate": 7.7336000495063e-06,
+      "loss": 1.1086,
+      "step": 1894
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 8.802880217153088,
+      "learning_rate": 7.731191151786705e-06,
+      "loss": 1.1802,
+      "step": 1895
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.23898880618405,
+      "learning_rate": 7.728781350193979e-06,
+      "loss": 1.2078,
+      "step": 1896
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.148650941396561,
+      "learning_rate": 7.726370645525631e-06,
+      "loss": 1.1007,
+      "step": 1897
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.655744283723385,
+      "learning_rate": 7.723959038579471e-06,
+      "loss": 1.1403,
+      "step": 1898
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.824314194100453,
+      "learning_rate": 7.721546530153608e-06,
+      "loss": 1.0453,
+      "step": 1899
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 6.653479938192875,
+      "learning_rate": 7.719133121046448e-06,
+      "loss": 1.1024,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 8.22065478319683,
+      "learning_rate": 7.716718812056696e-06,
+      "loss": 1.0449,
+      "step": 1901
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 6.850989992982222,
+      "learning_rate": 7.714303603983357e-06,
+      "loss": 1.0286,
+      "step": 1902
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.831570564050217,
+      "learning_rate": 7.71188749762573e-06,
+      "loss": 1.011,
+      "step": 1903
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 6.0602125911676525,
+      "learning_rate": 7.70947049378341e-06,
+      "loss": 1.1188,
+      "step": 1904
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 10.29105846809373,
+      "learning_rate": 7.707052593256298e-06,
+      "loss": 1.0582,
+      "step": 1905
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 6.905423092301058,
+      "learning_rate": 7.704633796844576e-06,
+      "loss": 1.0473,
+      "step": 1906
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.206474015639551,
+      "learning_rate": 7.702214105348737e-06,
+      "loss": 1.0452,
+      "step": 1907
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 9.196400062584704,
+      "learning_rate": 7.699793519569567e-06,
+      "loss": 1.121,
+      "step": 1908
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.360304453335443,
+      "learning_rate": 7.697372040308142e-06,
+      "loss": 1.1503,
+      "step": 1909
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 9.422712724351529,
+      "learning_rate": 7.694949668365839e-06,
+      "loss": 1.1316,
+      "step": 1910
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 8.906444601248637,
+      "learning_rate": 7.692526404544329e-06,
+      "loss": 1.093,
+      "step": 1911
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.260501923991859,
+      "learning_rate": 7.690102249645578e-06,
+      "loss": 1.0697,
+      "step": 1912
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 8.26053817380469,
+      "learning_rate": 7.687677204471849e-06,
+      "loss": 1.1361,
+      "step": 1913
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 9.21873617354312,
+      "learning_rate": 7.685251269825696e-06,
+      "loss": 1.097,
+      "step": 1914
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 8.063493056597766,
+      "learning_rate": 7.682824446509968e-06,
+      "loss": 1.0931,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.530053774205452,
+      "learning_rate": 7.680396735327813e-06,
+      "loss": 1.0382,
+      "step": 1916
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.412918203946851,
+      "learning_rate": 7.677968137082668e-06,
+      "loss": 1.0437,
+      "step": 1917
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 8.368816665294759,
+      "learning_rate": 7.675538652578259e-06,
+      "loss": 1.0117,
+      "step": 1918
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 4.850131055810423,
+      "learning_rate": 7.673108282618617e-06,
+      "loss": 1.0713,
+      "step": 1919
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 8.307718782312673,
+      "learning_rate": 7.67067702800806e-06,
+      "loss": 1.0927,
+      "step": 1920
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.143275288225571,
+      "learning_rate": 7.668244889551195e-06,
+      "loss": 1.0416,
+      "step": 1921
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 8.810987472484426,
+      "learning_rate": 7.665811868052927e-06,
+      "loss": 1.1029,
+      "step": 1922
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 8.53995169072935,
+      "learning_rate": 7.663377964318452e-06,
+      "loss": 1.1254,
+      "step": 1923
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 8.584783696577425,
+      "learning_rate": 7.660943179153258e-06,
+      "loss": 1.1013,
+      "step": 1924
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.938003767934636,
+      "learning_rate": 7.658507513363122e-06,
+      "loss": 1.1459,
+      "step": 1925
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 9.125375064304391,
+      "learning_rate": 7.656070967754116e-06,
+      "loss": 1.1832,
+      "step": 1926
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 9.262878601812607,
+      "learning_rate": 7.653633543132604e-06,
+      "loss": 1.0795,
+      "step": 1927
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.333066771901195,
+      "learning_rate": 7.651195240305234e-06,
+      "loss": 1.0748,
+      "step": 1928
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 10.346115895590124,
+      "learning_rate": 7.648756060078953e-06,
+      "loss": 1.1642,
+      "step": 1929
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 12.88003003656378,
+      "learning_rate": 7.646316003260993e-06,
+      "loss": 1.1041,
+      "step": 1930
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.682912205385788,
+      "learning_rate": 7.643875070658875e-06,
+      "loss": 1.146,
+      "step": 1931
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 9.197933802788492,
+      "learning_rate": 7.641433263080418e-06,
+      "loss": 1.0264,
+      "step": 1932
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 6.82483060351111,
+      "learning_rate": 7.638990581333725e-06,
+      "loss": 1.1086,
+      "step": 1933
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 8.29100303652457,
+      "learning_rate": 7.636547026227183e-06,
+      "loss": 1.1778,
+      "step": 1934
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.529868613321352,
+      "learning_rate": 7.634102598569477e-06,
+      "loss": 1.1367,
+      "step": 1935
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 10.697071519820858,
+      "learning_rate": 7.63165729916958e-06,
+      "loss": 1.1289,
+      "step": 1936
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.633086026228334,
+      "learning_rate": 7.629211128836747e-06,
+      "loss": 1.1193,
+      "step": 1937
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.59389839660818,
+      "learning_rate": 7.626764088380525e-06,
+      "loss": 1.0806,
+      "step": 1938
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.298963322322306,
+      "learning_rate": 7.624316178610749e-06,
+      "loss": 1.062,
+      "step": 1939
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.4244113077237435,
+      "learning_rate": 7.621867400337544e-06,
+      "loss": 1.1379,
+      "step": 1940
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 6.578697461395738,
+      "learning_rate": 7.619417754371319e-06,
+      "loss": 1.1287,
+      "step": 1941
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 7.481551854448722,
+      "learning_rate": 7.61696724152277e-06,
+      "loss": 1.0794,
+      "step": 1942
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.5092436424431535,
+      "learning_rate": 7.614515862602882e-06,
+      "loss": 1.1334,
+      "step": 1943
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.888987434434501,
+      "learning_rate": 7.612063618422927e-06,
+      "loss": 1.1862,
+      "step": 1944
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.153434119572353,
+      "learning_rate": 7.609610509794461e-06,
+      "loss": 1.0099,
+      "step": 1945
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 6.737091724609073,
+      "learning_rate": 7.607156537529325e-06,
+      "loss": 1.1484,
+      "step": 1946
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 9.486869021792117,
+      "learning_rate": 7.604701702439652e-06,
+      "loss": 1.1995,
+      "step": 1947
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.426590429978936,
+      "learning_rate": 7.602246005337854e-06,
+      "loss": 1.1259,
+      "step": 1948
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 6.118710137103363,
+      "learning_rate": 7.599789447036632e-06,
+      "loss": 1.0212,
+      "step": 1949
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 8.60274143573144,
+      "learning_rate": 7.5973320283489714e-06,
+      "loss": 1.1037,
+      "step": 1950
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 8.924416186619984,
+      "learning_rate": 7.594873750088139e-06,
+      "loss": 1.0523,
+      "step": 1951
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 8.64369737910102,
+      "learning_rate": 7.592414613067692e-06,
+      "loss": 1.168,
+      "step": 1952
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.111338671134519,
+      "learning_rate": 7.589954618101468e-06,
+      "loss": 1.0542,
+      "step": 1953
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.941193571915872,
+      "learning_rate": 7.5874937660035874e-06,
+      "loss": 1.1063,
+      "step": 1954
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 8.680817631861794,
+      "learning_rate": 7.5850320575884574e-06,
+      "loss": 1.0656,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.877906143870265,
+      "learning_rate": 7.582569493670768e-06,
+      "loss": 1.0753,
+      "step": 1956
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.6146025672563455,
+      "learning_rate": 7.58010607506549e-06,
+      "loss": 1.0639,
+      "step": 1957
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.755974616745022,
+      "learning_rate": 7.5776418025878806e-06,
+      "loss": 1.1087,
+      "step": 1958
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 8.823293898563657,
+      "learning_rate": 7.575176677053474e-06,
+      "loss": 1.0624,
+      "step": 1959
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.857399393541868,
+      "learning_rate": 7.572710699278094e-06,
+      "loss": 1.0816,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 9.054437636247282,
+      "learning_rate": 7.570243870077842e-06,
+      "loss": 1.0103,
+      "step": 1961
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 6.915607988452544,
+      "learning_rate": 7.567776190269101e-06,
+      "loss": 1.1151,
+      "step": 1962
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.082967966021343,
+      "learning_rate": 7.565307660668537e-06,
+      "loss": 1.1313,
+      "step": 1963
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 6.818599470242112,
+      "learning_rate": 7.562838282093097e-06,
+      "loss": 1.0557,
+      "step": 1964
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.88091041089453,
+      "learning_rate": 7.560368055360007e-06,
+      "loss": 1.1861,
+      "step": 1965
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 8.794268239505723,
+      "learning_rate": 7.5578969812867775e-06,
+      "loss": 1.1055,
+      "step": 1966
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 6.425702782841357,
+      "learning_rate": 7.555425060691195e-06,
+      "loss": 1.0833,
+      "step": 1967
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 8.827925019279181,
+      "learning_rate": 7.552952294391332e-06,
+      "loss": 1.1154,
+      "step": 1968
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.652482365956028,
+      "learning_rate": 7.550478683205536e-06,
+      "loss": 1.2137,
+      "step": 1969
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 8.523600574917882,
+      "learning_rate": 7.54800422795243e-06,
+      "loss": 1.0841,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 6.308570982442434,
+      "learning_rate": 7.545528929450929e-06,
+      "loss": 1.158,
+      "step": 1971
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 6.980570626385988,
+      "learning_rate": 7.543052788520219e-06,
+      "loss": 1.1227,
+      "step": 1972
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 5.741953176020219,
+      "learning_rate": 7.540575805979762e-06,
+      "loss": 1.0878,
+      "step": 1973
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 9.23732521773279,
+      "learning_rate": 7.538097982649304e-06,
+      "loss": 1.065,
+      "step": 1974
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.622762782645473,
+      "learning_rate": 7.5356193193488655e-06,
+      "loss": 1.1034,
+      "step": 1975
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 8.654784748402896,
+      "learning_rate": 7.5331398168987514e-06,
+      "loss": 1.0296,
+      "step": 1976
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 5.4595110479352975,
+      "learning_rate": 7.530659476119536e-06,
+      "loss": 1.1492,
+      "step": 1977
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.61841778357285,
+      "learning_rate": 7.528178297832076e-06,
+      "loss": 1.0863,
+      "step": 1978
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 8.37693653423429,
+      "learning_rate": 7.525696282857503e-06,
+      "loss": 1.1069,
+      "step": 1979
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 9.863467645677934,
+      "learning_rate": 7.52321343201723e-06,
+      "loss": 1.0394,
+      "step": 1980
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 9.159858084800996,
+      "learning_rate": 7.520729746132938e-06,
+      "loss": 1.0295,
+      "step": 1981
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 5.540256625015782,
+      "learning_rate": 7.518245226026595e-06,
+      "loss": 1.1299,
+      "step": 1982
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 6.9582784688827894,
+      "learning_rate": 7.515759872520436e-06,
+      "loss": 1.138,
+      "step": 1983
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.872371354062334,
+      "learning_rate": 7.513273686436977e-06,
+      "loss": 1.087,
+      "step": 1984
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 6.839469528569717,
+      "learning_rate": 7.510786668599006e-06,
+      "loss": 1.0498,
+      "step": 1985
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 5.311968047328238,
+      "learning_rate": 7.5082988198295915e-06,
+      "loss": 1.0839,
+      "step": 1986
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 6.311056813861423,
+      "learning_rate": 7.505810140952072e-06,
+      "loss": 1.1078,
+      "step": 1987
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 6.787844786415049,
+      "learning_rate": 7.503320632790063e-06,
+      "loss": 1.1045,
+      "step": 1988
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 6.7210681014990925,
+      "learning_rate": 7.500830296167454e-06,
+      "loss": 1.0981,
+      "step": 1989
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.50638711985698,
+      "learning_rate": 7.498339131908405e-06,
+      "loss": 1.1552,
+      "step": 1990
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 9.70909840130126,
+      "learning_rate": 7.4958471408373605e-06,
+      "loss": 1.0874,
+      "step": 1991
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 11.331316408652427,
+      "learning_rate": 7.493354323779026e-06,
+      "loss": 1.0627,
+      "step": 1992
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 8.735678640806233,
+      "learning_rate": 7.490860681558388e-06,
+      "loss": 1.0314,
+      "step": 1993
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 9.217925983587868,
+      "learning_rate": 7.488366215000704e-06,
+      "loss": 1.1232,
+      "step": 1994
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 8.575775428333309,
+      "learning_rate": 7.485870924931504e-06,
+      "loss": 1.0918,
+      "step": 1995
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.005476196112108,
+      "learning_rate": 7.483374812176593e-06,
+      "loss": 1.0951,
+      "step": 1996
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.544132197366087,
+      "learning_rate": 7.4808778775620436e-06,
+      "loss": 1.0827,
+      "step": 1997
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 8.73697636784914,
+      "learning_rate": 7.478380121914203e-06,
+      "loss": 1.0963,
+      "step": 1998
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.125368563968033,
+      "learning_rate": 7.475881546059691e-06,
+      "loss": 1.0407,
+      "step": 1999
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 7.031494935433574,
+      "learning_rate": 7.473382150825399e-06,
+      "loss": 1.0497,
+      "step": 2000
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.673772916253595,
+      "learning_rate": 7.4708819370384866e-06,
+      "loss": 1.1167,
+      "step": 2001
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 9.453163071848325,
+      "learning_rate": 7.468380905526387e-06,
+      "loss": 1.1212,
+      "step": 2002
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 5.791145079092138,
+      "learning_rate": 7.465879057116803e-06,
+      "loss": 1.1108,
+      "step": 2003
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 7.0637191863284645,
+      "learning_rate": 7.4633763926377086e-06,
+      "loss": 1.0096,
+      "step": 2004
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.479784419163884,
+      "learning_rate": 7.460872912917348e-06,
+      "loss": 1.0554,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.758894301101886,
+      "learning_rate": 7.4583686187842304e-06,
+      "loss": 1.0274,
+      "step": 2006
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.063961602996269,
+      "learning_rate": 7.455863511067143e-06,
+      "loss": 1.1369,
+      "step": 2007
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 6.078204615386867,
+      "learning_rate": 7.453357590595137e-06,
+      "loss": 1.0602,
+      "step": 2008
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 7.138051236574574,
+      "learning_rate": 7.450850858197532e-06,
+      "loss": 1.0295,
+      "step": 2009
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 4.91751819173455,
+      "learning_rate": 7.448343314703917e-06,
+      "loss": 1.1173,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 11.25227753443751,
+      "learning_rate": 7.4458349609441534e-06,
+      "loss": 1.1365,
+      "step": 2011
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 9.401129052356426,
+      "learning_rate": 7.443325797748366e-06,
+      "loss": 1.1022,
+      "step": 2012
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 5.974238972977989,
+      "learning_rate": 7.440815825946947e-06,
+      "loss": 1.0756,
+      "step": 2013
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 7.9464983540445635,
+      "learning_rate": 7.438305046370562e-06,
+      "loss": 1.194,
+      "step": 2014
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 4.9636974350250975,
+      "learning_rate": 7.435793459850135e-06,
+      "loss": 1.0827,
+      "step": 2015
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 6.207958417980312,
+      "learning_rate": 7.433281067216867e-06,
+      "loss": 1.025,
+      "step": 2016
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 6.965473440102362,
+      "learning_rate": 7.430767869302219e-06,
+      "loss": 1.0413,
+      "step": 2017
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 6.239573945586427,
+      "learning_rate": 7.4282538669379186e-06,
+      "loss": 1.0657,
+      "step": 2018
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 6.509568417828143,
+      "learning_rate": 7.425739060955965e-06,
+      "loss": 1.2051,
+      "step": 2019
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 7.98900964986192,
+      "learning_rate": 7.4232234521886184e-06,
+      "loss": 1.0423,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 10.46936259893223,
+      "learning_rate": 7.420707041468404e-06,
+      "loss": 1.1441,
+      "step": 2021
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 6.952798016206753,
+      "learning_rate": 7.418189829628117e-06,
+      "loss": 1.0875,
+      "step": 2022
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 5.394050934691498,
+      "learning_rate": 7.415671817500814e-06,
+      "loss": 1.1333,
+      "step": 2023
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 6.444274973636963,
+      "learning_rate": 7.413153005919819e-06,
+      "loss": 1.0274,
+      "step": 2024
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 5.956584160053955,
+      "learning_rate": 7.410633395718718e-06,
+      "loss": 1.0564,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 7.662585759263054,
+      "learning_rate": 7.408112987731363e-06,
+      "loss": 1.0734,
+      "step": 2026
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 11.26131719218438,
+      "learning_rate": 7.405591782791869e-06,
+      "loss": 1.0647,
+      "step": 2027
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 7.000763982794808,
+      "learning_rate": 7.403069781734617e-06,
+      "loss": 1.0949,
+      "step": 2028
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 6.385052865483534,
+      "learning_rate": 7.400546985394247e-06,
+      "loss": 1.1038,
+      "step": 2029
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.39511554056483,
+      "learning_rate": 7.398023394605668e-06,
+      "loss": 1.0318,
+      "step": 2030
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 6.974749002181365,
+      "learning_rate": 7.395499010204046e-06,
+      "loss": 1.1614,
+      "step": 2031
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 10.949252257280312,
+      "learning_rate": 7.392973833024816e-06,
+      "loss": 1.1212,
+      "step": 2032
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 7.9113368257894905,
+      "learning_rate": 7.390447863903669e-06,
+      "loss": 1.1881,
+      "step": 2033
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 9.168458305882082,
+      "learning_rate": 7.387921103676561e-06,
+      "loss": 1.0555,
+      "step": 2034
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 6.494400179111064,
+      "learning_rate": 7.385393553179712e-06,
+      "loss": 1.086,
+      "step": 2035
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 7.303519142520545,
+      "learning_rate": 7.382865213249599e-06,
+      "loss": 1.0907,
+      "step": 2036
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.947747842447066,
+      "learning_rate": 7.380336084722964e-06,
+      "loss": 1.0406,
+      "step": 2037
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.3111349228238,
+      "learning_rate": 7.377806168436809e-06,
+      "loss": 1.0529,
+      "step": 2038
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 9.97672388048174,
+      "learning_rate": 7.375275465228395e-06,
+      "loss": 1.0538,
+      "step": 2039
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 9.773374723707672,
+      "learning_rate": 7.372743975935246e-06,
+      "loss": 0.9995,
+      "step": 2040
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.42552289979947,
+      "learning_rate": 7.370211701395144e-06,
+      "loss": 1.0722,
+      "step": 2041
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.213146600663428,
+      "learning_rate": 7.36767864244613e-06,
+      "loss": 1.0903,
+      "step": 2042
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.641213033218618,
+      "learning_rate": 7.365144799926509e-06,
+      "loss": 1.0669,
+      "step": 2043
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 6.78464985873711,
+      "learning_rate": 7.362610174674845e-06,
+      "loss": 1.0846,
+      "step": 2044
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 7.629322544170271,
+      "learning_rate": 7.360074767529951e-06,
+      "loss": 1.0479,
+      "step": 2045
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.90523015100364,
+      "learning_rate": 7.357538579330913e-06,
+      "loss": 1.0589,
+      "step": 2046
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 6.579508917236537,
+      "learning_rate": 7.355001610917068e-06,
+      "loss": 1.0275,
+      "step": 2047
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 9.891685613571825,
+      "learning_rate": 7.35246386312801e-06,
+      "loss": 1.068,
+      "step": 2048
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 5.47374505750816,
+      "learning_rate": 7.349925336803595e-06,
+      "loss": 1.1275,
+      "step": 2049
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 10.166870003736358,
+      "learning_rate": 7.347386032783933e-06,
+      "loss": 1.0718,
+      "step": 2050
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 6.386555498757037,
+      "learning_rate": 7.344845951909394e-06,
+      "loss": 1.1051,
+      "step": 2051
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.26049036580486,
+      "learning_rate": 7.342305095020605e-06,
+      "loss": 1.0916,
+      "step": 2052
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 5.18211906234795,
+      "learning_rate": 7.339763462958448e-06,
+      "loss": 1.1912,
+      "step": 2053
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.39180094488201,
+      "learning_rate": 7.337221056564062e-06,
+      "loss": 1.1118,
+      "step": 2054
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 7.992314240814317,
+      "learning_rate": 7.334677876678845e-06,
+      "loss": 1.0346,
+      "step": 2055
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.029401463717015,
+      "learning_rate": 7.332133924144447e-06,
+      "loss": 0.9973,
+      "step": 2056
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 7.040883492704476,
+      "learning_rate": 7.329589199802776e-06,
+      "loss": 1.0639,
+      "step": 2057
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 7.636240209055165,
+      "learning_rate": 7.327043704495995e-06,
+      "loss": 1.0938,
+      "step": 2058
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 7.113174675663073,
+      "learning_rate": 7.324497439066522e-06,
+      "loss": 1.0319,
+      "step": 2059
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 6.517874966455956,
+      "learning_rate": 7.32195040435703e-06,
+      "loss": 1.1585,
+      "step": 2060
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.57048267599631,
+      "learning_rate": 7.319402601210448e-06,
+      "loss": 1.022,
+      "step": 2061
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 7.448657986468944,
+      "learning_rate": 7.316854030469954e-06,
+      "loss": 1.0011,
+      "step": 2062
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 7.021679200731094,
+      "learning_rate": 7.314304692978987e-06,
+      "loss": 1.1327,
+      "step": 2063
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 7.517193915240472,
+      "learning_rate": 7.311754589581237e-06,
+      "loss": 1.0462,
+      "step": 2064
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.599402910384537,
+      "learning_rate": 7.309203721120644e-06,
+      "loss": 1.1152,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 7.291174788285601,
+      "learning_rate": 7.306652088441408e-06,
+      "loss": 1.1012,
+      "step": 2066
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 11.059538488715956,
+      "learning_rate": 7.304099692387976e-06,
+      "loss": 1.0943,
+      "step": 2067
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 6.939341735984287,
+      "learning_rate": 7.301546533805049e-06,
+      "loss": 1.0753,
+      "step": 2068
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 5.559987156493563,
+      "learning_rate": 7.298992613537586e-06,
+      "loss": 1.1577,
+      "step": 2069
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 11.100986239145689,
+      "learning_rate": 7.296437932430786e-06,
+      "loss": 1.0632,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 7.947703867786294,
+      "learning_rate": 7.293882491330113e-06,
+      "loss": 1.0868,
+      "step": 2071
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 7.195179231589757,
+      "learning_rate": 7.291326291081274e-06,
+      "loss": 1.0866,
+      "step": 2072
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.53000749370384,
+      "learning_rate": 7.28876933253023e-06,
+      "loss": 1.0931,
+      "step": 2073
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 6.682747191299788,
+      "learning_rate": 7.286211616523193e-06,
+      "loss": 1.0453,
+      "step": 2074
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 9.501623240907863,
+      "learning_rate": 7.283653143906626e-06,
+      "loss": 1.1726,
+      "step": 2075
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 6.837539296570471,
+      "learning_rate": 7.281093915527242e-06,
+      "loss": 1.1808,
+      "step": 2076
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.96572745715605,
+      "learning_rate": 7.278533932232004e-06,
+      "loss": 1.1638,
+      "step": 2077
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 6.265100544623352,
+      "learning_rate": 7.275973194868123e-06,
+      "loss": 1.0937,
+      "step": 2078
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 7.051693628925987,
+      "learning_rate": 7.273411704283061e-06,
+      "loss": 1.1108,
+      "step": 2079
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.11592086153968,
+      "learning_rate": 7.2708494613245336e-06,
+      "loss": 1.0778,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 5.9127294255249225,
+      "learning_rate": 7.268286466840497e-06,
+      "loss": 1.0915,
+      "step": 2081
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 7.093407382824752,
+      "learning_rate": 7.265722721679164e-06,
+      "loss": 1.131,
+      "step": 2082
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 6.243172416207048,
+      "learning_rate": 7.263158226688991e-06,
+      "loss": 1.0812,
+      "step": 2083
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 6.635318957165792,
+      "learning_rate": 7.2605929827186835e-06,
+      "loss": 1.0714,
+      "step": 2084
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.229459015054095,
+      "learning_rate": 7.258026990617197e-06,
+      "loss": 1.0811,
+      "step": 2085
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 7.957236583208545,
+      "learning_rate": 7.2554602512337304e-06,
+      "loss": 1.0765,
+      "step": 2086
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 6.119085670821272,
+      "learning_rate": 7.252892765417732e-06,
+      "loss": 1.0748,
+      "step": 2087
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 6.8386061795079,
+      "learning_rate": 7.2503245340189025e-06,
+      "loss": 1.1169,
+      "step": 2088
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.657153821701183,
+      "learning_rate": 7.247755557887178e-06,
+      "loss": 1.1109,
+      "step": 2089
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 9.135076962504554,
+      "learning_rate": 7.245185837872751e-06,
+      "loss": 1.1764,
+      "step": 2090
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 7.794832036273806,
+      "learning_rate": 7.242615374826056e-06,
+      "loss": 1.1892,
+      "step": 2091
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 5.7406588578558475,
+      "learning_rate": 7.2400441695977755e-06,
+      "loss": 1.1638,
+      "step": 2092
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.382335254945298,
+      "learning_rate": 7.2374722230388325e-06,
+      "loss": 1.0964,
+      "step": 2093
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.978623492752572,
+      "learning_rate": 7.234899536000404e-06,
+      "loss": 1.0305,
+      "step": 2094
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 7.816603104821748,
+      "learning_rate": 7.232326109333901e-06,
+      "loss": 1.1791,
+      "step": 2095
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.066147439464107,
+      "learning_rate": 7.229751943890991e-06,
+      "loss": 1.1233,
+      "step": 2096
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 9.417579035431176,
+      "learning_rate": 7.227177040523578e-06,
+      "loss": 1.1078,
+      "step": 2097
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 6.8949733449456625,
+      "learning_rate": 7.224601400083811e-06,
+      "loss": 1.1563,
+      "step": 2098
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 7.225962777584664,
+      "learning_rate": 7.222025023424086e-06,
+      "loss": 1.0625,
+      "step": 2099
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 7.462564923403609,
+      "learning_rate": 7.219447911397041e-06,
+      "loss": 1.0488,
+      "step": 2100
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 6.21000999370981,
+      "learning_rate": 7.216870064855557e-06,
+      "loss": 1.1406,
+      "step": 2101
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 6.37789500396394,
+      "learning_rate": 7.21429148465276e-06,
+      "loss": 1.1329,
+      "step": 2102
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 7.089796073512882,
+      "learning_rate": 7.2117121716420155e-06,
+      "loss": 1.0733,
+      "step": 2103
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 5.8538811583722925,
+      "learning_rate": 7.209132126676934e-06,
+      "loss": 1.1021,
+      "step": 2104
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.617719782827129,
+      "learning_rate": 7.2065513506113686e-06,
+      "loss": 1.0794,
+      "step": 2105
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 7.7230876369618215,
+      "learning_rate": 7.203969844299412e-06,
+      "loss": 1.0979,
+      "step": 2106
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 6.561042827523433,
+      "learning_rate": 7.201387608595401e-06,
+      "loss": 1.1042,
+      "step": 2107
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 5.549835148064388,
+      "learning_rate": 7.1988046443539115e-06,
+      "loss": 1.0488,
+      "step": 2108
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 7.272784649740406,
+      "learning_rate": 7.196220952429762e-06,
+      "loss": 1.0502,
+      "step": 2109
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 6.945866987628533,
+      "learning_rate": 7.193636533678014e-06,
+      "loss": 1.1059,
+      "step": 2110
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 10.487801810341491,
+      "learning_rate": 7.191051388953962e-06,
+      "loss": 1.1156,
+      "step": 2111
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 5.812708982385117,
+      "learning_rate": 7.1884655191131516e-06,
+      "loss": 1.0946,
+      "step": 2112
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 10.014599712673869,
+      "learning_rate": 7.185878925011359e-06,
+      "loss": 1.1027,
+      "step": 2113
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.048938252317283,
+      "learning_rate": 7.183291607504605e-06,
+      "loss": 1.0741,
+      "step": 2114
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 10.11798304729865,
+      "learning_rate": 7.1807035674491465e-06,
+      "loss": 1.1021,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 6.808214161053852,
+      "learning_rate": 7.178114805701486e-06,
+      "loss": 1.119,
+      "step": 2116
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 6.124697034566022,
+      "learning_rate": 7.175525323118355e-06,
+      "loss": 1.1131,
+      "step": 2117
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.62574278277593,
+      "learning_rate": 7.172935120556732e-06,
+      "loss": 1.1054,
+      "step": 2118
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 6.522481208125588,
+      "learning_rate": 7.1703441988738324e-06,
+      "loss": 1.0548,
+      "step": 2119
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.434224336333838,
+      "learning_rate": 7.167752558927103e-06,
+      "loss": 1.0003,
+      "step": 2120
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 9.460210547635995,
+      "learning_rate": 7.165160201574238e-06,
+      "loss": 1.1061,
+      "step": 2121
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.313285218185701,
+      "learning_rate": 7.162567127673161e-06,
+      "loss": 1.0695,
+      "step": 2122
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.853200737799394,
+      "learning_rate": 7.1599733380820365e-06,
+      "loss": 1.1409,
+      "step": 2123
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 10.198220158585777,
+      "learning_rate": 7.157378833659269e-06,
+      "loss": 1.0771,
+      "step": 2124
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.348961877591974,
+      "learning_rate": 7.15478361526349e-06,
+      "loss": 1.1301,
+      "step": 2125
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.731533936843928,
+      "learning_rate": 7.152187683753575e-06,
+      "loss": 1.056,
+      "step": 2126
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 6.50038591508769,
+      "learning_rate": 7.149591039988637e-06,
+      "loss": 1.1541,
+      "step": 2127
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 10.467945195468785,
+      "learning_rate": 7.146993684828019e-06,
+      "loss": 1.0671,
+      "step": 2128
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 5.528822155935146,
+      "learning_rate": 7.144395619131301e-06,
+      "loss": 1.0951,
+      "step": 2129
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.859508920302877,
+      "learning_rate": 7.1417968437583e-06,
+      "loss": 0.9864,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.67897824097983,
+      "learning_rate": 7.139197359569067e-06,
+      "loss": 1.0837,
+      "step": 2131
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.963857446073648,
+      "learning_rate": 7.136597167423887e-06,
+      "loss": 1.0583,
+      "step": 2132
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 9.321748047142176,
+      "learning_rate": 7.133996268183281e-06,
+      "loss": 1.1072,
+      "step": 2133
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.754826304643938,
+      "learning_rate": 7.131394662707999e-06,
+      "loss": 1.0926,
+      "step": 2134
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.146689677389528,
+      "learning_rate": 7.128792351859033e-06,
+      "loss": 1.0647,
+      "step": 2135
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 11.43171736950066,
+      "learning_rate": 7.126189336497602e-06,
+      "loss": 1.1961,
+      "step": 2136
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 10.700132455094039,
+      "learning_rate": 7.123585617485158e-06,
+      "loss": 1.0448,
+      "step": 2137
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 12.937275631427818,
+      "learning_rate": 7.120981195683392e-06,
+      "loss": 1.1292,
+      "step": 2138
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.893362630876748,
+      "learning_rate": 7.1183760719542195e-06,
+      "loss": 1.1025,
+      "step": 2139
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 6.4487893059698616,
+      "learning_rate": 7.115770247159795e-06,
+      "loss": 1.1493,
+      "step": 2140
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 9.4446207665761,
+      "learning_rate": 7.113163722162503e-06,
+      "loss": 1.1011,
+      "step": 2141
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 6.320358974782647,
+      "learning_rate": 7.110556497824955e-06,
+      "loss": 1.0478,
+      "step": 2142
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.583762691747802,
+      "learning_rate": 7.107948575010002e-06,
+      "loss": 1.0758,
+      "step": 2143
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.5627571437277625,
+      "learning_rate": 7.10533995458072e-06,
+      "loss": 1.0747,
+      "step": 2144
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.518217272759777,
+      "learning_rate": 7.102730637400418e-06,
+      "loss": 1.1315,
+      "step": 2145
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.050070925381952,
+      "learning_rate": 7.100120624332637e-06,
+      "loss": 1.0194,
+      "step": 2146
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.425349385023759,
+      "learning_rate": 7.097509916241145e-06,
+      "loss": 1.0953,
+      "step": 2147
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.438375068664959,
+      "learning_rate": 7.094898513989945e-06,
+      "loss": 1.1683,
+      "step": 2148
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 11.131103038074036,
+      "learning_rate": 7.092286418443264e-06,
+      "loss": 1.0484,
+      "step": 2149
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 5.3956893125058105,
+      "learning_rate": 7.08967363046556e-06,
+      "loss": 1.055,
+      "step": 2150
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 9.759762457007248,
+      "learning_rate": 7.0870601509215235e-06,
+      "loss": 1.1026,
+      "step": 2151
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 9.184650018723625,
+      "learning_rate": 7.084445980676073e-06,
+      "loss": 1.0978,
+      "step": 2152
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.671285598623781,
+      "learning_rate": 7.081831120594349e-06,
+      "loss": 1.0019,
+      "step": 2153
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.047458080905812,
+      "learning_rate": 7.079215571541731e-06,
+      "loss": 1.1031,
+      "step": 2154
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 6.192040924439993,
+      "learning_rate": 7.076599334383818e-06,
+      "loss": 1.0977,
+      "step": 2155
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.9369857252373555,
+      "learning_rate": 7.073982409986441e-06,
+      "loss": 1.0767,
+      "step": 2156
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.269703328761805,
+      "learning_rate": 7.071364799215655e-06,
+      "loss": 1.1455,
+      "step": 2157
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 9.577748632257302,
+      "learning_rate": 7.068746502937748e-06,
+      "loss": 1.0956,
+      "step": 2158
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 8.49925114654692,
+      "learning_rate": 7.066127522019227e-06,
+      "loss": 1.0589,
+      "step": 2159
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 5.810205319908479,
+      "learning_rate": 7.063507857326835e-06,
+      "loss": 1.0584,
+      "step": 2160
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 6.558108470629341,
+      "learning_rate": 7.06088750972753e-06,
+      "loss": 1.0181,
+      "step": 2161
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.600708406589724,
+      "learning_rate": 7.058266480088505e-06,
+      "loss": 1.1026,
+      "step": 2162
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 10.505169305280727,
+      "learning_rate": 7.055644769277175e-06,
+      "loss": 1.0325,
+      "step": 2163
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 5.823776405614185,
+      "learning_rate": 7.053022378161185e-06,
+      "loss": 1.0392,
+      "step": 2164
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 5.470059456431909,
+      "learning_rate": 7.050399307608395e-06,
+      "loss": 1.0523,
+      "step": 2165
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 11.041956684134208,
+      "learning_rate": 7.047775558486901e-06,
+      "loss": 1.1116,
+      "step": 2166
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.459553683145138,
+      "learning_rate": 7.045151131665016e-06,
+      "loss": 1.1573,
+      "step": 2167
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 12.038663353195458,
+      "learning_rate": 7.042526028011283e-06,
+      "loss": 1.0591,
+      "step": 2168
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.437615839511951,
+      "learning_rate": 7.039900248394464e-06,
+      "loss": 1.1068,
+      "step": 2169
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 6.7617158357075695,
+      "learning_rate": 7.037273793683546e-06,
+      "loss": 1.056,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 5.917377242775904,
+      "learning_rate": 7.0346466647477414e-06,
+      "loss": 1.1258,
+      "step": 2171
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.736221695554974,
+      "learning_rate": 7.032018862456485e-06,
+      "loss": 1.0524,
+      "step": 2172
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 10.241008871310445,
+      "learning_rate": 7.029390387679433e-06,
+      "loss": 1.0621,
+      "step": 2173
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 6.642757653683062,
+      "learning_rate": 7.026761241286465e-06,
+      "loss": 1.1305,
+      "step": 2174
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 6.7870529049043835,
+      "learning_rate": 7.0241314241476834e-06,
+      "loss": 1.0935,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 9.52598085738245,
+      "learning_rate": 7.021500937133412e-06,
+      "loss": 1.108,
+      "step": 2176
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 9.373618857241079,
+      "learning_rate": 7.018869781114197e-06,
+      "loss": 1.0274,
+      "step": 2177
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.361329741386382,
+      "learning_rate": 7.016237956960803e-06,
+      "loss": 1.1623,
+      "step": 2178
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 9.654455615466711,
+      "learning_rate": 7.013605465544222e-06,
+      "loss": 1.0946,
+      "step": 2179
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 10.75911323095654,
+      "learning_rate": 7.010972307735661e-06,
+      "loss": 1.0588,
+      "step": 2180
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 6.06656187666202,
+      "learning_rate": 7.0083384844065484e-06,
+      "loss": 1.0433,
+      "step": 2181
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 6.756077391547717,
+      "learning_rate": 7.005703996428537e-06,
+      "loss": 1.1029,
+      "step": 2182
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 5.133808637707416,
+      "learning_rate": 7.003068844673492e-06,
+      "loss": 1.0783,
+      "step": 2183
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 8.330143976070053,
+      "learning_rate": 7.000433030013507e-06,
+      "loss": 1.0788,
+      "step": 2184
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 9.229098714138987,
+      "learning_rate": 6.997796553320891e-06,
+      "loss": 1.1142,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 9.631270543770018,
+      "learning_rate": 6.995159415468167e-06,
+      "loss": 1.0618,
+      "step": 2186
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 8.421808873999147,
+      "learning_rate": 6.992521617328085e-06,
+      "loss": 1.152,
+      "step": 2187
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 5.578405773813126,
+      "learning_rate": 6.9898831597736115e-06,
+      "loss": 0.9969,
+      "step": 2188
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.460922862598406,
+      "learning_rate": 6.9872440436779275e-06,
+      "loss": 1.052,
+      "step": 2189
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 10.900118330787503,
+      "learning_rate": 6.984604269914437e-06,
+      "loss": 1.1875,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 5.498861585654345,
+      "learning_rate": 6.981963839356756e-06,
+      "loss": 1.1367,
+      "step": 2191
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 5.749391248001744,
+      "learning_rate": 6.979322752878724e-06,
+      "loss": 1.1422,
+      "step": 2192
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.89877481723672,
+      "learning_rate": 6.97668101135439e-06,
+      "loss": 1.0373,
+      "step": 2193
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.184886155739144,
+      "learning_rate": 6.97403861565803e-06,
+      "loss": 1.0825,
+      "step": 2194
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 9.534604181023653,
+      "learning_rate": 6.971395566664126e-06,
+      "loss": 1.1279,
+      "step": 2195
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 8.481165993878143,
+      "learning_rate": 6.968751865247383e-06,
+      "loss": 1.0735,
+      "step": 2196
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.9120387752785994,
+      "learning_rate": 6.9661075122827195e-06,
+      "loss": 1.1191,
+      "step": 2197
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 9.381284156163337,
+      "learning_rate": 6.96346250864527e-06,
+      "loss": 1.0669,
+      "step": 2198
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 9.15091002559734,
+      "learning_rate": 6.960816855210385e-06,
+      "loss": 1.0963,
+      "step": 2199
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 9.481686956792458,
+      "learning_rate": 6.958170552853629e-06,
+      "loss": 1.1123,
+      "step": 2200
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.916580961250986,
+      "learning_rate": 6.95552360245078e-06,
+      "loss": 1.2032,
+      "step": 2201
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 8.265820376920788,
+      "learning_rate": 6.952876004877835e-06,
+      "loss": 1.1479,
+      "step": 2202
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 8.280163366307058,
+      "learning_rate": 6.950227761011001e-06,
+      "loss": 1.0428,
+      "step": 2203
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 8.987469674615124,
+      "learning_rate": 6.947578871726699e-06,
+      "loss": 0.9919,
+      "step": 2204
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 9.704594130916968,
+      "learning_rate": 6.944929337901567e-06,
+      "loss": 1.1272,
+      "step": 2205
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.546646033798718,
+      "learning_rate": 6.942279160412452e-06,
+      "loss": 0.9792,
+      "step": 2206
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 6.7861173893582,
+      "learning_rate": 6.9396283401364175e-06,
+      "loss": 1.0109,
+      "step": 2207
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 10.16255110669998,
+      "learning_rate": 6.936976877950738e-06,
+      "loss": 1.0888,
+      "step": 2208
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.860368619310683,
+      "learning_rate": 6.934324774732899e-06,
+      "loss": 1.0394,
+      "step": 2209
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.66532315272414,
+      "learning_rate": 6.931672031360603e-06,
+      "loss": 1.0462,
+      "step": 2210
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.562446341705684,
+      "learning_rate": 6.929018648711759e-06,
+      "loss": 1.1016,
+      "step": 2211
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.568874518821024,
+      "learning_rate": 6.926364627664491e-06,
+      "loss": 1.0984,
+      "step": 2212
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 8.159118060109545,
+      "learning_rate": 6.923709969097133e-06,
+      "loss": 1.1482,
+      "step": 2213
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.927342750005874,
+      "learning_rate": 6.921054673888229e-06,
+      "loss": 1.0645,
+      "step": 2214
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 8.570559656875876,
+      "learning_rate": 6.918398742916536e-06,
+      "loss": 1.0671,
+      "step": 2215
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 9.27666950606656,
+      "learning_rate": 6.91574217706102e-06,
+      "loss": 1.0021,
+      "step": 2216
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 8.624828423644198,
+      "learning_rate": 6.913084977200854e-06,
+      "loss": 1.1004,
+      "step": 2217
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 8.505184161175345,
+      "learning_rate": 6.91042714421543e-06,
+      "loss": 1.0996,
+      "step": 2218
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 6.370544709604223,
+      "learning_rate": 6.90776867898434e-06,
+      "loss": 1.0942,
+      "step": 2219
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.914779752676014,
+      "learning_rate": 6.90510958238739e-06,
+      "loss": 1.0131,
+      "step": 2220
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 10.976749269703307,
+      "learning_rate": 6.902449855304594e-06,
+      "loss": 1.0488,
+      "step": 2221
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.073095886374028,
+      "learning_rate": 6.899789498616172e-06,
+      "loss": 1.0461,
+      "step": 2222
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 7.492033499352497,
+      "learning_rate": 6.897128513202557e-06,
+      "loss": 1.0882,
+      "step": 2223
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 6.098412866516542,
+      "learning_rate": 6.89446689994439e-06,
+      "loss": 1.1138,
+      "step": 2224
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 9.854299350889733,
+      "learning_rate": 6.891804659722512e-06,
+      "loss": 1.0672,
+      "step": 2225
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 7.913016343758167,
+      "learning_rate": 6.889141793417981e-06,
+      "loss": 1.0093,
+      "step": 2226
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 6.1678177751549415,
+      "learning_rate": 6.8864783019120605e-06,
+      "loss": 1.0789,
+      "step": 2227
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 8.778238914643147,
+      "learning_rate": 6.883814186086215e-06,
+      "loss": 1.1643,
+      "step": 2228
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 7.1466218481827175,
+      "learning_rate": 6.881149446822119e-06,
+      "loss": 1.0972,
+      "step": 2229
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 6.268471713218206,
+      "learning_rate": 6.878484085001658e-06,
+      "loss": 1.1297,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 11.570516121947131,
+      "learning_rate": 6.875818101506915e-06,
+      "loss": 1.1243,
+      "step": 2231
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 7.73287149949458,
+      "learning_rate": 6.8731514972201865e-06,
+      "loss": 1.1032,
+      "step": 2232
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 10.560668505321313,
+      "learning_rate": 6.870484273023967e-06,
+      "loss": 1.0186,
+      "step": 2233
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 5.98505404175457,
+      "learning_rate": 6.867816429800962e-06,
+      "loss": 1.1372,
+      "step": 2234
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 6.004469115154352,
+      "learning_rate": 6.865147968434082e-06,
+      "loss": 1.0148,
+      "step": 2235
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 10.687329749930868,
+      "learning_rate": 6.862478889806436e-06,
+      "loss": 1.1083,
+      "step": 2236
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 9.807781917609338,
+      "learning_rate": 6.859809194801346e-06,
+      "loss": 1.1211,
+      "step": 2237
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 9.198702759132512,
+      "learning_rate": 6.857138884302329e-06,
+      "loss": 0.9752,
+      "step": 2238
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 5.429312363890472,
+      "learning_rate": 6.854467959193112e-06,
+      "loss": 1.0924,
+      "step": 2239
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 8.105696197515671,
+      "learning_rate": 6.851796420357624e-06,
+      "loss": 1.092,
+      "step": 2240
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 7.042490769547501,
+      "learning_rate": 6.849124268679996e-06,
+      "loss": 1.1496,
+      "step": 2241
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 7.303299981097352,
+      "learning_rate": 6.846451505044561e-06,
+      "loss": 1.1132,
+      "step": 2242
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 7.46308321296871,
+      "learning_rate": 6.843778130335858e-06,
+      "loss": 1.0451,
+      "step": 2243
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 6.755247104671525,
+      "learning_rate": 6.841104145438625e-06,
+      "loss": 1.0405,
+      "step": 2244
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 8.031164341776323,
+      "learning_rate": 6.8384295512378026e-06,
+      "loss": 1.0229,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 7.276649528648564,
+      "learning_rate": 6.8357543486185355e-06,
+      "loss": 1.0119,
+      "step": 2246
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 7.626976969919162,
+      "learning_rate": 6.8330785384661634e-06,
+      "loss": 1.1099,
+      "step": 2247
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 7.1766607208486874,
+      "learning_rate": 6.830402121666236e-06,
+      "loss": 1.1272,
+      "step": 2248
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 9.270668648251906,
+      "learning_rate": 6.827725099104497e-06,
+      "loss": 1.0578,
+      "step": 2249
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 8.055936851649081,
+      "learning_rate": 6.8250474716668926e-06,
+      "loss": 1.0646,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 7.073621368664573,
+      "learning_rate": 6.82236924023957e-06,
+      "loss": 1.084,
+      "step": 2251
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 8.88923553733693,
+      "learning_rate": 6.819690405708877e-06,
+      "loss": 1.0001,
+      "step": 2252
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 7.6054191633506285,
+      "learning_rate": 6.817010968961355e-06,
+      "loss": 1.0895,
+      "step": 2253
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 9.050021881800838,
+      "learning_rate": 6.814330930883755e-06,
+      "loss": 1.0795,
+      "step": 2254
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 6.2169216917156565,
+      "learning_rate": 6.811650292363017e-06,
+      "loss": 1.1045,
+      "step": 2255
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 6.19536738570374,
+      "learning_rate": 6.808969054286285e-06,
+      "loss": 1.0808,
+      "step": 2256
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 7.645105731574624,
+      "learning_rate": 6.8062872175409034e-06,
+      "loss": 1.048,
+      "step": 2257
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 10.804647412930615,
+      "learning_rate": 6.803604783014406e-06,
+      "loss": 1.0685,
+      "step": 2258
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 10.952521067893661,
+      "learning_rate": 6.800921751594534e-06,
+      "loss": 1.1063,
+      "step": 2259
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 8.794935787901737,
+      "learning_rate": 6.7982381241692255e-06,
+      "loss": 1.1558,
+      "step": 2260
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 7.282811748356745,
+      "learning_rate": 6.7955539016266055e-06,
+      "loss": 1.0606,
+      "step": 2261
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 6.334702664252695,
+      "learning_rate": 6.792869084855007e-06,
+      "loss": 1.0859,
+      "step": 2262
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 8.835529440618142,
+      "learning_rate": 6.790183674742958e-06,
+      "loss": 1.0989,
+      "step": 2263
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 10.24612858251003,
+      "learning_rate": 6.787497672179175e-06,
+      "loss": 1.0671,
+      "step": 2264
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 9.209961763072641,
+      "learning_rate": 6.78481107805258e-06,
+      "loss": 1.1111,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 7.707227127066124,
+      "learning_rate": 6.782123893252288e-06,
+      "loss": 1.0649,
+      "step": 2266
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 9.611794478988779,
+      "learning_rate": 6.779436118667604e-06,
+      "loss": 1.0438,
+      "step": 2267
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 7.496569672063562,
+      "learning_rate": 6.7767477551880375e-06,
+      "loss": 1.1327,
+      "step": 2268
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 9.807682166407282,
+      "learning_rate": 6.774058803703285e-06,
+      "loss": 1.0481,
+      "step": 2269
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 6.365552828725174,
+      "learning_rate": 6.771369265103241e-06,
+      "loss": 1.0459,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 7.331216268091394,
+      "learning_rate": 6.7686791402779964e-06,
+      "loss": 1.0657,
+      "step": 2271
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 6.199203742334143,
+      "learning_rate": 6.765988430117831e-06,
+      "loss": 1.0555,
+      "step": 2272
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 6.369928988988936,
+      "learning_rate": 6.76329713551322e-06,
+      "loss": 1.0912,
+      "step": 2273
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 8.695184383384072,
+      "learning_rate": 6.7606052573548365e-06,
+      "loss": 1.0533,
+      "step": 2274
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 12.85078336661026,
+      "learning_rate": 6.75791279653354e-06,
+      "loss": 1.0536,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 6.3419372991889915,
+      "learning_rate": 6.755219753940389e-06,
+      "loss": 1.0012,
+      "step": 2276
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 7.137787419471338,
+      "learning_rate": 6.752526130466629e-06,
+      "loss": 1.0849,
+      "step": 2277
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 6.017529692742419,
+      "learning_rate": 6.749831927003702e-06,
+      "loss": 0.9689,
+      "step": 2278
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 9.184332675631634,
+      "learning_rate": 6.747137144443239e-06,
+      "loss": 1.1114,
+      "step": 2279
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 6.220951279029631,
+      "learning_rate": 6.744441783677065e-06,
+      "loss": 1.1148,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.506310261940499,
+      "learning_rate": 6.741745845597194e-06,
+      "loss": 1.0332,
+      "step": 2281
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.7654557021495725,
+      "learning_rate": 6.739049331095833e-06,
+      "loss": 1.1191,
+      "step": 2282
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.805057773221516,
+      "learning_rate": 6.736352241065379e-06,
+      "loss": 1.0897,
+      "step": 2283
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.312558518106427,
+      "learning_rate": 6.7336545763984206e-06,
+      "loss": 1.1015,
+      "step": 2284
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.683190153858013,
+      "learning_rate": 6.730956337987734e-06,
+      "loss": 1.0995,
+      "step": 2285
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 6.2391422306530915,
+      "learning_rate": 6.7282575267262875e-06,
+      "loss": 1.0397,
+      "step": 2286
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.535762387183789,
+      "learning_rate": 6.725558143507238e-06,
+      "loss": 1.141,
+      "step": 2287
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 6.5156290915734925,
+      "learning_rate": 6.722858189223933e-06,
+      "loss": 1.0313,
+      "step": 2288
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 8.01276476833285,
+      "learning_rate": 6.7201576647699066e-06,
+      "loss": 1.1222,
+      "step": 2289
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.556867723171845,
+      "learning_rate": 6.717456571038883e-06,
+      "loss": 1.1621,
+      "step": 2290
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 6.249224805040979,
+      "learning_rate": 6.7147549089247745e-06,
+      "loss": 1.0515,
+      "step": 2291
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.479716271163351,
+      "learning_rate": 6.712052679321683e-06,
+      "loss": 1.1094,
+      "step": 2292
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 8.874053655560399,
+      "learning_rate": 6.709349883123898e-06,
+      "loss": 1.1097,
+      "step": 2293
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 11.992795642487787,
+      "learning_rate": 6.706646521225893e-06,
+      "loss": 1.1417,
+      "step": 2294
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 6.322768941203421,
+      "learning_rate": 6.70394259452233e-06,
+      "loss": 1.0366,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 6.990001009585748,
+      "learning_rate": 6.701238103908064e-06,
+      "loss": 1.0713,
+      "step": 2296
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 5.874645175833628,
+      "learning_rate": 6.698533050278127e-06,
+      "loss": 1.1137,
+      "step": 2297
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.199333935450739,
+      "learning_rate": 6.695827434527744e-06,
+      "loss": 1.0277,
+      "step": 2298
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 10.274580484237724,
+      "learning_rate": 6.693121257552326e-06,
+      "loss": 1.139,
+      "step": 2299
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.7547615970145305,
+      "learning_rate": 6.690414520247465e-06,
+      "loss": 1.0986,
+      "step": 2300
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 6.438898569204635,
+      "learning_rate": 6.6877072235089414e-06,
+      "loss": 1.1399,
+      "step": 2301
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.95888724735152,
+      "learning_rate": 6.684999368232723e-06,
+      "loss": 1.071,
+      "step": 2302
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.357207206547102,
+      "learning_rate": 6.682290955314956e-06,
+      "loss": 1.072,
+      "step": 2303
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 6.461614548172811,
+      "learning_rate": 6.67958198565198e-06,
+      "loss": 1.1178,
+      "step": 2304
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.591672822168383,
+      "learning_rate": 6.67687246014031e-06,
+      "loss": 1.2012,
+      "step": 2305
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.799454855181129,
+      "learning_rate": 6.674162379676651e-06,
+      "loss": 1.0307,
+      "step": 2306
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 8.596960985925222,
+      "learning_rate": 6.67145174515789e-06,
+      "loss": 1.0745,
+      "step": 2307
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.262354681214488,
+      "learning_rate": 6.668740557481096e-06,
+      "loss": 1.1618,
+      "step": 2308
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 6.1864028140798215,
+      "learning_rate": 6.666028817543522e-06,
+      "loss": 1.1109,
+      "step": 2309
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 4.657505057744547,
+      "learning_rate": 6.663316526242605e-06,
+      "loss": 1.0158,
+      "step": 2310
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 5.635357308323266,
+      "learning_rate": 6.660603684475961e-06,
+      "loss": 0.9991,
+      "step": 2311
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 6.54195697194185,
+      "learning_rate": 6.657890293141393e-06,
+      "loss": 0.9848,
+      "step": 2312
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.199802086533475,
+      "learning_rate": 6.655176353136884e-06,
+      "loss": 1.0642,
+      "step": 2313
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 8.032145440748556,
+      "learning_rate": 6.652461865360594e-06,
+      "loss": 1.0906,
+      "step": 2314
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.280033525475414,
+      "learning_rate": 6.649746830710872e-06,
+      "loss": 1.113,
+      "step": 2315
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.779700347405152,
+      "learning_rate": 6.647031250086245e-06,
+      "loss": 1.0738,
+      "step": 2316
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 6.97092292351568,
+      "learning_rate": 6.644315124385415e-06,
+      "loss": 1.0571,
+      "step": 2317
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 5.398285297588067,
+      "learning_rate": 6.641598454507272e-06,
+      "loss": 1.1544,
+      "step": 2318
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 8.270353681405316,
+      "learning_rate": 6.638881241350884e-06,
+      "loss": 1.1248,
+      "step": 2319
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 9.658458183774705,
+      "learning_rate": 6.636163485815499e-06,
+      "loss": 1.0256,
+      "step": 2320
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 9.067932797824325,
+      "learning_rate": 6.633445188800542e-06,
+      "loss": 1.0665,
+      "step": 2321
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 9.626528571428484,
+      "learning_rate": 6.6307263512056175e-06,
+      "loss": 1.1246,
+      "step": 2322
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 10.894402897282532,
+      "learning_rate": 6.6280069739305145e-06,
+      "loss": 1.0974,
+      "step": 2323
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.011086165777385,
+      "learning_rate": 6.625287057875192e-06,
+      "loss": 1.12,
+      "step": 2324
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 6.992318522726499,
+      "learning_rate": 6.622566603939794e-06,
+      "loss": 1.0546,
+      "step": 2325
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 9.866736172327585,
+      "learning_rate": 6.61984561302464e-06,
+      "loss": 1.0994,
+      "step": 2326
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 8.19761834798153,
+      "learning_rate": 6.617124086030228e-06,
+      "loss": 1.0919,
+      "step": 2327
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 8.301641193486097,
+      "learning_rate": 6.614402023857231e-06,
+      "loss": 1.1539,
+      "step": 2328
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 6.800850067185573,
+      "learning_rate": 6.611679427406504e-06,
+      "loss": 1.1218,
+      "step": 2329
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 6.3657132955021325,
+      "learning_rate": 6.608956297579072e-06,
+      "loss": 1.0277,
+      "step": 2330
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 9.78622270115545,
+      "learning_rate": 6.6062326352761415e-06,
+      "loss": 1.1626,
+      "step": 2331
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 7.641493000799567,
+      "learning_rate": 6.603508441399098e-06,
+      "loss": 1.1372,
+      "step": 2332
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 10.853004771122803,
+      "learning_rate": 6.600783716849493e-06,
+      "loss": 1.0851,
+      "step": 2333
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 5.951319262257247,
+      "learning_rate": 6.5980584625290626e-06,
+      "loss": 1.078,
+      "step": 2334
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 6.781333891962816,
+      "learning_rate": 6.595332679339714e-06,
+      "loss": 1.1494,
+      "step": 2335
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 9.651244315997548,
+      "learning_rate": 6.592606368183532e-06,
+      "loss": 1.0081,
+      "step": 2336
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 9.44387763210281,
+      "learning_rate": 6.589879529962772e-06,
+      "loss": 1.1625,
+      "step": 2337
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 9.191130316597986,
+      "learning_rate": 6.587152165579872e-06,
+      "loss": 1.1538,
+      "step": 2338
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 6.452873736346533,
+      "learning_rate": 6.584424275937432e-06,
+      "loss": 1.1705,
+      "step": 2339
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 9.220347909447465,
+      "learning_rate": 6.581695861938237e-06,
+      "loss": 1.0699,
+      "step": 2340
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.293720819007875,
+      "learning_rate": 6.578966924485241e-06,
+      "loss": 1.046,
+      "step": 2341
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.193098379028378,
+      "learning_rate": 6.576237464481569e-06,
+      "loss": 1.0236,
+      "step": 2342
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 8.069018447117335,
+      "learning_rate": 6.573507482830522e-06,
+      "loss": 1.1488,
+      "step": 2343
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 6.368056545178417,
+      "learning_rate": 6.570776980435574e-06,
+      "loss": 1.1446,
+      "step": 2344
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 6.699010286488502,
+      "learning_rate": 6.568045958200369e-06,
+      "loss": 1.0967,
+      "step": 2345
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.394305729112852,
+      "learning_rate": 6.5653144170287235e-06,
+      "loss": 1.0163,
+      "step": 2346
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 6.733552595304037,
+      "learning_rate": 6.562582357824627e-06,
+      "loss": 1.0852,
+      "step": 2347
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 6.868649745390377,
+      "learning_rate": 6.5598497814922425e-06,
+      "loss": 1.0415,
+      "step": 2348
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 8.6069317299203,
+      "learning_rate": 6.557116688935899e-06,
+      "loss": 1.0085,
+      "step": 2349
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 6.639208262420337,
+      "learning_rate": 6.554383081060098e-06,
+      "loss": 1.0203,
+      "step": 2350
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.142878872671573,
+      "learning_rate": 6.551648958769515e-06,
+      "loss": 1.0184,
+      "step": 2351
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.483458880083707,
+      "learning_rate": 6.548914322968993e-06,
+      "loss": 1.1227,
+      "step": 2352
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 9.059396448644607,
+      "learning_rate": 6.546179174563541e-06,
+      "loss": 1.0372,
+      "step": 2353
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 8.403301435580433,
+      "learning_rate": 6.543443514458349e-06,
+      "loss": 0.9668,
+      "step": 2354
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 6.378791387658121,
+      "learning_rate": 6.5407073435587605e-06,
+      "loss": 1.0702,
+      "step": 2355
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 8.02196420693433,
+      "learning_rate": 6.537970662770304e-06,
+      "loss": 1.1372,
+      "step": 2356
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 4.97535801789917,
+      "learning_rate": 6.535233472998666e-06,
+      "loss": 1.0585,
+      "step": 2357
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 8.524417819073545,
+      "learning_rate": 6.532495775149706e-06,
+      "loss": 1.0294,
+      "step": 2358
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 10.962140789060843,
+      "learning_rate": 6.529757570129449e-06,
+      "loss": 1.0556,
+      "step": 2359
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 5.466677382246053,
+      "learning_rate": 6.5270188588440895e-06,
+      "loss": 1.0379,
+      "step": 2360
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 10.9061603814332,
+      "learning_rate": 6.524279642199991e-06,
+      "loss": 1.0323,
+      "step": 2361
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 8.278260830408673,
+      "learning_rate": 6.5215399211036815e-06,
+      "loss": 1.0903,
+      "step": 2362
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 5.699165179031979,
+      "learning_rate": 6.518799696461857e-06,
+      "loss": 0.9868,
+      "step": 2363
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.192605361170927,
+      "learning_rate": 6.516058969181381e-06,
+      "loss": 1.1443,
+      "step": 2364
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 10.334307970810464,
+      "learning_rate": 6.5133177401692824e-06,
+      "loss": 1.143,
+      "step": 2365
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 8.012603701170066,
+      "learning_rate": 6.510576010332754e-06,
+      "loss": 1.0906,
+      "step": 2366
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 8.795518939623001,
+      "learning_rate": 6.50783378057916e-06,
+      "loss": 1.0003,
+      "step": 2367
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.564353169435629,
+      "learning_rate": 6.505091051816026e-06,
+      "loss": 1.0108,
+      "step": 2368
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.444254359554327,
+      "learning_rate": 6.502347824951042e-06,
+      "loss": 1.0167,
+      "step": 2369
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.444226228771815,
+      "learning_rate": 6.499604100892064e-06,
+      "loss": 1.0746,
+      "step": 2370
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 6.324190772285466,
+      "learning_rate": 6.496859880547115e-06,
+      "loss": 1.0006,
+      "step": 2371
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.227652081848219,
+      "learning_rate": 6.49411516482438e-06,
+      "loss": 1.0502,
+      "step": 2372
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 10.545013154437468,
+      "learning_rate": 6.491369954632205e-06,
+      "loss": 1.0269,
+      "step": 2373
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 9.619077439403204,
+      "learning_rate": 6.4886242508791055e-06,
+      "loss": 1.158,
+      "step": 2374
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 8.55178575199717,
+      "learning_rate": 6.485878054473757e-06,
+      "loss": 1.041,
+      "step": 2375
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.426906926117814,
+      "learning_rate": 6.4831313663249975e-06,
+      "loss": 1.1326,
+      "step": 2376
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 8.451687225227868,
+      "learning_rate": 6.4803841873418305e-06,
+      "loss": 1.0632,
+      "step": 2377
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 9.685005224797019,
+      "learning_rate": 6.477636518433417e-06,
+      "loss": 1.0421,
+      "step": 2378
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 8.382144058447127,
+      "learning_rate": 6.474888360509088e-06,
+      "loss": 1.0926,
+      "step": 2379
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 8.0754250677293,
+      "learning_rate": 6.472139714478328e-06,
+      "loss": 1.1849,
+      "step": 2380
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 10.080424211303653,
+      "learning_rate": 6.469390581250788e-06,
+      "loss": 1.1085,
+      "step": 2381
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.667215768520507,
+      "learning_rate": 6.466640961736279e-06,
+      "loss": 1.0879,
+      "step": 2382
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 6.362417528823468,
+      "learning_rate": 6.463890856844771e-06,
+      "loss": 1.0711,
+      "step": 2383
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 5.647633766485047,
+      "learning_rate": 6.461140267486401e-06,
+      "loss": 1.1056,
+      "step": 2384
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 9.183772229839438,
+      "learning_rate": 6.458389194571459e-06,
+      "loss": 1.1209,
+      "step": 2385
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.9646619366720355,
+      "learning_rate": 6.455637639010397e-06,
+      "loss": 1.0718,
+      "step": 2386
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.667148471107838,
+      "learning_rate": 6.452885601713829e-06,
+      "loss": 1.0221,
+      "step": 2387
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 8.219266260854171,
+      "learning_rate": 6.450133083592527e-06,
+      "loss": 1.1921,
+      "step": 2388
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.6285148201623665,
+      "learning_rate": 6.447380085557422e-06,
+      "loss": 1.0668,
+      "step": 2389
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.748327084094465,
+      "learning_rate": 6.444626608519604e-06,
+      "loss": 1.1094,
+      "step": 2390
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 5.223452623927037,
+      "learning_rate": 6.4418726533903195e-06,
+      "loss": 1.117,
+      "step": 2391
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 7.474640399752847,
+      "learning_rate": 6.4391182210809796e-06,
+      "loss": 1.1717,
+      "step": 2392
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 6.3440018347094735,
+      "learning_rate": 6.436363312503147e-06,
+      "loss": 1.0738,
+      "step": 2393
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 8.05626472866361,
+      "learning_rate": 6.433607928568542e-06,
+      "loss": 1.0371,
+      "step": 2394
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 6.189669162581272,
+      "learning_rate": 6.4308520701890464e-06,
+      "loss": 0.9916,
+      "step": 2395
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.050597192111736,
+      "learning_rate": 6.428095738276698e-06,
+      "loss": 1.1617,
+      "step": 2396
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 9.997702821930186,
+      "learning_rate": 6.425338933743685e-06,
+      "loss": 1.1142,
+      "step": 2397
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 9.990184166071034,
+      "learning_rate": 6.4225816575023625e-06,
+      "loss": 1.0102,
+      "step": 2398
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 10.660342932149526,
+      "learning_rate": 6.419823910465234e-06,
+      "loss": 1.0784,
+      "step": 2399
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 6.027975672696032,
+      "learning_rate": 6.417065693544961e-06,
+      "loss": 1.0207,
+      "step": 2400
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 8.21289786241107,
+      "learning_rate": 6.414307007654364e-06,
+      "loss": 1.07,
+      "step": 2401
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 9.696293922591522,
+      "learning_rate": 6.411547853706408e-06,
+      "loss": 1.1423,
+      "step": 2402
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 10.046548523086592,
+      "learning_rate": 6.408788232614227e-06,
+      "loss": 1.0613,
+      "step": 2403
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 8.24347828630205,
+      "learning_rate": 6.4060281452911024e-06,
+      "loss": 1.0526,
+      "step": 2404
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.114775701345637,
+      "learning_rate": 6.403267592650466e-06,
+      "loss": 1.0827,
+      "step": 2405
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 8.831795112076346,
+      "learning_rate": 6.400506575605911e-06,
+      "loss": 1.1495,
+      "step": 2406
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.505126178654303,
+      "learning_rate": 6.397745095071182e-06,
+      "loss": 1.0217,
+      "step": 2407
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 6.810657294435726,
+      "learning_rate": 6.394983151960175e-06,
+      "loss": 1.1556,
+      "step": 2408
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.365941790918908,
+      "learning_rate": 6.392220747186939e-06,
+      "loss": 0.965,
+      "step": 2409
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.916341808323364,
+      "learning_rate": 6.38945788166568e-06,
+      "loss": 1.061,
+      "step": 2410
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 6.018438919059752,
+      "learning_rate": 6.386694556310752e-06,
+      "loss": 1.1178,
+      "step": 2411
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 5.240093426025353,
+      "learning_rate": 6.383930772036663e-06,
+      "loss": 1.0805,
+      "step": 2412
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 11.608425928411267,
+      "learning_rate": 6.381166529758072e-06,
+      "loss": 1.0369,
+      "step": 2413
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 8.976044459819668,
+      "learning_rate": 6.378401830389791e-06,
+      "loss": 1.0783,
+      "step": 2414
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 8.461869510768881,
+      "learning_rate": 6.3756366748467825e-06,
+      "loss": 1.084,
+      "step": 2415
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 8.255940275394327,
+      "learning_rate": 6.37287106404416e-06,
+      "loss": 1.1504,
+      "step": 2416
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.479088860695362,
+      "learning_rate": 6.3701049988971865e-06,
+      "loss": 1.1501,
+      "step": 2417
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 6.748290549897523,
+      "learning_rate": 6.367338480321278e-06,
+      "loss": 1.1258,
+      "step": 2418
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 8.025093419379816,
+      "learning_rate": 6.364571509231998e-06,
+      "loss": 1.0831,
+      "step": 2419
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.120950520151237,
+      "learning_rate": 6.361804086545063e-06,
+      "loss": 1.1147,
+      "step": 2420
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 10.837575426992005,
+      "learning_rate": 6.359036213176333e-06,
+      "loss": 1.0601,
+      "step": 2421
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 6.395001467171513,
+      "learning_rate": 6.356267890041823e-06,
+      "loss": 1.0603,
+      "step": 2422
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 10.618693862199995,
+      "learning_rate": 6.3534991180576954e-06,
+      "loss": 1.177,
+      "step": 2423
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 10.849478850504477,
+      "learning_rate": 6.35072989814026e-06,
+      "loss": 1.039,
+      "step": 2424
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 5.191416368472842,
+      "learning_rate": 6.347960231205974e-06,
+      "loss": 1.0587,
+      "step": 2425
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.318163122373594,
+      "learning_rate": 6.345190118171447e-06,
+      "loss": 1.0825,
+      "step": 2426
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 10.025588680569218,
+      "learning_rate": 6.3424195599534295e-06,
+      "loss": 1.0191,
+      "step": 2427
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 6.959701410659219,
+      "learning_rate": 6.339648557468825e-06,
+      "loss": 1.0586,
+      "step": 2428
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 8.838602705452324,
+      "learning_rate": 6.3368771116346826e-06,
+      "loss": 1.0522,
+      "step": 2429
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 6.608203872356304,
+      "learning_rate": 6.334105223368195e-06,
+      "loss": 1.1313,
+      "step": 2430
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 6.494616091969169,
+      "learning_rate": 6.3313328935867055e-06,
+      "loss": 1.0536,
+      "step": 2431
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 8.050186095000035,
+      "learning_rate": 6.3285601232077045e-06,
+      "loss": 0.9729,
+      "step": 2432
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 5.653565120309862,
+      "learning_rate": 6.325786913148819e-06,
+      "loss": 1.0833,
+      "step": 2433
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.065415269290604,
+      "learning_rate": 6.323013264327834e-06,
+      "loss": 1.0317,
+      "step": 2434
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.962909143843355,
+      "learning_rate": 6.3202391776626705e-06,
+      "loss": 1.0905,
+      "step": 2435
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.849908257646431,
+      "learning_rate": 6.317464654071399e-06,
+      "loss": 1.0363,
+      "step": 2436
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 5.091889254936792,
+      "learning_rate": 6.3146896944722335e-06,
+      "loss": 1.0717,
+      "step": 2437
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.510173309634464,
+      "learning_rate": 6.31191429978353e-06,
+      "loss": 1.1082,
+      "step": 2438
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 8.049610942385652,
+      "learning_rate": 6.309138470923794e-06,
+      "loss": 1.0269,
+      "step": 2439
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 8.275958683128888,
+      "learning_rate": 6.30636220881167e-06,
+      "loss": 1.0535,
+      "step": 2440
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.291563878079067,
+      "learning_rate": 6.303585514365944e-06,
+      "loss": 1.0077,
+      "step": 2441
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 5.5247353690523715,
+      "learning_rate": 6.300808388505552e-06,
+      "loss": 1.0304,
+      "step": 2442
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 5.795256484127813,
+      "learning_rate": 6.298030832149568e-06,
+      "loss": 1.0853,
+      "step": 2443
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 5.564307813976859,
+      "learning_rate": 6.295252846217208e-06,
+      "loss": 1.0885,
+      "step": 2444
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.0610859447560825,
+      "learning_rate": 6.292474431627833e-06,
+      "loss": 1.0967,
+      "step": 2445
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 7.344834479922676,
+      "learning_rate": 6.289695589300946e-06,
+      "loss": 1.0581,
+      "step": 2446
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 9.301692336019716,
+      "learning_rate": 6.286916320156186e-06,
+      "loss": 1.0577,
+      "step": 2447
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 8.427749693317626,
+      "learning_rate": 6.2841366251133405e-06,
+      "loss": 1.024,
+      "step": 2448
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 8.029081524735176,
+      "learning_rate": 6.281356505092334e-06,
+      "loss": 1.1043,
+      "step": 2449
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.39088940458424,
+      "learning_rate": 6.278575961013231e-06,
+      "loss": 1.1126,
+      "step": 2450
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 5.74930681018394,
+      "learning_rate": 6.275794993796241e-06,
+      "loss": 1.0621,
+      "step": 2451
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 5.458087021987001,
+      "learning_rate": 6.273013604361707e-06,
+      "loss": 1.0942,
+      "step": 2452
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 9.11729689925752,
+      "learning_rate": 6.2702317936301164e-06,
+      "loss": 1.0843,
+      "step": 2453
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 9.42440765606935,
+      "learning_rate": 6.2674495625220945e-06,
+      "loss": 1.1232,
+      "step": 2454
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.808211697198796,
+      "learning_rate": 6.264666911958404e-06,
+      "loss": 1.1893,
+      "step": 2455
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.9173964175280735,
+      "learning_rate": 6.261883842859952e-06,
+      "loss": 1.0505,
+      "step": 2456
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 6.212114844795869,
+      "learning_rate": 6.259100356147778e-06,
+      "loss": 1.0033,
+      "step": 2457
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.519362372086553,
+      "learning_rate": 6.256316452743061e-06,
+      "loss": 1.1551,
+      "step": 2458
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.585941364945443,
+      "learning_rate": 6.2535321335671215e-06,
+      "loss": 1.0911,
+      "step": 2459
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 5.417778944837452,
+      "learning_rate": 6.250747399541414e-06,
+      "loss": 1.0512,
+      "step": 2460
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 5.782909518584041,
+      "learning_rate": 6.24796225158753e-06,
+      "loss": 1.0334,
+      "step": 2461
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.403666809744571,
+      "learning_rate": 6.2451766906272015e-06,
+      "loss": 1.1033,
+      "step": 2462
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.16273808428398,
+      "learning_rate": 6.242390717582294e-06,
+      "loss": 1.0457,
+      "step": 2463
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 6.665181335251205,
+      "learning_rate": 6.23960433337481e-06,
+      "loss": 1.1065,
+      "step": 2464
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.6874711146807835,
+      "learning_rate": 6.23681753892689e-06,
+      "loss": 1.0719,
+      "step": 2465
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.4990279173943195,
+      "learning_rate": 6.234030335160808e-06,
+      "loss": 1.0303,
+      "step": 2466
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 9.806042466603206,
+      "learning_rate": 6.2312427229989715e-06,
+      "loss": 1.0969,
+      "step": 2467
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.758758235427212,
+      "learning_rate": 6.228454703363932e-06,
+      "loss": 1.1793,
+      "step": 2468
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.439509513385088,
+      "learning_rate": 6.225666277178364e-06,
+      "loss": 1.0514,
+      "step": 2469
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 9.021117905038249,
+      "learning_rate": 6.222877445365085e-06,
+      "loss": 1.048,
+      "step": 2470
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.748893592458336,
+      "learning_rate": 6.220088208847042e-06,
+      "loss": 1.0076,
+      "step": 2471
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.7772841442586405,
+      "learning_rate": 6.217298568547321e-06,
+      "loss": 1.0981,
+      "step": 2472
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.51717962520824,
+      "learning_rate": 6.214508525389136e-06,
+      "loss": 0.995,
+      "step": 2473
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 5.518665976983076,
+      "learning_rate": 6.211718080295837e-06,
+      "loss": 1.1006,
+      "step": 2474
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 9.214850968899139,
+      "learning_rate": 6.208927234190907e-06,
+      "loss": 1.136,
+      "step": 2475
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 6.687976936749305,
+      "learning_rate": 6.206135987997966e-06,
+      "loss": 1.0852,
+      "step": 2476
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 10.042069990938893,
+      "learning_rate": 6.203344342640755e-06,
+      "loss": 1.1151,
+      "step": 2477
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 9.10261103338844,
+      "learning_rate": 6.200552299043157e-06,
+      "loss": 1.0984,
+      "step": 2478
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 10.12775746972815,
+      "learning_rate": 6.197759858129186e-06,
+      "loss": 1.0728,
+      "step": 2479
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.440209755901165,
+      "learning_rate": 6.194967020822984e-06,
+      "loss": 1.0726,
+      "step": 2480
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.341445111379203,
+      "learning_rate": 6.192173788048823e-06,
+      "loss": 1.1704,
+      "step": 2481
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.630735119752908,
+      "learning_rate": 6.1893801607311135e-06,
+      "loss": 1.0464,
+      "step": 2482
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 5.400506829493365,
+      "learning_rate": 6.186586139794387e-06,
+      "loss": 1.0846,
+      "step": 2483
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 6.452764936696683,
+      "learning_rate": 6.183791726163313e-06,
+      "loss": 1.0452,
+      "step": 2484
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.572073366355482,
+      "learning_rate": 6.180996920762685e-06,
+      "loss": 1.0541,
+      "step": 2485
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 6.385722679769644,
+      "learning_rate": 6.1782017245174316e-06,
+      "loss": 1.0933,
+      "step": 2486
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 9.79017280332082,
+      "learning_rate": 6.1754061383526065e-06,
+      "loss": 1.0111,
+      "step": 2487
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 9.284564551617073,
+      "learning_rate": 6.172610163193394e-06,
+      "loss": 0.9597,
+      "step": 2488
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 6.4367313946376665,
+      "learning_rate": 6.169813799965107e-06,
+      "loss": 1.0975,
+      "step": 2489
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 6.90927603741212,
+      "learning_rate": 6.167017049593189e-06,
+      "loss": 1.0507,
+      "step": 2490
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.51924228457596,
+      "learning_rate": 6.164219913003208e-06,
+      "loss": 1.0112,
+      "step": 2491
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 9.814860220390877,
+      "learning_rate": 6.161422391120862e-06,
+      "loss": 1.0196,
+      "step": 2492
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.68603043931395,
+      "learning_rate": 6.158624484871973e-06,
+      "loss": 1.147,
+      "step": 2493
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 6.885770362027918,
+      "learning_rate": 6.155826195182497e-06,
+      "loss": 1.0729,
+      "step": 2494
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 6.6308973668654065,
+      "learning_rate": 6.153027522978512e-06,
+      "loss": 1.0826,
+      "step": 2495
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 6.304715445136306,
+      "learning_rate": 6.150228469186223e-06,
+      "loss": 1.1172,
+      "step": 2496
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.747788919164873,
+      "learning_rate": 6.147429034731962e-06,
+      "loss": 1.1007,
+      "step": 2497
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.177284418967939,
+      "learning_rate": 6.144629220542188e-06,
+      "loss": 1.0563,
+      "step": 2498
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 6.550693597087024,
+      "learning_rate": 6.141829027543482e-06,
+      "loss": 1.0506,
+      "step": 2499
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 4.836465739918435,
+      "learning_rate": 6.139028456662556e-06,
+      "loss": 1.0656,
+      "step": 2500
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.790627142111054,
+      "learning_rate": 6.136227508826241e-06,
+      "loss": 1.0597,
+      "step": 2501
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 8.059199933141858,
+      "learning_rate": 6.133426184961498e-06,
+      "loss": 1.0573,
+      "step": 2502
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 6.892599406755736,
+      "learning_rate": 6.1306244859954086e-06,
+      "loss": 1.0577,
+      "step": 2503
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 6.265916590229476,
+      "learning_rate": 6.127822412855184e-06,
+      "loss": 1.061,
+      "step": 2504
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 6.985771594394403,
+      "learning_rate": 6.12501996646815e-06,
+      "loss": 1.0184,
+      "step": 2505
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.835599170878807,
+      "learning_rate": 6.122217147761764e-06,
+      "loss": 1.133,
+      "step": 2506
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 8.668398972458517,
+      "learning_rate": 6.119413957663603e-06,
+      "loss": 1.0865,
+      "step": 2507
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.74560422759956,
+      "learning_rate": 6.116610397101369e-06,
+      "loss": 1.1166,
+      "step": 2508
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.163560964510855,
+      "learning_rate": 6.113806467002884e-06,
+      "loss": 0.9878,
+      "step": 2509
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 9.306326901666006,
+      "learning_rate": 6.111002168296094e-06,
+      "loss": 1.128,
+      "step": 2510
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.245396081897838,
+      "learning_rate": 6.1081975019090675e-06,
+      "loss": 1.0208,
+      "step": 2511
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.393873971878754,
+      "learning_rate": 6.105392468769995e-06,
+      "loss": 1.0792,
+      "step": 2512
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 5.599922383162224,
+      "learning_rate": 6.102587069807183e-06,
+      "loss": 1.1072,
+      "step": 2513
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 4.986853449258178,
+      "learning_rate": 6.099781305949067e-06,
+      "loss": 1.0555,
+      "step": 2514
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 8.395792815497716,
+      "learning_rate": 6.096975178124199e-06,
+      "loss": 1.0832,
+      "step": 2515
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 5.97265790385324,
+      "learning_rate": 6.0941686872612525e-06,
+      "loss": 1.1446,
+      "step": 2516
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.818585655607422,
+      "learning_rate": 6.091361834289018e-06,
+      "loss": 1.1091,
+      "step": 2517
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 6.015053729545329,
+      "learning_rate": 6.088554620136411e-06,
+      "loss": 1.0748,
+      "step": 2518
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 9.979041798434697,
+      "learning_rate": 6.085747045732463e-06,
+      "loss": 1.0647,
+      "step": 2519
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 8.477259622544908,
+      "learning_rate": 6.082939112006327e-06,
+      "loss": 0.9292,
+      "step": 2520
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 10.689951814550502,
+      "learning_rate": 6.080130819887272e-06,
+      "loss": 1.031,
+      "step": 2521
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 8.032645506651166,
+      "learning_rate": 6.077322170304687e-06,
+      "loss": 0.9482,
+      "step": 2522
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 8.142656556071064,
+      "learning_rate": 6.074513164188083e-06,
+      "loss": 1.0602,
+      "step": 2523
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 9.13642841218158,
+      "learning_rate": 6.071703802467082e-06,
+      "loss": 1.1447,
+      "step": 2524
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.786317586829475,
+      "learning_rate": 6.068894086071427e-06,
+      "loss": 1.0253,
+      "step": 2525
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.978690057304507,
+      "learning_rate": 6.066084015930982e-06,
+      "loss": 1.1382,
+      "step": 2526
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 9.792559307155983,
+      "learning_rate": 6.063273592975722e-06,
+      "loss": 1.1274,
+      "step": 2527
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 8.18198661727572,
+      "learning_rate": 6.060462818135742e-06,
+      "loss": 1.0271,
+      "step": 2528
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 6.868009505823851,
+      "learning_rate": 6.0576516923412545e-06,
+      "loss": 1.0516,
+      "step": 2529
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 9.614277475007837,
+      "learning_rate": 6.054840216522584e-06,
+      "loss": 1.0841,
+      "step": 2530
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 6.279503836806656,
+      "learning_rate": 6.0520283916101754e-06,
+      "loss": 1.1067,
+      "step": 2531
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.871638729989099,
+      "learning_rate": 6.049216218534587e-06,
+      "loss": 1.0356,
+      "step": 2532
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 9.144754181785675,
+      "learning_rate": 6.046403698226491e-06,
+      "loss": 1.0665,
+      "step": 2533
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 6.510341064260523,
+      "learning_rate": 6.043590831616677e-06,
+      "loss": 1.157,
+      "step": 2534
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 5.610298381074865,
+      "learning_rate": 6.040777619636046e-06,
+      "loss": 1.0225,
+      "step": 2535
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 8.771969964435483,
+      "learning_rate": 6.037964063215621e-06,
+      "loss": 1.0832,
+      "step": 2536
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 8.1589470726241,
+      "learning_rate": 6.035150163286529e-06,
+      "loss": 1.0611,
+      "step": 2537
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.94604076547778,
+      "learning_rate": 6.032335920780016e-06,
+      "loss": 1.058,
+      "step": 2538
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.900489590782968,
+      "learning_rate": 6.02952133662744e-06,
+      "loss": 1.0528,
+      "step": 2539
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 6.356736114977681,
+      "learning_rate": 6.026706411760275e-06,
+      "loss": 0.9925,
+      "step": 2540
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 8.349025522294998,
+      "learning_rate": 6.023891147110103e-06,
+      "loss": 1.0203,
+      "step": 2541
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 9.901791194541817,
+      "learning_rate": 6.021075543608621e-06,
+      "loss": 1.0554,
+      "step": 2542
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.832353636557188,
+      "learning_rate": 6.0182596021876375e-06,
+      "loss": 1.057,
+      "step": 2543
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 6.891410683604362,
+      "learning_rate": 6.015443323779077e-06,
+      "loss": 1.0467,
+      "step": 2544
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 6.124566690950503,
+      "learning_rate": 6.012626709314967e-06,
+      "loss": 1.1506,
+      "step": 2545
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.011920794855757,
+      "learning_rate": 6.009809759727453e-06,
+      "loss": 1.0988,
+      "step": 2546
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.748589674036816,
+      "learning_rate": 6.00699247594879e-06,
+      "loss": 1.1398,
+      "step": 2547
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 6.449404746256427,
+      "learning_rate": 6.004174858911343e-06,
+      "loss": 1.1061,
+      "step": 2548
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 5.951969126698747,
+      "learning_rate": 6.001356909547587e-06,
+      "loss": 0.9921,
+      "step": 2549
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 8.799110215448882,
+      "learning_rate": 5.998538628790105e-06,
+      "loss": 1.1607,
+      "step": 2550
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 5.966652206266579,
+      "learning_rate": 5.995720017571596e-06,
+      "loss": 1.0154,
+      "step": 2551
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.989524949006456,
+      "learning_rate": 5.992901076824863e-06,
+      "loss": 1.0823,
+      "step": 2552
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 6.544959398951193,
+      "learning_rate": 5.9900818074828175e-06,
+      "loss": 1.1321,
+      "step": 2553
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 6.804292691065747,
+      "learning_rate": 5.987262210478484e-06,
+      "loss": 1.0939,
+      "step": 2554
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 9.085849402242273,
+      "learning_rate": 5.98444228674499e-06,
+      "loss": 1.1112,
+      "step": 2555
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 10.108634305661994,
+      "learning_rate": 5.981622037215579e-06,
+      "loss": 1.0008,
+      "step": 2556
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 6.693597078894207,
+      "learning_rate": 5.978801462823594e-06,
+      "loss": 1.0956,
+      "step": 2557
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 7.410235215679029,
+      "learning_rate": 5.975980564502488e-06,
+      "loss": 1.0939,
+      "step": 2558
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 8.797749360807837,
+      "learning_rate": 5.973159343185826e-06,
+      "loss": 1.0269,
+      "step": 2559
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 8.86540860434206,
+      "learning_rate": 5.970337799807274e-06,
+      "loss": 1.0555,
+      "step": 2560
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 8.743096504099011,
+      "learning_rate": 5.967515935300606e-06,
+      "loss": 1.0823,
+      "step": 2561
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.128343965886201,
+      "learning_rate": 5.964693750599706e-06,
+      "loss": 1.1179,
+      "step": 2562
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 10.18188117034308,
+      "learning_rate": 5.961871246638557e-06,
+      "loss": 1.0512,
+      "step": 2563
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 8.817238932480116,
+      "learning_rate": 5.959048424351254e-06,
+      "loss": 1.0794,
+      "step": 2564
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 9.040173644002046,
+      "learning_rate": 5.956225284671995e-06,
+      "loss": 1.0323,
+      "step": 2565
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.699239607242743,
+      "learning_rate": 5.953401828535082e-06,
+      "loss": 0.999,
+      "step": 2566
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 9.987007492868091,
+      "learning_rate": 5.950578056874923e-06,
+      "loss": 1.0809,
+      "step": 2567
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 6.327926228688559,
+      "learning_rate": 5.94775397062603e-06,
+      "loss": 1.1174,
+      "step": 2568
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 6.356980018626036,
+      "learning_rate": 5.94492957072302e-06,
+      "loss": 1.0098,
+      "step": 2569
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 6.562170747372973,
+      "learning_rate": 5.942104858100614e-06,
+      "loss": 1.1078,
+      "step": 2570
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 9.037011335976858,
+      "learning_rate": 5.939279833693632e-06,
+      "loss": 1.0574,
+      "step": 2571
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 6.847781376822956,
+      "learning_rate": 5.936454498437004e-06,
+      "loss": 1.1116,
+      "step": 2572
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 9.450298153777267,
+      "learning_rate": 5.93362885326576e-06,
+      "loss": 1.1019,
+      "step": 2573
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 6.706667481744824,
+      "learning_rate": 5.9308028991150276e-06,
+      "loss": 1.1149,
+      "step": 2574
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 9.600382057244538,
+      "learning_rate": 5.9279766369200465e-06,
+      "loss": 1.0517,
+      "step": 2575
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 8.121527549053427,
+      "learning_rate": 5.925150067616152e-06,
+      "loss": 1.0321,
+      "step": 2576
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 5.436563998839444,
+      "learning_rate": 5.92232319213878e-06,
+      "loss": 1.1129,
+      "step": 2577
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.517886630514804,
+      "learning_rate": 5.9194960114234714e-06,
+      "loss": 1.0915,
+      "step": 2578
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 5.315707244399211,
+      "learning_rate": 5.916668526405867e-06,
+      "loss": 1.0857,
+      "step": 2579
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 6.10051848699593,
+      "learning_rate": 5.9138407380217054e-06,
+      "loss": 1.1125,
+      "step": 2580
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 6.74671174929491,
+      "learning_rate": 5.911012647206831e-06,
+      "loss": 1.097,
+      "step": 2581
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 8.918728696321264,
+      "learning_rate": 5.908184254897183e-06,
+      "loss": 1.111,
+      "step": 2582
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.794669699454904,
+      "learning_rate": 5.905355562028804e-06,
+      "loss": 1.1548,
+      "step": 2583
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 8.49116860842423,
+      "learning_rate": 5.902526569537836e-06,
+      "loss": 1.0337,
+      "step": 2584
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 8.783727028337564,
+      "learning_rate": 5.899697278360517e-06,
+      "loss": 1.0641,
+      "step": 2585
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.581858001823447,
+      "learning_rate": 5.896867689433185e-06,
+      "loss": 1.0326,
+      "step": 2586
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.651806304660628,
+      "learning_rate": 5.89403780369228e-06,
+      "loss": 1.0815,
+      "step": 2587
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.98697695505451,
+      "learning_rate": 5.891207622074336e-06,
+      "loss": 1.0805,
+      "step": 2588
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 6.670897152749724,
+      "learning_rate": 5.888377145515987e-06,
+      "loss": 1.0627,
+      "step": 2589
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.0346554553545575,
+      "learning_rate": 5.885546374953965e-06,
+      "loss": 1.1214,
+      "step": 2590
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 5.92748924923748,
+      "learning_rate": 5.882715311325094e-06,
+      "loss": 1.0636,
+      "step": 2591
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 5.8791507555051155,
+      "learning_rate": 5.879883955566306e-06,
+      "loss": 1.0913,
+      "step": 2592
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 5.245741945698822,
+      "learning_rate": 5.877052308614619e-06,
+      "loss": 1.051,
+      "step": 2593
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 6.4504268771982005,
+      "learning_rate": 5.8742203714071525e-06,
+      "loss": 1.0434,
+      "step": 2594
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 9.821031606419181,
+      "learning_rate": 5.8713881448811205e-06,
+      "loss": 1.0646,
+      "step": 2595
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.975683255432128,
+      "learning_rate": 5.868555629973834e-06,
+      "loss": 1.0046,
+      "step": 2596
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 9.416450203266956,
+      "learning_rate": 5.8657228276226975e-06,
+      "loss": 0.9607,
+      "step": 2597
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 10.815601249398648,
+      "learning_rate": 5.862889738765213e-06,
+      "loss": 1.0805,
+      "step": 2598
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.327666994113519,
+      "learning_rate": 5.860056364338974e-06,
+      "loss": 0.9672,
+      "step": 2599
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 11.058749702416403,
+      "learning_rate": 5.857222705281674e-06,
+      "loss": 1.1705,
+      "step": 2600
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 10.612517293470834,
+      "learning_rate": 5.8543887625310955e-06,
+      "loss": 0.9882,
+      "step": 2601
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.536727616352362,
+      "learning_rate": 5.851554537025117e-06,
+      "loss": 1.0852,
+      "step": 2602
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 6.139593031333877,
+      "learning_rate": 5.84872002970171e-06,
+      "loss": 1.0247,
+      "step": 2603
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 6.6296567634173424,
+      "learning_rate": 5.84588524149894e-06,
+      "loss": 1.0852,
+      "step": 2604
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 8.38534419800214,
+      "learning_rate": 5.843050173354964e-06,
+      "loss": 1.0454,
+      "step": 2605
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 8.567985301466676,
+      "learning_rate": 5.840214826208034e-06,
+      "loss": 1.0968,
+      "step": 2606
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 9.293958698646408,
+      "learning_rate": 5.837379200996491e-06,
+      "loss": 1.0392,
+      "step": 2607
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 6.795760787842473,
+      "learning_rate": 5.834543298658774e-06,
+      "loss": 1.0415,
+      "step": 2608
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 6.116696775982066,
+      "learning_rate": 5.831707120133405e-06,
+      "loss": 1.0959,
+      "step": 2609
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 8.558623492999315,
+      "learning_rate": 5.828870666359006e-06,
+      "loss": 1.1056,
+      "step": 2610
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.085425485143764,
+      "learning_rate": 5.826033938274283e-06,
+      "loss": 1.0938,
+      "step": 2611
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 8.700862116650391,
+      "learning_rate": 5.8231969368180385e-06,
+      "loss": 1.1019,
+      "step": 2612
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 8.753433417939778,
+      "learning_rate": 5.820359662929161e-06,
+      "loss": 1.0568,
+      "step": 2613
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 5.818960257422497,
+      "learning_rate": 5.817522117546634e-06,
+      "loss": 1.0548,
+      "step": 2614
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 5.017133875488897,
+      "learning_rate": 5.814684301609524e-06,
+      "loss": 1.0259,
+      "step": 2615
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.0317629619665505,
+      "learning_rate": 5.811846216056996e-06,
+      "loss": 1.0874,
+      "step": 2616
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 7.55038322955586,
+      "learning_rate": 5.809007861828295e-06,
+      "loss": 1.0544,
+      "step": 2617
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 10.17556579180068,
+      "learning_rate": 5.806169239862761e-06,
+      "loss": 1.0001,
+      "step": 2618
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 6.394301342186405,
+      "learning_rate": 5.80333035109982e-06,
+      "loss": 1.0618,
+      "step": 2619
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 10.212978837207837,
+      "learning_rate": 5.800491196478989e-06,
+      "loss": 1.0268,
+      "step": 2620
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 5.602771566501415,
+      "learning_rate": 5.797651776939868e-06,
+      "loss": 1.0307,
+      "step": 2621
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 5.307377472899829,
+      "learning_rate": 5.794812093422148e-06,
+      "loss": 1.0366,
+      "step": 2622
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.650328918704066,
+      "learning_rate": 5.791972146865612e-06,
+      "loss": 1.0272,
+      "step": 2623
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.3173366968026246,
+      "learning_rate": 5.789131938210121e-06,
+      "loss": 1.114,
+      "step": 2624
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 5.481686382554482,
+      "learning_rate": 5.786291468395625e-06,
+      "loss": 1.0673,
+      "step": 2625
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.621181345418941,
+      "learning_rate": 5.783450738362167e-06,
+      "loss": 1.0402,
+      "step": 2626
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 8.165328415167929,
+      "learning_rate": 5.780609749049868e-06,
+      "loss": 1.0598,
+      "step": 2627
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 8.018890398868502,
+      "learning_rate": 5.7777685013989415e-06,
+      "loss": 1.0252,
+      "step": 2628
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 8.945303301039957,
+      "learning_rate": 5.77492699634968e-06,
+      "loss": 1.1471,
+      "step": 2629
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.008341667699472,
+      "learning_rate": 5.772085234842465e-06,
+      "loss": 1.0249,
+      "step": 2630
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 9.826545565724851,
+      "learning_rate": 5.769243217817764e-06,
+      "loss": 1.099,
+      "step": 2631
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 8.804737201129303,
+      "learning_rate": 5.766400946216127e-06,
+      "loss": 0.9994,
+      "step": 2632
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 5.255356749894535,
+      "learning_rate": 5.7635584209781855e-06,
+      "loss": 1.0446,
+      "step": 2633
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 9.515778033181064,
+      "learning_rate": 5.760715643044663e-06,
+      "loss": 1.1262,
+      "step": 2634
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.671840094618166,
+      "learning_rate": 5.757872613356358e-06,
+      "loss": 1.0099,
+      "step": 2635
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.120332134284422,
+      "learning_rate": 5.755029332854158e-06,
+      "loss": 1.0669,
+      "step": 2636
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.364788316471797,
+      "learning_rate": 5.752185802479031e-06,
+      "loss": 1.0881,
+      "step": 2637
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 6.879705530375664,
+      "learning_rate": 5.749342023172026e-06,
+      "loss": 0.9954,
+      "step": 2638
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 6.212364970349424,
+      "learning_rate": 5.746497995874281e-06,
+      "loss": 1.114,
+      "step": 2639
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 9.171099163296914,
+      "learning_rate": 5.7436537215270085e-06,
+      "loss": 1.0352,
+      "step": 2640
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 10.016973928870518,
+      "learning_rate": 5.740809201071505e-06,
+      "loss": 1.1078,
+      "step": 2641
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 9.018614006918792,
+      "learning_rate": 5.7379644354491526e-06,
+      "loss": 1.0714,
+      "step": 2642
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 8.460912644788783,
+      "learning_rate": 5.735119425601408e-06,
+      "loss": 1.0243,
+      "step": 2643
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 8.039756836793295,
+      "learning_rate": 5.732274172469815e-06,
+      "loss": 1.094,
+      "step": 2644
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 8.414679727912185,
+      "learning_rate": 5.729428676995994e-06,
+      "loss": 1.0691,
+      "step": 2645
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 8.028409058699879,
+      "learning_rate": 5.726582940121644e-06,
+      "loss": 1.0681,
+      "step": 2646
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 6.870545859415792,
+      "learning_rate": 5.7237369627885495e-06,
+      "loss": 1.0472,
+      "step": 2647
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.934884967749143,
+      "learning_rate": 5.720890745938572e-06,
+      "loss": 1.0282,
+      "step": 2648
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.525833796980881,
+      "learning_rate": 5.718044290513649e-06,
+      "loss": 1.136,
+      "step": 2649
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.624347735627071,
+      "learning_rate": 5.715197597455801e-06,
+      "loss": 1.0814,
+      "step": 2650
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.696135445654366,
+      "learning_rate": 5.712350667707126e-06,
+      "loss": 0.9959,
+      "step": 2651
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 10.260246286638774,
+      "learning_rate": 5.709503502209801e-06,
+      "loss": 1.0913,
+      "step": 2652
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 6.197165654627859,
+      "learning_rate": 5.7066561019060794e-06,
+      "loss": 1.0165,
+      "step": 2653
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.473683972300737,
+      "learning_rate": 5.703808467738291e-06,
+      "loss": 1.012,
+      "step": 2654
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 6.6584366839508755,
+      "learning_rate": 5.700960600648848e-06,
+      "loss": 1.0081,
+      "step": 2655
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.123754309005078,
+      "learning_rate": 5.698112501580239e-06,
+      "loss": 1.1192,
+      "step": 2656
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 5.71923561341102,
+      "learning_rate": 5.695264171475021e-06,
+      "loss": 1.0266,
+      "step": 2657
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 6.79380466776334,
+      "learning_rate": 5.692415611275837e-06,
+      "loss": 1.0607,
+      "step": 2658
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.918946561294021,
+      "learning_rate": 5.689566821925403e-06,
+      "loss": 1.0342,
+      "step": 2659
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 6.102795492986153,
+      "learning_rate": 5.686717804366512e-06,
+      "loss": 1.0046,
+      "step": 2660
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 9.19107989245618,
+      "learning_rate": 5.683868559542027e-06,
+      "loss": 1.1235,
+      "step": 2661
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 5.4323275066139445,
+      "learning_rate": 5.6810190883948945e-06,
+      "loss": 1.0425,
+      "step": 2662
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 6.634409693747351,
+      "learning_rate": 5.678169391868128e-06,
+      "loss": 1.0385,
+      "step": 2663
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 11.619132648863305,
+      "learning_rate": 5.675319470904823e-06,
+      "loss": 1.1546,
+      "step": 2664
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.679872167976498,
+      "learning_rate": 5.672469326448144e-06,
+      "loss": 1.1384,
+      "step": 2665
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 10.590546793569253,
+      "learning_rate": 5.669618959441331e-06,
+      "loss": 1.0062,
+      "step": 2666
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.127212499186323,
+      "learning_rate": 5.6667683708276985e-06,
+      "loss": 1.1143,
+      "step": 2667
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 5.440134319893578,
+      "learning_rate": 5.663917561550632e-06,
+      "loss": 1.0881,
+      "step": 2668
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 6.135385535896774,
+      "learning_rate": 5.661066532553593e-06,
+      "loss": 1.0428,
+      "step": 2669
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 8.310414106519335,
+      "learning_rate": 5.658215284780114e-06,
+      "loss": 1.0981,
+      "step": 2670
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 8.110275935565976,
+      "learning_rate": 5.6553638191737995e-06,
+      "loss": 1.0972,
+      "step": 2671
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 5.36463916838577,
+      "learning_rate": 5.652512136678327e-06,
+      "loss": 1.08,
+      "step": 2672
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 6.12327209339178,
+      "learning_rate": 5.649660238237449e-06,
+      "loss": 1.0363,
+      "step": 2673
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.578881983134596,
+      "learning_rate": 5.64680812479498e-06,
+      "loss": 0.9708,
+      "step": 2674
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.088817106439675,
+      "learning_rate": 5.643955797294816e-06,
+      "loss": 1.0717,
+      "step": 2675
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 6.477910567471656,
+      "learning_rate": 5.641103256680919e-06,
+      "loss": 1.0973,
+      "step": 2676
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 10.39051792647672,
+      "learning_rate": 5.63825050389732e-06,
+      "loss": 1.0499,
+      "step": 2677
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 11.867649400766492,
+      "learning_rate": 5.635397539888124e-06,
+      "loss": 0.9717,
+      "step": 2678
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.6798296369351355,
+      "learning_rate": 5.632544365597504e-06,
+      "loss": 1.0481,
+      "step": 2679
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 6.483578842304945,
+      "learning_rate": 5.629690981969704e-06,
+      "loss": 1.1034,
+      "step": 2680
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.031559339786733,
+      "learning_rate": 5.626837389949034e-06,
+      "loss": 1.1278,
+      "step": 2681
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 8.069436660981022,
+      "learning_rate": 5.623983590479875e-06,
+      "loss": 1.0624,
+      "step": 2682
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 8.589083409135927,
+      "learning_rate": 5.621129584506679e-06,
+      "loss": 1.0038,
+      "step": 2683
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 9.623191266519518,
+      "learning_rate": 5.618275372973965e-06,
+      "loss": 1.0393,
+      "step": 2684
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 5.35733368682282,
+      "learning_rate": 5.615420956826313e-06,
+      "loss": 0.9675,
+      "step": 2685
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.3617325646386735,
+      "learning_rate": 5.61256633700838e-06,
+      "loss": 1.1328,
+      "step": 2686
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.795863284127396,
+      "learning_rate": 5.6097115144648905e-06,
+      "loss": 1.0637,
+      "step": 2687
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.680269699664123,
+      "learning_rate": 5.6068564901406295e-06,
+      "loss": 1.0695,
+      "step": 2688
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 6.399047204424375,
+      "learning_rate": 5.604001264980452e-06,
+      "loss": 1.0877,
+      "step": 2689
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.3368903633280995,
+      "learning_rate": 5.601145839929279e-06,
+      "loss": 1.0401,
+      "step": 2690
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 6.925870730634387,
+      "learning_rate": 5.5982902159321005e-06,
+      "loss": 1.1387,
+      "step": 2691
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 6.944163716364282,
+      "learning_rate": 5.595434393933969e-06,
+      "loss": 1.132,
+      "step": 2692
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 8.946271149057887,
+      "learning_rate": 5.592578374880003e-06,
+      "loss": 1.0353,
+      "step": 2693
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.5874106360522795,
+      "learning_rate": 5.589722159715386e-06,
+      "loss": 1.1128,
+      "step": 2694
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 8.345854684955906,
+      "learning_rate": 5.58686574938537e-06,
+      "loss": 1.0785,
+      "step": 2695
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.4716010433809545,
+      "learning_rate": 5.584009144835267e-06,
+      "loss": 1.0177,
+      "step": 2696
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 6.451037405686858,
+      "learning_rate": 5.581152347010453e-06,
+      "loss": 1.1284,
+      "step": 2697
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.409728630633018,
+      "learning_rate": 5.578295356856373e-06,
+      "loss": 1.021,
+      "step": 2698
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 9.382484960184133,
+      "learning_rate": 5.575438175318533e-06,
+      "loss": 1.1542,
+      "step": 2699
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.299012434740956,
+      "learning_rate": 5.572580803342499e-06,
+      "loss": 1.0622,
+      "step": 2700
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 8.244669124845753,
+      "learning_rate": 5.569723241873905e-06,
+      "loss": 1.0561,
+      "step": 2701
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 5.414607727149686,
+      "learning_rate": 5.566865491858446e-06,
+      "loss": 1.0745,
+      "step": 2702
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 5.47829522348904,
+      "learning_rate": 5.564007554241878e-06,
+      "loss": 1.0706,
+      "step": 2703
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.051707125084258,
+      "learning_rate": 5.561149429970021e-06,
+      "loss": 1.0873,
+      "step": 2704
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.165968081999098,
+      "learning_rate": 5.5582911199887546e-06,
+      "loss": 0.9945,
+      "step": 2705
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 6.8747979495193565,
+      "learning_rate": 5.555432625244024e-06,
+      "loss": 1.0612,
+      "step": 2706
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 6.915191981891525,
+      "learning_rate": 5.5525739466818295e-06,
+      "loss": 1.0544,
+      "step": 2707
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 5.70333039555049,
+      "learning_rate": 5.549715085248237e-06,
+      "loss": 1.0932,
+      "step": 2708
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 10.335691896419121,
+      "learning_rate": 5.546856041889374e-06,
+      "loss": 1.0398,
+      "step": 2709
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.723519511431217,
+      "learning_rate": 5.543996817551419e-06,
+      "loss": 1.0377,
+      "step": 2710
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.114724249037535,
+      "learning_rate": 5.541137413180624e-06,
+      "loss": 0.9969,
+      "step": 2711
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 8.56653019780467,
+      "learning_rate": 5.538277829723291e-06,
+      "loss": 1.0625,
+      "step": 2712
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.203519106879422,
+      "learning_rate": 5.535418068125782e-06,
+      "loss": 1.0665,
+      "step": 2713
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.587078557460433,
+      "learning_rate": 5.532558129334522e-06,
+      "loss": 1.1203,
+      "step": 2714
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 6.519818171660956,
+      "learning_rate": 5.529698014295992e-06,
+      "loss": 0.9851,
+      "step": 2715
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.338450758736326,
+      "learning_rate": 5.526837723956733e-06,
+      "loss": 1.0781,
+      "step": 2716
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 8.288671626858548,
+      "learning_rate": 5.52397725926334e-06,
+      "loss": 1.0481,
+      "step": 2717
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 8.451214041640796,
+      "learning_rate": 5.52111662116247e-06,
+      "loss": 1.0057,
+      "step": 2718
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.148219724452334,
+      "learning_rate": 5.518255810600836e-06,
+      "loss": 1.0529,
+      "step": 2719
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 10.684019779891562,
+      "learning_rate": 5.5153948285252104e-06,
+      "loss": 1.0209,
+      "step": 2720
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 6.12478936034875,
+      "learning_rate": 5.512533675882415e-06,
+      "loss": 1.0638,
+      "step": 2721
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 6.541491413439099,
+      "learning_rate": 5.509672353619335e-06,
+      "loss": 1.074,
+      "step": 2722
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 8.190821062117383,
+      "learning_rate": 5.506810862682912e-06,
+      "loss": 1.0818,
+      "step": 2723
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.958203388208641,
+      "learning_rate": 5.503949204020139e-06,
+      "loss": 1.1153,
+      "step": 2724
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.804285609362566,
+      "learning_rate": 5.501087378578067e-06,
+      "loss": 1.1403,
+      "step": 2725
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 11.907626637010562,
+      "learning_rate": 5.498225387303802e-06,
+      "loss": 1.0862,
+      "step": 2726
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 6.630916001259983,
+      "learning_rate": 5.495363231144504e-06,
+      "loss": 1.1096,
+      "step": 2727
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 7.892260268187746,
+      "learning_rate": 5.4925009110473914e-06,
+      "loss": 1.0649,
+      "step": 2728
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 9.1659443928354,
+      "learning_rate": 5.489638427959728e-06,
+      "loss": 1.0388,
+      "step": 2729
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 8.530826243234253,
+      "learning_rate": 5.486775782828841e-06,
+      "loss": 1.0633,
+      "step": 2730
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 5.489832975895147,
+      "learning_rate": 5.483912976602109e-06,
+      "loss": 1.0149,
+      "step": 2731
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 8.030980797132903,
+      "learning_rate": 5.48105001022696e-06,
+      "loss": 0.9981,
+      "step": 2732
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.301883613472213,
+      "learning_rate": 5.478186884650878e-06,
+      "loss": 0.9236,
+      "step": 2733
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 9.857346904300945,
+      "learning_rate": 5.475323600821398e-06,
+      "loss": 1.0788,
+      "step": 2734
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.447021478833291,
+      "learning_rate": 5.472460159686111e-06,
+      "loss": 1.0661,
+      "step": 2735
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 8.360585026184896,
+      "learning_rate": 5.469596562192656e-06,
+      "loss": 1.0401,
+      "step": 2736
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.6387501137881655,
+      "learning_rate": 5.466732809288726e-06,
+      "loss": 1.0512,
+      "step": 2737
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.35175853212237,
+      "learning_rate": 5.463868901922062e-06,
+      "loss": 1.0443,
+      "step": 2738
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.67311203325931,
+      "learning_rate": 5.461004841040464e-06,
+      "loss": 1.0474,
+      "step": 2739
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 8.16998642143846,
+      "learning_rate": 5.458140627591774e-06,
+      "loss": 1.0071,
+      "step": 2740
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.1528514642908485,
+      "learning_rate": 5.455276262523886e-06,
+      "loss": 1.1786,
+      "step": 2741
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 5.170681492518137,
+      "learning_rate": 5.452411746784753e-06,
+      "loss": 1.0624,
+      "step": 2742
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 9.811444430676488,
+      "learning_rate": 5.449547081322365e-06,
+      "loss": 1.1024,
+      "step": 2743
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 6.721131033362977,
+      "learning_rate": 5.446682267084772e-06,
+      "loss": 1.1486,
+      "step": 2744
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 12.32872617096248,
+      "learning_rate": 5.443817305020066e-06,
+      "loss": 1.0988,
+      "step": 2745
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 6.120439392645221,
+      "learning_rate": 5.440952196076392e-06,
+      "loss": 1.0631,
+      "step": 2746
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 6.268726513283164,
+      "learning_rate": 5.438086941201943e-06,
+      "loss": 1.1028,
+      "step": 2747
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.4664038006388065,
+      "learning_rate": 5.4352215413449594e-06,
+      "loss": 1.0563,
+      "step": 2748
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 8.97859613142174,
+      "learning_rate": 5.432355997453729e-06,
+      "loss": 0.9865,
+      "step": 2749
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 8.544115965120568,
+      "learning_rate": 5.429490310476591e-06,
+      "loss": 1.0469,
+      "step": 2750
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 9.884134965951995,
+      "learning_rate": 5.426624481361925e-06,
+      "loss": 1.0948,
+      "step": 2751
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 5.359625989282007,
+      "learning_rate": 5.423758511058167e-06,
+      "loss": 1.1361,
+      "step": 2752
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 8.218682830630916,
+      "learning_rate": 5.420892400513791e-06,
+      "loss": 1.0661,
+      "step": 2753
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.696768358481628,
+      "learning_rate": 5.41802615067732e-06,
+      "loss": 1.0558,
+      "step": 2754
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 8.070685162282391,
+      "learning_rate": 5.415159762497326e-06,
+      "loss": 1.0555,
+      "step": 2755
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 6.757583404682622,
+      "learning_rate": 5.412293236922428e-06,
+      "loss": 1.0491,
+      "step": 2756
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 6.377954242846048,
+      "learning_rate": 5.409426574901282e-06,
+      "loss": 1.0425,
+      "step": 2757
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.855248356474259,
+      "learning_rate": 5.406559777382595e-06,
+      "loss": 1.0659,
+      "step": 2758
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 8.076327231275393,
+      "learning_rate": 5.403692845315122e-06,
+      "loss": 1.0398,
+      "step": 2759
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 8.92161728635801,
+      "learning_rate": 5.400825779647657e-06,
+      "loss": 1.0954,
+      "step": 2760
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 6.524638685521979,
+      "learning_rate": 5.3979585813290395e-06,
+      "loss": 0.9834,
+      "step": 2761
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.496749147463968,
+      "learning_rate": 5.395091251308153e-06,
+      "loss": 1.0183,
+      "step": 2762
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 10.052798908360767,
+      "learning_rate": 5.392223790533927e-06,
+      "loss": 1.057,
+      "step": 2763
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 9.80796900369788,
+      "learning_rate": 5.3893561999553325e-06,
+      "loss": 1.1041,
+      "step": 2764
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 9.690945946151427,
+      "learning_rate": 5.386488480521379e-06,
+      "loss": 1.043,
+      "step": 2765
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 9.638342624213069,
+      "learning_rate": 5.383620633181127e-06,
+      "loss": 0.9485,
+      "step": 2766
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 6.247436880569226,
+      "learning_rate": 5.380752658883676e-06,
+      "loss": 1.0881,
+      "step": 2767
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 8.290718547600708,
+      "learning_rate": 5.377884558578163e-06,
+      "loss": 1.0957,
+      "step": 2768
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.804278832808307,
+      "learning_rate": 5.3750163332137725e-06,
+      "loss": 1.0803,
+      "step": 2769
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.488203167119587,
+      "learning_rate": 5.37214798373973e-06,
+      "loss": 1.0977,
+      "step": 2770
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.065929251553024,
+      "learning_rate": 5.3692795111052975e-06,
+      "loss": 1.1052,
+      "step": 2771
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.959885883644366,
+      "learning_rate": 5.3664109162597835e-06,
+      "loss": 1.0717,
+      "step": 2772
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.008456770184937,
+      "learning_rate": 5.363542200152531e-06,
+      "loss": 1.0426,
+      "step": 2773
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 5.721348662120588,
+      "learning_rate": 5.3606733637329285e-06,
+      "loss": 1.0762,
+      "step": 2774
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 8.388694992952278,
+      "learning_rate": 5.3578044079504025e-06,
+      "loss": 0.9987,
+      "step": 2775
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.389223955820564,
+      "learning_rate": 5.354935333754417e-06,
+      "loss": 1.0002,
+      "step": 2776
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 6.889737955195487,
+      "learning_rate": 5.352066142094477e-06,
+      "loss": 1.0629,
+      "step": 2777
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 9.71908525200557,
+      "learning_rate": 5.349196833920127e-06,
+      "loss": 1.0882,
+      "step": 2778
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 8.46981971774547,
+      "learning_rate": 5.346327410180949e-06,
+      "loss": 1.043,
+      "step": 2779
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 8.254042430109198,
+      "learning_rate": 5.343457871826564e-06,
+      "loss": 1.0551,
+      "step": 2780
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 8.904340486476954,
+      "learning_rate": 5.340588219806629e-06,
+      "loss": 1.1113,
+      "step": 2781
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 9.691680158751783,
+      "learning_rate": 5.33771845507084e-06,
+      "loss": 1.0587,
+      "step": 2782
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 8.058580205425383,
+      "learning_rate": 5.334848578568933e-06,
+      "loss": 0.9943,
+      "step": 2783
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 6.542774508194272,
+      "learning_rate": 5.331978591250675e-06,
+      "loss": 1.0464,
+      "step": 2784
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.965694295578071,
+      "learning_rate": 5.329108494065875e-06,
+      "loss": 1.0078,
+      "step": 2785
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 8.529637848115547,
+      "learning_rate": 5.3262382879643745e-06,
+      "loss": 1.0081,
+      "step": 2786
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.3923525531975525,
+      "learning_rate": 5.323367973896055e-06,
+      "loss": 1.0213,
+      "step": 2787
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 6.826907597031383,
+      "learning_rate": 5.320497552810831e-06,
+      "loss": 1.0584,
+      "step": 2788
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 6.818634903863633,
+      "learning_rate": 5.317627025658653e-06,
+      "loss": 1.0696,
+      "step": 2789
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 8.221671629810395,
+      "learning_rate": 5.314756393389506e-06,
+      "loss": 1.0605,
+      "step": 2790
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 8.681399795482033,
+      "learning_rate": 5.31188565695341e-06,
+      "loss": 1.0232,
+      "step": 2791
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 6.390997370253447,
+      "learning_rate": 5.309014817300422e-06,
+      "loss": 1.0806,
+      "step": 2792
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.206417627649671,
+      "learning_rate": 5.3061438753806286e-06,
+      "loss": 1.1101,
+      "step": 2793
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 8.121085694671242,
+      "learning_rate": 5.303272832144154e-06,
+      "loss": 1.0639,
+      "step": 2794
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 6.860506312832638,
+      "learning_rate": 5.300401688541154e-06,
+      "loss": 1.015,
+      "step": 2795
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 8.410430927166361,
+      "learning_rate": 5.297530445521821e-06,
+      "loss": 1.0467,
+      "step": 2796
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 9.681796695065444,
+      "learning_rate": 5.2946591040363735e-06,
+      "loss": 1.034,
+      "step": 2797
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 9.46749983716086,
+      "learning_rate": 5.291787665035068e-06,
+      "loss": 1.0215,
+      "step": 2798
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 9.924072263753969,
+      "learning_rate": 5.288916129468193e-06,
+      "loss": 1.0934,
+      "step": 2799
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 8.524246666295252,
+      "learning_rate": 5.2860444982860694e-06,
+      "loss": 1.1127,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 5.8627793621857025,
+      "learning_rate": 5.283172772439044e-06,
+      "loss": 1.1457,
+      "step": 2801
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.258999029522441,
+      "learning_rate": 5.280300952877501e-06,
+      "loss": 1.0335,
+      "step": 2802
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 5.698419612442744,
+      "learning_rate": 5.277429040551854e-06,
+      "loss": 1.1128,
+      "step": 2803
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.801626538570592,
+      "learning_rate": 5.274557036412547e-06,
+      "loss": 1.0412,
+      "step": 2804
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 8.72932546943612,
+      "learning_rate": 5.271684941410055e-06,
+      "loss": 1.0806,
+      "step": 2805
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.030085260849415,
+      "learning_rate": 5.268812756494881e-06,
+      "loss": 1.1504,
+      "step": 2806
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 9.654846958412135,
+      "learning_rate": 5.265940482617561e-06,
+      "loss": 1.0847,
+      "step": 2807
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 4.04038990031888,
+      "learning_rate": 5.263068120728657e-06,
+      "loss": 1.1862,
+      "step": 2808
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.15640720562962,
+      "learning_rate": 5.260195671778764e-06,
+      "loss": 1.0784,
+      "step": 2809
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 6.033536868195488,
+      "learning_rate": 5.257323136718502e-06,
+      "loss": 1.0916,
+      "step": 2810
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 8.54263095782439,
+      "learning_rate": 5.25445051649852e-06,
+      "loss": 0.9812,
+      "step": 2811
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.963238233572479,
+      "learning_rate": 5.2515778120695e-06,
+      "loss": 1.0151,
+      "step": 2812
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 6.560564590305189,
+      "learning_rate": 5.248705024382145e-06,
+      "loss": 1.0761,
+      "step": 2813
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.25847512636286,
+      "learning_rate": 5.245832154387191e-06,
+      "loss": 1.035,
+      "step": 2814
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 5.648417666189866,
+      "learning_rate": 5.242959203035396e-06,
+      "loss": 0.9995,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 8.785726011862577,
+      "learning_rate": 5.24008617127755e-06,
+      "loss": 0.9726,
+      "step": 2816
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.5011441741730085,
+      "learning_rate": 5.2372130600644685e-06,
+      "loss": 1.0402,
+      "step": 2817
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 8.602425071464701,
+      "learning_rate": 5.23433987034699e-06,
+      "loss": 1.0349,
+      "step": 2818
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 6.740095672579992,
+      "learning_rate": 5.231466603075982e-06,
+      "loss": 1.0216,
+      "step": 2819
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 6.935107828543583,
+      "learning_rate": 5.228593259202338e-06,
+      "loss": 0.9712,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 5.84821812861362,
+      "learning_rate": 5.2257198396769744e-06,
+      "loss": 1.0904,
+      "step": 2821
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 6.313106231908285,
+      "learning_rate": 5.222846345450836e-06,
+      "loss": 1.047,
+      "step": 2822
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.213614901382394,
+      "learning_rate": 5.219972777474888e-06,
+      "loss": 1.0507,
+      "step": 2823
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 5.831172308438678,
+      "learning_rate": 5.217099136700124e-06,
+      "loss": 1.087,
+      "step": 2824
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.678554431952141,
+      "learning_rate": 5.21422542407756e-06,
+      "loss": 1.0607,
+      "step": 2825
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 8.669278373130366,
+      "learning_rate": 5.211351640558235e-06,
+      "loss": 1.0498,
+      "step": 2826
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.3049869750480685,
+      "learning_rate": 5.208477787093213e-06,
+      "loss": 1.0185,
+      "step": 2827
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 9.195057514681485,
+      "learning_rate": 5.205603864633581e-06,
+      "loss": 1.036,
+      "step": 2828
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 8.310207928860017,
+      "learning_rate": 5.2027298741304455e-06,
+      "loss": 1.0543,
+      "step": 2829
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 8.306545742433627,
+      "learning_rate": 5.199855816534941e-06,
+      "loss": 1.0727,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 9.441702835255073,
+      "learning_rate": 5.196981692798222e-06,
+      "loss": 1.0672,
+      "step": 2831
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 9.073855492273186,
+      "learning_rate": 5.194107503871463e-06,
+      "loss": 1.0767,
+      "step": 2832
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 8.541826831827887,
+      "learning_rate": 5.19123325070586e-06,
+      "loss": 1.0804,
+      "step": 2833
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.570767477668686,
+      "learning_rate": 5.1883589342526365e-06,
+      "loss": 1.0069,
+      "step": 2834
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 8.994853745128589,
+      "learning_rate": 5.185484555463026e-06,
+      "loss": 1.0009,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 9.961589269655384,
+      "learning_rate": 5.182610115288296e-06,
+      "loss": 0.9843,
+      "step": 2836
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 9.545325065464082,
+      "learning_rate": 5.17973561467972e-06,
+      "loss": 1.1569,
+      "step": 2837
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 8.016552347427615,
+      "learning_rate": 5.176861054588602e-06,
+      "loss": 1.0612,
+      "step": 2838
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.284033026473704,
+      "learning_rate": 5.173986435966263e-06,
+      "loss": 1.0306,
+      "step": 2839
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 5.674768474388147,
+      "learning_rate": 5.171111759764042e-06,
+      "loss": 1.089,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.812529821324057,
+      "learning_rate": 5.168237026933296e-06,
+      "loss": 1.0371,
+      "step": 2841
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 9.224093088384208,
+      "learning_rate": 5.1653622384254055e-06,
+      "loss": 1.1188,
+      "step": 2842
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 7.972194949255889,
+      "learning_rate": 5.162487395191763e-06,
+      "loss": 1.0447,
+      "step": 2843
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 5.659713601771,
+      "learning_rate": 5.159612498183787e-06,
+      "loss": 1.0186,
+      "step": 2844
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.499413894591772,
+      "learning_rate": 5.156737548352906e-06,
+      "loss": 1.0484,
+      "step": 2845
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 10.011524901266199,
+      "learning_rate": 5.153862546650569e-06,
+      "loss": 1.018,
+      "step": 2846
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.861575980062825,
+      "learning_rate": 5.150987494028244e-06,
+      "loss": 1.0421,
+      "step": 2847
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 6.0650040676688795,
+      "learning_rate": 5.148112391437414e-06,
+      "loss": 0.9783,
+      "step": 2848
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.134764900802713,
+      "learning_rate": 5.145237239829578e-06,
+      "loss": 1.0363,
+      "step": 2849
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 9.599701468936946,
+      "learning_rate": 5.142362040156254e-06,
+      "loss": 1.1173,
+      "step": 2850
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.658358388357254,
+      "learning_rate": 5.139486793368972e-06,
+      "loss": 1.0638,
+      "step": 2851
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 5.583559794025067,
+      "learning_rate": 5.13661150041928e-06,
+      "loss": 1.1322,
+      "step": 2852
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 6.758167427226102,
+      "learning_rate": 5.1337361622587425e-06,
+      "loss": 1.0091,
+      "step": 2853
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 9.111283628216077,
+      "learning_rate": 5.130860779838934e-06,
+      "loss": 1.0357,
+      "step": 2854
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 8.302238967235708,
+      "learning_rate": 5.127985354111451e-06,
+      "loss": 1.0897,
+      "step": 2855
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.449208485413602,
+      "learning_rate": 5.125109886027898e-06,
+      "loss": 1.0292,
+      "step": 2856
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.5254019271413375,
+      "learning_rate": 5.1222343765398954e-06,
+      "loss": 1.023,
+      "step": 2857
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.434509414577269,
+      "learning_rate": 5.119358826599078e-06,
+      "loss": 1.1022,
+      "step": 2858
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 9.016024634281587,
+      "learning_rate": 5.116483237157094e-06,
+      "loss": 1.0763,
+      "step": 2859
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 6.904001111210193,
+      "learning_rate": 5.113607609165606e-06,
+      "loss": 1.1325,
+      "step": 2860
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 10.2555409252944,
+      "learning_rate": 5.110731943576286e-06,
+      "loss": 1.0471,
+      "step": 2861
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 9.925949337979414,
+      "learning_rate": 5.107856241340819e-06,
+      "loss": 1.031,
+      "step": 2862
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 8.431179881554899,
+      "learning_rate": 5.104980503410904e-06,
+      "loss": 1.0904,
+      "step": 2863
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.330246032918762,
+      "learning_rate": 5.102104730738254e-06,
+      "loss": 1.0012,
+      "step": 2864
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.0552955852470385,
+      "learning_rate": 5.0992289242745875e-06,
+      "loss": 1.0339,
+      "step": 2865
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.182515335300917,
+      "learning_rate": 5.096353084971637e-06,
+      "loss": 1.0608,
+      "step": 2866
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 10.61589876092198,
+      "learning_rate": 5.09347721378115e-06,
+      "loss": 0.9723,
+      "step": 2867
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 4.789410933112407,
+      "learning_rate": 5.090601311654876e-06,
+      "loss": 1.0096,
+      "step": 2868
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.430034199508122,
+      "learning_rate": 5.0877253795445815e-06,
+      "loss": 0.9983,
+      "step": 2869
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 8.072797936235686,
+      "learning_rate": 5.084849418402043e-06,
+      "loss": 1.0348,
+      "step": 2870
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 8.516127813588861,
+      "learning_rate": 5.08197342917904e-06,
+      "loss": 1.0574,
+      "step": 2871
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.614498180959775,
+      "learning_rate": 5.07909741282737e-06,
+      "loss": 1.151,
+      "step": 2872
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 9.63135341932997,
+      "learning_rate": 5.076221370298832e-06,
+      "loss": 1.0547,
+      "step": 2873
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 8.720712117520755,
+      "learning_rate": 5.073345302545238e-06,
+      "loss": 1.0412,
+      "step": 2874
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 5.666315065789015,
+      "learning_rate": 5.0704692105184095e-06,
+      "loss": 1.0357,
+      "step": 2875
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 6.883909568507746,
+      "learning_rate": 5.067593095170173e-06,
+      "loss": 1.0851,
+      "step": 2876
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 9.671596778120836,
+      "learning_rate": 5.064716957452359e-06,
+      "loss": 1.0314,
+      "step": 2877
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 8.179080224126619,
+      "learning_rate": 5.061840798316815e-06,
+      "loss": 1.001,
+      "step": 2878
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.038313503023233,
+      "learning_rate": 5.058964618715388e-06,
+      "loss": 1.0688,
+      "step": 2879
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.282610375748145,
+      "learning_rate": 5.0560884195999375e-06,
+      "loss": 1.017,
+      "step": 2880
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 6.365313554733232,
+      "learning_rate": 5.053212201922323e-06,
+      "loss": 1.0391,
+      "step": 2881
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.008516670150774,
+      "learning_rate": 5.050335966634413e-06,
+      "loss": 1.1368,
+      "step": 2882
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.2596095754443315,
+      "learning_rate": 5.047459714688084e-06,
+      "loss": 1.0977,
+      "step": 2883
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 9.018639493151369,
+      "learning_rate": 5.044583447035218e-06,
+      "loss": 1.085,
+      "step": 2884
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 8.61256614714217,
+      "learning_rate": 5.041707164627695e-06,
+      "loss": 1.0092,
+      "step": 2885
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 5.525821029936904,
+      "learning_rate": 5.038830868417411e-06,
+      "loss": 1.1105,
+      "step": 2886
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.055978310518433,
+      "learning_rate": 5.035954559356257e-06,
+      "loss": 1.0927,
+      "step": 2887
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.649995257844592,
+      "learning_rate": 5.033078238396133e-06,
+      "loss": 1.0198,
+      "step": 2888
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 8.318220910305467,
+      "learning_rate": 5.030201906488944e-06,
+      "loss": 1.0486,
+      "step": 2889
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 5.885591173125552,
+      "learning_rate": 5.027325564586593e-06,
+      "loss": 1.1038,
+      "step": 2890
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 9.067827271427285,
+      "learning_rate": 5.024449213640995e-06,
+      "loss": 0.9836,
+      "step": 2891
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.673244630938672,
+      "learning_rate": 5.02157285460406e-06,
+      "loss": 1.1183,
+      "step": 2892
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 8.195371737349316,
+      "learning_rate": 5.018696488427702e-06,
+      "loss": 1.0817,
+      "step": 2893
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 6.910661416454962,
+      "learning_rate": 5.0158201160638434e-06,
+      "loss": 1.1229,
+      "step": 2894
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.085363378406355,
+      "learning_rate": 5.0129437384644e-06,
+      "loss": 1.0902,
+      "step": 2895
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 6.723602667355192,
+      "learning_rate": 5.010067356581296e-06,
+      "loss": 1.0883,
+      "step": 2896
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 7.970987076082881,
+      "learning_rate": 5.007190971366455e-06,
+      "loss": 1.1108,
+      "step": 2897
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 8.792952792354505,
+      "learning_rate": 5.0043145837717975e-06,
+      "loss": 0.9658,
+      "step": 2898
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 5.80759705296197,
+      "learning_rate": 5.001438194749253e-06,
+      "loss": 0.9022,
+      "step": 2899
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 9.843014322232753,
+      "learning_rate": 4.998561805250748e-06,
+      "loss": 1.0457,
+      "step": 2900
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 5.559057587124639,
+      "learning_rate": 4.9956854162282024e-06,
+      "loss": 1.0612,
+      "step": 2901
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 9.740303382316899,
+      "learning_rate": 4.992809028633548e-06,
+      "loss": 0.976,
+      "step": 2902
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 9.152780129158838,
+      "learning_rate": 4.989932643418706e-06,
+      "loss": 1.091,
+      "step": 2903
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.643608088399409,
+      "learning_rate": 4.987056261535601e-06,
+      "loss": 1.0038,
+      "step": 2904
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.3717544716650085,
+      "learning_rate": 4.984179883936158e-06,
+      "loss": 1.0456,
+      "step": 2905
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 8.168571818542734,
+      "learning_rate": 4.9813035115723e-06,
+      "loss": 1.056,
+      "step": 2906
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 8.100990479061617,
+      "learning_rate": 4.9784271453959415e-06,
+      "loss": 1.0379,
+      "step": 2907
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.688584316002917,
+      "learning_rate": 4.975550786359006e-06,
+      "loss": 1.046,
+      "step": 2908
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.591988249201697,
+      "learning_rate": 4.972674435413406e-06,
+      "loss": 0.9732,
+      "step": 2909
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 6.535340364568768,
+      "learning_rate": 4.969798093511057e-06,
+      "loss": 0.9781,
+      "step": 2910
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 8.259223174166433,
+      "learning_rate": 4.966921761603869e-06,
+      "loss": 1.0614,
+      "step": 2911
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 6.669616846247357,
+      "learning_rate": 4.964045440643744e-06,
+      "loss": 0.9687,
+      "step": 2912
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 5.335333427722515,
+      "learning_rate": 4.961169131582591e-06,
+      "loss": 0.9501,
+      "step": 2913
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 8.805848534438022,
+      "learning_rate": 4.958292835372307e-06,
+      "loss": 0.9544,
+      "step": 2914
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 8.186171346258813,
+      "learning_rate": 4.955416552964784e-06,
+      "loss": 0.9977,
+      "step": 2915
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 5.27388317962243,
+      "learning_rate": 4.9525402853119164e-06,
+      "loss": 1.1125,
+      "step": 2916
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 6.79008266874262,
+      "learning_rate": 4.949664033365587e-06,
+      "loss": 1.0563,
+      "step": 2917
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 9.028464447643158,
+      "learning_rate": 4.946787798077679e-06,
+      "loss": 0.9976,
+      "step": 2918
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 8.050659487953164,
+      "learning_rate": 4.943911580400064e-06,
+      "loss": 1.0118,
+      "step": 2919
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 8.922285950090782,
+      "learning_rate": 4.941035381284612e-06,
+      "loss": 1.0871,
+      "step": 2920
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 8.085492264880221,
+      "learning_rate": 4.9381592016831856e-06,
+      "loss": 1.018,
+      "step": 2921
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.614334383305612,
+      "learning_rate": 4.9352830425476425e-06,
+      "loss": 1.0523,
+      "step": 2922
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.204516954943623,
+      "learning_rate": 4.93240690482983e-06,
+      "loss": 0.9835,
+      "step": 2923
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 9.603009549430084,
+      "learning_rate": 4.929530789481591e-06,
+      "loss": 1.0711,
+      "step": 2924
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.552161728139304,
+      "learning_rate": 4.926654697454761e-06,
+      "loss": 1.0704,
+      "step": 2925
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 6.919364935092023,
+      "learning_rate": 4.923778629701169e-06,
+      "loss": 1.0561,
+      "step": 2926
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 5.31493737451035,
+      "learning_rate": 4.920902587172632e-06,
+      "loss": 1.1111,
+      "step": 2927
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 6.431606756657504,
+      "learning_rate": 4.918026570820961e-06,
+      "loss": 1.048,
+      "step": 2928
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 6.4738322406442075,
+      "learning_rate": 4.915150581597959e-06,
+      "loss": 1.1644,
+      "step": 2929
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.023130330566979,
+      "learning_rate": 4.912274620455419e-06,
+      "loss": 1.0589,
+      "step": 2930
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 6.771742761947451,
+      "learning_rate": 4.909398688345125e-06,
+      "loss": 1.0421,
+      "step": 2931
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.511115266629986,
+      "learning_rate": 4.906522786218853e-06,
+      "loss": 1.0222,
+      "step": 2932
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 8.608182154295303,
+      "learning_rate": 4.903646915028364e-06,
+      "loss": 1.0501,
+      "step": 2933
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 8.277112372751434,
+      "learning_rate": 4.900771075725414e-06,
+      "loss": 1.0531,
+      "step": 2934
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.2607305796805575,
+      "learning_rate": 4.897895269261748e-06,
+      "loss": 1.0237,
+      "step": 2935
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.2793955665567225,
+      "learning_rate": 4.895019496589096e-06,
+      "loss": 1.0099,
+      "step": 2936
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 5.999679717825481,
+      "learning_rate": 4.892143758659183e-06,
+      "loss": 1.0815,
+      "step": 2937
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 6.568076748010813,
+      "learning_rate": 4.889268056423717e-06,
+      "loss": 1.0183,
+      "step": 2938
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 6.143045409801693,
+      "learning_rate": 4.886392390834396e-06,
+      "loss": 1.1184,
+      "step": 2939
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.708138810487211,
+      "learning_rate": 4.8835167628429066e-06,
+      "loss": 0.999,
+      "step": 2940
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.815534520999739,
+      "learning_rate": 4.880641173400923e-06,
+      "loss": 1.0842,
+      "step": 2941
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.451866681853508,
+      "learning_rate": 4.877765623460107e-06,
+      "loss": 1.0377,
+      "step": 2942
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 9.151021135413968,
+      "learning_rate": 4.874890113972105e-06,
+      "loss": 1.1148,
+      "step": 2943
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 9.122917393428521,
+      "learning_rate": 4.87201464588855e-06,
+      "loss": 1.0351,
+      "step": 2944
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 6.0996341314660425,
+      "learning_rate": 4.869139220161067e-06,
+      "loss": 1.1107,
+      "step": 2945
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.832815219618395,
+      "learning_rate": 4.86626383774126e-06,
+      "loss": 1.0508,
+      "step": 2946
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.298061955935144,
+      "learning_rate": 4.8633884995807205e-06,
+      "loss": 1.053,
+      "step": 2947
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.853697821248537,
+      "learning_rate": 4.8605132066310295e-06,
+      "loss": 1.109,
+      "step": 2948
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.778265083455759,
+      "learning_rate": 4.857637959843747e-06,
+      "loss": 1.0728,
+      "step": 2949
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 8.281645342900417,
+      "learning_rate": 4.854762760170423e-06,
+      "loss": 1.09,
+      "step": 2950
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 5.520200154526741,
+      "learning_rate": 4.851887608562588e-06,
+      "loss": 1.0584,
+      "step": 2951
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 7.61723047931326,
+      "learning_rate": 4.849012505971757e-06,
+      "loss": 1.0269,
+      "step": 2952
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 5.208581130684577,
+      "learning_rate": 4.846137453349434e-06,
+      "loss": 1.155,
+      "step": 2953
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 8.07820289008367,
+      "learning_rate": 4.843262451647097e-06,
+      "loss": 1.1716,
+      "step": 2954
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 8.025475112195464,
+      "learning_rate": 4.840387501816215e-06,
+      "loss": 0.9796,
+      "step": 2955
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 11.965444754629186,
+      "learning_rate": 4.837512604808238e-06,
+      "loss": 1.079,
+      "step": 2956
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.9009205032647145,
+      "learning_rate": 4.834637761574595e-06,
+      "loss": 0.9605,
+      "step": 2957
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.446724941644457,
+      "learning_rate": 4.831762973066705e-06,
+      "loss": 1.1562,
+      "step": 2958
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 8.384283079297056,
+      "learning_rate": 4.828888240235961e-06,
+      "loss": 1.0614,
+      "step": 2959
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.304689142187464,
+      "learning_rate": 4.826013564033738e-06,
+      "loss": 1.139,
+      "step": 2960
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 6.818744886935395,
+      "learning_rate": 4.823138945411399e-06,
+      "loss": 1.0567,
+      "step": 2961
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 8.829949374554666,
+      "learning_rate": 4.820264385320283e-06,
+      "loss": 1.0376,
+      "step": 2962
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.480929664066341,
+      "learning_rate": 4.817389884711706e-06,
+      "loss": 0.9376,
+      "step": 2963
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 6.987128929719068,
+      "learning_rate": 4.814515444536975e-06,
+      "loss": 1.0035,
+      "step": 2964
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 5.879182725752661,
+      "learning_rate": 4.811641065747364e-06,
+      "loss": 1.008,
+      "step": 2965
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 8.863919626912459,
+      "learning_rate": 4.808766749294141e-06,
+      "loss": 1.0272,
+      "step": 2966
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 6.740967730248571,
+      "learning_rate": 4.80589249612854e-06,
+      "loss": 1.1031,
+      "step": 2967
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.965550814772475,
+      "learning_rate": 4.8030183072017785e-06,
+      "loss": 1.0196,
+      "step": 2968
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 6.707428447722556,
+      "learning_rate": 4.80014418346506e-06,
+      "loss": 1.0065,
+      "step": 2969
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.9711903501907155,
+      "learning_rate": 4.797270125869556e-06,
+      "loss": 1.016,
+      "step": 2970
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 5.9826230536103155,
+      "learning_rate": 4.79439613536642e-06,
+      "loss": 1.0596,
+      "step": 2971
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 10.546474343765098,
+      "learning_rate": 4.791522212906789e-06,
+      "loss": 1.0851,
+      "step": 2972
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 6.175445451733009,
+      "learning_rate": 4.788648359441766e-06,
+      "loss": 0.934,
+      "step": 2973
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 6.706584938373306,
+      "learning_rate": 4.785774575922441e-06,
+      "loss": 1.0465,
+      "step": 2974
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 6.530952488893744,
+      "learning_rate": 4.782900863299878e-06,
+      "loss": 1.0768,
+      "step": 2975
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.3739971708679395,
+      "learning_rate": 4.780027222525112e-06,
+      "loss": 0.9997,
+      "step": 2976
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 9.378578710355896,
+      "learning_rate": 4.777153654549166e-06,
+      "loss": 1.0767,
+      "step": 2977
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 5.5417379190285265,
+      "learning_rate": 4.774280160323027e-06,
+      "loss": 1.146,
+      "step": 2978
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.765672410545544,
+      "learning_rate": 4.771406740797663e-06,
+      "loss": 0.9354,
+      "step": 2979
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 10.313987910436733,
+      "learning_rate": 4.768533396924019e-06,
+      "loss": 1.1104,
+      "step": 2980
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 5.804536472249611,
+      "learning_rate": 4.765660129653011e-06,
+      "loss": 1.1035,
+      "step": 2981
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 5.722259285765224,
+      "learning_rate": 4.762786939935533e-06,
+      "loss": 1.056,
+      "step": 2982
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.571114868485718,
+      "learning_rate": 4.759913828722452e-06,
+      "loss": 0.9783,
+      "step": 2983
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.86854742823828,
+      "learning_rate": 4.757040796964605e-06,
+      "loss": 1.0351,
+      "step": 2984
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 8.168499287901536,
+      "learning_rate": 4.754167845612811e-06,
+      "loss": 1.0934,
+      "step": 2985
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 6.834355427297426,
+      "learning_rate": 4.751294975617858e-06,
+      "loss": 1.0783,
+      "step": 2986
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.045931855320411,
+      "learning_rate": 4.748422187930501e-06,
+      "loss": 1.0763,
+      "step": 2987
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 6.392409903964894,
+      "learning_rate": 4.7455494835014805e-06,
+      "loss": 1.0864,
+      "step": 2988
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 6.064349244828244,
+      "learning_rate": 4.742676863281499e-06,
+      "loss": 1.0922,
+      "step": 2989
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.877929351787009,
+      "learning_rate": 4.739804328221237e-06,
+      "loss": 1.0923,
+      "step": 2990
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.229553102208058,
+      "learning_rate": 4.736931879271344e-06,
+      "loss": 1.0822,
+      "step": 2991
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.364111458641928,
+      "learning_rate": 4.73405951738244e-06,
+      "loss": 0.9737,
+      "step": 2992
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.00583990458482,
+      "learning_rate": 4.73118724350512e-06,
+      "loss": 1.0806,
+      "step": 2993
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 9.852702455944527,
+      "learning_rate": 4.728315058589947e-06,
+      "loss": 0.9864,
+      "step": 2994
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 9.118537308357986,
+      "learning_rate": 4.7254429635874536e-06,
+      "loss": 1.087,
+      "step": 2995
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 6.316051171455313,
+      "learning_rate": 4.722570959448147e-06,
+      "loss": 1.0116,
+      "step": 2996
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 9.017918080306195,
+      "learning_rate": 4.7196990471225e-06,
+      "loss": 0.9998,
+      "step": 2997
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 9.453431859859666,
+      "learning_rate": 4.716827227560958e-06,
+      "loss": 1.002,
+      "step": 2998
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 9.285230021494371,
+      "learning_rate": 4.713955501713933e-06,
+      "loss": 0.9925,
+      "step": 2999
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 8.43114972297475,
+      "learning_rate": 4.711083870531807e-06,
+      "loss": 0.9891,
+      "step": 3000
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 8.917694866002257,
+      "learning_rate": 4.7082123349649325e-06,
+      "loss": 1.0682,
+      "step": 3001
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 4.835076199975826,
+      "learning_rate": 4.705340895963628e-06,
+      "loss": 0.9697,
+      "step": 3002
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 6.828615103783771,
+      "learning_rate": 4.70246955447818e-06,
+      "loss": 1.0397,
+      "step": 3003
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 7.563260160181534,
+      "learning_rate": 4.699598311458846e-06,
+      "loss": 1.0012,
+      "step": 3004
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 5.942869453968258,
+      "learning_rate": 4.696727167855847e-06,
+      "loss": 1.0688,
+      "step": 3005
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 5.862664524947979,
+      "learning_rate": 4.693856124619373e-06,
+      "loss": 1.1163,
+      "step": 3006
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 6.6031339926602435,
+      "learning_rate": 4.690985182699581e-06,
+      "loss": 1.123,
+      "step": 3007
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 6.361290645399283,
+      "learning_rate": 4.688114343046591e-06,
+      "loss": 1.1666,
+      "step": 3008
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 10.25445394933359,
+      "learning_rate": 4.685243606610496e-06,
+      "loss": 0.9838,
+      "step": 3009
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 10.436596036220731,
+      "learning_rate": 4.682372974341349e-06,
+      "loss": 1.1019,
+      "step": 3010
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 8.331920645568081,
+      "learning_rate": 4.67950244718917e-06,
+      "loss": 1.0148,
+      "step": 3011
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 8.667857114878341,
+      "learning_rate": 4.676632026103946e-06,
+      "loss": 1.0502,
+      "step": 3012
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 6.272894954404005,
+      "learning_rate": 4.673761712035626e-06,
+      "loss": 1.0027,
+      "step": 3013
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 10.048187440935198,
+      "learning_rate": 4.670891505934127e-06,
+      "loss": 0.9861,
+      "step": 3014
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 6.083015636042373,
+      "learning_rate": 4.668021408749327e-06,
+      "loss": 1.0455,
+      "step": 3015
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 10.819721809319205,
+      "learning_rate": 4.665151421431069e-06,
+      "loss": 1.0963,
+      "step": 3016
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.954880353532675,
+      "learning_rate": 4.662281544929161e-06,
+      "loss": 1.0814,
+      "step": 3017
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 6.226015557303414,
+      "learning_rate": 4.659411780193374e-06,
+      "loss": 1.1516,
+      "step": 3018
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.830735711084285,
+      "learning_rate": 4.656542128173438e-06,
+      "loss": 1.066,
+      "step": 3019
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 8.007118121372947,
+      "learning_rate": 4.6536725898190525e-06,
+      "loss": 1.0618,
+      "step": 3020
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 9.383085063137383,
+      "learning_rate": 4.650803166079874e-06,
+      "loss": 1.0695,
+      "step": 3021
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.297427636442189,
+      "learning_rate": 4.647933857905524e-06,
+      "loss": 1.0438,
+      "step": 3022
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 8.485065277570087,
+      "learning_rate": 4.6450646662455855e-06,
+      "loss": 0.9608,
+      "step": 3023
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 5.466879970547467,
+      "learning_rate": 4.642195592049599e-06,
+      "loss": 0.9796,
+      "step": 3024
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.434492599897775,
+      "learning_rate": 4.639326636267073e-06,
+      "loss": 0.9812,
+      "step": 3025
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 6.440354129750773,
+      "learning_rate": 4.636457799847471e-06,
+      "loss": 1.016,
+      "step": 3026
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.5717997266121,
+      "learning_rate": 4.633589083740219e-06,
+      "loss": 1.016,
+      "step": 3027
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 6.123486172157586,
+      "learning_rate": 4.630720488894704e-06,
+      "loss": 1.1153,
+      "step": 3028
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 6.639286097298334,
+      "learning_rate": 4.62785201626027e-06,
+      "loss": 1.0179,
+      "step": 3029
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 8.029506742951789,
+      "learning_rate": 4.624983666786228e-06,
+      "loss": 1.0262,
+      "step": 3030
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.449189357592999,
+      "learning_rate": 4.622115441421839e-06,
+      "loss": 1.0401,
+      "step": 3031
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.0392459890024135,
+      "learning_rate": 4.619247341116325e-06,
+      "loss": 1.0352,
+      "step": 3032
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.159598138139303,
+      "learning_rate": 4.6163793668188745e-06,
+      "loss": 1.0833,
+      "step": 3033
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 8.855179713631468,
+      "learning_rate": 4.6135115194786235e-06,
+      "loss": 1.0392,
+      "step": 3034
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 5.9288524949563435,
+      "learning_rate": 4.61064380004467e-06,
+      "loss": 0.9982,
+      "step": 3035
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 10.869149273153894,
+      "learning_rate": 4.607776209466075e-06,
+      "loss": 1.0258,
+      "step": 3036
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 8.043679017043884,
+      "learning_rate": 4.604908748691846e-06,
+      "loss": 1.084,
+      "step": 3037
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 9.060289858150037,
+      "learning_rate": 4.602041418670962e-06,
+      "loss": 1.1232,
+      "step": 3038
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 5.791248622186592,
+      "learning_rate": 4.5991742203523446e-06,
+      "loss": 1.0539,
+      "step": 3039
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.852119211835782,
+      "learning_rate": 4.596307154684878e-06,
+      "loss": 0.9932,
+      "step": 3040
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 10.14420833151039,
+      "learning_rate": 4.593440222617406e-06,
+      "loss": 1.0493,
+      "step": 3041
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 5.962265624737548,
+      "learning_rate": 4.590573425098721e-06,
+      "loss": 1.0504,
+      "step": 3042
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 5.71055700469296,
+      "learning_rate": 4.587706763077573e-06,
+      "loss": 1.1075,
+      "step": 3043
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.2703025724774335,
+      "learning_rate": 4.584840237502675e-06,
+      "loss": 1.0383,
+      "step": 3044
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 6.679047486006595,
+      "learning_rate": 4.58197384932268e-06,
+      "loss": 1.0838,
+      "step": 3045
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 6.899218063867554,
+      "learning_rate": 4.579107599486211e-06,
+      "loss": 1.042,
+      "step": 3046
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.206567107496266,
+      "learning_rate": 4.576241488941836e-06,
+      "loss": 1.056,
+      "step": 3047
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 6.120545591651873,
+      "learning_rate": 4.573375518638075e-06,
+      "loss": 1.0536,
+      "step": 3048
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 6.518852731564999,
+      "learning_rate": 4.570509689523411e-06,
+      "loss": 1.1241,
+      "step": 3049
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 6.299632295120398,
+      "learning_rate": 4.567644002546273e-06,
+      "loss": 1.1003,
+      "step": 3050
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 8.181165709896575,
+      "learning_rate": 4.564778458655042e-06,
+      "loss": 1.0594,
+      "step": 3051
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.32231221683594,
+      "learning_rate": 4.561913058798058e-06,
+      "loss": 1.0984,
+      "step": 3052
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 6.231749133637834,
+      "learning_rate": 4.559047803923609e-06,
+      "loss": 1.1019,
+      "step": 3053
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.004972700356031,
+      "learning_rate": 4.556182694979936e-06,
+      "loss": 1.0314,
+      "step": 3054
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 5.8261391296087535,
+      "learning_rate": 4.55331773291523e-06,
+      "loss": 1.116,
+      "step": 3055
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 6.7969444113097515,
+      "learning_rate": 4.550452918677635e-06,
+      "loss": 1.124,
+      "step": 3056
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.656274541298851,
+      "learning_rate": 4.547588253215249e-06,
+      "loss": 1.0625,
+      "step": 3057
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 6.8929538979400595,
+      "learning_rate": 4.5447237374761154e-06,
+      "loss": 1.0852,
+      "step": 3058
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.156177856209396,
+      "learning_rate": 4.5418593724082285e-06,
+      "loss": 1.0588,
+      "step": 3059
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.788323223615861,
+      "learning_rate": 4.538995158959538e-06,
+      "loss": 1.0106,
+      "step": 3060
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 7.715587560191846,
+      "learning_rate": 4.536131098077938e-06,
+      "loss": 1.0735,
+      "step": 3061
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 6.379469202188936,
+      "learning_rate": 4.533267190711276e-06,
+      "loss": 1.0328,
+      "step": 3062
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 9.818714427099849,
+      "learning_rate": 4.530403437807346e-06,
+      "loss": 1.0336,
+      "step": 3063
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 8.671426219747914,
+      "learning_rate": 4.527539840313889e-06,
+      "loss": 1.0651,
+      "step": 3064
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 8.368270007475152,
+      "learning_rate": 4.5246763991786025e-06,
+      "loss": 1.01,
+      "step": 3065
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 6.100453899064347,
+      "learning_rate": 4.521813115349124e-06,
+      "loss": 1.0296,
+      "step": 3066
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 11.074796042825275,
+      "learning_rate": 4.518949989773041e-06,
+      "loss": 1.0454,
+      "step": 3067
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 6.0346919528031,
+      "learning_rate": 4.516087023397892e-06,
+      "loss": 0.9916,
+      "step": 3068
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 9.264416715479571,
+      "learning_rate": 4.513224217171159e-06,
+      "loss": 1.0456,
+      "step": 3069
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 9.732290847026396,
+      "learning_rate": 4.510361572040273e-06,
+      "loss": 1.0906,
+      "step": 3070
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 6.870631559395743,
+      "learning_rate": 4.507499088952612e-06,
+      "loss": 1.0013,
+      "step": 3071
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 8.42951617997482,
+      "learning_rate": 4.504636768855496e-06,
+      "loss": 0.9318,
+      "step": 3072
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.460653438780879,
+      "learning_rate": 4.501774612696199e-06,
+      "loss": 1.0285,
+      "step": 3073
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 6.9883030350722315,
+      "learning_rate": 4.498912621421934e-06,
+      "loss": 1.0722,
+      "step": 3074
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 6.47811355214117,
+      "learning_rate": 4.4960507959798625e-06,
+      "loss": 1.0464,
+      "step": 3075
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 8.701137494692007,
+      "learning_rate": 4.49318913731709e-06,
+      "loss": 1.0376,
+      "step": 3076
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.827706066392766,
+      "learning_rate": 4.490327646380665e-06,
+      "loss": 1.0681,
+      "step": 3077
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 6.43934717256055,
+      "learning_rate": 4.487466324117587e-06,
+      "loss": 1.0419,
+      "step": 3078
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 5.612032511831482,
+      "learning_rate": 4.484605171474792e-06,
+      "loss": 0.997,
+      "step": 3079
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.857816329001032,
+      "learning_rate": 4.481744189399164e-06,
+      "loss": 1.0218,
+      "step": 3080
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 9.447898998194498,
+      "learning_rate": 4.478883378837531e-06,
+      "loss": 1.164,
+      "step": 3081
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 6.841640562968617,
+      "learning_rate": 4.476022740736663e-06,
+      "loss": 1.0043,
+      "step": 3082
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 9.579786734592188,
+      "learning_rate": 4.473162276043269e-06,
+      "loss": 1.0354,
+      "step": 3083
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 6.887087890562375,
+      "learning_rate": 4.47030198570401e-06,
+      "loss": 1.121,
+      "step": 3084
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 10.027852286157307,
+      "learning_rate": 4.4674418706654785e-06,
+      "loss": 1.107,
+      "step": 3085
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 6.617972195190493,
+      "learning_rate": 4.4645819318742195e-06,
+      "loss": 1.0417,
+      "step": 3086
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 6.8990652688578376,
+      "learning_rate": 4.461722170276712e-06,
+      "loss": 1.0821,
+      "step": 3087
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 8.455652659475719,
+      "learning_rate": 4.458862586819377e-06,
+      "loss": 0.9974,
+      "step": 3088
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.057931891045097,
+      "learning_rate": 4.456003182448582e-06,
+      "loss": 1.0816,
+      "step": 3089
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 6.780475185009239,
+      "learning_rate": 4.4531439581106295e-06,
+      "loss": 1.1091,
+      "step": 3090
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 8.77054915128659,
+      "learning_rate": 4.450284914751764e-06,
+      "loss": 1.0205,
+      "step": 3091
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 6.9376210610533935,
+      "learning_rate": 4.447426053318172e-06,
+      "loss": 1.0546,
+      "step": 3092
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 8.178445634197052,
+      "learning_rate": 4.444567374755978e-06,
+      "loss": 1.124,
+      "step": 3093
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 6.796943853052845,
+      "learning_rate": 4.441708880011246e-06,
+      "loss": 1.0332,
+      "step": 3094
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 6.671121598129578,
+      "learning_rate": 4.438850570029981e-06,
+      "loss": 1.008,
+      "step": 3095
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.112167626931423,
+      "learning_rate": 4.435992445758123e-06,
+      "loss": 1.0672,
+      "step": 3096
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 6.990407070854908,
+      "learning_rate": 4.4331345081415555e-06,
+      "loss": 1.0564,
+      "step": 3097
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 8.265048775165882,
+      "learning_rate": 4.4302767581260965e-06,
+      "loss": 1.0676,
+      "step": 3098
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 5.785334263394802,
+      "learning_rate": 4.427419196657502e-06,
+      "loss": 1.0094,
+      "step": 3099
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.678545323295073,
+      "learning_rate": 4.424561824681469e-06,
+      "loss": 1.0374,
+      "step": 3100
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 8.631178499099,
+      "learning_rate": 4.421704643143626e-06,
+      "loss": 1.1368,
+      "step": 3101
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.448380874658891,
+      "learning_rate": 4.418847652989548e-06,
+      "loss": 1.109,
+      "step": 3102
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 9.545591881261434,
+      "learning_rate": 4.415990855164735e-06,
+      "loss": 0.9708,
+      "step": 3103
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 6.800474081884031,
+      "learning_rate": 4.4131342506146305e-06,
+      "loss": 0.9613,
+      "step": 3104
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.904879849507238,
+      "learning_rate": 4.410277840284615e-06,
+      "loss": 1.0597,
+      "step": 3105
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.347549481524987,
+      "learning_rate": 4.4074216251199995e-06,
+      "loss": 1.1317,
+      "step": 3106
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 11.073020363436038,
+      "learning_rate": 4.404565606066031e-06,
+      "loss": 1.0414,
+      "step": 3107
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 8.46458350620673,
+      "learning_rate": 4.401709784067901e-06,
+      "loss": 1.0195,
+      "step": 3108
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.330343777802066,
+      "learning_rate": 4.3988541600707206e-06,
+      "loss": 1.0844,
+      "step": 3109
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.581557940522176,
+      "learning_rate": 4.39599873501955e-06,
+      "loss": 1.0092,
+      "step": 3110
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 9.137024110278091,
+      "learning_rate": 4.393143509859373e-06,
+      "loss": 1.1099,
+      "step": 3111
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.966118957132628,
+      "learning_rate": 4.3902884855351094e-06,
+      "loss": 1.0513,
+      "step": 3112
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 6.555286812037327,
+      "learning_rate": 4.387433662991621e-06,
+      "loss": 1.0782,
+      "step": 3113
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 8.130496019794315,
+      "learning_rate": 4.38457904317369e-06,
+      "loss": 1.0628,
+      "step": 3114
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 9.921369780098146,
+      "learning_rate": 4.381724627026037e-06,
+      "loss": 1.0805,
+      "step": 3115
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.43517850558511,
+      "learning_rate": 4.378870415493322e-06,
+      "loss": 0.9665,
+      "step": 3116
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.222379884626037,
+      "learning_rate": 4.376016409520124e-06,
+      "loss": 1.1021,
+      "step": 3117
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 10.574043588019613,
+      "learning_rate": 4.373162610050966e-06,
+      "loss": 1.0098,
+      "step": 3118
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 6.633674390253577,
+      "learning_rate": 4.370309018030299e-06,
+      "loss": 1.0367,
+      "step": 3119
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.672264679497176,
+      "learning_rate": 4.367455634402496e-06,
+      "loss": 1.0459,
+      "step": 3120
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 8.544866956970303,
+      "learning_rate": 4.364602460111877e-06,
+      "loss": 0.9912,
+      "step": 3121
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.517080989921016,
+      "learning_rate": 4.361749496102683e-06,
+      "loss": 1.1,
+      "step": 3122
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 7.405971372477545,
+      "learning_rate": 4.358896743319083e-06,
+      "loss": 1.0532,
+      "step": 3123
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 10.8130599775756,
+      "learning_rate": 4.356044202705185e-06,
+      "loss": 1.0337,
+      "step": 3124
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 8.669257286658384,
+      "learning_rate": 4.3531918752050205e-06,
+      "loss": 1.0429,
+      "step": 3125
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 4.898676463380346,
+      "learning_rate": 4.3503397617625535e-06,
+      "loss": 1.0643,
+      "step": 3126
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.355777227710762,
+      "learning_rate": 4.3474878633216735e-06,
+      "loss": 1.0435,
+      "step": 3127
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 6.416735938405522,
+      "learning_rate": 4.344636180826201e-06,
+      "loss": 1.106,
+      "step": 3128
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 6.23848962003804,
+      "learning_rate": 4.3417847152198874e-06,
+      "loss": 1.0808,
+      "step": 3129
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 8.295046748022367,
+      "learning_rate": 4.33893346744641e-06,
+      "loss": 1.094,
+      "step": 3130
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.914122676632349,
+      "learning_rate": 4.336082438449369e-06,
+      "loss": 1.0265,
+      "step": 3131
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.674390288583959,
+      "learning_rate": 4.333231629172303e-06,
+      "loss": 1.1187,
+      "step": 3132
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 8.571623430368263,
+      "learning_rate": 4.330381040558669e-06,
+      "loss": 1.007,
+      "step": 3133
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 8.360313445190554,
+      "learning_rate": 4.327530673551857e-06,
+      "loss": 1.0247,
+      "step": 3134
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 6.605973629322992,
+      "learning_rate": 4.324680529095178e-06,
+      "loss": 1.0025,
+      "step": 3135
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 8.868901373250742,
+      "learning_rate": 4.321830608131872e-06,
+      "loss": 1.0428,
+      "step": 3136
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 8.632467713296998,
+      "learning_rate": 4.318980911605107e-06,
+      "loss": 1.0192,
+      "step": 3137
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 10.860105493419429,
+      "learning_rate": 4.316131440457974e-06,
+      "loss": 1.0805,
+      "step": 3138
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.561305326125262,
+      "learning_rate": 4.31328219563349e-06,
+      "loss": 1.1515,
+      "step": 3139
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 5.790681253593334,
+      "learning_rate": 4.3104331780745975e-06,
+      "loss": 1.0539,
+      "step": 3140
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 8.616158176918793,
+      "learning_rate": 4.307584388724164e-06,
+      "loss": 1.0375,
+      "step": 3141
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.1825582816858,
+      "learning_rate": 4.3047358285249804e-06,
+      "loss": 1.0374,
+      "step": 3142
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 5.770166906046187,
+      "learning_rate": 4.301887498419764e-06,
+      "loss": 1.0161,
+      "step": 3143
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.28538140971377,
+      "learning_rate": 4.299039399351152e-06,
+      "loss": 1.0208,
+      "step": 3144
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.635236270893567,
+      "learning_rate": 4.29619153226171e-06,
+      "loss": 0.9357,
+      "step": 3145
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 8.389080323856128,
+      "learning_rate": 4.293343898093923e-06,
+      "loss": 1.0708,
+      "step": 3146
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 5.629337004000265,
+      "learning_rate": 4.2904964977902006e-06,
+      "loss": 0.9967,
+      "step": 3147
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.769959322002648,
+      "learning_rate": 4.287649332292875e-06,
+      "loss": 1.0876,
+      "step": 3148
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.291016084952088,
+      "learning_rate": 4.2848024025442e-06,
+      "loss": 1.0528,
+      "step": 3149
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 9.23477844474025,
+      "learning_rate": 4.281955709486353e-06,
+      "loss": 1.0087,
+      "step": 3150
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 8.038176736238016,
+      "learning_rate": 4.2791092540614295e-06,
+      "loss": 1.041,
+      "step": 3151
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 6.885900570190547,
+      "learning_rate": 4.276263037211451e-06,
+      "loss": 1.0141,
+      "step": 3152
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 6.348464189133148,
+      "learning_rate": 4.273417059878358e-06,
+      "loss": 1.0912,
+      "step": 3153
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 9.385500376372391,
+      "learning_rate": 4.2705713230040095e-06,
+      "loss": 1.0669,
+      "step": 3154
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 8.102824953889963,
+      "learning_rate": 4.267725827530186e-06,
+      "loss": 1.0343,
+      "step": 3155
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 5.853235554062766,
+      "learning_rate": 4.264880574398593e-06,
+      "loss": 1.0048,
+      "step": 3156
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 8.527963864613609,
+      "learning_rate": 4.262035564550849e-06,
+      "loss": 1.0962,
+      "step": 3157
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 9.570142748078977,
+      "learning_rate": 4.259190798928496e-06,
+      "loss": 1.0463,
+      "step": 3158
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.220727870026603,
+      "learning_rate": 4.256346278472995e-06,
+      "loss": 1.0065,
+      "step": 3159
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 9.850043271572735,
+      "learning_rate": 4.253502004125721e-06,
+      "loss": 1.0204,
+      "step": 3160
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.752759301857823,
+      "learning_rate": 4.2506579768279745e-06,
+      "loss": 1.1039,
+      "step": 3161
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.746722472368297,
+      "learning_rate": 4.247814197520971e-06,
+      "loss": 1.0299,
+      "step": 3162
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.629147847643115,
+      "learning_rate": 4.244970667145843e-06,
+      "loss": 0.9269,
+      "step": 3163
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 10.202597961331627,
+      "learning_rate": 4.242127386643643e-06,
+      "loss": 0.9883,
+      "step": 3164
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.194015004783529,
+      "learning_rate": 4.239284356955338e-06,
+      "loss": 1.046,
+      "step": 3165
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.764400355684369,
+      "learning_rate": 4.236441579021815e-06,
+      "loss": 1.0062,
+      "step": 3166
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 5.031337469483931,
+      "learning_rate": 4.233599053783876e-06,
+      "loss": 0.995,
+      "step": 3167
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 9.760079343614184,
+      "learning_rate": 4.230756782182237e-06,
+      "loss": 1.1403,
+      "step": 3168
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 8.910201451992744,
+      "learning_rate": 4.227914765157536e-06,
+      "loss": 1.02,
+      "step": 3169
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 6.781082220505954,
+      "learning_rate": 4.225073003650322e-06,
+      "loss": 1.0507,
+      "step": 3170
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 9.903796134428957,
+      "learning_rate": 4.22223149860106e-06,
+      "loss": 1.1107,
+      "step": 3171
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 10.777054297070746,
+      "learning_rate": 4.219390250950133e-06,
+      "loss": 1.0343,
+      "step": 3172
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 5.512726892344115,
+      "learning_rate": 4.216549261637833e-06,
+      "loss": 1.0129,
+      "step": 3173
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 8.057618869418503,
+      "learning_rate": 4.2137085316043755e-06,
+      "loss": 0.994,
+      "step": 3174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 9.365275928898123,
+      "learning_rate": 4.210868061789883e-06,
+      "loss": 1.0251,
+      "step": 3175
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 5.635708879415115,
+      "learning_rate": 4.208027853134388e-06,
+      "loss": 1.1549,
+      "step": 3176
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 6.597197789921775,
+      "learning_rate": 4.205187906577853e-06,
+      "loss": 1.0793,
+      "step": 3177
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.853140428056554,
+      "learning_rate": 4.202348223060135e-06,
+      "loss": 1.0869,
+      "step": 3178
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 9.531220368652912,
+      "learning_rate": 4.1995088035210126e-06,
+      "loss": 1.0347,
+      "step": 3179
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 7.424187119917504,
+      "learning_rate": 4.196669648900182e-06,
+      "loss": 1.0843,
+      "step": 3180
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 8.311566796442937,
+      "learning_rate": 4.19383076013724e-06,
+      "loss": 1.0106,
+      "step": 3181
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 10.797611707241508,
+      "learning_rate": 4.190992138171706e-06,
+      "loss": 0.9797,
+      "step": 3182
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.058388031985945,
+      "learning_rate": 4.188153783943007e-06,
+      "loss": 0.9857,
+      "step": 3183
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.087731494399889,
+      "learning_rate": 4.185315698390475e-06,
+      "loss": 1.0377,
+      "step": 3184
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.2336295077252215,
+      "learning_rate": 4.182477882453369e-06,
+      "loss": 1.0539,
+      "step": 3185
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 5.765573844197733,
+      "learning_rate": 4.1796403370708415e-06,
+      "loss": 0.9584,
+      "step": 3186
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.840707929269738,
+      "learning_rate": 4.176803063181962e-06,
+      "loss": 0.9244,
+      "step": 3187
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 8.574326489843981,
+      "learning_rate": 4.173966061725719e-06,
+      "loss": 0.9859,
+      "step": 3188
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 11.279377292556934,
+      "learning_rate": 4.171129333640995e-06,
+      "loss": 1.0515,
+      "step": 3189
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 5.832156162461678,
+      "learning_rate": 4.1682928798665965e-06,
+      "loss": 1.0424,
+      "step": 3190
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 11.13283279016595,
+      "learning_rate": 4.1654567013412295e-06,
+      "loss": 1.0205,
+      "step": 3191
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 8.255197771304394,
+      "learning_rate": 4.16262079900351e-06,
+      "loss": 0.9197,
+      "step": 3192
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.908763089215219,
+      "learning_rate": 4.159785173791968e-06,
+      "loss": 1.02,
+      "step": 3193
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 5.969753531361613,
+      "learning_rate": 4.156949826645039e-06,
+      "loss": 1.0469,
+      "step": 3194
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.504673381574,
+      "learning_rate": 4.154114758501062e-06,
+      "loss": 1.1262,
+      "step": 3195
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.51416062638994,
+      "learning_rate": 4.151279970298292e-06,
+      "loss": 1.0764,
+      "step": 3196
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 11.271474259691669,
+      "learning_rate": 4.148445462974884e-06,
+      "loss": 1.0143,
+      "step": 3197
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 10.099541517097087,
+      "learning_rate": 4.145611237468905e-06,
+      "loss": 1.0668,
+      "step": 3198
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.252475667711852,
+      "learning_rate": 4.142777294718327e-06,
+      "loss": 1.0592,
+      "step": 3199
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.63525544518043,
+      "learning_rate": 4.139943635661026e-06,
+      "loss": 1.0062,
+      "step": 3200
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 8.087691698949936,
+      "learning_rate": 4.1371102612347895e-06,
+      "loss": 1.0313,
+      "step": 3201
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.6792665839205565,
+      "learning_rate": 4.134277172377304e-06,
+      "loss": 1.0258,
+      "step": 3202
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.730120844346822,
+      "learning_rate": 4.131444370026167e-06,
+      "loss": 1.0439,
+      "step": 3203
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 9.54434033966115,
+      "learning_rate": 4.128611855118881e-06,
+      "loss": 1.062,
+      "step": 3204
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.6892208055369275,
+      "learning_rate": 4.125779628592848e-06,
+      "loss": 1.0587,
+      "step": 3205
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.72820424494417,
+      "learning_rate": 4.122947691385382e-06,
+      "loss": 0.995,
+      "step": 3206
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 8.80309609984523,
+      "learning_rate": 4.120116044433695e-06,
+      "loss": 1.0644,
+      "step": 3207
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 9.18445164269448,
+      "learning_rate": 4.1172846886749056e-06,
+      "loss": 0.9905,
+      "step": 3208
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.3545361853865625,
+      "learning_rate": 4.114453625046038e-06,
+      "loss": 0.9935,
+      "step": 3209
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.431676092960119,
+      "learning_rate": 4.111622854484015e-06,
+      "loss": 1.0062,
+      "step": 3210
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 8.02931835363768,
+      "learning_rate": 4.108792377925665e-06,
+      "loss": 1.071,
+      "step": 3211
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.6463629289976005,
+      "learning_rate": 4.105962196307722e-06,
+      "loss": 0.9711,
+      "step": 3212
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 5.036949907783018,
+      "learning_rate": 4.103132310566816e-06,
+      "loss": 1.0041,
+      "step": 3213
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.778490354011713,
+      "learning_rate": 4.100302721639486e-06,
+      "loss": 1.069,
+      "step": 3214
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 5.406684315227048,
+      "learning_rate": 4.097473430462166e-06,
+      "loss": 1.0489,
+      "step": 3215
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.617034692461856,
+      "learning_rate": 4.094644437971197e-06,
+      "loss": 1.0534,
+      "step": 3216
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.557840016152599,
+      "learning_rate": 4.091815745102818e-06,
+      "loss": 1.0549,
+      "step": 3217
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.684593673417675,
+      "learning_rate": 4.088987352793172e-06,
+      "loss": 1.0252,
+      "step": 3218
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.641177613438943,
+      "learning_rate": 4.086159261978296e-06,
+      "loss": 0.9682,
+      "step": 3219
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.415367619557034,
+      "learning_rate": 4.083331473594136e-06,
+      "loss": 0.985,
+      "step": 3220
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 11.805890283808246,
+      "learning_rate": 4.080503988576529e-06,
+      "loss": 1.0641,
+      "step": 3221
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 9.3233554863134,
+      "learning_rate": 4.077676807861221e-06,
+      "loss": 0.9692,
+      "step": 3222
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.08768302650449,
+      "learning_rate": 4.07484993238385e-06,
+      "loss": 1.0945,
+      "step": 3223
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 4.828435181613442,
+      "learning_rate": 4.0720233630799534e-06,
+      "loss": 1.0395,
+      "step": 3224
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 8.323282499129297,
+      "learning_rate": 4.069197100884973e-06,
+      "loss": 0.9665,
+      "step": 3225
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 9.391474197654201,
+      "learning_rate": 4.066371146734243e-06,
+      "loss": 1.0965,
+      "step": 3226
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 5.743211032561929,
+      "learning_rate": 4.063545501562997e-06,
+      "loss": 1.0108,
+      "step": 3227
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 8.65386163264088,
+      "learning_rate": 4.06072016630637e-06,
+      "loss": 1.0332,
+      "step": 3228
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.241865978372313,
+      "learning_rate": 4.057895141899389e-06,
+      "loss": 1.0769,
+      "step": 3229
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.785315365255694,
+      "learning_rate": 4.055070429276981e-06,
+      "loss": 1.0542,
+      "step": 3230
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.222069176641742,
+      "learning_rate": 4.052246029373972e-06,
+      "loss": 1.0004,
+      "step": 3231
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 8.470337423795627,
+      "learning_rate": 4.049421943125079e-06,
+      "loss": 0.9761,
+      "step": 3232
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 10.303441962038717,
+      "learning_rate": 4.046598171464921e-06,
+      "loss": 1.0353,
+      "step": 3233
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.549980480529251,
+      "learning_rate": 4.043774715328008e-06,
+      "loss": 1.0504,
+      "step": 3234
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.03532295108702,
+      "learning_rate": 4.040951575648748e-06,
+      "loss": 1.1535,
+      "step": 3235
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.803990555750774,
+      "learning_rate": 4.038128753361445e-06,
+      "loss": 0.9793,
+      "step": 3236
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 7.5214532736468565,
+      "learning_rate": 4.0353062494002945e-06,
+      "loss": 1.024,
+      "step": 3237
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 6.222052743611647,
+      "learning_rate": 4.032484064699395e-06,
+      "loss": 1.0676,
+      "step": 3238
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 9.527440095185263,
+      "learning_rate": 4.029662200192728e-06,
+      "loss": 1.1017,
+      "step": 3239
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 7.336597529628855,
+      "learning_rate": 4.026840656814175e-06,
+      "loss": 1.0091,
+      "step": 3240
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 5.793482282977768,
+      "learning_rate": 4.024019435497513e-06,
+      "loss": 1.0361,
+      "step": 3241
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.793684685571248,
+      "learning_rate": 4.021198537176409e-06,
+      "loss": 1.0001,
+      "step": 3242
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 6.976166533038718,
+      "learning_rate": 4.018377962784423e-06,
+      "loss": 0.9451,
+      "step": 3243
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.982393120054587,
+      "learning_rate": 4.0155577132550115e-06,
+      "loss": 1.0908,
+      "step": 3244
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.927117023074574,
+      "learning_rate": 4.012737789521517e-06,
+      "loss": 0.9616,
+      "step": 3245
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 7.573047273045692,
+      "learning_rate": 4.009918192517184e-06,
+      "loss": 0.9963,
+      "step": 3246
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 6.445180563786478,
+      "learning_rate": 4.00709892317514e-06,
+      "loss": 1.054,
+      "step": 3247
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 9.55593574938231,
+      "learning_rate": 4.004279982428404e-06,
+      "loss": 1.0705,
+      "step": 3248
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 9.033815220901664,
+      "learning_rate": 4.001461371209896e-06,
+      "loss": 0.9465,
+      "step": 3249
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.584254892140882,
+      "learning_rate": 3.998643090452417e-06,
+      "loss": 1.033,
+      "step": 3250
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 7.75275776653321,
+      "learning_rate": 3.995825141088658e-06,
+      "loss": 1.0051,
+      "step": 3251
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 7.6022023618273495,
+      "learning_rate": 3.993007524051212e-06,
+      "loss": 1.0586,
+      "step": 3252
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 9.829983103835707,
+      "learning_rate": 3.990190240272547e-06,
+      "loss": 1.0326,
+      "step": 3253
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 10.541682153381567,
+      "learning_rate": 3.987373290685034e-06,
+      "loss": 1.0149,
+      "step": 3254
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 7.038385152096824,
+      "learning_rate": 3.984556676220926e-06,
+      "loss": 1.0308,
+      "step": 3255
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 5.4273111340880815,
+      "learning_rate": 3.9817403978123624e-06,
+      "loss": 1.0381,
+      "step": 3256
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.021082106532171,
+      "learning_rate": 3.978924456391381e-06,
+      "loss": 0.9915,
+      "step": 3257
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.50374044032916,
+      "learning_rate": 3.976108852889901e-06,
+      "loss": 1.0305,
+      "step": 3258
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.21970054565101,
+      "learning_rate": 3.973293588239726e-06,
+      "loss": 1.0209,
+      "step": 3259
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 6.269903062915188,
+      "learning_rate": 3.970478663372562e-06,
+      "loss": 0.9967,
+      "step": 3260
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 9.901253142546857,
+      "learning_rate": 3.967664079219985e-06,
+      "loss": 1.01,
+      "step": 3261
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.77049902883101,
+      "learning_rate": 3.9648498367134724e-06,
+      "loss": 0.9923,
+      "step": 3262
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 6.551108161457183,
+      "learning_rate": 3.9620359367843816e-06,
+      "loss": 1.0719,
+      "step": 3263
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.45780805208012,
+      "learning_rate": 3.9592223803639536e-06,
+      "loss": 0.9907,
+      "step": 3264
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 7.8018498508845315,
+      "learning_rate": 3.956409168383325e-06,
+      "loss": 0.9859,
+      "step": 3265
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 6.711134171213075,
+      "learning_rate": 3.953596301773513e-06,
+      "loss": 1.0831,
+      "step": 3266
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 7.778356600172093,
+      "learning_rate": 3.950783781465415e-06,
+      "loss": 1.0586,
+      "step": 3267
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.163338039502664,
+      "learning_rate": 3.947971608389826e-06,
+      "loss": 1.0794,
+      "step": 3268
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.436426241510805,
+      "learning_rate": 3.945159783477417e-06,
+      "loss": 1.05,
+      "step": 3269
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.08133938078998,
+      "learning_rate": 3.942348307658747e-06,
+      "loss": 1.0063,
+      "step": 3270
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 10.345608204634734,
+      "learning_rate": 3.939537181864259e-06,
+      "loss": 0.9759,
+      "step": 3271
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 7.61604460699275,
+      "learning_rate": 3.936726407024279e-06,
+      "loss": 1.0637,
+      "step": 3272
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 5.387736233164225,
+      "learning_rate": 3.93391598406902e-06,
+      "loss": 0.9656,
+      "step": 3273
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 7.609324175664128,
+      "learning_rate": 3.931105913928574e-06,
+      "loss": 1.0147,
+      "step": 3274
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.713440634512034,
+      "learning_rate": 3.92829619753292e-06,
+      "loss": 1.021,
+      "step": 3275
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.927816281955346,
+      "learning_rate": 3.925486835811919e-06,
+      "loss": 1.1072,
+      "step": 3276
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 7.631191966559336,
+      "learning_rate": 3.922677829695313e-06,
+      "loss": 1.0263,
+      "step": 3277
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 7.7224168093046766,
+      "learning_rate": 3.91986918011273e-06,
+      "loss": 1.1364,
+      "step": 3278
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 9.302181681114188,
+      "learning_rate": 3.917060887993675e-06,
+      "loss": 1.0164,
+      "step": 3279
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.054366647190486,
+      "learning_rate": 3.914252954267538e-06,
+      "loss": 0.9931,
+      "step": 3280
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 7.958609579870013,
+      "learning_rate": 3.911445379863591e-06,
+      "loss": 1.0366,
+      "step": 3281
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.328482251185752,
+      "learning_rate": 3.908638165710984e-06,
+      "loss": 1.0665,
+      "step": 3282
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 7.7075107025452985,
+      "learning_rate": 3.905831312738749e-06,
+      "loss": 1.007,
+      "step": 3283
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 9.918973968472821,
+      "learning_rate": 3.903024821875802e-06,
+      "loss": 1.0408,
+      "step": 3284
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 6.5448189961298695,
+      "learning_rate": 3.9002186940509336e-06,
+      "loss": 1.0045,
+      "step": 3285
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.465155710733315,
+      "learning_rate": 3.897412930192818e-06,
+      "loss": 1.0456,
+      "step": 3286
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.503763194938738,
+      "learning_rate": 3.894607531230008e-06,
+      "loss": 0.9156,
+      "step": 3287
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 7.296501487163283,
+      "learning_rate": 3.891802498090933e-06,
+      "loss": 1.0882,
+      "step": 3288
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 10.255237921981795,
+      "learning_rate": 3.888997831703907e-06,
+      "loss": 1.0462,
+      "step": 3289
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.91624189304639,
+      "learning_rate": 3.886193532997119e-06,
+      "loss": 1.0048,
+      "step": 3290
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 8.623782527107645,
+      "learning_rate": 3.883389602898633e-06,
+      "loss": 1.0469,
+      "step": 3291
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 6.046652973656518,
+      "learning_rate": 3.880586042336399e-06,
+      "loss": 1.112,
+      "step": 3292
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 7.416444297684781,
+      "learning_rate": 3.877782852238238e-06,
+      "loss": 1.0422,
+      "step": 3293
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 8.03689902783983,
+      "learning_rate": 3.874980033531851e-06,
+      "loss": 1.094,
+      "step": 3294
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 9.876449811633975,
+      "learning_rate": 3.872177587144819e-06,
+      "loss": 1.0259,
+      "step": 3295
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 8.951331442195814,
+      "learning_rate": 3.869375514004591e-06,
+      "loss": 0.9938,
+      "step": 3296
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.272462990718445,
+      "learning_rate": 3.8665738150385035e-06,
+      "loss": 1.041,
+      "step": 3297
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.901972938743258,
+      "learning_rate": 3.863772491173761e-06,
+      "loss": 1.0298,
+      "step": 3298
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 5.642039562161817,
+      "learning_rate": 3.860971543337446e-06,
+      "loss": 1.0864,
+      "step": 3299
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 5.460576826800614,
+      "learning_rate": 3.858170972456519e-06,
+      "loss": 1.0589,
+      "step": 3300
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.180938680453649,
+      "learning_rate": 3.855370779457814e-06,
+      "loss": 1.071,
+      "step": 3301
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 5.707055108137102,
+      "learning_rate": 3.852570965268039e-06,
+      "loss": 0.9872,
+      "step": 3302
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 8.12656387417775,
+      "learning_rate": 3.849771530813779e-06,
+      "loss": 1.0537,
+      "step": 3303
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 11.902293678002469,
+      "learning_rate": 3.846972477021489e-06,
+      "loss": 1.1211,
+      "step": 3304
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 8.997219274434235,
+      "learning_rate": 3.844173804817504e-06,
+      "loss": 1.1374,
+      "step": 3305
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.093862145301924,
+      "learning_rate": 3.841375515128029e-06,
+      "loss": 0.9872,
+      "step": 3306
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 5.998315247060909,
+      "learning_rate": 3.838577608879141e-06,
+      "loss": 1.0454,
+      "step": 3307
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 9.18371077768195,
+      "learning_rate": 3.835780086996794e-06,
+      "loss": 1.0189,
+      "step": 3308
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.095136370576202,
+      "learning_rate": 3.83298295040681e-06,
+      "loss": 0.9894,
+      "step": 3309
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.392322750787589,
+      "learning_rate": 3.830186200034894e-06,
+      "loss": 1.0777,
+      "step": 3310
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.552990031789193,
+      "learning_rate": 3.827389836806607e-06,
+      "loss": 0.9899,
+      "step": 3311
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 8.071149961887771,
+      "learning_rate": 3.824593861647395e-06,
+      "loss": 1.033,
+      "step": 3312
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.259788349680559,
+      "learning_rate": 3.82179827548257e-06,
+      "loss": 1.0171,
+      "step": 3313
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.832919380577621,
+      "learning_rate": 3.819003079237317e-06,
+      "loss": 1.0684,
+      "step": 3314
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 9.418182752353472,
+      "learning_rate": 3.816208273836689e-06,
+      "loss": 1.0876,
+      "step": 3315
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.300690648049704,
+      "learning_rate": 3.8134138602056148e-06,
+      "loss": 1.0794,
+      "step": 3316
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.75393710052147,
+      "learning_rate": 3.810619839268887e-06,
+      "loss": 1.0221,
+      "step": 3317
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.976007222461117,
+      "learning_rate": 3.807826211951178e-06,
+      "loss": 1.0279,
+      "step": 3318
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 5.076287095387588,
+      "learning_rate": 3.805032979177019e-06,
+      "loss": 1.0461,
+      "step": 3319
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 8.998017885890818,
+      "learning_rate": 3.8022401418708143e-06,
+      "loss": 0.9549,
+      "step": 3320
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.234679346259218,
+      "learning_rate": 3.7994477009568442e-06,
+      "loss": 1.0666,
+      "step": 3321
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 8.160619223505837,
+      "learning_rate": 3.796655657359248e-06,
+      "loss": 1.0395,
+      "step": 3322
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 10.326156858209307,
+      "learning_rate": 3.793864012002036e-06,
+      "loss": 1.0342,
+      "step": 3323
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.699665075090202,
+      "learning_rate": 3.791072765809094e-06,
+      "loss": 1.049,
+      "step": 3324
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.220846592736146,
+      "learning_rate": 3.788281919704164e-06,
+      "loss": 1.0368,
+      "step": 3325
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.267774006081293,
+      "learning_rate": 3.7854914746108655e-06,
+      "loss": 1.0932,
+      "step": 3326
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 8.355450143187014,
+      "learning_rate": 3.7827014314526824e-06,
+      "loss": 1.1649,
+      "step": 3327
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 10.997738721946707,
+      "learning_rate": 3.779911791152958e-06,
+      "loss": 0.9632,
+      "step": 3328
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 8.754753975231392,
+      "learning_rate": 3.7771225546349165e-06,
+      "loss": 0.9333,
+      "step": 3329
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.901587561591213,
+      "learning_rate": 3.7743337228216386e-06,
+      "loss": 1.0748,
+      "step": 3330
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.739065303762935,
+      "learning_rate": 3.7715452966360694e-06,
+      "loss": 1.003,
+      "step": 3331
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.065130466687544,
+      "learning_rate": 3.7687572770010293e-06,
+      "loss": 1.0669,
+      "step": 3332
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 5.6123713451449975,
+      "learning_rate": 3.765969664839193e-06,
+      "loss": 1.0711,
+      "step": 3333
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.521611438958162,
+      "learning_rate": 3.763182461073111e-06,
+      "loss": 1.0632,
+      "step": 3334
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.060322332958488,
+      "learning_rate": 3.760395666625192e-06,
+      "loss": 1.0892,
+      "step": 3335
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.025279811004928,
+      "learning_rate": 3.757609282417707e-06,
+      "loss": 1.1191,
+      "step": 3336
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 5.7811515123928645,
+      "learning_rate": 3.7548233093727997e-06,
+      "loss": 1.0639,
+      "step": 3337
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 5.844334668252187,
+      "learning_rate": 3.7520377484124724e-06,
+      "loss": 1.0132,
+      "step": 3338
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.247876714650993,
+      "learning_rate": 3.7492526004585877e-06,
+      "loss": 0.9653,
+      "step": 3339
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 8.936710794943492,
+      "learning_rate": 3.74646786643288e-06,
+      "loss": 1.0008,
+      "step": 3340
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.922634781502727,
+      "learning_rate": 3.74368354725694e-06,
+      "loss": 1.075,
+      "step": 3341
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 9.161646754849446,
+      "learning_rate": 3.7408996438522237e-06,
+      "loss": 0.9987,
+      "step": 3342
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.429936937091102,
+      "learning_rate": 3.73811615714005e-06,
+      "loss": 1.0201,
+      "step": 3343
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 8.667645683984372,
+      "learning_rate": 3.7353330880415963e-06,
+      "loss": 1.0502,
+      "step": 3344
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 10.740418749530622,
+      "learning_rate": 3.7325504374779076e-06,
+      "loss": 1.0491,
+      "step": 3345
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 7.819296420428835,
+      "learning_rate": 3.729768206369886e-06,
+      "loss": 0.9878,
+      "step": 3346
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 9.310051474326174,
+      "learning_rate": 3.726986395638294e-06,
+      "loss": 1.0969,
+      "step": 3347
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.285873193918472,
+      "learning_rate": 3.7242050062037606e-06,
+      "loss": 0.9588,
+      "step": 3348
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 8.711113177138952,
+      "learning_rate": 3.721424038986768e-06,
+      "loss": 1.0535,
+      "step": 3349
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 8.604460614487644,
+      "learning_rate": 3.7186434949076672e-06,
+      "loss": 1.0107,
+      "step": 3350
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.963404899307455,
+      "learning_rate": 3.715863374886661e-06,
+      "loss": 0.9551,
+      "step": 3351
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.199525832377732,
+      "learning_rate": 3.7130836798438145e-06,
+      "loss": 1.1119,
+      "step": 3352
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 8.20817947234432,
+      "learning_rate": 3.710304410699056e-06,
+      "loss": 1.0235,
+      "step": 3353
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 6.671879124505789,
+      "learning_rate": 3.7075255683721686e-06,
+      "loss": 1.1733,
+      "step": 3354
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 9.515163835891451,
+      "learning_rate": 3.704747153782793e-06,
+      "loss": 1.0153,
+      "step": 3355
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.586494592204299,
+      "learning_rate": 3.701969167850434e-06,
+      "loss": 1.0817,
+      "step": 3356
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 5.2896754818415195,
+      "learning_rate": 3.6991916114944494e-06,
+      "loss": 1.027,
+      "step": 3357
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 6.2126614855110995,
+      "learning_rate": 3.6964144856340577e-06,
+      "loss": 1.0162,
+      "step": 3358
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 6.882453834699177,
+      "learning_rate": 3.693637791188333e-06,
+      "loss": 0.9887,
+      "step": 3359
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 8.942661600551677,
+      "learning_rate": 3.6908615290762073e-06,
+      "loss": 1.0628,
+      "step": 3360
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 6.918375182865709,
+      "learning_rate": 3.6880857002164704e-06,
+      "loss": 1.065,
+      "step": 3361
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 10.088090115942252,
+      "learning_rate": 3.685310305527769e-06,
+      "loss": 1.0692,
+      "step": 3362
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 4.847698389508214,
+      "learning_rate": 3.6825353459286023e-06,
+      "loss": 1.1278,
+      "step": 3363
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 8.137805217544138,
+      "learning_rate": 3.6797608223373316e-06,
+      "loss": 1.0471,
+      "step": 3364
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 6.975269865925057,
+      "learning_rate": 3.6769867356721676e-06,
+      "loss": 1.041,
+      "step": 3365
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 9.440730870884792,
+      "learning_rate": 3.6742130868511823e-06,
+      "loss": 0.9661,
+      "step": 3366
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 6.625525231915149,
+      "learning_rate": 3.671439876792299e-06,
+      "loss": 1.0783,
+      "step": 3367
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 6.910844938170343,
+      "learning_rate": 3.668667106413295e-06,
+      "loss": 1.0297,
+      "step": 3368
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 8.067842020461638,
+      "learning_rate": 3.6658947766318066e-06,
+      "loss": 1.0241,
+      "step": 3369
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 6.135485512938214,
+      "learning_rate": 3.6631228883653195e-06,
+      "loss": 1.0424,
+      "step": 3370
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 8.22940165799355,
+      "learning_rate": 3.660351442531176e-06,
+      "loss": 1.0157,
+      "step": 3371
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 8.12820381160682,
+      "learning_rate": 3.657580440046572e-06,
+      "loss": 1.0452,
+      "step": 3372
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 8.803501331168833,
+      "learning_rate": 3.6548098818285544e-06,
+      "loss": 1.0627,
+      "step": 3373
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 10.67733705849491,
+      "learning_rate": 3.6520397687940267e-06,
+      "loss": 1.055,
+      "step": 3374
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.517866515736838,
+      "learning_rate": 3.649270101859742e-06,
+      "loss": 0.9266,
+      "step": 3375
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 6.751207081472303,
+      "learning_rate": 3.6465008819423054e-06,
+      "loss": 0.9515,
+      "step": 3376
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 6.180566363414594,
+      "learning_rate": 3.6437321099581783e-06,
+      "loss": 0.9603,
+      "step": 3377
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 6.98749451679304,
+      "learning_rate": 3.6409637868236693e-06,
+      "loss": 1.0494,
+      "step": 3378
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 6.726932314640692,
+      "learning_rate": 3.6381959134549393e-06,
+      "loss": 0.9595,
+      "step": 3379
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 10.175592042208198,
+      "learning_rate": 3.6354284907680037e-06,
+      "loss": 1.1027,
+      "step": 3380
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 8.554210769026142,
+      "learning_rate": 3.632661519678722e-06,
+      "loss": 1.0317,
+      "step": 3381
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 9.299847979373727,
+      "learning_rate": 3.629895001102815e-06,
+      "loss": 1.0637,
+      "step": 3382
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 9.714496068381592,
+      "learning_rate": 3.6271289359558424e-06,
+      "loss": 1.0495,
+      "step": 3383
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 5.941629524686497,
+      "learning_rate": 3.624363325153218e-06,
+      "loss": 1.051,
+      "step": 3384
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 8.569509617609086,
+      "learning_rate": 3.621598169610211e-06,
+      "loss": 0.9978,
+      "step": 3385
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.776793239537877,
+      "learning_rate": 3.6188334702419305e-06,
+      "loss": 0.9671,
+      "step": 3386
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.670576677871694,
+      "learning_rate": 3.6160692279633393e-06,
+      "loss": 1.0203,
+      "step": 3387
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 8.71312144117676,
+      "learning_rate": 3.61330544368925e-06,
+      "loss": 1.0995,
+      "step": 3388
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 6.438942624635069,
+      "learning_rate": 3.6105421183343198e-06,
+      "loss": 1.0153,
+      "step": 3389
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 5.784186987871882,
+      "learning_rate": 3.6077792528130615e-06,
+      "loss": 1.0153,
+      "step": 3390
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 8.741347598934793,
+      "learning_rate": 3.6050168480398274e-06,
+      "loss": 1.0961,
+      "step": 3391
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 11.068035717296517,
+      "learning_rate": 3.602254904928818e-06,
+      "loss": 1.0819,
+      "step": 3392
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 6.4401352268732355,
+      "learning_rate": 3.5994934243940904e-06,
+      "loss": 1.0703,
+      "step": 3393
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 8.512866104412455,
+      "learning_rate": 3.5967324073495363e-06,
+      "loss": 1.0107,
+      "step": 3394
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.248913961995428,
+      "learning_rate": 3.5939718547088992e-06,
+      "loss": 0.9865,
+      "step": 3395
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 9.373954835935377,
+      "learning_rate": 3.591211767385775e-06,
+      "loss": 1.1062,
+      "step": 3396
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 8.11481997013048,
+      "learning_rate": 3.5884521462935917e-06,
+      "loss": 0.9911,
+      "step": 3397
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 8.201425992286355,
+      "learning_rate": 3.5856929923456384e-06,
+      "loss": 1.1632,
+      "step": 3398
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 9.083971403669288,
+      "learning_rate": 3.5829343064550403e-06,
+      "loss": 0.9905,
+      "step": 3399
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 9.146105968548673,
+      "learning_rate": 3.5801760895347666e-06,
+      "loss": 1.0241,
+      "step": 3400
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 7.025262006461065,
+      "learning_rate": 3.5774183424976383e-06,
+      "loss": 1.0024,
+      "step": 3401
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 9.548904149982869,
+      "learning_rate": 3.5746610662563175e-06,
+      "loss": 0.9986,
+      "step": 3402
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 5.800864928598077,
+      "learning_rate": 3.571904261723304e-06,
+      "loss": 1.0157,
+      "step": 3403
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 8.485157681069204,
+      "learning_rate": 3.5691479298109557e-06,
+      "loss": 0.9999,
+      "step": 3404
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 6.4488687001022775,
+      "learning_rate": 3.5663920714314583e-06,
+      "loss": 1.0356,
+      "step": 3405
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 6.820284567293687,
+      "learning_rate": 3.563636687496854e-06,
+      "loss": 0.9745,
+      "step": 3406
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 10.140654502499792,
+      "learning_rate": 3.560881778919022e-06,
+      "loss": 0.95,
+      "step": 3407
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 5.188059750535896,
+      "learning_rate": 3.55812734660968e-06,
+      "loss": 1.0655,
+      "step": 3408
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 8.156650147558489,
+      "learning_rate": 3.5553733914803977e-06,
+      "loss": 1.0459,
+      "step": 3409
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.250995896551256,
+      "learning_rate": 3.552619914442581e-06,
+      "loss": 1.0766,
+      "step": 3410
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 8.0556067652378,
+      "learning_rate": 3.549866916407474e-06,
+      "loss": 1.0834,
+      "step": 3411
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 6.107418483681571,
+      "learning_rate": 3.547114398286172e-06,
+      "loss": 0.9805,
+      "step": 3412
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 9.285811279502665,
+      "learning_rate": 3.544362360989604e-06,
+      "loss": 1.0665,
+      "step": 3413
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 5.438283403042046,
+      "learning_rate": 3.5416108054285426e-06,
+      "loss": 1.1194,
+      "step": 3414
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 6.372395307485319,
+      "learning_rate": 3.5388597325136003e-06,
+      "loss": 1.0629,
+      "step": 3415
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.032532716504826,
+      "learning_rate": 3.5361091431552287e-06,
+      "loss": 0.9971,
+      "step": 3416
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 9.377407733634065,
+      "learning_rate": 3.5333590382637226e-06,
+      "loss": 1.0312,
+      "step": 3417
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 6.812106731796156,
+      "learning_rate": 3.5306094187492144e-06,
+      "loss": 1.0482,
+      "step": 3418
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 9.216719054915458,
+      "learning_rate": 3.5278602855216736e-06,
+      "loss": 1.013,
+      "step": 3419
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.885611328893447,
+      "learning_rate": 3.525111639490914e-06,
+      "loss": 1.0179,
+      "step": 3420
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 8.914086434562979,
+      "learning_rate": 3.5223634815665832e-06,
+      "loss": 0.9933,
+      "step": 3421
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.31932359378517,
+      "learning_rate": 3.5196158126581716e-06,
+      "loss": 1.0269,
+      "step": 3422
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 8.125406470413294,
+      "learning_rate": 3.516868633675004e-06,
+      "loss": 1.003,
+      "step": 3423
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 8.593081311976809,
+      "learning_rate": 3.5141219455262445e-06,
+      "loss": 1.0447,
+      "step": 3424
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.885008556184157,
+      "learning_rate": 3.5113757491208957e-06,
+      "loss": 1.0232,
+      "step": 3425
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 8.094641874815567,
+      "learning_rate": 3.5086300453677963e-06,
+      "loss": 0.9495,
+      "step": 3426
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 10.567963235145008,
+      "learning_rate": 3.5058848351756216e-06,
+      "loss": 1.0412,
+      "step": 3427
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.5427684773939605,
+      "learning_rate": 3.503140119452886e-06,
+      "loss": 1.1341,
+      "step": 3428
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 11.024123470760083,
+      "learning_rate": 3.500395899107936e-06,
+      "loss": 1.0437,
+      "step": 3429
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.017300887454992,
+      "learning_rate": 3.4976521750489596e-06,
+      "loss": 1.0016,
+      "step": 3430
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 5.382186225572012,
+      "learning_rate": 3.494908948183976e-06,
+      "loss": 1.0896,
+      "step": 3431
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.8073585815287325,
+      "learning_rate": 3.4921662194208404e-06,
+      "loss": 1.0143,
+      "step": 3432
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 8.080822484391302,
+      "learning_rate": 3.4894239896672465e-06,
+      "loss": 0.9827,
+      "step": 3433
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 10.323990536256035,
+      "learning_rate": 3.4866822598307205e-06,
+      "loss": 1.1329,
+      "step": 3434
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 5.166030946469528,
+      "learning_rate": 3.4839410308186207e-06,
+      "loss": 1.0329,
+      "step": 3435
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 5.096974480840889,
+      "learning_rate": 3.4812003035381448e-06,
+      "loss": 1.0259,
+      "step": 3436
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.666337156599118,
+      "learning_rate": 3.4784600788963197e-06,
+      "loss": 1.0657,
+      "step": 3437
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.388573531920492,
+      "learning_rate": 3.4757203578000104e-06,
+      "loss": 1.0808,
+      "step": 3438
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 9.942164474208209,
+      "learning_rate": 3.4729811411559122e-06,
+      "loss": 1.0361,
+      "step": 3439
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 8.090763470699548,
+      "learning_rate": 3.470242429870553e-06,
+      "loss": 1.0195,
+      "step": 3440
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 8.660342983450999,
+      "learning_rate": 3.4675042248502965e-06,
+      "loss": 1.0156,
+      "step": 3441
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 4.824192756465043,
+      "learning_rate": 3.464766527001336e-06,
+      "loss": 1.0125,
+      "step": 3442
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 8.141116865979008,
+      "learning_rate": 3.462029337229697e-06,
+      "loss": 0.9597,
+      "step": 3443
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.8031258130612,
+      "learning_rate": 3.45929265644124e-06,
+      "loss": 1.0904,
+      "step": 3444
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 6.734862024699446,
+      "learning_rate": 3.4565564855416535e-06,
+      "loss": 0.9948,
+      "step": 3445
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.3553428261411025,
+      "learning_rate": 3.453820825436459e-06,
+      "loss": 0.9342,
+      "step": 3446
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 6.805258951002565,
+      "learning_rate": 3.4510856770310098e-06,
+      "loss": 1.0673,
+      "step": 3447
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 8.18661556813627,
+      "learning_rate": 3.4483510412304853e-06,
+      "loss": 1.0328,
+      "step": 3448
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 8.304294132290654,
+      "learning_rate": 3.4456169189399023e-06,
+      "loss": 1.0891,
+      "step": 3449
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.030127878792851,
+      "learning_rate": 3.4428833110641037e-06,
+      "loss": 1.0142,
+      "step": 3450
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 6.945114914002239,
+      "learning_rate": 3.4401502185077583e-06,
+      "loss": 0.9648,
+      "step": 3451
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 10.984772127218562,
+      "learning_rate": 3.4374176421753737e-06,
+      "loss": 1.0385,
+      "step": 3452
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 9.886896549562941,
+      "learning_rate": 3.4346855829712765e-06,
+      "loss": 1.0974,
+      "step": 3453
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 6.168788610247744,
+      "learning_rate": 3.4319540417996334e-06,
+      "loss": 1.0633,
+      "step": 3454
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 10.994617247176118,
+      "learning_rate": 3.4292230195644283e-06,
+      "loss": 1.0161,
+      "step": 3455
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 10.504112232007058,
+      "learning_rate": 3.4264925171694786e-06,
+      "loss": 1.0766,
+      "step": 3456
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 6.359178940340475,
+      "learning_rate": 3.4237625355184327e-06,
+      "loss": 1.0395,
+      "step": 3457
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.102829746314389,
+      "learning_rate": 3.4210330755147616e-06,
+      "loss": 1.1098,
+      "step": 3458
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 7.537110091738483,
+      "learning_rate": 3.418304138061763e-06,
+      "loss": 1.0452,
+      "step": 3459
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 8.638343524425379,
+      "learning_rate": 3.4155757240625686e-06,
+      "loss": 1.0821,
+      "step": 3460
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 8.316786682096142,
+      "learning_rate": 3.4128478344201286e-06,
+      "loss": 1.0559,
+      "step": 3461
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 6.105838554751304,
+      "learning_rate": 3.410120470037228e-06,
+      "loss": 1.0409,
+      "step": 3462
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 6.744710188691536,
+      "learning_rate": 3.4073936318164704e-06,
+      "loss": 1.064,
+      "step": 3463
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 8.432925467365818,
+      "learning_rate": 3.4046673206602866e-06,
+      "loss": 0.978,
+      "step": 3464
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.566284915833628,
+      "learning_rate": 3.4019415374709395e-06,
+      "loss": 1.1248,
+      "step": 3465
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 10.903196678040429,
+      "learning_rate": 3.399216283150509e-06,
+      "loss": 1.0188,
+      "step": 3466
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 6.575195444173767,
+      "learning_rate": 3.3964915586009035e-06,
+      "loss": 1.085,
+      "step": 3467
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.997916427020536,
+      "learning_rate": 3.393767364723859e-06,
+      "loss": 0.9273,
+      "step": 3468
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 8.747024311195135,
+      "learning_rate": 3.3910437024209285e-06,
+      "loss": 0.9871,
+      "step": 3469
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.858601649075951,
+      "learning_rate": 3.3883205725934977e-06,
+      "loss": 1.0568,
+      "step": 3470
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.617009040930568,
+      "learning_rate": 3.3855979761427705e-06,
+      "loss": 1.0154,
+      "step": 3471
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.970127538581633,
+      "learning_rate": 3.3828759139697727e-06,
+      "loss": 1.0258,
+      "step": 3472
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 10.18311801962558,
+      "learning_rate": 3.3801543869753607e-06,
+      "loss": 1.0255,
+      "step": 3473
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 5.113977727771892,
+      "learning_rate": 3.377433396060208e-06,
+      "loss": 1.0062,
+      "step": 3474
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 6.24753209626846,
+      "learning_rate": 3.3747129421248092e-06,
+      "loss": 1.0148,
+      "step": 3475
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 6.099145214959072,
+      "learning_rate": 3.3719930260694876e-06,
+      "loss": 1.0606,
+      "step": 3476
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 8.162240198687355,
+      "learning_rate": 3.369273648794383e-06,
+      "loss": 1.0259,
+      "step": 3477
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 6.437558835501313,
+      "learning_rate": 3.36655481119946e-06,
+      "loss": 1.0081,
+      "step": 3478
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 10.53038455519474,
+      "learning_rate": 3.363836514184504e-06,
+      "loss": 0.9769,
+      "step": 3479
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 6.539483509712474,
+      "learning_rate": 3.361118758649116e-06,
+      "loss": 0.9864,
+      "step": 3480
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.479960542531581,
+      "learning_rate": 3.3584015454927287e-06,
+      "loss": 1.0603,
+      "step": 3481
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 10.179086890991332,
+      "learning_rate": 3.3556848756145883e-06,
+      "loss": 1.0311,
+      "step": 3482
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 9.29825116899978,
+      "learning_rate": 3.3529687499137575e-06,
+      "loss": 1.0808,
+      "step": 3483
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 8.322365037980495,
+      "learning_rate": 3.350253169289128e-06,
+      "loss": 1.0685,
+      "step": 3484
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.193944972414496,
+      "learning_rate": 3.3475381346394055e-06,
+      "loss": 1.0848,
+      "step": 3485
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 6.695126394290114,
+      "learning_rate": 3.3448236468631174e-06,
+      "loss": 0.9306,
+      "step": 3486
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 9.007996521735793,
+      "learning_rate": 3.3421097068586072e-06,
+      "loss": 1.0048,
+      "step": 3487
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.268438250211148,
+      "learning_rate": 3.3393963155240394e-06,
+      "loss": 0.9781,
+      "step": 3488
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 8.679909370840576,
+      "learning_rate": 3.336683473757397e-06,
+      "loss": 1.0563,
+      "step": 3489
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 6.6989219306614585,
+      "learning_rate": 3.3339711824564796e-06,
+      "loss": 0.9574,
+      "step": 3490
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 5.557113470840905,
+      "learning_rate": 3.3312594425189053e-06,
+      "loss": 1.0474,
+      "step": 3491
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 10.309893148271582,
+      "learning_rate": 3.3285482548421116e-06,
+      "loss": 1.0327,
+      "step": 3492
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.243745138495172,
+      "learning_rate": 3.3258376203233494e-06,
+      "loss": 1.0653,
+      "step": 3493
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 9.180921238718922,
+      "learning_rate": 3.3231275398596915e-06,
+      "loss": 0.9889,
+      "step": 3494
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.135740409654819,
+      "learning_rate": 3.3204180143480225e-06,
+      "loss": 1.0074,
+      "step": 3495
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.88532895045857,
+      "learning_rate": 3.3177090446850445e-06,
+      "loss": 0.9913,
+      "step": 3496
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.109211566066326,
+      "learning_rate": 3.3150006317672787e-06,
+      "loss": 1.0666,
+      "step": 3497
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 9.718639199556355,
+      "learning_rate": 3.3122927764910594e-06,
+      "loss": 1.0241,
+      "step": 3498
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.923535847168718,
+      "learning_rate": 3.3095854797525362e-06,
+      "loss": 0.9819,
+      "step": 3499
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 8.23002943925708,
+      "learning_rate": 3.3068787424476758e-06,
+      "loss": 1.0583,
+      "step": 3500
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.386339280433798,
+      "learning_rate": 3.3041725654722556e-06,
+      "loss": 1.001,
+      "step": 3501
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 6.4941649624899105,
+      "learning_rate": 3.3014669497218736e-06,
+      "loss": 1.0616,
+      "step": 3502
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.723933561568408,
+      "learning_rate": 3.298761896091938e-06,
+      "loss": 1.0181,
+      "step": 3503
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 6.020859723758323,
+      "learning_rate": 3.2960574054776707e-06,
+      "loss": 1.1127,
+      "step": 3504
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 8.400848369070346,
+      "learning_rate": 3.2933534787741094e-06,
+      "loss": 1.1665,
+      "step": 3505
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.016990928027048,
+      "learning_rate": 3.2906501168761044e-06,
+      "loss": 1.0208,
+      "step": 3506
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.764267350136031,
+      "learning_rate": 3.2879473206783176e-06,
+      "loss": 0.9821,
+      "step": 3507
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.732954520073998,
+      "learning_rate": 3.2852450910752272e-06,
+      "loss": 0.9749,
+      "step": 3508
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 8.369623196282344,
+      "learning_rate": 3.2825434289611193e-06,
+      "loss": 1.0724,
+      "step": 3509
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 6.633152319849446,
+      "learning_rate": 3.2798423352300955e-06,
+      "loss": 0.9799,
+      "step": 3510
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 8.653718165696437,
+      "learning_rate": 3.27714181077607e-06,
+      "loss": 0.9882,
+      "step": 3511
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 9.406381304502078,
+      "learning_rate": 3.2744418564927633e-06,
+      "loss": 0.9963,
+      "step": 3512
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 8.451285663654016,
+      "learning_rate": 3.2717424732737146e-06,
+      "loss": 1.037,
+      "step": 3513
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.168619767500378,
+      "learning_rate": 3.2690436620122677e-06,
+      "loss": 1.0153,
+      "step": 3514
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.889814694984852,
+      "learning_rate": 3.266345423601581e-06,
+      "loss": 1.0559,
+      "step": 3515
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.837095640068012,
+      "learning_rate": 3.2636477589346227e-06,
+      "loss": 1.072,
+      "step": 3516
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 5.049105180285653,
+      "learning_rate": 3.2609506689041683e-06,
+      "loss": 1.0151,
+      "step": 3517
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 8.807276022750719,
+      "learning_rate": 3.258254154402808e-06,
+      "loss": 1.0678,
+      "step": 3518
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.532418456007807,
+      "learning_rate": 3.2555582163229375e-06,
+      "loss": 1.0424,
+      "step": 3519
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.454221791860862,
+      "learning_rate": 3.2528628555567627e-06,
+      "loss": 0.9826,
+      "step": 3520
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.971885759560764,
+      "learning_rate": 3.2501680729963005e-06,
+      "loss": 1.0966,
+      "step": 3521
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 8.938561240678371,
+      "learning_rate": 3.247473869533373e-06,
+      "loss": 1.0533,
+      "step": 3522
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.807286931417264,
+      "learning_rate": 3.2447802460596124e-06,
+      "loss": 1.0365,
+      "step": 3523
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 9.250558493134553,
+      "learning_rate": 3.242087203466461e-06,
+      "loss": 1.0087,
+      "step": 3524
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.101707637113763,
+      "learning_rate": 3.239394742645164e-06,
+      "loss": 1.0394,
+      "step": 3525
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.473708645240798,
+      "learning_rate": 3.2367028644867814e-06,
+      "loss": 0.932,
+      "step": 3526
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.347706806051649,
+      "learning_rate": 3.2340115698821715e-06,
+      "loss": 1.0594,
+      "step": 3527
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.935313504143516,
+      "learning_rate": 3.231320859722005e-06,
+      "loss": 1.0864,
+      "step": 3528
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 9.461426561490365,
+      "learning_rate": 3.22863073489676e-06,
+      "loss": 0.9804,
+      "step": 3529
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 8.542149093415762,
+      "learning_rate": 3.2259411962967167e-06,
+      "loss": 1.1986,
+      "step": 3530
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.421861723894602,
+      "learning_rate": 3.223252244811963e-06,
+      "loss": 1.033,
+      "step": 3531
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.944067046973977,
+      "learning_rate": 3.2205638813323968e-06,
+      "loss": 0.979,
+      "step": 3532
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 10.152311683281145,
+      "learning_rate": 3.2178761067477125e-06,
+      "loss": 1.0163,
+      "step": 3533
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 9.93054302762939,
+      "learning_rate": 3.215188921947421e-06,
+      "loss": 1.0276,
+      "step": 3534
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.9084494512448495,
+      "learning_rate": 3.212502327820827e-06,
+      "loss": 1.1325,
+      "step": 3535
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 8.3091621305257,
+      "learning_rate": 3.2098163252570436e-06,
+      "loss": 1.0546,
+      "step": 3536
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 9.321544083837978,
+      "learning_rate": 3.2071309151449936e-06,
+      "loss": 1.0306,
+      "step": 3537
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 4.39849307722175,
+      "learning_rate": 3.204446098373396e-06,
+      "loss": 1.0271,
+      "step": 3538
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 5.397140241913351,
+      "learning_rate": 3.2017618758307757e-06,
+      "loss": 1.0312,
+      "step": 3539
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.7716174044180155,
+      "learning_rate": 3.199078248405466e-06,
+      "loss": 1.04,
+      "step": 3540
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 5.246764431450347,
+      "learning_rate": 3.196395216985594e-06,
+      "loss": 1.011,
+      "step": 3541
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 9.439259457810294,
+      "learning_rate": 3.1937127824590986e-06,
+      "loss": 1.0399,
+      "step": 3542
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.599923401947849,
+      "learning_rate": 3.1910309457137167e-06,
+      "loss": 1.03,
+      "step": 3543
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 8.075228118097494,
+      "learning_rate": 3.188349707636984e-06,
+      "loss": 1.0037,
+      "step": 3544
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 8.214188721457027,
+      "learning_rate": 3.1856690691162466e-06,
+      "loss": 1.1187,
+      "step": 3545
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 10.462708477433665,
+      "learning_rate": 3.1829890310386467e-06,
+      "loss": 1.1223,
+      "step": 3546
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 8.463122548261369,
+      "learning_rate": 3.1803095942911245e-06,
+      "loss": 1.0358,
+      "step": 3547
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 9.764605829921777,
+      "learning_rate": 3.1776307597604308e-06,
+      "loss": 1.0063,
+      "step": 3548
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.108563208270394,
+      "learning_rate": 3.1749525283331074e-06,
+      "loss": 1.1013,
+      "step": 3549
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.923244298951215,
+      "learning_rate": 3.172274900895504e-06,
+      "loss": 1.0137,
+      "step": 3550
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 8.052656159917694,
+      "learning_rate": 3.169597878333767e-06,
+      "loss": 0.9988,
+      "step": 3551
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 8.405369225821898,
+      "learning_rate": 3.1669214615338374e-06,
+      "loss": 1.0083,
+      "step": 3552
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.9759392676143674,
+      "learning_rate": 3.1642456513814666e-06,
+      "loss": 1.0992,
+      "step": 3553
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.696363360053364,
+      "learning_rate": 3.1615704487622e-06,
+      "loss": 0.9849,
+      "step": 3554
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 8.54388855545932,
+      "learning_rate": 3.1588958545613757e-06,
+      "loss": 1.0558,
+      "step": 3555
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 5.251458092553939,
+      "learning_rate": 3.1562218696641433e-06,
+      "loss": 1.0778,
+      "step": 3556
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 9.228199006589525,
+      "learning_rate": 3.153548494955439e-06,
+      "loss": 1.0771,
+      "step": 3557
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 9.931113835253365,
+      "learning_rate": 3.1508757313200056e-06,
+      "loss": 0.9893,
+      "step": 3558
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 9.694403681859889,
+      "learning_rate": 3.1482035796423772e-06,
+      "loss": 1.087,
+      "step": 3559
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.660153735479927,
+      "learning_rate": 3.1455320408068878e-06,
+      "loss": 1.0899,
+      "step": 3560
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.017867140653132,
+      "learning_rate": 3.142861115697672e-06,
+      "loss": 1.0111,
+      "step": 3561
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.844028148301959,
+      "learning_rate": 3.1401908051986563e-06,
+      "loss": 0.9818,
+      "step": 3562
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.441646621043356,
+      "learning_rate": 3.137521110193564e-06,
+      "loss": 0.9724,
+      "step": 3563
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.011726768555147,
+      "learning_rate": 3.1348520315659205e-06,
+      "loss": 0.9792,
+      "step": 3564
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 8.85657252789137,
+      "learning_rate": 3.132183570199038e-06,
+      "loss": 1.0541,
+      "step": 3565
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.587986580315311,
+      "learning_rate": 3.1295157269760347e-06,
+      "loss": 1.0841,
+      "step": 3566
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.727882067160947,
+      "learning_rate": 3.126848502779816e-06,
+      "loss": 1.1251,
+      "step": 3567
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.326648874958979,
+      "learning_rate": 3.1241818984930857e-06,
+      "loss": 0.9621,
+      "step": 3568
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.3136951040109865,
+      "learning_rate": 3.121515914998343e-06,
+      "loss": 1.075,
+      "step": 3569
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 7.213421601907497,
+      "learning_rate": 3.118850553177881e-06,
+      "loss": 1.0076,
+      "step": 3570
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 8.682636983117844,
+      "learning_rate": 3.1161858139137864e-06,
+      "loss": 1.0829,
+      "step": 3571
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.334766362375608,
+      "learning_rate": 3.113521698087941e-06,
+      "loss": 1.0364,
+      "step": 3572
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 6.506291504925377,
+      "learning_rate": 3.110858206582018e-06,
+      "loss": 0.9952,
+      "step": 3573
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 8.136989253772652,
+      "learning_rate": 3.108195340277489e-06,
+      "loss": 0.9989,
+      "step": 3574
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 9.324567962479776,
+      "learning_rate": 3.105533100055613e-06,
+      "loss": 1.0329,
+      "step": 3575
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.84235993524027,
+      "learning_rate": 3.1028714867974435e-06,
+      "loss": 1.0004,
+      "step": 3576
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 8.103343924545728,
+      "learning_rate": 3.1002105013838294e-06,
+      "loss": 1.002,
+      "step": 3577
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.682254088577267,
+      "learning_rate": 3.097550144695409e-06,
+      "loss": 1.0094,
+      "step": 3578
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.457948615401286,
+      "learning_rate": 3.0948904176126116e-06,
+      "loss": 1.0265,
+      "step": 3579
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.0757376624546255,
+      "learning_rate": 3.0922313210156615e-06,
+      "loss": 1.0484,
+      "step": 3580
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.035367340654009,
+      "learning_rate": 3.0895728557845705e-06,
+      "loss": 0.9878,
+      "step": 3581
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.333516000277385,
+      "learning_rate": 3.086915022799146e-06,
+      "loss": 1.0851,
+      "step": 3582
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 5.503328762640004,
+      "learning_rate": 3.084257822938983e-06,
+      "loss": 1.0692,
+      "step": 3583
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 5.869441042890076,
+      "learning_rate": 3.0816012570834654e-06,
+      "loss": 1.0597,
+      "step": 3584
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 6.807784827500908,
+      "learning_rate": 3.078945326111773e-06,
+      "loss": 1.0099,
+      "step": 3585
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 5.62573905537167,
+      "learning_rate": 3.076290030902869e-06,
+      "loss": 1.0984,
+      "step": 3586
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 8.716655790772558,
+      "learning_rate": 3.07363537233551e-06,
+      "loss": 1.1261,
+      "step": 3587
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 8.895719957067744,
+      "learning_rate": 3.0709813512882425e-06,
+      "loss": 1.0027,
+      "step": 3588
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.7423081227798525,
+      "learning_rate": 3.0683279686393974e-06,
+      "loss": 1.0517,
+      "step": 3589
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.949476856694369,
+      "learning_rate": 3.0656752252671016e-06,
+      "loss": 1.1177,
+      "step": 3590
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.860532622982166,
+      "learning_rate": 3.0630231220492647e-06,
+      "loss": 0.9576,
+      "step": 3591
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 6.603400256222893,
+      "learning_rate": 3.0603716598635837e-06,
+      "loss": 0.9401,
+      "step": 3592
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 5.036250635345899,
+      "learning_rate": 3.057720839587549e-06,
+      "loss": 1.0104,
+      "step": 3593
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 5.6666382409229685,
+      "learning_rate": 3.055070662098435e-06,
+      "loss": 0.9535,
+      "step": 3594
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 6.1715058145946555,
+      "learning_rate": 3.0524211282733017e-06,
+      "loss": 1.0251,
+      "step": 3595
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 9.811653404457385,
+      "learning_rate": 3.0497722389890006e-06,
+      "loss": 1.0786,
+      "step": 3596
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 5.143330327170381,
+      "learning_rate": 3.047123995122165e-06,
+      "loss": 1.0749,
+      "step": 3597
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 8.128418370725978,
+      "learning_rate": 3.044476397549221e-06,
+      "loss": 1.0222,
+      "step": 3598
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 8.140621505619748,
+      "learning_rate": 3.0418294471463732e-06,
+      "loss": 1.1473,
+      "step": 3599
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 8.18960268324512,
+      "learning_rate": 3.0391831447896154e-06,
+      "loss": 1.0269,
+      "step": 3600
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 8.811031719381761,
+      "learning_rate": 3.0365374913547317e-06,
+      "loss": 0.9877,
+      "step": 3601
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 8.315673512464143,
+      "learning_rate": 3.0338924877172826e-06,
+      "loss": 1.0458,
+      "step": 3602
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 9.82748268757348,
+      "learning_rate": 3.0312481347526178e-06,
+      "loss": 1.0099,
+      "step": 3603
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.141603566643751,
+      "learning_rate": 3.0286044333358765e-06,
+      "loss": 1.0133,
+      "step": 3604
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 5.6943169641681575,
+      "learning_rate": 3.025961384341971e-06,
+      "loss": 1.0013,
+      "step": 3605
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 8.780365411566065,
+      "learning_rate": 3.0233189886456117e-06,
+      "loss": 1.0383,
+      "step": 3606
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 9.323951603011887,
+      "learning_rate": 3.0206772471212793e-06,
+      "loss": 0.97,
+      "step": 3607
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 6.715795800662645,
+      "learning_rate": 3.018036160643244e-06,
+      "loss": 1.1203,
+      "step": 3608
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.826373729781376,
+      "learning_rate": 3.015395730085565e-06,
+      "loss": 1.0073,
+      "step": 3609
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 10.848059543809931,
+      "learning_rate": 3.012755956322074e-06,
+      "loss": 1.0294,
+      "step": 3610
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.27170126379003,
+      "learning_rate": 3.0101168402263893e-06,
+      "loss": 0.9922,
+      "step": 3611
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 9.018757223648148,
+      "learning_rate": 3.0074783826719167e-06,
+      "loss": 0.9558,
+      "step": 3612
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 8.473604633579383,
+      "learning_rate": 3.004840584531834e-06,
+      "loss": 1.0706,
+      "step": 3613
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 10.335266963930344,
+      "learning_rate": 3.002203446679112e-06,
+      "loss": 1.0264,
+      "step": 3614
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 6.905267798574611,
+      "learning_rate": 2.9995669699864956e-06,
+      "loss": 1.0129,
+      "step": 3615
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.489483973512029,
+      "learning_rate": 2.9969311553265084e-06,
+      "loss": 0.9596,
+      "step": 3616
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 9.688117219078588,
+      "learning_rate": 2.994296003571465e-06,
+      "loss": 0.9428,
+      "step": 3617
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 6.488327129624957,
+      "learning_rate": 2.9916615155934537e-06,
+      "loss": 0.9999,
+      "step": 3618
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.283501400610689,
+      "learning_rate": 2.98902769226434e-06,
+      "loss": 0.9285,
+      "step": 3619
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 9.240065101663225,
+      "learning_rate": 2.986394534455779e-06,
+      "loss": 1.0448,
+      "step": 3620
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.516188093701967,
+      "learning_rate": 2.9837620430391966e-06,
+      "loss": 0.9817,
+      "step": 3621
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 10.012377407623775,
+      "learning_rate": 2.981130218885805e-06,
+      "loss": 0.9584,
+      "step": 3622
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.484254203874113,
+      "learning_rate": 2.9784990628665894e-06,
+      "loss": 1.0348,
+      "step": 3623
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 6.425068152593537,
+      "learning_rate": 2.975868575852318e-06,
+      "loss": 0.9896,
+      "step": 3624
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.625852389381979,
+      "learning_rate": 2.973238758713536e-06,
+      "loss": 1.0187,
+      "step": 3625
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 9.351200099868601,
+      "learning_rate": 2.9706096123205695e-06,
+      "loss": 1.001,
+      "step": 3626
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 7.78918928183644,
+      "learning_rate": 2.967981137543516e-06,
+      "loss": 1.118,
+      "step": 3627
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 9.348411468482697,
+      "learning_rate": 2.965353335252259e-06,
+      "loss": 1.0279,
+      "step": 3628
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 11.173735606799951,
+      "learning_rate": 2.962726206316454e-06,
+      "loss": 1.0627,
+      "step": 3629
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 10.011214591924086,
+      "learning_rate": 2.9600997516055374e-06,
+      "loss": 1.1305,
+      "step": 3630
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 6.952507005048865,
+      "learning_rate": 2.957473971988718e-06,
+      "loss": 0.9885,
+      "step": 3631
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.497355678129736,
+      "learning_rate": 2.954848868334984e-06,
+      "loss": 0.9871,
+      "step": 3632
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 9.21893705442449,
+      "learning_rate": 2.9522244415131003e-06,
+      "loss": 1.0578,
+      "step": 3633
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 6.664310506495063,
+      "learning_rate": 2.9496006923916067e-06,
+      "loss": 0.9785,
+      "step": 3634
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.957347124624234,
+      "learning_rate": 2.9469776218388167e-06,
+      "loss": 0.9848,
+      "step": 3635
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 8.090632082122832,
+      "learning_rate": 2.944355230722825e-06,
+      "loss": 1.1006,
+      "step": 3636
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.303907346477056,
+      "learning_rate": 2.941733519911496e-06,
+      "loss": 1.0587,
+      "step": 3637
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.363892471678612,
+      "learning_rate": 2.939112490272472e-06,
+      "loss": 0.9472,
+      "step": 3638
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 6.145018803992776,
+      "learning_rate": 2.9364921426731687e-06,
+      "loss": 1.0201,
+      "step": 3639
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 8.102022779562375,
+      "learning_rate": 2.9338724779807735e-06,
+      "loss": 1.0707,
+      "step": 3640
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 10.516889834025257,
+      "learning_rate": 2.9312534970622544e-06,
+      "loss": 0.979,
+      "step": 3641
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.989665447001889,
+      "learning_rate": 2.928635200784346e-06,
+      "loss": 1.0537,
+      "step": 3642
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 4.746515915792027,
+      "learning_rate": 2.9260175900135607e-06,
+      "loss": 1.0006,
+      "step": 3643
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.664547252344365,
+      "learning_rate": 2.9234006656161835e-06,
+      "loss": 1.0043,
+      "step": 3644
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.378957407938662,
+      "learning_rate": 2.9207844284582697e-06,
+      "loss": 0.9567,
+      "step": 3645
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 6.582689800890665,
+      "learning_rate": 2.918168879405652e-06,
+      "loss": 1.0801,
+      "step": 3646
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 9.551225689085614,
+      "learning_rate": 2.9155540193239306e-06,
+      "loss": 1.0809,
+      "step": 3647
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 5.545483759588687,
+      "learning_rate": 2.912939849078476e-06,
+      "loss": 0.9896,
+      "step": 3648
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.6683232222435205,
+      "learning_rate": 2.910326369534442e-06,
+      "loss": 1.0359,
+      "step": 3649
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 11.338631284846608,
+      "learning_rate": 2.9077135815567405e-06,
+      "loss": 1.0853,
+      "step": 3650
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 10.600073026711256,
+      "learning_rate": 2.9051014860100556e-06,
+      "loss": 1.0442,
+      "step": 3651
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 6.239780475194638,
+      "learning_rate": 2.902490083758856e-06,
+      "loss": 1.0395,
+      "step": 3652
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 8.688586426651627,
+      "learning_rate": 2.899879375667364e-06,
+      "loss": 1.011,
+      "step": 3653
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 8.142520593618716,
+      "learning_rate": 2.897269362599582e-06,
+      "loss": 0.9603,
+      "step": 3654
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 6.779721454404203,
+      "learning_rate": 2.894660045419282e-06,
+      "loss": 1.012,
+      "step": 3655
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.5201291673141215,
+      "learning_rate": 2.8920514249899996e-06,
+      "loss": 1.0286,
+      "step": 3656
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 9.386830028483729,
+      "learning_rate": 2.8894435021750456e-06,
+      "loss": 1.0396,
+      "step": 3657
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 12.55930186807988,
+      "learning_rate": 2.8868362778375e-06,
+      "loss": 0.9613,
+      "step": 3658
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 8.327934459929129,
+      "learning_rate": 2.8842297528402052e-06,
+      "loss": 1.0016,
+      "step": 3659
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 8.974256590244961,
+      "learning_rate": 2.8816239280457805e-06,
+      "loss": 1.0535,
+      "step": 3660
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.779708380000989,
+      "learning_rate": 2.8790188043166085e-06,
+      "loss": 0.9913,
+      "step": 3661
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.5949492882106595,
+      "learning_rate": 2.8764143825148427e-06,
+      "loss": 0.9248,
+      "step": 3662
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 9.267285917446827,
+      "learning_rate": 2.8738106635024e-06,
+      "loss": 1.0401,
+      "step": 3663
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 6.84234618179337,
+      "learning_rate": 2.8712076481409674e-06,
+      "loss": 0.9754,
+      "step": 3664
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 5.6115269635056215,
+      "learning_rate": 2.868605337292002e-06,
+      "loss": 1.0316,
+      "step": 3665
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 8.96843138063232,
+      "learning_rate": 2.8660037318167213e-06,
+      "loss": 0.9758,
+      "step": 3666
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 6.881661693045343,
+      "learning_rate": 2.8634028325761134e-06,
+      "loss": 1.0403,
+      "step": 3667
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.025810588175047,
+      "learning_rate": 2.860802640430935e-06,
+      "loss": 1.0798,
+      "step": 3668
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 8.178404235839702,
+      "learning_rate": 2.8582031562417013e-06,
+      "loss": 1.0026,
+      "step": 3669
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 10.206272205269698,
+      "learning_rate": 2.8556043808686997e-06,
+      "loss": 0.9313,
+      "step": 3670
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 10.350754440008515,
+      "learning_rate": 2.8530063151719835e-06,
+      "loss": 0.9852,
+      "step": 3671
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.701329335188821,
+      "learning_rate": 2.8504089600113644e-06,
+      "loss": 1.0129,
+      "step": 3672
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 6.1058821346398,
+      "learning_rate": 2.847812316246425e-06,
+      "loss": 1.0317,
+      "step": 3673
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.151832056379969,
+      "learning_rate": 2.8452163847365134e-06,
+      "loss": 1.028,
+      "step": 3674
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 6.3556185148334166,
+      "learning_rate": 2.8426211663407343e-06,
+      "loss": 1.0568,
+      "step": 3675
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 8.461220867734067,
+      "learning_rate": 2.840026661917964e-06,
+      "loss": 1.1254,
+      "step": 3676
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 5.540303584729398,
+      "learning_rate": 2.8374328723268395e-06,
+      "loss": 1.0599,
+      "step": 3677
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 8.609103107422538,
+      "learning_rate": 2.834839798425764e-06,
+      "loss": 1.0774,
+      "step": 3678
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 10.034960711971957,
+      "learning_rate": 2.832247441072897e-06,
+      "loss": 1.0667,
+      "step": 3679
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 6.438089915204967,
+      "learning_rate": 2.829655801126169e-06,
+      "loss": 1.0403,
+      "step": 3680
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 6.9208844590570715,
+      "learning_rate": 2.8270648794432686e-06,
+      "loss": 1.1299,
+      "step": 3681
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 9.392618283425822,
+      "learning_rate": 2.8244746768816455e-06,
+      "loss": 1.0084,
+      "step": 3682
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 9.32874305044438,
+      "learning_rate": 2.8218851942985155e-06,
+      "loss": 1.002,
+      "step": 3683
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 8.530838567831296,
+      "learning_rate": 2.8192964325508548e-06,
+      "loss": 1.0775,
+      "step": 3684
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.234793067136559,
+      "learning_rate": 2.816708392495397e-06,
+      "loss": 1.0458,
+      "step": 3685
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 5.826167121882311,
+      "learning_rate": 2.8141210749886415e-06,
+      "loss": 1.0675,
+      "step": 3686
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 8.126963511142108,
+      "learning_rate": 2.8115344808868505e-06,
+      "loss": 0.9574,
+      "step": 3687
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 9.356175068242356,
+      "learning_rate": 2.8089486110460385e-06,
+      "loss": 0.9434,
+      "step": 3688
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 5.153396400085494,
+      "learning_rate": 2.8063634663219875e-06,
+      "loss": 0.9787,
+      "step": 3689
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 8.416216907485326,
+      "learning_rate": 2.8037790475702396e-06,
+      "loss": 1.0604,
+      "step": 3690
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 6.323999770368511,
+      "learning_rate": 2.80119535564609e-06,
+      "loss": 1.0051,
+      "step": 3691
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 5.269476978984514,
+      "learning_rate": 2.7986123914046005e-06,
+      "loss": 0.9523,
+      "step": 3692
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 9.861179848583687,
+      "learning_rate": 2.796030155700589e-06,
+      "loss": 0.9995,
+      "step": 3693
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 6.642826529670125,
+      "learning_rate": 2.7934486493886335e-06,
+      "loss": 0.9879,
+      "step": 3694
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 7.422003719059079,
+      "learning_rate": 2.790867873323067e-06,
+      "loss": 0.9793,
+      "step": 3695
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 6.453991990583184,
+      "learning_rate": 2.788287828357985e-06,
+      "loss": 1.0207,
+      "step": 3696
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 7.315820737672043,
+      "learning_rate": 2.7857085153472424e-06,
+      "loss": 1.1096,
+      "step": 3697
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 6.713418316522561,
+      "learning_rate": 2.783129935144444e-06,
+      "loss": 1.004,
+      "step": 3698
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 7.608720473471098,
+      "learning_rate": 2.7805520886029604e-06,
+      "loss": 1.0619,
+      "step": 3699
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 8.531882553391805,
+      "learning_rate": 2.777974976575917e-06,
+      "loss": 0.949,
+      "step": 3700
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 8.180592997011358,
+      "learning_rate": 2.775398599916189e-06,
+      "loss": 1.0436,
+      "step": 3701
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 5.582180438254961,
+      "learning_rate": 2.7728229594764237e-06,
+      "loss": 1.038,
+      "step": 3702
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 7.084946991872569,
+      "learning_rate": 2.770248056109011e-06,
+      "loss": 1.0431,
+      "step": 3703
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 6.455446091162911,
+      "learning_rate": 2.7676738906660975e-06,
+      "loss": 1.0459,
+      "step": 3704
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 8.291699401241145,
+      "learning_rate": 2.7651004639995977e-06,
+      "loss": 1.0183,
+      "step": 3705
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 5.699265784713535,
+      "learning_rate": 2.762527776961169e-06,
+      "loss": 0.8864,
+      "step": 3706
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 9.105687787876189,
+      "learning_rate": 2.7599558304022266e-06,
+      "loss": 1.01,
+      "step": 3707
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 6.262057995100954,
+      "learning_rate": 2.7573846251739447e-06,
+      "loss": 1.0483,
+      "step": 3708
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 8.268252537587477,
+      "learning_rate": 2.7548141621272495e-06,
+      "loss": 1.0003,
+      "step": 3709
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 8.315826286130596,
+      "learning_rate": 2.752244442112824e-06,
+      "loss": 1.0118,
+      "step": 3710
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 7.876007162785253,
+      "learning_rate": 2.7496754659811e-06,
+      "loss": 1.0478,
+      "step": 3711
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 7.860328124088158,
+      "learning_rate": 2.747107234582268e-06,
+      "loss": 1.0078,
+      "step": 3712
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 7.315393771487737,
+      "learning_rate": 2.744539748766273e-06,
+      "loss": 1.0912,
+      "step": 3713
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 9.714889348267095,
+      "learning_rate": 2.7419730093828055e-06,
+      "loss": 1.1007,
+      "step": 3714
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 7.568693684493069,
+      "learning_rate": 2.739407017281317e-06,
+      "loss": 1.0052,
+      "step": 3715
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 5.94299696434129,
+      "learning_rate": 2.736841773311011e-06,
+      "loss": 1.0114,
+      "step": 3716
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 5.421260745975282,
+      "learning_rate": 2.7342772783208355e-06,
+      "loss": 1.0174,
+      "step": 3717
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 8.046088080648884,
+      "learning_rate": 2.7317135331595034e-06,
+      "loss": 1.0082,
+      "step": 3718
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 8.585344170221028,
+      "learning_rate": 2.72915053867547e-06,
+      "loss": 1.0451,
+      "step": 3719
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 7.6332580177925,
+      "learning_rate": 2.7265882957169387e-06,
+      "loss": 1.0817,
+      "step": 3720
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 8.259437911844756,
+      "learning_rate": 2.72402680513188e-06,
+      "loss": 0.99,
+      "step": 3721
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 8.715910356361016,
+      "learning_rate": 2.7214660677680005e-06,
+      "loss": 0.9964,
+      "step": 3722
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 7.614104766553355,
+      "learning_rate": 2.7189060844727587e-06,
+      "loss": 0.9904,
+      "step": 3723
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 8.222315382843714,
+      "learning_rate": 2.7163468560933755e-06,
+      "loss": 1.0262,
+      "step": 3724
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 6.59591393267844,
+      "learning_rate": 2.7137883834768076e-06,
+      "loss": 1.0223,
+      "step": 3725
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 10.4486733265826,
+      "learning_rate": 2.7112306674697708e-06,
+      "loss": 1.0422,
+      "step": 3726
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 6.946893879694298,
+      "learning_rate": 2.7086737089187283e-06,
+      "loss": 0.9775,
+      "step": 3727
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 12.199748448394526,
+      "learning_rate": 2.706117508669889e-06,
+      "loss": 1.0853,
+      "step": 3728
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 9.930106753174895,
+      "learning_rate": 2.703562067569214e-06,
+      "loss": 1.0222,
+      "step": 3729
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 6.750445940555763,
+      "learning_rate": 2.7010073864624177e-06,
+      "loss": 0.9657,
+      "step": 3730
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 8.136818561586177,
+      "learning_rate": 2.6984534661949515e-06,
+      "loss": 0.9751,
+      "step": 3731
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 7.644132618216975,
+      "learning_rate": 2.6959003076120248e-06,
+      "loss": 1.0111,
+      "step": 3732
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 6.140113309878173,
+      "learning_rate": 2.6933479115585923e-06,
+      "loss": 0.9263,
+      "step": 3733
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 6.8109454810634125,
+      "learning_rate": 2.6907962788793574e-06,
+      "loss": 1.0556,
+      "step": 3734
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 6.269438832103582,
+      "learning_rate": 2.6882454104187643e-06,
+      "loss": 1.0606,
+      "step": 3735
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 9.3193716280034,
+      "learning_rate": 2.6856953070210128e-06,
+      "loss": 0.9853,
+      "step": 3736
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 8.038394100488933,
+      "learning_rate": 2.6831459695300478e-06,
+      "loss": 0.9571,
+      "step": 3737
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 6.9853160534071765,
+      "learning_rate": 2.680597398789554e-06,
+      "loss": 1.0823,
+      "step": 3738
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 10.173805677395722,
+      "learning_rate": 2.6780495956429696e-06,
+      "loss": 0.9963,
+      "step": 3739
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 7.743376499222315,
+      "learning_rate": 2.6755025609334795e-06,
+      "loss": 1.0499,
+      "step": 3740
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 5.244804032660133,
+      "learning_rate": 2.6729562955040054e-06,
+      "loss": 1.0445,
+      "step": 3741
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 9.395347963483157,
+      "learning_rate": 2.670410800197224e-06,
+      "loss": 0.9961,
+      "step": 3742
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 8.749996464585703,
+      "learning_rate": 2.6678660758555543e-06,
+      "loss": 1.0242,
+      "step": 3743
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 5.176663695258186,
+      "learning_rate": 2.6653221233211555e-06,
+      "loss": 0.9969,
+      "step": 3744
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 10.433156814168555,
+      "learning_rate": 2.662778943435938e-06,
+      "loss": 0.9786,
+      "step": 3745
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 8.506344560181434,
+      "learning_rate": 2.660236537041554e-06,
+      "loss": 1.0541,
+      "step": 3746
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 8.441453696394747,
+      "learning_rate": 2.6576949049793964e-06,
+      "loss": 1.017,
+      "step": 3747
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 9.076846390557456,
+      "learning_rate": 2.655154048090607e-06,
+      "loss": 0.9857,
+      "step": 3748
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.563760443706007,
+      "learning_rate": 2.6526139672160677e-06,
+      "loss": 0.9955,
+      "step": 3749
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.930021927452784,
+      "learning_rate": 2.6500746631964074e-06,
+      "loss": 1.0171,
+      "step": 3750
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 9.362654871463047,
+      "learning_rate": 2.647536136871991e-06,
+      "loss": 0.9671,
+      "step": 3751
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 8.896242078786798,
+      "learning_rate": 2.6449983890829326e-06,
+      "loss": 1.0562,
+      "step": 3752
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.884512243820625,
+      "learning_rate": 2.6424614206690886e-06,
+      "loss": 0.9918,
+      "step": 3753
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 8.754936580830597,
+      "learning_rate": 2.63992523247005e-06,
+      "loss": 1.0726,
+      "step": 3754
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.040008041856144,
+      "learning_rate": 2.6373898253251575e-06,
+      "loss": 1.1159,
+      "step": 3755
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.813344374609347,
+      "learning_rate": 2.6348552000734925e-06,
+      "loss": 1.0229,
+      "step": 3756
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 10.155862609081506,
+      "learning_rate": 2.6323213575538707e-06,
+      "loss": 1.0276,
+      "step": 3757
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 8.088645553194864,
+      "learning_rate": 2.6297882986048585e-06,
+      "loss": 0.9684,
+      "step": 3758
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.004502322895341,
+      "learning_rate": 2.627256024064757e-06,
+      "loss": 0.9897,
+      "step": 3759
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.642842374298937,
+      "learning_rate": 2.6247245347716067e-06,
+      "loss": 1.0321,
+      "step": 3760
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 9.229563090730327,
+      "learning_rate": 2.6221938315631924e-06,
+      "loss": 1.0334,
+      "step": 3761
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.384780735762401,
+      "learning_rate": 2.619663915277038e-06,
+      "loss": 1.0808,
+      "step": 3762
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.804650372736323,
+      "learning_rate": 2.6171347867504027e-06,
+      "loss": 1.0173,
+      "step": 3763
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 5.808617134254001,
+      "learning_rate": 2.61460644682029e-06,
+      "loss": 1.0696,
+      "step": 3764
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.457962081749211,
+      "learning_rate": 2.6120788963234396e-06,
+      "loss": 0.9643,
+      "step": 3765
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 8.541767098983467,
+      "learning_rate": 2.609552136096334e-06,
+      "loss": 1.0047,
+      "step": 3766
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.192181802248539,
+      "learning_rate": 2.607026166975186e-06,
+      "loss": 0.9655,
+      "step": 3767
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.857738539658992,
+      "learning_rate": 2.6045009897959538e-06,
+      "loss": 1.0797,
+      "step": 3768
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.784554094456745,
+      "learning_rate": 2.601976605394334e-06,
+      "loss": 1.0435,
+      "step": 3769
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 5.587925674442055,
+      "learning_rate": 2.5994530146057538e-06,
+      "loss": 0.9654,
+      "step": 3770
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.207682306244348,
+      "learning_rate": 2.5969302182653845e-06,
+      "loss": 0.9711,
+      "step": 3771
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.329325435143176,
+      "learning_rate": 2.594408217208133e-06,
+      "loss": 1.0527,
+      "step": 3772
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.037714953656895,
+      "learning_rate": 2.591887012268637e-06,
+      "loss": 0.9835,
+      "step": 3773
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.888575781739273,
+      "learning_rate": 2.5893666042812833e-06,
+      "loss": 1.0368,
+      "step": 3774
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.0966748905796955,
+      "learning_rate": 2.586846994080183e-06,
+      "loss": 1.0703,
+      "step": 3775
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 8.182534501049155,
+      "learning_rate": 2.5843281824991857e-06,
+      "loss": 1.0247,
+      "step": 3776
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 9.898197725545478,
+      "learning_rate": 2.5818101703718846e-06,
+      "loss": 1.037,
+      "step": 3777
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 8.602243409483666,
+      "learning_rate": 2.5792929585315984e-06,
+      "loss": 1.0274,
+      "step": 3778
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 8.245546130236624,
+      "learning_rate": 2.576776547811385e-06,
+      "loss": 0.9758,
+      "step": 3779
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 10.59508756088831,
+      "learning_rate": 2.5742609390440363e-06,
+      "loss": 1.0633,
+      "step": 3780
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 8.073959646607333,
+      "learning_rate": 2.571746133062082e-06,
+      "loss": 1.0112,
+      "step": 3781
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 8.855808729912203,
+      "learning_rate": 2.5692321306977834e-06,
+      "loss": 0.9815,
+      "step": 3782
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 9.161319017868804,
+      "learning_rate": 2.5667189327831345e-06,
+      "loss": 0.9764,
+      "step": 3783
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 9.592165393578085,
+      "learning_rate": 2.564206540149865e-06,
+      "loss": 1.0638,
+      "step": 3784
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.13931626297667,
+      "learning_rate": 2.561694953629441e-06,
+      "loss": 0.9908,
+      "step": 3785
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 9.148477874708965,
+      "learning_rate": 2.5591841740530543e-06,
+      "loss": 1.0626,
+      "step": 3786
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.242586272585976,
+      "learning_rate": 2.5566742022516355e-06,
+      "loss": 0.9452,
+      "step": 3787
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 8.314507272661128,
+      "learning_rate": 2.5541650390558482e-06,
+      "loss": 1.0249,
+      "step": 3788
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 8.067606379130215,
+      "learning_rate": 2.5516566852960824e-06,
+      "loss": 1.0373,
+      "step": 3789
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.7859832080473845,
+      "learning_rate": 2.5491491418024694e-06,
+      "loss": 1.035,
+      "step": 3790
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.931784291916867,
+      "learning_rate": 2.546642409404866e-06,
+      "loss": 0.9949,
+      "step": 3791
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.3021636389330835,
+      "learning_rate": 2.544136488932857e-06,
+      "loss": 1.0296,
+      "step": 3792
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.622382780970703,
+      "learning_rate": 2.5416313812157712e-06,
+      "loss": 1.0134,
+      "step": 3793
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 9.05797895412748,
+      "learning_rate": 2.539127087082656e-06,
+      "loss": 1.0476,
+      "step": 3794
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 8.263067973124059,
+      "learning_rate": 2.536623607362292e-06,
+      "loss": 1.1026,
+      "step": 3795
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.742564770960635,
+      "learning_rate": 2.5341209428831987e-06,
+      "loss": 0.9494,
+      "step": 3796
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 9.636261761134678,
+      "learning_rate": 2.5316190944736143e-06,
+      "loss": 1.0622,
+      "step": 3797
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 7.18761810312278,
+      "learning_rate": 2.5291180629615143e-06,
+      "loss": 1.0818,
+      "step": 3798
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.7048942374094,
+      "learning_rate": 2.526617849174603e-06,
+      "loss": 1.0525,
+      "step": 3799
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 6.874804399868254,
+      "learning_rate": 2.5241184539403095e-06,
+      "loss": 0.9608,
+      "step": 3800
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.986884945723481,
+      "learning_rate": 2.5216198780857977e-06,
+      "loss": 0.9673,
+      "step": 3801
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.978040884772697,
+      "learning_rate": 2.5191221224379594e-06,
+      "loss": 0.9718,
+      "step": 3802
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 8.179663813801447,
+      "learning_rate": 2.516625187823408e-06,
+      "loss": 1.023,
+      "step": 3803
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.516819727836624,
+      "learning_rate": 2.5141290750684956e-06,
+      "loss": 1.0745,
+      "step": 3804
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 9.035784725253695,
+      "learning_rate": 2.5116337849992956e-06,
+      "loss": 1.0267,
+      "step": 3805
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 8.156032190294074,
+      "learning_rate": 2.5091393184416137e-06,
+      "loss": 0.8901,
+      "step": 3806
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.13364104867808,
+      "learning_rate": 2.5066456762209748e-06,
+      "loss": 0.9363,
+      "step": 3807
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.159624055132478,
+      "learning_rate": 2.5041528591626404e-06,
+      "loss": 1.0228,
+      "step": 3808
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 8.257081494361268,
+      "learning_rate": 2.5016608680915956e-06,
+      "loss": 1.0554,
+      "step": 3809
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.510510506384159,
+      "learning_rate": 2.499169703832548e-06,
+      "loss": 1.0383,
+      "step": 3810
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 9.580607259425197,
+      "learning_rate": 2.4966793672099375e-06,
+      "loss": 1.0593,
+      "step": 3811
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 10.95661706505508,
+      "learning_rate": 2.4941898590479297e-06,
+      "loss": 1.0017,
+      "step": 3812
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.018553123649163,
+      "learning_rate": 2.4917011801704093e-06,
+      "loss": 0.9826,
+      "step": 3813
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.73602843503932,
+      "learning_rate": 2.4892133314009943e-06,
+      "loss": 1.0659,
+      "step": 3814
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 8.002938156650035,
+      "learning_rate": 2.4867263135630252e-06,
+      "loss": 0.981,
+      "step": 3815
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 9.667863586364374,
+      "learning_rate": 2.4842401274795658e-06,
+      "loss": 0.9862,
+      "step": 3816
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 8.884451354613955,
+      "learning_rate": 2.481754773973406e-06,
+      "loss": 1.0222,
+      "step": 3817
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 10.844710515999934,
+      "learning_rate": 2.4792702538670637e-06,
+      "loss": 1.051,
+      "step": 3818
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 5.037931267022663,
+      "learning_rate": 2.4767865679827723e-06,
+      "loss": 1.0218,
+      "step": 3819
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 9.468290356311718,
+      "learning_rate": 2.4743037171424976e-06,
+      "loss": 0.9026,
+      "step": 3820
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 8.617008246717774,
+      "learning_rate": 2.4718217021679246e-06,
+      "loss": 1.0508,
+      "step": 3821
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.81247066746361,
+      "learning_rate": 2.469340523880466e-06,
+      "loss": 1.0566,
+      "step": 3822
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 8.130295805450483,
+      "learning_rate": 2.4668601831012502e-06,
+      "loss": 0.9192,
+      "step": 3823
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 8.539768486674,
+      "learning_rate": 2.4643806806511344e-06,
+      "loss": 1.004,
+      "step": 3824
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 8.943639588225347,
+      "learning_rate": 2.4619020173506984e-06,
+      "loss": 1.042,
+      "step": 3825
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 8.272154948557798,
+      "learning_rate": 2.4594241940202403e-06,
+      "loss": 1.0674,
+      "step": 3826
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.5821199573527,
+      "learning_rate": 2.4569472114797827e-06,
+      "loss": 1.0087,
+      "step": 3827
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.950216574625976,
+      "learning_rate": 2.454471070549072e-06,
+      "loss": 0.9938,
+      "step": 3828
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.99537533464708,
+      "learning_rate": 2.45199577204757e-06,
+      "loss": 1.0221,
+      "step": 3829
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.8402886877030715,
+      "learning_rate": 2.4495213167944663e-06,
+      "loss": 1.0703,
+      "step": 3830
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.508251811802657,
+      "learning_rate": 2.44704770560867e-06,
+      "loss": 1.0423,
+      "step": 3831
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 5.823876438770671,
+      "learning_rate": 2.4445749393088054e-06,
+      "loss": 1.0255,
+      "step": 3832
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.323239929787939,
+      "learning_rate": 2.442103018713224e-06,
+      "loss": 0.9549,
+      "step": 3833
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 9.595472355951854,
+      "learning_rate": 2.4396319446399954e-06,
+      "loss": 1.0269,
+      "step": 3834
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.267006441775579,
+      "learning_rate": 2.4371617179069056e-06,
+      "loss": 1.0954,
+      "step": 3835
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.919413010066388,
+      "learning_rate": 2.4346923393314648e-06,
+      "loss": 0.921,
+      "step": 3836
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 5.351587084520998,
+      "learning_rate": 2.4322238097308997e-06,
+      "loss": 1.0459,
+      "step": 3837
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.85160770364697,
+      "learning_rate": 2.42975612992216e-06,
+      "loss": 0.9775,
+      "step": 3838
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.23231342141589,
+      "learning_rate": 2.4272893007219073e-06,
+      "loss": 1.0907,
+      "step": 3839
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.727762163848735,
+      "learning_rate": 2.424823322946526e-06,
+      "loss": 0.974,
+      "step": 3840
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 10.0218889105814,
+      "learning_rate": 2.4223581974121224e-06,
+      "loss": 0.9817,
+      "step": 3841
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.25323666094311,
+      "learning_rate": 2.4198939249345114e-06,
+      "loss": 0.9962,
+      "step": 3842
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.685409862755417,
+      "learning_rate": 2.417430506329233e-06,
+      "loss": 0.9784,
+      "step": 3843
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.188339765534495,
+      "learning_rate": 2.4149679424115442e-06,
+      "loss": 1.048,
+      "step": 3844
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.7299743496858655,
+      "learning_rate": 2.412506233996412e-06,
+      "loss": 1.042,
+      "step": 3845
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.579273906405062,
+      "learning_rate": 2.410045381898533e-06,
+      "loss": 1.0238,
+      "step": 3846
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.06355657562522,
+      "learning_rate": 2.4075853869323094e-06,
+      "loss": 1.1255,
+      "step": 3847
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.870334605129798,
+      "learning_rate": 2.40512624991186e-06,
+      "loss": 1.0298,
+      "step": 3848
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 9.444078363303952,
+      "learning_rate": 2.40266797165103e-06,
+      "loss": 0.9951,
+      "step": 3849
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.600708757843542,
+      "learning_rate": 2.40021055296337e-06,
+      "loss": 0.9699,
+      "step": 3850
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.629844083015455,
+      "learning_rate": 2.3977539946621455e-06,
+      "loss": 0.977,
+      "step": 3851
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 7.984236988218107,
+      "learning_rate": 2.3952982975603494e-06,
+      "loss": 0.9707,
+      "step": 3852
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 11.327386118568539,
+      "learning_rate": 2.392843462470675e-06,
+      "loss": 1.0416,
+      "step": 3853
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.655380712812707,
+      "learning_rate": 2.390389490205542e-06,
+      "loss": 1.0193,
+      "step": 3854
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 5.91292712726203,
+      "learning_rate": 2.3879363815770753e-06,
+      "loss": 1.0095,
+      "step": 3855
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.771251043629482,
+      "learning_rate": 2.385484137397119e-06,
+      "loss": 1.0329,
+      "step": 3856
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.395191249821438,
+      "learning_rate": 2.3830327584772323e-06,
+      "loss": 1.0214,
+      "step": 3857
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 8.644939881894828,
+      "learning_rate": 2.380582245628683e-06,
+      "loss": 1.0079,
+      "step": 3858
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 5.868966460381082,
+      "learning_rate": 2.378132599662457e-06,
+      "loss": 1.0492,
+      "step": 3859
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.573419956331965,
+      "learning_rate": 2.375683821389253e-06,
+      "loss": 0.9865,
+      "step": 3860
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 5.81924318450172,
+      "learning_rate": 2.3732359116194758e-06,
+      "loss": 0.9694,
+      "step": 3861
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 8.012672571417891,
+      "learning_rate": 2.3707888711632552e-06,
+      "loss": 1.0591,
+      "step": 3862
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.169877075692,
+      "learning_rate": 2.3683427008304228e-06,
+      "loss": 0.9858,
+      "step": 3863
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.365195577141238,
+      "learning_rate": 2.3658974014305217e-06,
+      "loss": 1.0199,
+      "step": 3864
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 8.087643452974358,
+      "learning_rate": 2.363452973772818e-06,
+      "loss": 1.0494,
+      "step": 3865
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 10.00366959075229,
+      "learning_rate": 2.361009418666279e-06,
+      "loss": 0.977,
+      "step": 3866
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 9.71788836775309,
+      "learning_rate": 2.3585667369195815e-06,
+      "loss": 1.0925,
+      "step": 3867
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.6175900751002,
+      "learning_rate": 2.356124929341126e-06,
+      "loss": 0.9924,
+      "step": 3868
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.199000865966535,
+      "learning_rate": 2.3536839967390095e-06,
+      "loss": 1.007,
+      "step": 3869
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 12.100140631067248,
+      "learning_rate": 2.3512439399210487e-06,
+      "loss": 1.1263,
+      "step": 3870
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.184365172161536,
+      "learning_rate": 2.3488047596947676e-06,
+      "loss": 0.9503,
+      "step": 3871
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 9.57608011912405,
+      "learning_rate": 2.3463664568673977e-06,
+      "loss": 1.0791,
+      "step": 3872
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.746519720032787,
+      "learning_rate": 2.3439290322458835e-06,
+      "loss": 0.9445,
+      "step": 3873
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.60925870076823,
+      "learning_rate": 2.3414924866368794e-06,
+      "loss": 0.9987,
+      "step": 3874
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.825431596854343,
+      "learning_rate": 2.3390568208467435e-06,
+      "loss": 1.0529,
+      "step": 3875
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.713738845445588,
+      "learning_rate": 2.3366220356815483e-06,
+      "loss": 1.0154,
+      "step": 3876
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.385061593192133,
+      "learning_rate": 2.3341881319470733e-06,
+      "loss": 1.0335,
+      "step": 3877
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.904250606762758,
+      "learning_rate": 2.3317551104488066e-06,
+      "loss": 1.0182,
+      "step": 3878
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.469720298048428,
+      "learning_rate": 2.329322971991942e-06,
+      "loss": 1.0941,
+      "step": 3879
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 8.138174390611693,
+      "learning_rate": 2.3268917173813825e-06,
+      "loss": 1.024,
+      "step": 3880
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 9.295905961526705,
+      "learning_rate": 2.324461347421742e-06,
+      "loss": 1.064,
+      "step": 3881
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.6645933493111285,
+      "learning_rate": 2.3220318629173343e-06,
+      "loss": 1.0704,
+      "step": 3882
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.4900827345548535,
+      "learning_rate": 2.3196032646721865e-06,
+      "loss": 1.0419,
+      "step": 3883
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.286908407923372,
+      "learning_rate": 2.317175553490032e-06,
+      "loss": 1.0035,
+      "step": 3884
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 10.096635324067254,
+      "learning_rate": 2.3147487301743046e-06,
+      "loss": 1.0715,
+      "step": 3885
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.510942290871521,
+      "learning_rate": 2.312322795528151e-06,
+      "loss": 1.0773,
+      "step": 3886
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 9.119678198025094,
+      "learning_rate": 2.309897750354423e-06,
+      "loss": 0.998,
+      "step": 3887
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.30528901557219,
+      "learning_rate": 2.307473595455672e-06,
+      "loss": 1.0484,
+      "step": 3888
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.468113298188061,
+      "learning_rate": 2.3050503316341622e-06,
+      "loss": 0.9544,
+      "step": 3889
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.9315345311196594,
+      "learning_rate": 2.3026279596918604e-06,
+      "loss": 1.014,
+      "step": 3890
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.382693169756241,
+      "learning_rate": 2.3002064804304353e-06,
+      "loss": 0.98,
+      "step": 3891
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.720551585748731,
+      "learning_rate": 2.2977858946512634e-06,
+      "loss": 0.9664,
+      "step": 3892
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.344213851777503,
+      "learning_rate": 2.2953662031554246e-06,
+      "loss": 1.0032,
+      "step": 3893
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.600343365040278,
+      "learning_rate": 2.292947406743706e-06,
+      "loss": 1.0874,
+      "step": 3894
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.69789409923523,
+      "learning_rate": 2.29052950621659e-06,
+      "loss": 1.0094,
+      "step": 3895
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.42596504980965,
+      "learning_rate": 2.288112502374271e-06,
+      "loss": 1.0559,
+      "step": 3896
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 8.355834940512866,
+      "learning_rate": 2.285696396016644e-06,
+      "loss": 1.0226,
+      "step": 3897
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.923677406399057,
+      "learning_rate": 2.283281187943304e-06,
+      "loss": 1.021,
+      "step": 3898
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 5.228228398481081,
+      "learning_rate": 2.280866878953553e-06,
+      "loss": 1.0856,
+      "step": 3899
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 8.571300857488456,
+      "learning_rate": 2.2784534698463947e-06,
+      "loss": 1.1605,
+      "step": 3900
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 5.780805801772409,
+      "learning_rate": 2.276040961420531e-06,
+      "loss": 0.9525,
+      "step": 3901
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 9.849775460110404,
+      "learning_rate": 2.2736293544743703e-06,
+      "loss": 0.9952,
+      "step": 3902
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.995521260538384,
+      "learning_rate": 2.2712186498060227e-06,
+      "loss": 0.9682,
+      "step": 3903
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 5.5605291376071335,
+      "learning_rate": 2.2688088482132953e-06,
+      "loss": 0.987,
+      "step": 3904
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 9.869273018808869,
+      "learning_rate": 2.2663999504936996e-06,
+      "loss": 0.9871,
+      "step": 3905
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.690145628905556,
+      "learning_rate": 2.26399195744445e-06,
+      "loss": 1.0145,
+      "step": 3906
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.0505542583776135,
+      "learning_rate": 2.261584869862455e-06,
+      "loss": 0.9807,
+      "step": 3907
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 6.984366263755419,
+      "learning_rate": 2.259178688544329e-06,
+      "loss": 1.1351,
+      "step": 3908
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 8.632655421054832,
+      "learning_rate": 2.2567734142863857e-06,
+      "loss": 1.0508,
+      "step": 3909
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.730985738296447,
+      "learning_rate": 2.254369047884639e-06,
+      "loss": 1.0079,
+      "step": 3910
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 7.308851923082362,
+      "learning_rate": 2.2519655901347974e-06,
+      "loss": 0.958,
+      "step": 3911
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 8.309181824630057,
+      "learning_rate": 2.249563041832275e-06,
+      "loss": 1.0626,
+      "step": 3912
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.147132397185106,
+      "learning_rate": 2.247161403772183e-06,
+      "loss": 1.0129,
+      "step": 3913
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 10.400246093875156,
+      "learning_rate": 2.244760676749328e-06,
+      "loss": 1.0107,
+      "step": 3914
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.15247225777807,
+      "learning_rate": 2.2423608615582186e-06,
+      "loss": 1.0395,
+      "step": 3915
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 8.607365535273246,
+      "learning_rate": 2.239961958993064e-06,
+      "loss": 0.9886,
+      "step": 3916
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.291080795490075,
+      "learning_rate": 2.237563969847762e-06,
+      "loss": 0.9977,
+      "step": 3917
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 5.976344834015085,
+      "learning_rate": 2.235166894915921e-06,
+      "loss": 0.9925,
+      "step": 3918
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 10.979088591372975,
+      "learning_rate": 2.232770734990838e-06,
+      "loss": 0.991,
+      "step": 3919
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 6.241948279746162,
+      "learning_rate": 2.230375490865504e-06,
+      "loss": 1.0427,
+      "step": 3920
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 6.5375042455552705,
+      "learning_rate": 2.227981163332621e-06,
+      "loss": 1.033,
+      "step": 3921
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 6.1108269946705835,
+      "learning_rate": 2.2255877531845743e-06,
+      "loss": 0.9879,
+      "step": 3922
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 6.084294666533618,
+      "learning_rate": 2.2231952612134466e-06,
+      "loss": 0.9598,
+      "step": 3923
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 8.24996069479176,
+      "learning_rate": 2.2208036882110283e-06,
+      "loss": 0.9842,
+      "step": 3924
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 9.321696919499969,
+      "learning_rate": 2.218413034968792e-06,
+      "loss": 0.9673,
+      "step": 3925
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 8.247491024129108,
+      "learning_rate": 2.2160233022779154e-06,
+      "loss": 1.0284,
+      "step": 3926
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 6.751543359331997,
+      "learning_rate": 2.2136344909292636e-06,
+      "loss": 1.0042,
+      "step": 3927
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 9.172722062001448,
+      "learning_rate": 2.2112466017134026e-06,
+      "loss": 1.0371,
+      "step": 3928
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 6.303008277215659,
+      "learning_rate": 2.208859635420594e-06,
+      "loss": 1.0052,
+      "step": 3929
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 8.885106595731074,
+      "learning_rate": 2.206473592840789e-06,
+      "loss": 1.0555,
+      "step": 3930
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.337446010188293,
+      "learning_rate": 2.2040884747636355e-06,
+      "loss": 1.0143,
+      "step": 3931
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.029330576071012,
+      "learning_rate": 2.201704281978478e-06,
+      "loss": 0.9215,
+      "step": 3932
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 8.964355743731977,
+      "learning_rate": 2.199321015274347e-06,
+      "loss": 0.9521,
+      "step": 3933
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 6.023639399503659,
+      "learning_rate": 2.1969386754399797e-06,
+      "loss": 0.9824,
+      "step": 3934
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 9.11753623952582,
+      "learning_rate": 2.1945572632637947e-06,
+      "loss": 0.9565,
+      "step": 3935
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 8.54864859643051,
+      "learning_rate": 2.192176779533904e-06,
+      "loss": 0.9309,
+      "step": 3936
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.820883751100604,
+      "learning_rate": 2.1897972250381226e-06,
+      "loss": 1.0357,
+      "step": 3937
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 8.742581993495854,
+      "learning_rate": 2.187418600563949e-06,
+      "loss": 0.9891,
+      "step": 3938
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 9.593616916885198,
+      "learning_rate": 2.1850409068985717e-06,
+      "loss": 1.0255,
+      "step": 3939
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 8.30093968436564,
+      "learning_rate": 2.182664144828884e-06,
+      "loss": 1.007,
+      "step": 3940
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.337632401339735,
+      "learning_rate": 2.180288315141456e-06,
+      "loss": 0.9955,
+      "step": 3941
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 5.7683976028444475,
+      "learning_rate": 2.1779134186225583e-06,
+      "loss": 0.9864,
+      "step": 3942
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 6.037665595965405,
+      "learning_rate": 2.1755394560581515e-06,
+      "loss": 0.9888,
+      "step": 3943
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 5.609518931001361,
+      "learning_rate": 2.1731664282338827e-06,
+      "loss": 1.027,
+      "step": 3944
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 6.189325266769668,
+      "learning_rate": 2.1707943359350946e-06,
+      "loss": 1.0091,
+      "step": 3945
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 6.218295576295836,
+      "learning_rate": 2.168423179946821e-06,
+      "loss": 1.0105,
+      "step": 3946
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 12.26402138192184,
+      "learning_rate": 2.166052961053778e-06,
+      "loss": 1.049,
+      "step": 3947
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 9.048186704770048,
+      "learning_rate": 2.16368368004038e-06,
+      "loss": 1.0496,
+      "step": 3948
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 8.895917747128378,
+      "learning_rate": 2.161315337690727e-06,
+      "loss": 0.9721,
+      "step": 3949
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 6.921176937597543,
+      "learning_rate": 2.158947934788612e-06,
+      "loss": 1.0366,
+      "step": 3950
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 5.519811041938415,
+      "learning_rate": 2.1565814721175105e-06,
+      "loss": 1.0217,
+      "step": 3951
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 9.00143534640892,
+      "learning_rate": 2.154215950460592e-06,
+      "loss": 0.917,
+      "step": 3952
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 6.33059167293261,
+      "learning_rate": 2.1518513706007154e-06,
+      "loss": 0.9861,
+      "step": 3953
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 6.069304015546742,
+      "learning_rate": 2.1494877333204218e-06,
+      "loss": 1.0287,
+      "step": 3954
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 8.816822501728245,
+      "learning_rate": 2.1471250394019472e-06,
+      "loss": 1.0136,
+      "step": 3955
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 6.071917674067339,
+      "learning_rate": 2.144763289627213e-06,
+      "loss": 1.001,
+      "step": 3956
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 9.73786844717934,
+      "learning_rate": 2.142402484777825e-06,
+      "loss": 0.9621,
+      "step": 3957
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 8.866123326504793,
+      "learning_rate": 2.1400426256350805e-06,
+      "loss": 1.0647,
+      "step": 3958
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 8.692592418742944,
+      "learning_rate": 2.1376837129799643e-06,
+      "loss": 1.0139,
+      "step": 3959
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.711261308825237,
+      "learning_rate": 2.1353257475931416e-06,
+      "loss": 1.0769,
+      "step": 3960
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.82997208057156,
+      "learning_rate": 2.1329687302549713e-06,
+      "loss": 1.0116,
+      "step": 3961
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.022622048581572,
+      "learning_rate": 2.130612661745496e-06,
+      "loss": 1.0118,
+      "step": 3962
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 8.363909470636106,
+      "learning_rate": 2.1282575428444422e-06,
+      "loss": 1.0285,
+      "step": 3963
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 7.242105482025311,
+      "learning_rate": 2.125903374331225e-06,
+      "loss": 1.0006,
+      "step": 3964
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 9.654599738801583,
+      "learning_rate": 2.123550156984944e-06,
+      "loss": 1.0359,
+      "step": 3965
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 8.402559098246948,
+      "learning_rate": 2.121197891584385e-06,
+      "loss": 0.9353,
+      "step": 3966
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 9.796974699924148,
+      "learning_rate": 2.118846578908015e-06,
+      "loss": 1.0183,
+      "step": 3967
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 8.026552076773983,
+      "learning_rate": 2.1164962197339895e-06,
+      "loss": 0.9625,
+      "step": 3968
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 5.608258156987342,
+      "learning_rate": 2.114146814840149e-06,
+      "loss": 1.0213,
+      "step": 3969
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.357695661273308,
+      "learning_rate": 2.1117983650040142e-06,
+      "loss": 0.9366,
+      "step": 3970
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 7.794703701114753,
+      "learning_rate": 2.1094508710027923e-06,
+      "loss": 1.0286,
+      "step": 3971
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 5.5286111944239575,
+      "learning_rate": 2.107104333613376e-06,
+      "loss": 1.0373,
+      "step": 3972
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 8.02805176976196,
+      "learning_rate": 2.104758753612337e-06,
+      "loss": 1.0433,
+      "step": 3973
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.922026175923059,
+      "learning_rate": 2.1024141317759327e-06,
+      "loss": 0.9166,
+      "step": 3974
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 9.5734363675039,
+      "learning_rate": 2.100070468880106e-06,
+      "loss": 1.0094,
+      "step": 3975
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.617224594463179,
+      "learning_rate": 2.0977277657004756e-06,
+      "loss": 1.0501,
+      "step": 3976
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 9.996398292160576,
+      "learning_rate": 2.0953860230123486e-06,
+      "loss": 0.9878,
+      "step": 3977
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 7.1165133477258,
+      "learning_rate": 2.093045241590714e-06,
+      "loss": 0.9475,
+      "step": 3978
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 10.037718208377449,
+      "learning_rate": 2.0907054222102367e-06,
+      "loss": 1.0963,
+      "step": 3979
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 9.441752779835797,
+      "learning_rate": 2.08836656564527e-06,
+      "loss": 0.9976,
+      "step": 3980
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 7.530625179645598,
+      "learning_rate": 2.0860286726698458e-06,
+      "loss": 1.0153,
+      "step": 3981
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 8.00469454799396,
+      "learning_rate": 2.0836917440576793e-06,
+      "loss": 1.0095,
+      "step": 3982
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 9.590028110395616,
+      "learning_rate": 2.0813557805821604e-06,
+      "loss": 1.0184,
+      "step": 3983
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.908165878872122,
+      "learning_rate": 2.079020783016366e-06,
+      "loss": 1.0059,
+      "step": 3984
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 9.383857919308314,
+      "learning_rate": 2.076686752133053e-06,
+      "loss": 1.0232,
+      "step": 3985
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.61404805818504,
+      "learning_rate": 2.0743536887046523e-06,
+      "loss": 1.0842,
+      "step": 3986
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 8.262044244503898,
+      "learning_rate": 2.0720215935032813e-06,
+      "loss": 0.9664,
+      "step": 3987
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 8.010330857159017,
+      "learning_rate": 2.069690467300735e-06,
+      "loss": 1.0516,
+      "step": 3988
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 7.033993008149392,
+      "learning_rate": 2.067360310868483e-06,
+      "loss": 0.968,
+      "step": 3989
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 9.423889247289646,
+      "learning_rate": 2.065031124977685e-06,
+      "loss": 1.0115,
+      "step": 3990
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 10.805736514937584,
+      "learning_rate": 2.0627029103991693e-06,
+      "loss": 1.0304,
+      "step": 3991
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 7.401570137833009,
+      "learning_rate": 2.0603756679034412e-06,
+      "loss": 0.9629,
+      "step": 3992
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 8.532472056446569,
+      "learning_rate": 2.058049398260699e-06,
+      "loss": 1.0674,
+      "step": 3993
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 7.596958919317464,
+      "learning_rate": 2.0557241022408026e-06,
+      "loss": 1.0583,
+      "step": 3994
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 8.901917288919277,
+      "learning_rate": 2.0533997806132944e-06,
+      "loss": 1.0035,
+      "step": 3995
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 11.900414412827516,
+      "learning_rate": 2.0510764341474032e-06,
+      "loss": 1.1123,
+      "step": 3996
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 7.115709301050529,
+      "learning_rate": 2.048754063612023e-06,
+      "loss": 1.0069,
+      "step": 3997
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 7.054755552680873,
+      "learning_rate": 2.046432669775731e-06,
+      "loss": 1.014,
+      "step": 3998
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 7.827224436042658,
+      "learning_rate": 2.044112253406782e-06,
+      "loss": 1.0239,
+      "step": 3999
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 5.975453150980924,
+      "learning_rate": 2.041792815273102e-06,
+      "loss": 0.9744,
+      "step": 4000
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.471576279306851,
+      "learning_rate": 2.039474356142301e-06,
+      "loss": 0.9894,
+      "step": 4001
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 7.792321453610465,
+      "learning_rate": 2.0371568767816556e-06,
+      "loss": 0.9412,
+      "step": 4002
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 8.860336826679678,
+      "learning_rate": 2.0348403779581266e-06,
+      "loss": 1.0126,
+      "step": 4003
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 5.977856896300457,
+      "learning_rate": 2.0325248604383476e-06,
+      "loss": 0.9547,
+      "step": 4004
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 9.610863312870801,
+      "learning_rate": 2.030210324988622e-06,
+      "loss": 0.956,
+      "step": 4005
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 10.197162700439197,
+      "learning_rate": 2.0278967723749394e-06,
+      "loss": 0.9646,
+      "step": 4006
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 9.276874643549741,
+      "learning_rate": 2.0255842033629546e-06,
+      "loss": 1.0073,
+      "step": 4007
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.50816225416315,
+      "learning_rate": 2.023272618717997e-06,
+      "loss": 1.0643,
+      "step": 4008
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 5.959917286494323,
+      "learning_rate": 2.0209620192050787e-06,
+      "loss": 1.0501,
+      "step": 4009
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 5.862801245693239,
+      "learning_rate": 2.0186524055888784e-06,
+      "loss": 1.0025,
+      "step": 4010
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 8.466008940645157,
+      "learning_rate": 2.0163437786337457e-06,
+      "loss": 1.0546,
+      "step": 4011
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 8.4716251192754,
+      "learning_rate": 2.0140361391037162e-06,
+      "loss": 1.0018,
+      "step": 4012
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 6.318049560701423,
+      "learning_rate": 2.0117294877624856e-06,
+      "loss": 0.9406,
+      "step": 4013
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 11.165067127423548,
+      "learning_rate": 2.009423825373429e-06,
+      "loss": 1.0541,
+      "step": 4014
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 8.780020667189532,
+      "learning_rate": 2.007119152699595e-06,
+      "loss": 0.994,
+      "step": 4015
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 7.852457040454261,
+      "learning_rate": 2.0048154705036994e-06,
+      "loss": 1.0273,
+      "step": 4016
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 5.747046829862756,
+      "learning_rate": 2.0025127795481347e-06,
+      "loss": 1.0457,
+      "step": 4017
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 7.356577343729457,
+      "learning_rate": 2.000211080594967e-06,
+      "loss": 0.9986,
+      "step": 4018
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 7.585994822049109,
+      "learning_rate": 1.997910374405927e-06,
+      "loss": 0.9671,
+      "step": 4019
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 7.773268399154496,
+      "learning_rate": 1.995610661742423e-06,
+      "loss": 0.992,
+      "step": 4020
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 7.624965427876473,
+      "learning_rate": 1.993311943365533e-06,
+      "loss": 1.0464,
+      "step": 4021
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 7.817015744395594,
+      "learning_rate": 1.9910142200360074e-06,
+      "loss": 0.992,
+      "step": 4022
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 8.62431599514998,
+      "learning_rate": 1.9887174925142604e-06,
+      "loss": 1.0599,
+      "step": 4023
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 8.504092194136543,
+      "learning_rate": 1.9864217615603858e-06,
+      "loss": 1.0794,
+      "step": 4024
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 7.773708381849741,
+      "learning_rate": 1.9841270279341436e-06,
+      "loss": 1.0296,
+      "step": 4025
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 8.234504210984197,
+      "learning_rate": 1.9818332923949608e-06,
+      "loss": 1.0145,
+      "step": 4026
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 10.411436030593395,
+      "learning_rate": 1.979540555701939e-06,
+      "loss": 1.0885,
+      "step": 4027
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.6222235849559,
+      "learning_rate": 1.977248818613848e-06,
+      "loss": 0.973,
+      "step": 4028
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.836045718772466,
+      "learning_rate": 1.974958081889123e-06,
+      "loss": 1.0072,
+      "step": 4029
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 6.182316625509971,
+      "learning_rate": 1.9726683462858727e-06,
+      "loss": 0.9793,
+      "step": 4030
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 6.758918297577875,
+      "learning_rate": 1.9703796125618746e-06,
+      "loss": 0.9836,
+      "step": 4031
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 8.803006798607543,
+      "learning_rate": 1.9680918814745696e-06,
+      "loss": 0.984,
+      "step": 4032
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 8.632154425803547,
+      "learning_rate": 1.9658051537810718e-06,
+      "loss": 1.0202,
+      "step": 4033
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 5.829802452711266,
+      "learning_rate": 1.963519430238163e-06,
+      "loss": 1.0153,
+      "step": 4034
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.7315541714449205,
+      "learning_rate": 1.961234711602289e-06,
+      "loss": 1.0445,
+      "step": 4035
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 11.030006707289141,
+      "learning_rate": 1.9589509986295646e-06,
+      "loss": 1.035,
+      "step": 4036
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 9.189770953385969,
+      "learning_rate": 1.9566682920757747e-06,
+      "loss": 1.0898,
+      "step": 4037
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 6.824404735622406,
+      "learning_rate": 1.954386592696369e-06,
+      "loss": 1.0476,
+      "step": 4038
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 6.311737618854613,
+      "learning_rate": 1.952105901246461e-06,
+      "loss": 0.9318,
+      "step": 4039
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 10.915702261042641,
+      "learning_rate": 1.949826218480835e-06,
+      "loss": 1.035,
+      "step": 4040
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 6.700981378644601,
+      "learning_rate": 1.9475475451539414e-06,
+      "loss": 0.9934,
+      "step": 4041
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 6.9592561363270296,
+      "learning_rate": 1.945269882019892e-06,
+      "loss": 1.0091,
+      "step": 4042
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 6.926872819467846,
+      "learning_rate": 1.9429932298324684e-06,
+      "loss": 0.9979,
+      "step": 4043
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.5783033941642355,
+      "learning_rate": 1.9407175893451186e-06,
+      "loss": 1.0604,
+      "step": 4044
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.028694455332076,
+      "learning_rate": 1.9384429613109503e-06,
+      "loss": 0.9811,
+      "step": 4045
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 8.295108390699117,
+      "learning_rate": 1.9361693464827403e-06,
+      "loss": 0.9501,
+      "step": 4046
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 8.118347363441476,
+      "learning_rate": 1.9338967456129327e-06,
+      "loss": 1.0079,
+      "step": 4047
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 5.814474468028652,
+      "learning_rate": 1.9316251594536274e-06,
+      "loss": 0.9944,
+      "step": 4048
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 8.126077580405713,
+      "learning_rate": 1.9293545887565962e-06,
+      "loss": 0.9643,
+      "step": 4049
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 8.735326839404571,
+      "learning_rate": 1.9270850342732743e-06,
+      "loss": 0.9504,
+      "step": 4050
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 8.787694618896541,
+      "learning_rate": 1.9248164967547554e-06,
+      "loss": 1.0781,
+      "step": 4051
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.858376131602204,
+      "learning_rate": 1.9225489769518008e-06,
+      "loss": 0.9263,
+      "step": 4052
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 6.61305321267452,
+      "learning_rate": 1.9202824756148347e-06,
+      "loss": 1.0845,
+      "step": 4053
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.164949183802399,
+      "learning_rate": 1.9180169934939454e-06,
+      "loss": 1.0488,
+      "step": 4054
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.770063585201613,
+      "learning_rate": 1.915752531338878e-06,
+      "loss": 1.0136,
+      "step": 4055
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.985472029926021,
+      "learning_rate": 1.913489089899046e-06,
+      "loss": 1.0763,
+      "step": 4056
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.649448425097871,
+      "learning_rate": 1.911226669923526e-06,
+      "loss": 0.9987,
+      "step": 4057
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 9.8635072584744,
+      "learning_rate": 1.908965272161049e-06,
+      "loss": 1.1155,
+      "step": 4058
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.766450539095125,
+      "learning_rate": 1.906704897360015e-06,
+      "loss": 0.9903,
+      "step": 4059
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 8.506125820115395,
+      "learning_rate": 1.9044455462684846e-06,
+      "loss": 0.9698,
+      "step": 4060
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 8.926766996501007,
+      "learning_rate": 1.9021872196341728e-06,
+      "loss": 0.9622,
+      "step": 4061
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 8.831594168737698,
+      "learning_rate": 1.8999299182044677e-06,
+      "loss": 0.9979,
+      "step": 4062
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 9.354792329969053,
+      "learning_rate": 1.897673642726408e-06,
+      "loss": 1.1553,
+      "step": 4063
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.316976334363079,
+      "learning_rate": 1.8954183939466919e-06,
+      "loss": 1.008,
+      "step": 4064
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 8.496220696592944,
+      "learning_rate": 1.8931641726116889e-06,
+      "loss": 0.9895,
+      "step": 4065
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 8.657098819881579,
+      "learning_rate": 1.8909109794674191e-06,
+      "loss": 0.9834,
+      "step": 4066
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 10.616313985442101,
+      "learning_rate": 1.8886588152595597e-06,
+      "loss": 1.0449,
+      "step": 4067
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 6.588984456779573,
+      "learning_rate": 1.8864076807334602e-06,
+      "loss": 1.0417,
+      "step": 4068
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 9.328488978920769,
+      "learning_rate": 1.884157576634117e-06,
+      "loss": 1.0432,
+      "step": 4069
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.968560923796011,
+      "learning_rate": 1.8819085037061902e-06,
+      "loss": 1.0605,
+      "step": 4070
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 5.192435117134007,
+      "learning_rate": 1.8796604626940017e-06,
+      "loss": 1.0316,
+      "step": 4071
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 8.385159048258508,
+      "learning_rate": 1.8774134543415246e-06,
+      "loss": 1.0902,
+      "step": 4072
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 9.75166453240906,
+      "learning_rate": 1.8751674793923975e-06,
+      "loss": 1.0542,
+      "step": 4073
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 10.993023548022851,
+      "learning_rate": 1.8729225385899107e-06,
+      "loss": 1.0002,
+      "step": 4074
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 8.208740866319292,
+      "learning_rate": 1.870678632677017e-06,
+      "loss": 0.9813,
+      "step": 4075
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 8.39415880162935,
+      "learning_rate": 1.8684357623963272e-06,
+      "loss": 1.0606,
+      "step": 4076
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.183871528656929,
+      "learning_rate": 1.8661939284901015e-06,
+      "loss": 1.0578,
+      "step": 4077
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 10.415565962665303,
+      "learning_rate": 1.86395313170027e-06,
+      "loss": 1.0337,
+      "step": 4078
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 6.147814202005776,
+      "learning_rate": 1.861713372768409e-06,
+      "loss": 1.0137,
+      "step": 4079
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 8.577973215293499,
+      "learning_rate": 1.8594746524357515e-06,
+      "loss": 0.9976,
+      "step": 4080
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 7.911154686565046,
+      "learning_rate": 1.8572369714431964e-06,
+      "loss": 0.978,
+      "step": 4081
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.633070725487923,
+      "learning_rate": 1.855000330531289e-06,
+      "loss": 0.9998,
+      "step": 4082
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.928196609480362,
+      "learning_rate": 1.8527647304402308e-06,
+      "loss": 0.9877,
+      "step": 4083
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.489690659005122,
+      "learning_rate": 1.8505301719098883e-06,
+      "loss": 1.0539,
+      "step": 4084
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 9.689386069866998,
+      "learning_rate": 1.8482966556797717e-06,
+      "loss": 1.0398,
+      "step": 4085
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 9.423705895463828,
+      "learning_rate": 1.846064182489052e-06,
+      "loss": 1.008,
+      "step": 4086
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.499536135177512,
+      "learning_rate": 1.8438327530765571e-06,
+      "loss": 0.9707,
+      "step": 4087
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.316154572179107,
+      "learning_rate": 1.8416023681807627e-06,
+      "loss": 0.9934,
+      "step": 4088
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 10.6650537744209,
+      "learning_rate": 1.839373028539805e-06,
+      "loss": 1.021,
+      "step": 4089
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 8.165548667871033,
+      "learning_rate": 1.8371447348914728e-06,
+      "loss": 0.9924,
+      "step": 4090
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 8.732473090810736,
+      "learning_rate": 1.8349174879732057e-06,
+      "loss": 0.9777,
+      "step": 4091
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.2385803784139044,
+      "learning_rate": 1.8326912885221005e-06,
+      "loss": 1.0217,
+      "step": 4092
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.972009354122992,
+      "learning_rate": 1.8304661372749056e-06,
+      "loss": 0.9509,
+      "step": 4093
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 11.143379069688185,
+      "learning_rate": 1.8282420349680252e-06,
+      "loss": 1.0046,
+      "step": 4094
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.329163757844355,
+      "learning_rate": 1.8260189823375102e-06,
+      "loss": 1.0269,
+      "step": 4095
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.654538461527618,
+      "learning_rate": 1.8237969801190696e-06,
+      "loss": 1.0452,
+      "step": 4096
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.697166977699876,
+      "learning_rate": 1.821576029048066e-06,
+      "loss": 0.9831,
+      "step": 4097
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.659700834362212,
+      "learning_rate": 1.8193561298595065e-06,
+      "loss": 0.9474,
+      "step": 4098
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 8.99254410025604,
+      "learning_rate": 1.8171372832880575e-06,
+      "loss": 0.9868,
+      "step": 4099
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 8.917369361638377,
+      "learning_rate": 1.8149194900680355e-06,
+      "loss": 1.0716,
+      "step": 4100
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.166033265167043,
+      "learning_rate": 1.8127027509334045e-06,
+      "loss": 0.9958,
+      "step": 4101
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 8.563031618200098,
+      "learning_rate": 1.8104870666177836e-06,
+      "loss": 0.9869,
+      "step": 4102
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.7865680738718455,
+      "learning_rate": 1.8082724378544443e-06,
+      "loss": 0.9473,
+      "step": 4103
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 8.226447289443295,
+      "learning_rate": 1.8060588653763017e-06,
+      "loss": 1.1016,
+      "step": 4104
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.734548973148731,
+      "learning_rate": 1.8038463499159275e-06,
+      "loss": 0.9977,
+      "step": 4105
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 9.505249209856352,
+      "learning_rate": 1.8016348922055448e-06,
+      "loss": 1.0653,
+      "step": 4106
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.439984514088156,
+      "learning_rate": 1.7994244929770195e-06,
+      "loss": 0.9882,
+      "step": 4107
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.331024925809458,
+      "learning_rate": 1.797215152961873e-06,
+      "loss": 1.0402,
+      "step": 4108
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 8.312314459327808,
+      "learning_rate": 1.795006872891275e-06,
+      "loss": 0.9837,
+      "step": 4109
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.192555934830484,
+      "learning_rate": 1.7927996534960457e-06,
+      "loss": 1.0994,
+      "step": 4110
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 8.290241395970328,
+      "learning_rate": 1.7905934955066484e-06,
+      "loss": 0.9482,
+      "step": 4111
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.9927858085549754,
+      "learning_rate": 1.788388399653202e-06,
+      "loss": 0.9724,
+      "step": 4112
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 9.448902507922387,
+      "learning_rate": 1.786184366665472e-06,
+      "loss": 0.9063,
+      "step": 4113
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 9.640749740248824,
+      "learning_rate": 1.7839813972728693e-06,
+      "loss": 0.9985,
+      "step": 4114
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.856523587870917,
+      "learning_rate": 1.781779492204455e-06,
+      "loss": 1.1063,
+      "step": 4115
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.169412583735022,
+      "learning_rate": 1.7795786521889402e-06,
+      "loss": 0.9648,
+      "step": 4116
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 5.101378066100785,
+      "learning_rate": 1.777378877954678e-06,
+      "loss": 0.961,
+      "step": 4117
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 4.3773857285489886,
+      "learning_rate": 1.7751801702296734e-06,
+      "loss": 0.9648,
+      "step": 4118
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 8.873887511251134,
+      "learning_rate": 1.7729825297415787e-06,
+      "loss": 0.9867,
+      "step": 4119
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.455184434770624,
+      "learning_rate": 1.7707859572176877e-06,
+      "loss": 0.9483,
+      "step": 4120
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 9.140485538943016,
+      "learning_rate": 1.768590453384947e-06,
+      "loss": 0.9497,
+      "step": 4121
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 8.84401667535437,
+      "learning_rate": 1.7663960189699468e-06,
+      "loss": 0.9778,
+      "step": 4122
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.458518854729282,
+      "learning_rate": 1.7642026546989217e-06,
+      "loss": 0.9772,
+      "step": 4123
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.420952773174492,
+      "learning_rate": 1.7620103612977546e-06,
+      "loss": 0.9079,
+      "step": 4124
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.412963437196221,
+      "learning_rate": 1.7598191394919738e-06,
+      "loss": 0.9692,
+      "step": 4125
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.564921182444892,
+      "learning_rate": 1.7576289900067545e-06,
+      "loss": 0.9832,
+      "step": 4126
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 9.239449815165806,
+      "learning_rate": 1.7554399135669103e-06,
+      "loss": 0.9719,
+      "step": 4127
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.589821459441581,
+      "learning_rate": 1.7532519108969076e-06,
+      "loss": 0.9509,
+      "step": 4128
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.3826417847810095,
+      "learning_rate": 1.7510649827208548e-06,
+      "loss": 0.9228,
+      "step": 4129
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 10.824690535589477,
+      "learning_rate": 1.7488791297625006e-06,
+      "loss": 1.0534,
+      "step": 4130
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 9.275519585140929,
+      "learning_rate": 1.7466943527452446e-06,
+      "loss": 1.1436,
+      "step": 4131
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 5.696151603088689,
+      "learning_rate": 1.7445106523921268e-06,
+      "loss": 1.0834,
+      "step": 4132
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 8.75012074107419,
+      "learning_rate": 1.7423280294258278e-06,
+      "loss": 0.9462,
+      "step": 4133
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.416535981152327,
+      "learning_rate": 1.7401464845686805e-06,
+      "loss": 0.9809,
+      "step": 4134
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 8.636468181317905,
+      "learning_rate": 1.7379660185426528e-06,
+      "loss": 0.9168,
+      "step": 4135
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 6.869268720593445,
+      "learning_rate": 1.7357866320693544e-06,
+      "loss": 0.9871,
+      "step": 4136
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.729380318268291,
+      "learning_rate": 1.7336083258700492e-06,
+      "loss": 0.9567,
+      "step": 4137
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 7.226797167730988,
+      "learning_rate": 1.731431100665632e-06,
+      "loss": 1.0551,
+      "step": 4138
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 8.158524049823146,
+      "learning_rate": 1.729254957176641e-06,
+      "loss": 0.9852,
+      "step": 4139
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 8.713856329671868,
+      "learning_rate": 1.7270798961232654e-06,
+      "loss": 0.9456,
+      "step": 4140
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 8.039447426993192,
+      "learning_rate": 1.7249059182253252e-06,
+      "loss": 0.9726,
+      "step": 4141
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.311360639082459,
+      "learning_rate": 1.7227330242022888e-06,
+      "loss": 0.9278,
+      "step": 4142
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 8.038519692628498,
+      "learning_rate": 1.7205612147732653e-06,
+      "loss": 1.0452,
+      "step": 4143
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 8.633049033322013,
+      "learning_rate": 1.7183904906569998e-06,
+      "loss": 1.0027,
+      "step": 4144
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 6.451164018370383,
+      "learning_rate": 1.7162208525718838e-06,
+      "loss": 1.013,
+      "step": 4145
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 8.609395189548811,
+      "learning_rate": 1.7140523012359494e-06,
+      "loss": 0.9807,
+      "step": 4146
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 9.263261388032063,
+      "learning_rate": 1.7118848373668633e-06,
+      "loss": 0.9765,
+      "step": 4147
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.031062146879497,
+      "learning_rate": 1.7097184616819395e-06,
+      "loss": 0.9961,
+      "step": 4148
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 8.564904714706051,
+      "learning_rate": 1.7075531748981238e-06,
+      "loss": 1.1205,
+      "step": 4149
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 9.049841895348047,
+      "learning_rate": 1.7053889777320126e-06,
+      "loss": 1.0562,
+      "step": 4150
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 6.5807570500482315,
+      "learning_rate": 1.7032258708998322e-06,
+      "loss": 1.0422,
+      "step": 4151
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 5.825306673856045,
+      "learning_rate": 1.701063855117448e-06,
+      "loss": 0.9964,
+      "step": 4152
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 6.023641819324255,
+      "learning_rate": 1.6989029311003741e-06,
+      "loss": 0.9997,
+      "step": 4153
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 9.81029187214597,
+      "learning_rate": 1.6967430995637535e-06,
+      "loss": 0.9527,
+      "step": 4154
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 10.03092444276428,
+      "learning_rate": 1.6945843612223677e-06,
+      "loss": 0.9868,
+      "step": 4155
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.511227139429013,
+      "learning_rate": 1.692426716790646e-06,
+      "loss": 1.013,
+      "step": 4156
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 6.71140191230748,
+      "learning_rate": 1.690270166982645e-06,
+      "loss": 1.0196,
+      "step": 4157
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.968928790964714,
+      "learning_rate": 1.688114712512064e-06,
+      "loss": 0.9976,
+      "step": 4158
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.712949003566553,
+      "learning_rate": 1.6859603540922415e-06,
+      "loss": 0.9646,
+      "step": 4159
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.418461103407769,
+      "learning_rate": 1.6838070924361472e-06,
+      "loss": 1.062,
+      "step": 4160
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.085512622298254,
+      "learning_rate": 1.6816549282563938e-06,
+      "loss": 0.9725,
+      "step": 4161
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.280794990257713,
+      "learning_rate": 1.67950386226523e-06,
+      "loss": 1.0487,
+      "step": 4162
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 8.391477893583195,
+      "learning_rate": 1.6773538951745355e-06,
+      "loss": 1.051,
+      "step": 4163
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.482451567521537,
+      "learning_rate": 1.6752050276958331e-06,
+      "loss": 0.9886,
+      "step": 4164
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 8.432170824802636,
+      "learning_rate": 1.6730572605402784e-06,
+      "loss": 1.001,
+      "step": 4165
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 8.890147727371112,
+      "learning_rate": 1.670910594418666e-06,
+      "loss": 0.9481,
+      "step": 4166
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.169842926760167,
+      "learning_rate": 1.6687650300414189e-06,
+      "loss": 1.0489,
+      "step": 4167
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 5.624500921362634,
+      "learning_rate": 1.6666205681186032e-06,
+      "loss": 1.002,
+      "step": 4168
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.644028489144219,
+      "learning_rate": 1.6644772093599176e-06,
+      "loss": 1.0925,
+      "step": 4169
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.846355390479042,
+      "learning_rate": 1.662334954474693e-06,
+      "loss": 1.0114,
+      "step": 4170
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 5.789774553654003,
+      "learning_rate": 1.660193804171898e-06,
+      "loss": 1.0496,
+      "step": 4171
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.60360578696319,
+      "learning_rate": 1.658053759160137e-06,
+      "loss": 0.933,
+      "step": 4172
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 9.723058659863215,
+      "learning_rate": 1.6559148201476432e-06,
+      "loss": 1.0239,
+      "step": 4173
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 6.8623634349588505,
+      "learning_rate": 1.6537769878422887e-06,
+      "loss": 1.0778,
+      "step": 4174
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 8.22678604598368,
+      "learning_rate": 1.6516402629515788e-06,
+      "loss": 1.0034,
+      "step": 4175
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 5.97004242836865,
+      "learning_rate": 1.6495046461826486e-06,
+      "loss": 0.9135,
+      "step": 4176
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 8.776760766230709,
+      "learning_rate": 1.6473701382422703e-06,
+      "loss": 0.9552,
+      "step": 4177
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 6.502477097629968,
+      "learning_rate": 1.6452367398368496e-06,
+      "loss": 0.9769,
+      "step": 4178
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 9.497952659212404,
+      "learning_rate": 1.6431044516724204e-06,
+      "loss": 1.0017,
+      "step": 4179
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.6032583272454675,
+      "learning_rate": 1.640973274454653e-06,
+      "loss": 0.9738,
+      "step": 4180
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.988822665060281,
+      "learning_rate": 1.63884320888885e-06,
+      "loss": 0.8993,
+      "step": 4181
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 8.982758894529262,
+      "learning_rate": 1.6367142556799458e-06,
+      "loss": 1.0888,
+      "step": 4182
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 9.012372467533616,
+      "learning_rate": 1.6345864155325041e-06,
+      "loss": 1.0044,
+      "step": 4183
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 5.89202923249399,
+      "learning_rate": 1.6324596891507228e-06,
+      "loss": 0.964,
+      "step": 4184
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 5.880488983804762,
+      "learning_rate": 1.630334077238433e-06,
+      "loss": 1.0755,
+      "step": 4185
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.66379774831138,
+      "learning_rate": 1.628209580499091e-06,
+      "loss": 0.9958,
+      "step": 4186
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 9.34713707152301,
+      "learning_rate": 1.626086199635789e-06,
+      "loss": 0.9995,
+      "step": 4187
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 6.608069199929422,
+      "learning_rate": 1.623963935351251e-06,
+      "loss": 0.9724,
+      "step": 4188
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 5.864784242891257,
+      "learning_rate": 1.6218427883478248e-06,
+      "loss": 0.9376,
+      "step": 4189
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.564352617142722,
+      "learning_rate": 1.619722759327495e-06,
+      "loss": 1.0541,
+      "step": 4190
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 6.6169665996344795,
+      "learning_rate": 1.6176038489918755e-06,
+      "loss": 0.9704,
+      "step": 4191
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 9.993636136136905,
+      "learning_rate": 1.6154860580422043e-06,
+      "loss": 0.9986,
+      "step": 4192
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 7.272783383276005,
+      "learning_rate": 1.613369387179356e-06,
+      "loss": 1.0041,
+      "step": 4193
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 6.940909428663364,
+      "learning_rate": 1.6112538371038321e-06,
+      "loss": 1.032,
+      "step": 4194
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.215347196162797,
+      "learning_rate": 1.6091394085157596e-06,
+      "loss": 1.0011,
+      "step": 4195
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.833971635793382,
+      "learning_rate": 1.6070261021148992e-06,
+      "loss": 1.0141,
+      "step": 4196
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 9.50669162323675,
+      "learning_rate": 1.604913918600638e-06,
+      "loss": 0.9873,
+      "step": 4197
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 8.918444884849597,
+      "learning_rate": 1.602802858671994e-06,
+      "loss": 1.047,
+      "step": 4198
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 11.65944252856963,
+      "learning_rate": 1.6006929230276074e-06,
+      "loss": 0.9924,
+      "step": 4199
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.111658830808944,
+      "learning_rate": 1.5985841123657524e-06,
+      "loss": 0.996,
+      "step": 4200
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 5.794384933342339,
+      "learning_rate": 1.5964764273843298e-06,
+      "loss": 1.0446,
+      "step": 4201
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 9.259277564101419,
+      "learning_rate": 1.5943698687808635e-06,
+      "loss": 0.9546,
+      "step": 4202
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.695046480263619,
+      "learning_rate": 1.5922644372525093e-06,
+      "loss": 1.0657,
+      "step": 4203
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.4410832211074895,
+      "learning_rate": 1.5901601334960504e-06,
+      "loss": 0.9868,
+      "step": 4204
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.826025398296855,
+      "learning_rate": 1.5880569582078896e-06,
+      "loss": 1.0592,
+      "step": 4205
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 9.976674848766791,
+      "learning_rate": 1.585954912084069e-06,
+      "loss": 0.9751,
+      "step": 4206
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 9.709673849669418,
+      "learning_rate": 1.5838539958202454e-06,
+      "loss": 1.0341,
+      "step": 4207
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.733323531921945,
+      "learning_rate": 1.5817542101117029e-06,
+      "loss": 1.0244,
+      "step": 4208
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.02400578595648,
+      "learning_rate": 1.5796555556533605e-06,
+      "loss": 0.9453,
+      "step": 4209
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.931559523244781,
+      "learning_rate": 1.5775580331397544e-06,
+      "loss": 1.03,
+      "step": 4210
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 9.192238015980008,
+      "learning_rate": 1.5754616432650443e-06,
+      "loss": 0.9511,
+      "step": 4211
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.433175586186215,
+      "learning_rate": 1.573366386723026e-06,
+      "loss": 1.0528,
+      "step": 4212
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.393375537221476,
+      "learning_rate": 1.5712722642071083e-06,
+      "loss": 1.0792,
+      "step": 4213
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.5292468501615755,
+      "learning_rate": 1.569179276410332e-06,
+      "loss": 0.9922,
+      "step": 4214
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 9.830308943444214,
+      "learning_rate": 1.567087424025362e-06,
+      "loss": 1.0153,
+      "step": 4215
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 9.785858499063,
+      "learning_rate": 1.564996707744482e-06,
+      "loss": 1.0059,
+      "step": 4216
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 9.041693980860169,
+      "learning_rate": 1.5629071282596047e-06,
+      "loss": 0.9653,
+      "step": 4217
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.018315001715459,
+      "learning_rate": 1.5608186862622683e-06,
+      "loss": 1.0145,
+      "step": 4218
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.836953416195218,
+      "learning_rate": 1.5587313824436262e-06,
+      "loss": 1.0534,
+      "step": 4219
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 8.124764287899762,
+      "learning_rate": 1.5566452174944657e-06,
+      "loss": 0.9659,
+      "step": 4220
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.254913505288886,
+      "learning_rate": 1.5545601921051855e-06,
+      "loss": 0.967,
+      "step": 4221
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 8.447602571598773,
+      "learning_rate": 1.552476306965821e-06,
+      "loss": 0.983,
+      "step": 4222
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.854977010983275,
+      "learning_rate": 1.55039356276602e-06,
+      "loss": 1.0253,
+      "step": 4223
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.7181435983634366,
+      "learning_rate": 1.5483119601950508e-06,
+      "loss": 0.9627,
+      "step": 4224
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.5894568808331835,
+      "learning_rate": 1.5462314999418171e-06,
+      "loss": 0.9093,
+      "step": 4225
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 8.0874006306066,
+      "learning_rate": 1.5441521826948314e-06,
+      "loss": 1.0522,
+      "step": 4226
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.1676181996301,
+      "learning_rate": 1.5420740091422298e-06,
+      "loss": 0.9816,
+      "step": 4227
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.70508535372172,
+      "learning_rate": 1.539996979971779e-06,
+      "loss": 1.019,
+      "step": 4228
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.3325874483741496,
+      "learning_rate": 1.537921095870857e-06,
+      "loss": 0.9433,
+      "step": 4229
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 9.350486212810543,
+      "learning_rate": 1.5358463575264664e-06,
+      "loss": 0.983,
+      "step": 4230
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.3202809996311355,
+      "learning_rate": 1.5337727656252344e-06,
+      "loss": 0.9835,
+      "step": 4231
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 8.51296632746974,
+      "learning_rate": 1.5317003208533999e-06,
+      "loss": 1.0734,
+      "step": 4232
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 5.934030275290607,
+      "learning_rate": 1.5296290238968303e-06,
+      "loss": 1.0606,
+      "step": 4233
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 13.266630473805648,
+      "learning_rate": 1.527558875441012e-06,
+      "loss": 0.9545,
+      "step": 4234
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 10.111021880234635,
+      "learning_rate": 1.5254898761710458e-06,
+      "loss": 1.0156,
+      "step": 4235
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 8.58225499232562,
+      "learning_rate": 1.5234220267716576e-06,
+      "loss": 1.0405,
+      "step": 4236
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.672667803981778,
+      "learning_rate": 1.5213553279271915e-06,
+      "loss": 1.0068,
+      "step": 4237
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.57333102486431,
+      "learning_rate": 1.5192897803216116e-06,
+      "loss": 1.0017,
+      "step": 4238
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 8.140039165423236,
+      "learning_rate": 1.517225384638497e-06,
+      "loss": 0.9328,
+      "step": 4239
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.440810988990338,
+      "learning_rate": 1.5151621415610495e-06,
+      "loss": 0.9958,
+      "step": 4240
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 8.276169101024069,
+      "learning_rate": 1.5131000517720911e-06,
+      "loss": 0.9709,
+      "step": 4241
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.547679082162277,
+      "learning_rate": 1.5110391159540548e-06,
+      "loss": 0.9849,
+      "step": 4242
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.084553399834642,
+      "learning_rate": 1.5089793347889992e-06,
+      "loss": 0.9431,
+      "step": 4243
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 9.100087438491569,
+      "learning_rate": 1.5069207089585985e-06,
+      "loss": 1.0167,
+      "step": 4244
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.751969241702253,
+      "learning_rate": 1.5048632391441414e-06,
+      "loss": 1.1298,
+      "step": 4245
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 7.218514714151896,
+      "learning_rate": 1.5028069260265377e-06,
+      "loss": 0.9459,
+      "step": 4246
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 10.280324896676353,
+      "learning_rate": 1.500751770286315e-06,
+      "loss": 1.0125,
+      "step": 4247
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 5.607972420820921,
+      "learning_rate": 1.4986977726036127e-06,
+      "loss": 1.0516,
+      "step": 4248
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 6.251207946856698,
+      "learning_rate": 1.4966449336581923e-06,
+      "loss": 1.0201,
+      "step": 4249
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 8.068386166673413,
+      "learning_rate": 1.4945932541294317e-06,
+      "loss": 0.9931,
+      "step": 4250
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 10.740163529177659,
+      "learning_rate": 1.4925427346963194e-06,
+      "loss": 0.9622,
+      "step": 4251
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.3810326545671945,
+      "learning_rate": 1.4904933760374662e-06,
+      "loss": 1.003,
+      "step": 4252
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.098342741349222,
+      "learning_rate": 1.488445178831096e-06,
+      "loss": 1.05,
+      "step": 4253
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 8.667788923560463,
+      "learning_rate": 1.48639814375505e-06,
+      "loss": 0.9086,
+      "step": 4254
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 8.558721473027335,
+      "learning_rate": 1.4843522714867815e-06,
+      "loss": 0.9666,
+      "step": 4255
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.715040329725036,
+      "learning_rate": 1.4823075627033612e-06,
+      "loss": 0.9761,
+      "step": 4256
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.319897644144661,
+      "learning_rate": 1.4802640180814765e-06,
+      "loss": 1.0906,
+      "step": 4257
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.503396985170578,
+      "learning_rate": 1.478221638297425e-06,
+      "loss": 1.0591,
+      "step": 4258
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.86796647938387,
+      "learning_rate": 1.4761804240271227e-06,
+      "loss": 0.9683,
+      "step": 4259
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.057976806652853,
+      "learning_rate": 1.4741403759461005e-06,
+      "loss": 1.047,
+      "step": 4260
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 8.92571830932089,
+      "learning_rate": 1.472101494729497e-06,
+      "loss": 0.9984,
+      "step": 4261
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.955664644647209,
+      "learning_rate": 1.470063781052072e-06,
+      "loss": 0.9418,
+      "step": 4262
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.270232064860696,
+      "learning_rate": 1.468027235588197e-06,
+      "loss": 1.0018,
+      "step": 4263
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.785539841598498,
+      "learning_rate": 1.4659918590118528e-06,
+      "loss": 1.0428,
+      "step": 4264
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.744964123852562,
+      "learning_rate": 1.4639576519966375e-06,
+      "loss": 1.0305,
+      "step": 4265
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 10.258682043149653,
+      "learning_rate": 1.4619246152157634e-06,
+      "loss": 0.9063,
+      "step": 4266
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.494001534561906,
+      "learning_rate": 1.45989274934205e-06,
+      "loss": 1.103,
+      "step": 4267
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.089506682745848,
+      "learning_rate": 1.4578620550479333e-06,
+      "loss": 1.0303,
+      "step": 4268
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.9328060046432585,
+      "learning_rate": 1.4558325330054613e-06,
+      "loss": 0.95,
+      "step": 4269
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 8.209794241538829,
+      "learning_rate": 1.4538041838862944e-06,
+      "loss": 1.0647,
+      "step": 4270
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.396351729158779,
+      "learning_rate": 1.4517770083617017e-06,
+      "loss": 0.9947,
+      "step": 4271
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 8.628784219258097,
+      "learning_rate": 1.4497510071025671e-06,
+      "loss": 0.9781,
+      "step": 4272
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.304251048619934,
+      "learning_rate": 1.4477261807793868e-06,
+      "loss": 0.9658,
+      "step": 4273
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.217762274636244,
+      "learning_rate": 1.445702530062263e-06,
+      "loss": 1.0884,
+      "step": 4274
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.0914348628673185,
+      "learning_rate": 1.4436800556209135e-06,
+      "loss": 1.0028,
+      "step": 4275
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 9.35164427031852,
+      "learning_rate": 1.4416587581246672e-06,
+      "loss": 1.0763,
+      "step": 4276
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.078687522057714,
+      "learning_rate": 1.4396386382424565e-06,
+      "loss": 1.0593,
+      "step": 4277
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 5.8383252566388935,
+      "learning_rate": 1.4376196966428363e-06,
+      "loss": 1.0532,
+      "step": 4278
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 8.335904858425774,
+      "learning_rate": 1.4356019339939614e-06,
+      "loss": 0.9571,
+      "step": 4279
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.492320139523368,
+      "learning_rate": 1.433585350963596e-06,
+      "loss": 0.9955,
+      "step": 4280
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 9.330197693952437,
+      "learning_rate": 1.4315699482191236e-06,
+      "loss": 1.0842,
+      "step": 4281
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.607959835118662,
+      "learning_rate": 1.4295557264275284e-06,
+      "loss": 0.9928,
+      "step": 4282
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 8.08328596716975,
+      "learning_rate": 1.4275426862554026e-06,
+      "loss": 1.0147,
+      "step": 4283
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.656595202074557,
+      "learning_rate": 1.4255308283689584e-06,
+      "loss": 0.998,
+      "step": 4284
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.967392459500584,
+      "learning_rate": 1.4235201534340038e-06,
+      "loss": 0.9967,
+      "step": 4285
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 8.50380916232979,
+      "learning_rate": 1.4215106621159624e-06,
+      "loss": 1.0291,
+      "step": 4286
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 8.707676839150892,
+      "learning_rate": 1.419502355079867e-06,
+      "loss": 0.9934,
+      "step": 4287
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.353824632874191,
+      "learning_rate": 1.4174952329903518e-06,
+      "loss": 0.9092,
+      "step": 4288
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 10.645286847390008,
+      "learning_rate": 1.4154892965116663e-06,
+      "loss": 0.9886,
+      "step": 4289
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.644312673991213,
+      "learning_rate": 1.4134845463076645e-06,
+      "loss": 0.9484,
+      "step": 4290
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 9.050208972234204,
+      "learning_rate": 1.4114809830418058e-06,
+      "loss": 1.0198,
+      "step": 4291
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.65430698448766,
+      "learning_rate": 1.4094786073771594e-06,
+      "loss": 1.0218,
+      "step": 4292
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 5.730837454939422,
+      "learning_rate": 1.4074774199764018e-06,
+      "loss": 1.0198,
+      "step": 4293
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 8.123573847271448,
+      "learning_rate": 1.4054774215018159e-06,
+      "loss": 0.9528,
+      "step": 4294
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.595010118728019,
+      "learning_rate": 1.4034786126152893e-06,
+      "loss": 0.9673,
+      "step": 4295
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.065596811679255,
+      "learning_rate": 1.4014809939783142e-06,
+      "loss": 0.9451,
+      "step": 4296
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 5.804308290133469,
+      "learning_rate": 1.3994845662519985e-06,
+      "loss": 1.106,
+      "step": 4297
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 8.50643523684074,
+      "learning_rate": 1.397489330097045e-06,
+      "loss": 0.9571,
+      "step": 4298
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 10.39414935155112,
+      "learning_rate": 1.3954952861737642e-06,
+      "loss": 1.0209,
+      "step": 4299
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 8.396228004035805,
+      "learning_rate": 1.39350243514208e-06,
+      "loss": 0.9359,
+      "step": 4300
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.124187556782423,
+      "learning_rate": 1.3915107776615117e-06,
+      "loss": 0.968,
+      "step": 4301
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.947533491680739,
+      "learning_rate": 1.389520314391189e-06,
+      "loss": 0.9847,
+      "step": 4302
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 6.044568384258338,
+      "learning_rate": 1.3875310459898466e-06,
+      "loss": 0.9763,
+      "step": 4303
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.596789962556259,
+      "learning_rate": 1.3855429731158193e-06,
+      "loss": 1.0175,
+      "step": 4304
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 8.189060886865127,
+      "learning_rate": 1.3835560964270507e-06,
+      "loss": 0.9863,
+      "step": 4305
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 7.43124071824293,
+      "learning_rate": 1.3815704165810884e-06,
+      "loss": 0.9769,
+      "step": 4306
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.969789398315192,
+      "learning_rate": 1.3795859342350793e-06,
+      "loss": 0.9922,
+      "step": 4307
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.252031256332158,
+      "learning_rate": 1.3776026500457796e-06,
+      "loss": 0.9289,
+      "step": 4308
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 8.49340581416214,
+      "learning_rate": 1.375620564669546e-06,
+      "loss": 0.9825,
+      "step": 4309
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.793406215214934,
+      "learning_rate": 1.3736396787623402e-06,
+      "loss": 1.0585,
+      "step": 4310
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 8.21424545662429,
+      "learning_rate": 1.3716599929797237e-06,
+      "loss": 0.9504,
+      "step": 4311
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.787648453589959,
+      "learning_rate": 1.3696815079768638e-06,
+      "loss": 1.0733,
+      "step": 4312
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.937989944174318,
+      "learning_rate": 1.3677042244085315e-06,
+      "loss": 0.9964,
+      "step": 4313
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.776705052812031,
+      "learning_rate": 1.3657281429290953e-06,
+      "loss": 0.9403,
+      "step": 4314
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.476116291789734,
+      "learning_rate": 1.3637532641925294e-06,
+      "loss": 0.9983,
+      "step": 4315
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 8.377993678153333,
+      "learning_rate": 1.3617795888524121e-06,
+      "loss": 0.9963,
+      "step": 4316
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.925989010336881,
+      "learning_rate": 1.3598071175619165e-06,
+      "loss": 0.9921,
+      "step": 4317
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 8.785765390683702,
+      "learning_rate": 1.3578358509738238e-06,
+      "loss": 0.8777,
+      "step": 4318
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.109174953056582,
+      "learning_rate": 1.355865789740516e-06,
+      "loss": 1.016,
+      "step": 4319
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.436423194864954,
+      "learning_rate": 1.3538969345139702e-06,
+      "loss": 1.0309,
+      "step": 4320
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.167892361514542,
+      "learning_rate": 1.3519292859457705e-06,
+      "loss": 1.0421,
+      "step": 4321
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.793609117770837,
+      "learning_rate": 1.3499628446871016e-06,
+      "loss": 0.9955,
+      "step": 4322
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 4.369692931513156,
+      "learning_rate": 1.3479976113887432e-06,
+      "loss": 1.068,
+      "step": 4323
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.782233484522471,
+      "learning_rate": 1.3460335867010809e-06,
+      "loss": 1.0354,
+      "step": 4324
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 10.36112134338215,
+      "learning_rate": 1.3440707712740975e-06,
+      "loss": 0.9967,
+      "step": 4325
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.8926141164621475,
+      "learning_rate": 1.3421091657573782e-06,
+      "loss": 1.0085,
+      "step": 4326
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 9.927470565388544,
+      "learning_rate": 1.3401487708001026e-06,
+      "loss": 0.9908,
+      "step": 4327
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 8.773796019807843,
+      "learning_rate": 1.3381895870510547e-06,
+      "loss": 1.0019,
+      "step": 4328
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.734053478033903,
+      "learning_rate": 1.336231615158617e-06,
+      "loss": 1.0152,
+      "step": 4329
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 5.004096500105433,
+      "learning_rate": 1.3342748557707674e-06,
+      "loss": 0.9767,
+      "step": 4330
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 12.458991149139854,
+      "learning_rate": 1.3323193095350862e-06,
+      "loss": 1.0422,
+      "step": 4331
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 5.49406092564869,
+      "learning_rate": 1.3303649770987525e-06,
+      "loss": 1.0284,
+      "step": 4332
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 8.155768439038003,
+      "learning_rate": 1.3284118591085392e-06,
+      "loss": 0.9801,
+      "step": 4333
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.5201294599200095,
+      "learning_rate": 1.3264599562108222e-06,
+      "loss": 1.0559,
+      "step": 4334
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.670177342309082,
+      "learning_rate": 1.3245092690515738e-06,
+      "loss": 1.0614,
+      "step": 4335
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.986489542629874,
+      "learning_rate": 1.3225597982763616e-06,
+      "loss": 0.9883,
+      "step": 4336
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.485537114459495,
+      "learning_rate": 1.320611544530353e-06,
+      "loss": 0.9162,
+      "step": 4337
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 8.008284507097576,
+      "learning_rate": 1.3186645084583145e-06,
+      "loss": 0.9809,
+      "step": 4338
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 11.67962187530607,
+      "learning_rate": 1.3167186907046042e-06,
+      "loss": 0.9338,
+      "step": 4339
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.611459969531163,
+      "learning_rate": 1.3147740919131814e-06,
+      "loss": 1.0463,
+      "step": 4340
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 8.270182588024946,
+      "learning_rate": 1.3128307127276002e-06,
+      "loss": 1.1035,
+      "step": 4341
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 9.0312076675387,
+      "learning_rate": 1.3108885537910143e-06,
+      "loss": 0.9555,
+      "step": 4342
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.987051002485992,
+      "learning_rate": 1.3089476157461667e-06,
+      "loss": 0.9593,
+      "step": 4343
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.3600903837093,
+      "learning_rate": 1.3070078992354024e-06,
+      "loss": 1.0061,
+      "step": 4344
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 9.382944118316484,
+      "learning_rate": 1.3050694049006618e-06,
+      "loss": 0.9174,
+      "step": 4345
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.982161900988386,
+      "learning_rate": 1.3031321333834762e-06,
+      "loss": 0.8927,
+      "step": 4346
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.501857295287985,
+      "learning_rate": 1.3011960853249771e-06,
+      "loss": 0.9995,
+      "step": 4347
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 8.183657067954124,
+      "learning_rate": 1.29926126136589e-06,
+      "loss": 1.0788,
+      "step": 4348
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.338650320239356,
+      "learning_rate": 1.2973276621465308e-06,
+      "loss": 1.0601,
+      "step": 4349
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 8.765461871519433,
+      "learning_rate": 1.2953952883068194e-06,
+      "loss": 0.9706,
+      "step": 4350
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.735334245965543,
+      "learning_rate": 1.2934641404862614e-06,
+      "loss": 0.9827,
+      "step": 4351
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.495627268821063,
+      "learning_rate": 1.2915342193239573e-06,
+      "loss": 0.9791,
+      "step": 4352
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 5.070940580229381,
+      "learning_rate": 1.2896055254586105e-06,
+      "loss": 1.0754,
+      "step": 4353
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.990082039337558,
+      "learning_rate": 1.2876780595285082e-06,
+      "loss": 1.0347,
+      "step": 4354
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 6.285985056275269,
+      "learning_rate": 1.285751822171532e-06,
+      "loss": 1.0403,
+      "step": 4355
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.0340894981681155,
+      "learning_rate": 1.283826814025167e-06,
+      "loss": 0.9189,
+      "step": 4356
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 8.740838468309795,
+      "learning_rate": 1.2819030357264784e-06,
+      "loss": 0.9855,
+      "step": 4357
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 8.107593672081082,
+      "learning_rate": 1.2799804879121324e-06,
+      "loss": 0.9364,
+      "step": 4358
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 5.984642314242572,
+      "learning_rate": 1.2780591712183883e-06,
+      "loss": 1.0135,
+      "step": 4359
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 8.768359906919859,
+      "learning_rate": 1.2761390862810907e-06,
+      "loss": 0.9953,
+      "step": 4360
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 8.092439089468908,
+      "learning_rate": 1.2742202337356852e-06,
+      "loss": 0.9551,
+      "step": 4361
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 8.8058615755351,
+      "learning_rate": 1.2723026142172058e-06,
+      "loss": 0.9658,
+      "step": 4362
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 7.320837519698916,
+      "learning_rate": 1.2703862283602759e-06,
+      "loss": 1.0801,
+      "step": 4363
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 5.19208341681127,
+      "learning_rate": 1.2684710767991143e-06,
+      "loss": 0.9452,
+      "step": 4364
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 7.476914654150063,
+      "learning_rate": 1.2665571601675303e-06,
+      "loss": 0.9435,
+      "step": 4365
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 5.158074582348294,
+      "learning_rate": 1.2646444790989265e-06,
+      "loss": 1.0045,
+      "step": 4366
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 7.929962947063789,
+      "learning_rate": 1.2627330342262929e-06,
+      "loss": 1.0484,
+      "step": 4367
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 7.902140883772017,
+      "learning_rate": 1.260822826182208e-06,
+      "loss": 1.0626,
+      "step": 4368
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 9.165512057878265,
+      "learning_rate": 1.2589138555988522e-06,
+      "loss": 1.0241,
+      "step": 4369
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 10.275668009342795,
+      "learning_rate": 1.2570061231079856e-06,
+      "loss": 0.9846,
+      "step": 4370
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 7.175748951291341,
+      "learning_rate": 1.2550996293409584e-06,
+      "loss": 0.9525,
+      "step": 4371
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 9.341742532733997,
+      "learning_rate": 1.2531943749287217e-06,
+      "loss": 1.0141,
+      "step": 4372
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.6272386561380205,
+      "learning_rate": 1.2512903605018039e-06,
+      "loss": 1.0202,
+      "step": 4373
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 9.600237576741351,
+      "learning_rate": 1.2493875866903305e-06,
+      "loss": 0.9916,
+      "step": 4374
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 8.052151423784244,
+      "learning_rate": 1.2474860541240153e-06,
+      "loss": 1.0291,
+      "step": 4375
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.343045708334896,
+      "learning_rate": 1.2455857634321578e-06,
+      "loss": 1.0112,
+      "step": 4376
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.25406223930157,
+      "learning_rate": 1.2436867152436505e-06,
+      "loss": 0.9576,
+      "step": 4377
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 10.166694037451862,
+      "learning_rate": 1.2417889101869746e-06,
+      "loss": 1.0124,
+      "step": 4378
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 7.061722513918895,
+      "learning_rate": 1.2398923488901954e-06,
+      "loss": 1.0245,
+      "step": 4379
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 8.302087420329462,
+      "learning_rate": 1.2379970319809715e-06,
+      "loss": 0.9702,
+      "step": 4380
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.737583119362876,
+      "learning_rate": 1.2361029600865482e-06,
+      "loss": 1.044,
+      "step": 4381
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.803474536649329,
+      "learning_rate": 1.2342101338337593e-06,
+      "loss": 0.9386,
+      "step": 4382
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 8.13006064845519,
+      "learning_rate": 1.232318553849023e-06,
+      "loss": 0.939,
+      "step": 4383
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 10.043911549138674,
+      "learning_rate": 1.230428220758349e-06,
+      "loss": 1.0466,
+      "step": 4384
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 8.397745108895098,
+      "learning_rate": 1.2285391351873344e-06,
+      "loss": 1.0286,
+      "step": 4385
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 7.782750659324175,
+      "learning_rate": 1.226651297761159e-06,
+      "loss": 1.011,
+      "step": 4386
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 7.4076232858923365,
+      "learning_rate": 1.2247647091045939e-06,
+      "loss": 0.9849,
+      "step": 4387
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.635854756807119,
+      "learning_rate": 1.2228793698419971e-06,
+      "loss": 1.0373,
+      "step": 4388
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.020434533156263,
+      "learning_rate": 1.220995280597309e-06,
+      "loss": 0.9656,
+      "step": 4389
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 7.346316371496961,
+      "learning_rate": 1.2191124419940597e-06,
+      "loss": 1.0161,
+      "step": 4390
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.399396287693516,
+      "learning_rate": 1.2172308546553668e-06,
+      "loss": 0.9675,
+      "step": 4391
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 8.308814914763683,
+      "learning_rate": 1.2153505192039283e-06,
+      "loss": 0.9735,
+      "step": 4392
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 7.464369689148427,
+      "learning_rate": 1.213471436262032e-06,
+      "loss": 1.0498,
+      "step": 4393
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 8.98625618428946,
+      "learning_rate": 1.2115936064515527e-06,
+      "loss": 1.0208,
+      "step": 4394
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 7.377657540485097,
+      "learning_rate": 1.2097170303939448e-06,
+      "loss": 1.0093,
+      "step": 4395
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.741502590196446,
+      "learning_rate": 1.2078417087102534e-06,
+      "loss": 1.0,
+      "step": 4396
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 9.243053663011873,
+      "learning_rate": 1.2059676420211048e-06,
+      "loss": 1.0236,
+      "step": 4397
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 7.678355826616386,
+      "learning_rate": 1.2040948309467138e-06,
+      "loss": 0.9365,
+      "step": 4398
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 9.07034347753601,
+      "learning_rate": 1.2022232761068736e-06,
+      "loss": 0.9776,
+      "step": 4399
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 9.978834571683516,
+      "learning_rate": 1.2003529781209678e-06,
+      "loss": 1.0209,
+      "step": 4400
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 7.803509488878278,
+      "learning_rate": 1.198483937607962e-06,
+      "loss": 0.9581,
+      "step": 4401
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 7.287249066523738,
+      "learning_rate": 1.196616155186403e-06,
+      "loss": 0.9011,
+      "step": 4402
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 9.419134955705706,
+      "learning_rate": 1.1947496314744245e-06,
+      "loss": 0.9454,
+      "step": 4403
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 7.149821091004257,
+      "learning_rate": 1.192884367089744e-06,
+      "loss": 1.1296,
+      "step": 4404
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 9.107515484131236,
+      "learning_rate": 1.191020362649658e-06,
+      "loss": 1.0416,
+      "step": 4405
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 9.3598543871594,
+      "learning_rate": 1.1891576187710503e-06,
+      "loss": 1.0319,
+      "step": 4406
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 9.102447165556683,
+      "learning_rate": 1.1872961360703865e-06,
+      "loss": 1.0154,
+      "step": 4407
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 7.20071597698133,
+      "learning_rate": 1.1854359151637135e-06,
+      "loss": 0.9607,
+      "step": 4408
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 7.4835417469128656,
+      "learning_rate": 1.1835769566666611e-06,
+      "loss": 0.9328,
+      "step": 4409
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.848440304283051,
+      "learning_rate": 1.1817192611944444e-06,
+      "loss": 1.0879,
+      "step": 4410
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 9.009941443277414,
+      "learning_rate": 1.1798628293618542e-06,
+      "loss": 0.9375,
+      "step": 4411
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 8.596248614127088,
+      "learning_rate": 1.178007661783268e-06,
+      "loss": 0.996,
+      "step": 4412
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 8.537787635489135,
+      "learning_rate": 1.176153759072644e-06,
+      "loss": 1.0024,
+      "step": 4413
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 6.546170872203999,
+      "learning_rate": 1.1743011218435225e-06,
+      "loss": 0.9748,
+      "step": 4414
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 12.214552742549978,
+      "learning_rate": 1.1724497507090215e-06,
+      "loss": 1.0267,
+      "step": 4415
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 11.056390870662906,
+      "learning_rate": 1.1705996462818437e-06,
+      "loss": 1.0638,
+      "step": 4416
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 7.002412265507212,
+      "learning_rate": 1.1687508091742727e-06,
+      "loss": 1.0227,
+      "step": 4417
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 8.828750838270427,
+      "learning_rate": 1.1669032399981684e-06,
+      "loss": 0.9831,
+      "step": 4418
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 8.010708080716672,
+      "learning_rate": 1.1650569393649747e-06,
+      "loss": 1.0261,
+      "step": 4419
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 7.051843100106192,
+      "learning_rate": 1.1632119078857174e-06,
+      "loss": 1.0099,
+      "step": 4420
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 10.76150707514131,
+      "learning_rate": 1.1613681461709953e-06,
+      "loss": 1.0277,
+      "step": 4421
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 7.575598828824693,
+      "learning_rate": 1.1595256548309962e-06,
+      "loss": 1.0758,
+      "step": 4422
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 8.430949872676997,
+      "learning_rate": 1.1576844344754818e-06,
+      "loss": 0.9442,
+      "step": 4423
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 7.182724655750665,
+      "learning_rate": 1.1558444857137895e-06,
+      "loss": 0.9455,
+      "step": 4424
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 5.826444586222842,
+      "learning_rate": 1.1540058091548472e-06,
+      "loss": 0.9544,
+      "step": 4425
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 8.197204722364592,
+      "learning_rate": 1.1521684054071524e-06,
+      "loss": 1.0519,
+      "step": 4426
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 10.235808520296901,
+      "learning_rate": 1.1503322750787805e-06,
+      "loss": 0.9706,
+      "step": 4427
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 6.477505232170382,
+      "learning_rate": 1.1484974187773946e-06,
+      "loss": 1.0168,
+      "step": 4428
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 7.523447370411289,
+      "learning_rate": 1.1466638371102273e-06,
+      "loss": 0.8963,
+      "step": 4429
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 10.209932687766837,
+      "learning_rate": 1.1448315306840935e-06,
+      "loss": 1.0045,
+      "step": 4430
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 8.528364920375267,
+      "learning_rate": 1.1430005001053868e-06,
+      "loss": 0.931,
+      "step": 4431
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 8.542241193827815,
+      "learning_rate": 1.1411707459800735e-06,
+      "loss": 1.0525,
+      "step": 4432
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 6.812730569139027,
+      "learning_rate": 1.1393422689137034e-06,
+      "loss": 0.9652,
+      "step": 4433
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 8.964182286835285,
+      "learning_rate": 1.1375150695114023e-06,
+      "loss": 1.0582,
+      "step": 4434
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 7.339251525904359,
+      "learning_rate": 1.1356891483778686e-06,
+      "loss": 1.0035,
+      "step": 4435
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 8.467118950615724,
+      "learning_rate": 1.1338645061173836e-06,
+      "loss": 1.0647,
+      "step": 4436
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 7.15885741197333,
+      "learning_rate": 1.1320411433338025e-06,
+      "loss": 0.9596,
+      "step": 4437
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 7.4649314377345855,
+      "learning_rate": 1.1302190606305585e-06,
+      "loss": 1.1432,
+      "step": 4438
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 6.66366980994699,
+      "learning_rate": 1.1283982586106573e-06,
+      "loss": 1.032,
+      "step": 4439
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 6.966804699571537,
+      "learning_rate": 1.1265787378766863e-06,
+      "loss": 1.0056,
+      "step": 4440
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 8.464096912360864,
+      "learning_rate": 1.124760499030806e-06,
+      "loss": 0.9819,
+      "step": 4441
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 6.486341077178681,
+      "learning_rate": 1.122943542674752e-06,
+      "loss": 0.9971,
+      "step": 4442
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 8.797817767335893,
+      "learning_rate": 1.1211278694098339e-06,
+      "loss": 0.9774,
+      "step": 4443
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 11.659510312712268,
+      "learning_rate": 1.1193134798369443e-06,
+      "loss": 0.9815,
+      "step": 4444
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 8.715438192222852,
+      "learning_rate": 1.1175003745565415e-06,
+      "loss": 1.0051,
+      "step": 4445
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 8.928845217319166,
+      "learning_rate": 1.1156885541686645e-06,
+      "loss": 0.9969,
+      "step": 4446
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 11.211257693240702,
+      "learning_rate": 1.113878019272927e-06,
+      "loss": 0.9578,
+      "step": 4447
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 6.972101437727235,
+      "learning_rate": 1.1120687704685129e-06,
+      "loss": 1.0539,
+      "step": 4448
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 12.325792988729493,
+      "learning_rate": 1.1102608083541855e-06,
+      "loss": 0.9817,
+      "step": 4449
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 8.833214782219011,
+      "learning_rate": 1.1084541335282805e-06,
+      "loss": 1.0085,
+      "step": 4450
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 6.741903445094429,
+      "learning_rate": 1.1066487465887054e-06,
+      "loss": 0.9983,
+      "step": 4451
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 8.889607073554746,
+      "learning_rate": 1.1048446481329445e-06,
+      "loss": 0.9863,
+      "step": 4452
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 10.76880530115856,
+      "learning_rate": 1.1030418387580545e-06,
+      "loss": 0.947,
+      "step": 4453
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 9.19153315161136,
+      "learning_rate": 1.1012403190606668e-06,
+      "loss": 1.0046,
+      "step": 4454
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 9.153639023180773,
+      "learning_rate": 1.0994400896369822e-06,
+      "loss": 1.0002,
+      "step": 4455
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 6.603013551345089,
+      "learning_rate": 1.0976411510827784e-06,
+      "loss": 1.0164,
+      "step": 4456
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 8.066652422287873,
+      "learning_rate": 1.0958435039934052e-06,
+      "loss": 1.0597,
+      "step": 4457
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 8.007711821859433,
+      "learning_rate": 1.0940471489637826e-06,
+      "loss": 1.0131,
+      "step": 4458
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 7.196581843125046,
+      "learning_rate": 1.0922520865884056e-06,
+      "loss": 0.9901,
+      "step": 4459
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 9.488134593785272,
+      "learning_rate": 1.090458317461342e-06,
+      "loss": 0.9953,
+      "step": 4460
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 8.320975417263027,
+      "learning_rate": 1.0886658421762276e-06,
+      "loss": 1.0398,
+      "step": 4461
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 8.55808662798909,
+      "learning_rate": 1.0868746613262727e-06,
+      "loss": 1.0308,
+      "step": 4462
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 9.34937275633962,
+      "learning_rate": 1.0850847755042621e-06,
+      "loss": 0.9581,
+      "step": 4463
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 8.600878306767735,
+      "learning_rate": 1.0832961853025448e-06,
+      "loss": 1.0365,
+      "step": 4464
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 7.78742910640045,
+      "learning_rate": 1.0815088913130472e-06,
+      "loss": 0.9931,
+      "step": 4465
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 10.199449401353196,
+      "learning_rate": 1.0797228941272663e-06,
+      "loss": 0.9252,
+      "step": 4466
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 6.115509517804605,
+      "learning_rate": 1.0779381943362655e-06,
+      "loss": 1.0015,
+      "step": 4467
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 6.6865652821888135,
+      "learning_rate": 1.0761547925306825e-06,
+      "loss": 0.8744,
+      "step": 4468
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 6.889676088372942,
+      "learning_rate": 1.0743726893007257e-06,
+      "loss": 0.9736,
+      "step": 4469
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 7.801772743423116,
+      "learning_rate": 1.0725918852361723e-06,
+      "loss": 1.0201,
+      "step": 4470
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 8.79292792787957,
+      "learning_rate": 1.0708123809263688e-06,
+      "loss": 0.9824,
+      "step": 4471
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 7.843074667340615,
+      "learning_rate": 1.0690341769602331e-06,
+      "loss": 0.9669,
+      "step": 4472
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 9.443096806084537,
+      "learning_rate": 1.0672572739262539e-06,
+      "loss": 1.0486,
+      "step": 4473
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 5.676314166129394,
+      "learning_rate": 1.065481672412485e-06,
+      "loss": 1.0165,
+      "step": 4474
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 12.081545772733511,
+      "learning_rate": 1.0637073730065544e-06,
+      "loss": 1.0054,
+      "step": 4475
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 9.024754078443623,
+      "learning_rate": 1.061934376295657e-06,
+      "loss": 0.9511,
+      "step": 4476
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.122306071946555,
+      "learning_rate": 1.0601626828665545e-06,
+      "loss": 0.9801,
+      "step": 4477
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.540888750151181,
+      "learning_rate": 1.0583922933055813e-06,
+      "loss": 0.9916,
+      "step": 4478
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 5.4041986159144555,
+      "learning_rate": 1.0566232081986388e-06,
+      "loss": 0.9888,
+      "step": 4479
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 6.982126113914933,
+      "learning_rate": 1.0548554281311945e-06,
+      "loss": 1.0115,
+      "step": 4480
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 8.442297176104084,
+      "learning_rate": 1.053088953688286e-06,
+      "loss": 1.022,
+      "step": 4481
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 9.717871796729591,
+      "learning_rate": 1.051323785454521e-06,
+      "loss": 0.9224,
+      "step": 4482
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.995737779301534,
+      "learning_rate": 1.0495599240140692e-06,
+      "loss": 0.994,
+      "step": 4483
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 9.286005462821198,
+      "learning_rate": 1.0477973699506728e-06,
+      "loss": 1.031,
+      "step": 4484
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 9.327129627800334,
+      "learning_rate": 1.046036123847639e-06,
+      "loss": 1.1017,
+      "step": 4485
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 8.235396420386081,
+      "learning_rate": 1.0442761862878454e-06,
+      "loss": 1.0703,
+      "step": 4486
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.776631178158562,
+      "learning_rate": 1.04251755785373e-06,
+      "loss": 1.0209,
+      "step": 4487
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 5.792855862214417,
+      "learning_rate": 1.0407602391273037e-06,
+      "loss": 0.9922,
+      "step": 4488
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.000674227557474,
+      "learning_rate": 1.0390042306901421e-06,
+      "loss": 0.9724,
+      "step": 4489
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 9.736872259441043,
+      "learning_rate": 1.037249533123385e-06,
+      "loss": 1.1127,
+      "step": 4490
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.921433331421728,
+      "learning_rate": 1.0354961470077419e-06,
+      "loss": 0.8554,
+      "step": 4491
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.681950547526029,
+      "learning_rate": 1.0337440729234865e-06,
+      "loss": 1.0473,
+      "step": 4492
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 8.601099240733328,
+      "learning_rate": 1.0319933114504555e-06,
+      "loss": 0.9491,
+      "step": 4493
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.001547626201771,
+      "learning_rate": 1.0302438631680588e-06,
+      "loss": 0.9731,
+      "step": 4494
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 6.064502786254107,
+      "learning_rate": 1.0284957286552655e-06,
+      "loss": 1.001,
+      "step": 4495
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 6.057258657161634,
+      "learning_rate": 1.0267489084906068e-06,
+      "loss": 0.9684,
+      "step": 4496
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 6.89570211634604,
+      "learning_rate": 1.02500340325219e-06,
+      "loss": 1.058,
+      "step": 4497
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 5.978866979246845,
+      "learning_rate": 1.0232592135176783e-06,
+      "loss": 1.103,
+      "step": 4498
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 6.498902622562698,
+      "learning_rate": 1.0215163398642986e-06,
+      "loss": 1.0233,
+      "step": 4499
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 9.315100623195967,
+      "learning_rate": 1.0197747828688514e-06,
+      "loss": 0.9107,
+      "step": 4500
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.115898171690208,
+      "learning_rate": 1.0180345431076916e-06,
+      "loss": 0.9314,
+      "step": 4501
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 6.580739265012323,
+      "learning_rate": 1.0162956211567432e-06,
+      "loss": 1.0013,
+      "step": 4502
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 8.613820363202029,
+      "learning_rate": 1.0145580175914954e-06,
+      "loss": 1.0283,
+      "step": 4503
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.038753252112674,
+      "learning_rate": 1.0128217329869955e-06,
+      "loss": 0.9594,
+      "step": 4504
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 9.444608012513454,
+      "learning_rate": 1.0110867679178588e-06,
+      "loss": 0.924,
+      "step": 4505
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 8.038737718141936,
+      "learning_rate": 1.0093531229582643e-06,
+      "loss": 1.0486,
+      "step": 4506
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 8.471663996011344,
+      "learning_rate": 1.0076207986819498e-06,
+      "loss": 0.992,
+      "step": 4507
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 8.083712471556316,
+      "learning_rate": 1.0058897956622204e-06,
+      "loss": 1.0348,
+      "step": 4508
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 8.717138257951321,
+      "learning_rate": 1.004160114471942e-06,
+      "loss": 0.9276,
+      "step": 4509
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 8.427138853920013,
+      "learning_rate": 1.002431755683545e-06,
+      "loss": 0.9947,
+      "step": 4510
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 4.721569620884141,
+      "learning_rate": 1.0007047198690178e-06,
+      "loss": 0.9434,
+      "step": 4511
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 6.488816195845207,
+      "learning_rate": 9.989790075999145e-07,
+      "loss": 0.9607,
+      "step": 4512
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 5.0318958523865,
+      "learning_rate": 9.97254619447352e-07,
+      "loss": 1.0321,
+      "step": 4513
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.160435637451985,
+      "learning_rate": 9.955315559820062e-07,
+      "loss": 1.036,
+      "step": 4514
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 5.411133897011835,
+      "learning_rate": 9.93809817774113e-07,
+      "loss": 0.9556,
+      "step": 4515
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 6.675273040503929,
+      "learning_rate": 9.920894053934777e-07,
+      "loss": 1.0332,
+      "step": 4516
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 5.6360174940101455,
+      "learning_rate": 9.903703194094577e-07,
+      "loss": 0.9913,
+      "step": 4517
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 8.154890529307695,
+      "learning_rate": 9.886525603909769e-07,
+      "loss": 0.99,
+      "step": 4518
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 10.0692949494048,
+      "learning_rate": 9.869361289065194e-07,
+      "loss": 1.0258,
+      "step": 4519
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 5.451350518928295,
+      "learning_rate": 9.852210255241262e-07,
+      "loss": 0.9198,
+      "step": 4520
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 9.157355469175156,
+      "learning_rate": 9.835072508114036e-07,
+      "loss": 0.9798,
+      "step": 4521
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.596782882935708,
+      "learning_rate": 9.81794805335517e-07,
+      "loss": 1.0608,
+      "step": 4522
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 6.472535542607257,
+      "learning_rate": 9.800836896631889e-07,
+      "loss": 1.0428,
+      "step": 4523
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 8.280116331311438,
+      "learning_rate": 9.783739043607043e-07,
+      "loss": 0.9929,
+      "step": 4524
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.674316728381299,
+      "learning_rate": 9.766654499939083e-07,
+      "loss": 1.0426,
+      "step": 4525
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 8.624711275052961,
+      "learning_rate": 9.749583271282064e-07,
+      "loss": 1.0589,
+      "step": 4526
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 8.023709447800345,
+      "learning_rate": 9.732525363285584e-07,
+      "loss": 0.9563,
+      "step": 4527
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 6.540401175772911,
+      "learning_rate": 9.715480781594883e-07,
+      "loss": 1.0213,
+      "step": 4528
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 6.675911299450273,
+      "learning_rate": 9.698449531850785e-07,
+      "loss": 0.9616,
+      "step": 4529
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.592229750615259,
+      "learning_rate": 9.681431619689668e-07,
+      "loss": 1.0221,
+      "step": 4530
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 7.568213116263169,
+      "learning_rate": 9.664427050743536e-07,
+      "loss": 1.083,
+      "step": 4531
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 10.534519631279645,
+      "learning_rate": 9.647435830639962e-07,
+      "loss": 1.0365,
+      "step": 4532
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 9.73657983146182,
+      "learning_rate": 9.630457965002081e-07,
+      "loss": 0.9924,
+      "step": 4533
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 9.525122998481327,
+      "learning_rate": 9.613493459448642e-07,
+      "loss": 1.0553,
+      "step": 4534
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 11.34952694471565,
+      "learning_rate": 9.596542319593972e-07,
+      "loss": 0.9713,
+      "step": 4535
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 9.007153688431197,
+      "learning_rate": 9.579604551047922e-07,
+      "loss": 1.0204,
+      "step": 4536
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 8.847857577081204,
+      "learning_rate": 9.562680159415976e-07,
+      "loss": 0.9608,
+      "step": 4537
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 8.778972135132095,
+      "learning_rate": 9.545769150299195e-07,
+      "loss": 1.0066,
+      "step": 4538
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 8.359574554679071,
+      "learning_rate": 9.528871529294142e-07,
+      "loss": 0.9859,
+      "step": 4539
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 6.7835438196399425,
+      "learning_rate": 9.51198730199302e-07,
+      "loss": 1.0135,
+      "step": 4540
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 8.143251603884892,
+      "learning_rate": 9.495116473983573e-07,
+      "loss": 0.9598,
+      "step": 4541
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 9.433870336134925,
+      "learning_rate": 9.47825905084912e-07,
+      "loss": 1.0113,
+      "step": 4542
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 8.016187089323315,
+      "learning_rate": 9.461415038168514e-07,
+      "loss": 1.0452,
+      "step": 4543
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 10.081426806916237,
+      "learning_rate": 9.444584441516197e-07,
+      "loss": 1.0205,
+      "step": 4544
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 8.402851107838817,
+      "learning_rate": 9.427767266462184e-07,
+      "loss": 0.9588,
+      "step": 4545
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.1435387607137475,
+      "learning_rate": 9.410963518572003e-07,
+      "loss": 1.0081,
+      "step": 4546
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 10.376568504412752,
+      "learning_rate": 9.394173203406776e-07,
+      "loss": 1.0451,
+      "step": 4547
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.787818204626139,
+      "learning_rate": 9.377396326523181e-07,
+      "loss": 0.9494,
+      "step": 4548
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 10.01043977435664,
+      "learning_rate": 9.360632893473415e-07,
+      "loss": 0.9252,
+      "step": 4549
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.575744214262559,
+      "learning_rate": 9.343882909805252e-07,
+      "loss": 1.0655,
+      "step": 4550
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.885291238688797,
+      "learning_rate": 9.327146381062024e-07,
+      "loss": 0.9529,
+      "step": 4551
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.936988498403845,
+      "learning_rate": 9.310423312782574e-07,
+      "loss": 0.9839,
+      "step": 4552
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.326581616133243,
+      "learning_rate": 9.293713710501323e-07,
+      "loss": 0.9693,
+      "step": 4553
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 6.292953077964001,
+      "learning_rate": 9.277017579748238e-07,
+      "loss": 1.0331,
+      "step": 4554
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 9.04509664757992,
+      "learning_rate": 9.260334926048787e-07,
+      "loss": 1.0107,
+      "step": 4555
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.547005520186146,
+      "learning_rate": 9.243665754924019e-07,
+      "loss": 0.9877,
+      "step": 4556
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 8.950686017903838,
+      "learning_rate": 9.227010071890508e-07,
+      "loss": 0.9761,
+      "step": 4557
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.216857349256262,
+      "learning_rate": 9.210367882460374e-07,
+      "loss": 1.004,
+      "step": 4558
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 5.866646910948968,
+      "learning_rate": 9.193739192141243e-07,
+      "loss": 1.0099,
+      "step": 4559
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 8.148290267410925,
+      "learning_rate": 9.177124006436289e-07,
+      "loss": 1.025,
+      "step": 4560
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.676078675995311,
+      "learning_rate": 9.160522330844246e-07,
+      "loss": 0.9004,
+      "step": 4561
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 10.479045244884997,
+      "learning_rate": 9.14393417085932e-07,
+      "loss": 0.9874,
+      "step": 4562
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.212558213405176,
+      "learning_rate": 9.12735953197128e-07,
+      "loss": 1.0677,
+      "step": 4563
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 11.454328978804355,
+      "learning_rate": 9.11079841966544e-07,
+      "loss": 1.0759,
+      "step": 4564
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.75662023380098,
+      "learning_rate": 9.094250839422564e-07,
+      "loss": 0.8934,
+      "step": 4565
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 9.53500905045641,
+      "learning_rate": 9.077716796719033e-07,
+      "loss": 1.0227,
+      "step": 4566
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 8.546844458653485,
+      "learning_rate": 9.061196297026686e-07,
+      "loss": 0.9101,
+      "step": 4567
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.296797306822483,
+      "learning_rate": 9.044689345812863e-07,
+      "loss": 0.9399,
+      "step": 4568
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 5.975325020034976,
+      "learning_rate": 9.028195948540503e-07,
+      "loss": 0.9711,
+      "step": 4569
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 6.609746519655468,
+      "learning_rate": 9.011716110667978e-07,
+      "loss": 0.9279,
+      "step": 4570
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 6.192622374490241,
+      "learning_rate": 8.995249837649178e-07,
+      "loss": 0.9499,
+      "step": 4571
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.797118863581068,
+      "learning_rate": 8.978797134933587e-07,
+      "loss": 0.9474,
+      "step": 4572
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 8.985487089062945,
+      "learning_rate": 8.962358007966093e-07,
+      "loss": 1.0224,
+      "step": 4573
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 8.883452660600218,
+      "learning_rate": 8.945932462187157e-07,
+      "loss": 0.9564,
+      "step": 4574
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.435365779769438,
+      "learning_rate": 8.929520503032729e-07,
+      "loss": 1.0719,
+      "step": 4575
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 8.566084170016143,
+      "learning_rate": 8.913122135934244e-07,
+      "loss": 1.0219,
+      "step": 4576
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.723582995894121,
+      "learning_rate": 8.896737366318652e-07,
+      "loss": 0.9942,
+      "step": 4577
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 9.516100743279777,
+      "learning_rate": 8.880366199608436e-07,
+      "loss": 0.9817,
+      "step": 4578
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.22137139721758,
+      "learning_rate": 8.864008641221511e-07,
+      "loss": 0.9247,
+      "step": 4579
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.009270320089602,
+      "learning_rate": 8.847664696571335e-07,
+      "loss": 0.996,
+      "step": 4580
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.624667276941716,
+      "learning_rate": 8.831334371066852e-07,
+      "loss": 0.9277,
+      "step": 4581
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 8.733911225538366,
+      "learning_rate": 8.815017670112514e-07,
+      "loss": 0.9591,
+      "step": 4582
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 8.333821875991035,
+      "learning_rate": 8.79871459910821e-07,
+      "loss": 0.9599,
+      "step": 4583
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 5.256487829068638,
+      "learning_rate": 8.782425163449371e-07,
+      "loss": 0.9653,
+      "step": 4584
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 7.999762299591792,
+      "learning_rate": 8.766149368526916e-07,
+      "loss": 1.0303,
+      "step": 4585
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 8.187324649283774,
+      "learning_rate": 8.749887219727204e-07,
+      "loss": 0.9111,
+      "step": 4586
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 8.34008857210806,
+      "learning_rate": 8.733638722432114e-07,
+      "loss": 1.0452,
+      "step": 4587
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 5.866930427209018,
+      "learning_rate": 8.717403882019021e-07,
+      "loss": 1.0402,
+      "step": 4588
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.580966030343469,
+      "learning_rate": 8.701182703860728e-07,
+      "loss": 1.0365,
+      "step": 4589
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.485539755528636,
+      "learning_rate": 8.684975193325562e-07,
+      "loss": 0.9817,
+      "step": 4590
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 8.714646022866468,
+      "learning_rate": 8.668781355777323e-07,
+      "loss": 0.9378,
+      "step": 4591
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.299383228783558,
+      "learning_rate": 8.652601196575256e-07,
+      "loss": 1.0087,
+      "step": 4592
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 8.87295521412859,
+      "learning_rate": 8.636434721074099e-07,
+      "loss": 1.0172,
+      "step": 4593
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 8.282082048291523,
+      "learning_rate": 8.62028193462408e-07,
+      "loss": 1.0872,
+      "step": 4594
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 6.654604402274794,
+      "learning_rate": 8.604142842570856e-07,
+      "loss": 0.9671,
+      "step": 4595
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 8.93970675161527,
+      "learning_rate": 8.588017450255582e-07,
+      "loss": 1.0199,
+      "step": 4596
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.65049150272345,
+      "learning_rate": 8.57190576301487e-07,
+      "loss": 0.9848,
+      "step": 4597
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.412578518544517,
+      "learning_rate": 8.555807786180814e-07,
+      "loss": 0.9085,
+      "step": 4598
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 6.74084181884047,
+      "learning_rate": 8.539723525080923e-07,
+      "loss": 0.9622,
+      "step": 4599
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 9.930074489730783,
+      "learning_rate": 8.523652985038206e-07,
+      "loss": 0.9251,
+      "step": 4600
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 6.64656999756287,
+      "learning_rate": 8.507596171371146e-07,
+      "loss": 0.9834,
+      "step": 4601
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 5.727741419519572,
+      "learning_rate": 8.491553089393629e-07,
+      "loss": 1.0515,
+      "step": 4602
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 10.453963101454832,
+      "learning_rate": 8.475523744415037e-07,
+      "loss": 0.9849,
+      "step": 4603
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 8.810274034008819,
+      "learning_rate": 8.459508141740213e-07,
+      "loss": 1.019,
+      "step": 4604
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.649262015635217,
+      "learning_rate": 8.443506286669412e-07,
+      "loss": 1.0059,
+      "step": 4605
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.145535007977747,
+      "learning_rate": 8.427518184498373e-07,
+      "loss": 0.9766,
+      "step": 4606
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 10.359841221406175,
+      "learning_rate": 8.411543840518283e-07,
+      "loss": 0.9392,
+      "step": 4607
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 8.156689462762563,
+      "learning_rate": 8.395583260015738e-07,
+      "loss": 1.0371,
+      "step": 4608
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 5.632124750286835,
+      "learning_rate": 8.379636448272827e-07,
+      "loss": 0.8628,
+      "step": 4609
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 8.436009263161845,
+      "learning_rate": 8.363703410567075e-07,
+      "loss": 1.0208,
+      "step": 4610
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 8.828624273498715,
+      "learning_rate": 8.347784152171401e-07,
+      "loss": 0.9267,
+      "step": 4611
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.229115311058258,
+      "learning_rate": 8.331878678354222e-07,
+      "loss": 1.0109,
+      "step": 4612
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 5.996137403593687,
+      "learning_rate": 8.315986994379366e-07,
+      "loss": 0.9499,
+      "step": 4613
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 9.052248038747186,
+      "learning_rate": 8.30010910550611e-07,
+      "loss": 1.0151,
+      "step": 4614
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 10.365653872598639,
+      "learning_rate": 8.284245016989134e-07,
+      "loss": 0.9908,
+      "step": 4615
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.565857867195756,
+      "learning_rate": 8.268394734078589e-07,
+      "loss": 1.0993,
+      "step": 4616
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.108298233402003,
+      "learning_rate": 8.25255826202005e-07,
+      "loss": 1.0384,
+      "step": 4617
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 6.18604658808944,
+      "learning_rate": 8.236735606054485e-07,
+      "loss": 0.9356,
+      "step": 4618
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 8.121281983035326,
+      "learning_rate": 8.220926771418341e-07,
+      "loss": 0.9792,
+      "step": 4619
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.322985865809679,
+      "learning_rate": 8.205131763343465e-07,
+      "loss": 1.039,
+      "step": 4620
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.242769691289162,
+      "learning_rate": 8.18935058705711e-07,
+      "loss": 1.0262,
+      "step": 4621
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 5.099617358066858,
+      "learning_rate": 8.173583247782007e-07,
+      "loss": 0.9798,
+      "step": 4622
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.377861670681378,
+      "learning_rate": 8.157829750736256e-07,
+      "loss": 0.9052,
+      "step": 4623
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 6.49585438099758,
+      "learning_rate": 8.142090101133376e-07,
+      "loss": 1.0658,
+      "step": 4624
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.975310989014369,
+      "learning_rate": 8.126364304182338e-07,
+      "loss": 1.0183,
+      "step": 4625
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.618577500194943,
+      "learning_rate": 8.110652365087518e-07,
+      "loss": 0.9131,
+      "step": 4626
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 5.74924965477865,
+      "learning_rate": 8.094954289048673e-07,
+      "loss": 0.9679,
+      "step": 4627
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 8.580507444500572,
+      "learning_rate": 8.079270081261015e-07,
+      "loss": 1.0517,
+      "step": 4628
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 8.874796197928164,
+      "learning_rate": 8.063599746915146e-07,
+      "loss": 1.0175,
+      "step": 4629
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 7.313781282212115,
+      "learning_rate": 8.047943291197091e-07,
+      "loss": 0.9351,
+      "step": 4630
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 8.610277985697,
+      "learning_rate": 8.03230071928825e-07,
+      "loss": 0.8821,
+      "step": 4631
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 5.3828810194177,
+      "learning_rate": 8.016672036365453e-07,
+      "loss": 1.0339,
+      "step": 4632
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 5.9179598967069715,
+      "learning_rate": 8.001057247600946e-07,
+      "loss": 1.0289,
+      "step": 4633
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 6.6913389889345165,
+      "learning_rate": 7.985456358162336e-07,
+      "loss": 0.9985,
+      "step": 4634
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 5.664580873269371,
+      "learning_rate": 7.969869373212669e-07,
+      "loss": 1.0298,
+      "step": 4635
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 10.921824687525651,
+      "learning_rate": 7.95429629791038e-07,
+      "loss": 0.973,
+      "step": 4636
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 8.900296340167893,
+      "learning_rate": 7.938737137409264e-07,
+      "loss": 1.0041,
+      "step": 4637
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 9.408428903794366,
+      "learning_rate": 7.923191896858584e-07,
+      "loss": 1.0069,
+      "step": 4638
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 12.724356914973272,
+      "learning_rate": 7.907660581402943e-07,
+      "loss": 1.0269,
+      "step": 4639
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 9.42417586074162,
+      "learning_rate": 7.892143196182306e-07,
+      "loss": 0.9358,
+      "step": 4640
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 8.880693987769192,
+      "learning_rate": 7.876639746332132e-07,
+      "loss": 0.9745,
+      "step": 4641
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 8.78864052341673,
+      "learning_rate": 7.861150236983173e-07,
+      "loss": 1.0481,
+      "step": 4642
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 5.8834450136300696,
+      "learning_rate": 7.845674673261578e-07,
+      "loss": 0.9578,
+      "step": 4643
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 6.393360828023926,
+      "learning_rate": 7.830213060288949e-07,
+      "loss": 0.9238,
+      "step": 4644
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 8.462296136417745,
+      "learning_rate": 7.814765403182184e-07,
+      "loss": 0.9369,
+      "step": 4645
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 10.735910826132967,
+      "learning_rate": 7.799331707053625e-07,
+      "loss": 1.0708,
+      "step": 4646
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.491999150732545,
+      "learning_rate": 7.783911977010972e-07,
+      "loss": 0.9552,
+      "step": 4647
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 8.072315658783584,
+      "learning_rate": 7.76850621815729e-07,
+      "loss": 0.9929,
+      "step": 4648
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.7317085148585,
+      "learning_rate": 7.753114435591036e-07,
+      "loss": 1.0423,
+      "step": 4649
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 9.290214152291654,
+      "learning_rate": 7.737736634406051e-07,
+      "loss": 0.997,
+      "step": 4650
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 9.055694573116881,
+      "learning_rate": 7.722372819691515e-07,
+      "loss": 0.8935,
+      "step": 4651
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.97482708313919,
+      "learning_rate": 7.70702299653201e-07,
+      "loss": 1.0181,
+      "step": 4652
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 9.0793067030918,
+      "learning_rate": 7.691687170007484e-07,
+      "loss": 1.0534,
+      "step": 4653
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.881224293876571,
+      "learning_rate": 7.676365345193248e-07,
+      "loss": 1.0212,
+      "step": 4654
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 8.214936880411624,
+      "learning_rate": 7.661057527159965e-07,
+      "loss": 0.9842,
+      "step": 4655
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.344472858163537,
+      "learning_rate": 7.645763720973687e-07,
+      "loss": 1.0148,
+      "step": 4656
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.667144566858231,
+      "learning_rate": 7.63048393169582e-07,
+      "loss": 1.0011,
+      "step": 4657
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 5.664921383614868,
+      "learning_rate": 7.615218164383109e-07,
+      "loss": 1.017,
+      "step": 4658
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.348296338050132,
+      "learning_rate": 7.599966424087695e-07,
+      "loss": 0.9855,
+      "step": 4659
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.382888565323738,
+      "learning_rate": 7.584728715857071e-07,
+      "loss": 1.0058,
+      "step": 4660
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 8.77618234943773,
+      "learning_rate": 7.569505044734043e-07,
+      "loss": 1.0728,
+      "step": 4661
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.3824606264192365,
+      "learning_rate": 7.554295415756824e-07,
+      "loss": 0.9587,
+      "step": 4662
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.418359303955883,
+      "learning_rate": 7.539099833958974e-07,
+      "loss": 0.9683,
+      "step": 4663
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 11.80188120997244,
+      "learning_rate": 7.523918304369354e-07,
+      "loss": 0.993,
+      "step": 4664
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.427996336375655,
+      "learning_rate": 7.50875083201223e-07,
+      "loss": 0.9869,
+      "step": 4665
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 8.157542487059162,
+      "learning_rate": 7.493597421907212e-07,
+      "loss": 1.0153,
+      "step": 4666
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.5378388884380465,
+      "learning_rate": 7.478458079069206e-07,
+      "loss": 0.9139,
+      "step": 4667
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.776195989115491,
+      "learning_rate": 7.463332808508517e-07,
+      "loss": 1.0032,
+      "step": 4668
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.406665783689918,
+      "learning_rate": 7.448221615230766e-07,
+      "loss": 0.9344,
+      "step": 4669
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.5252746342223995,
+      "learning_rate": 7.433124504236938e-07,
+      "loss": 0.9174,
+      "step": 4670
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 9.420043165518823,
+      "learning_rate": 7.41804148052332e-07,
+      "loss": 0.9845,
+      "step": 4671
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.1091125250453215,
+      "learning_rate": 7.40297254908156e-07,
+      "loss": 0.9709,
+      "step": 4672
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.350994737779897,
+      "learning_rate": 7.387917714898663e-07,
+      "loss": 0.9858,
+      "step": 4673
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.108422598116363,
+      "learning_rate": 7.372876982956917e-07,
+      "loss": 1.0712,
+      "step": 4674
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.776737135554825,
+      "learning_rate": 7.357850358233987e-07,
+      "loss": 0.9856,
+      "step": 4675
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 10.12138461840777,
+      "learning_rate": 7.342837845702866e-07,
+      "loss": 1.012,
+      "step": 4676
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.9966419448636,
+      "learning_rate": 7.327839450331837e-07,
+      "loss": 1.0201,
+      "step": 4677
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.399859188604752,
+      "learning_rate": 7.312855177084549e-07,
+      "loss": 1.0637,
+      "step": 4678
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.773169416626197,
+      "learning_rate": 7.297885030919988e-07,
+      "loss": 1.0036,
+      "step": 4679
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 8.28728506169133,
+      "learning_rate": 7.282929016792412e-07,
+      "loss": 1.0026,
+      "step": 4680
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.830301016238881,
+      "learning_rate": 7.267987139651455e-07,
+      "loss": 1.0485,
+      "step": 4681
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.044583020030482,
+      "learning_rate": 7.253059404442054e-07,
+      "loss": 0.9835,
+      "step": 4682
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.207976335065737,
+      "learning_rate": 7.238145816104442e-07,
+      "loss": 1.0356,
+      "step": 4683
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 11.147194599782415,
+      "learning_rate": 7.223246379574206e-07,
+      "loss": 0.9713,
+      "step": 4684
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.088624310620486,
+      "learning_rate": 7.20836109978223e-07,
+      "loss": 0.9072,
+      "step": 4685
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.880912267143005,
+      "learning_rate": 7.193489981654738e-07,
+      "loss": 0.8999,
+      "step": 4686
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.938386197176184,
+      "learning_rate": 7.178633030113214e-07,
+      "loss": 0.9633,
+      "step": 4687
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 5.786202904129825,
+      "learning_rate": 7.163790250074509e-07,
+      "loss": 0.918,
+      "step": 4688
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.160621644535624,
+      "learning_rate": 7.148961646450763e-07,
+      "loss": 0.9243,
+      "step": 4689
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 5.137052628431243,
+      "learning_rate": 7.134147224149406e-07,
+      "loss": 0.9647,
+      "step": 4690
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.1239374056843126,
+      "learning_rate": 7.119346988073205e-07,
+      "loss": 0.9952,
+      "step": 4691
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.410514379352966,
+      "learning_rate": 7.104560943120226e-07,
+      "loss": 0.9741,
+      "step": 4692
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 8.337609965921418,
+      "learning_rate": 7.089789094183797e-07,
+      "loss": 1.029,
+      "step": 4693
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.1088073804924825,
+      "learning_rate": 7.07503144615263e-07,
+      "loss": 0.9419,
+      "step": 4694
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 9.536228042337997,
+      "learning_rate": 7.06028800391067e-07,
+      "loss": 1.0108,
+      "step": 4695
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.728593628182195,
+      "learning_rate": 7.045558772337169e-07,
+      "loss": 0.9916,
+      "step": 4696
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.561627351337398,
+      "learning_rate": 7.030843756306693e-07,
+      "loss": 1.0322,
+      "step": 4697
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 8.078605254559234,
+      "learning_rate": 7.016142960689115e-07,
+      "loss": 0.9247,
+      "step": 4698
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.014726211587559,
+      "learning_rate": 7.001456390349559e-07,
+      "loss": 1.0267,
+      "step": 4699
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 6.110740360602029,
+      "learning_rate": 6.986784050148476e-07,
+      "loss": 0.9986,
+      "step": 4700
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 7.054037262386285,
+      "learning_rate": 6.972125944941611e-07,
+      "loss": 0.903,
+      "step": 4701
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 6.378930542286625,
+      "learning_rate": 6.95748207957998e-07,
+      "loss": 1.0304,
+      "step": 4702
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.071109378883312,
+      "learning_rate": 6.942852458909883e-07,
+      "loss": 1.0007,
+      "step": 4703
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 9.501004527222538,
+      "learning_rate": 6.928237087772915e-07,
+      "loss": 0.9499,
+      "step": 4704
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.077503935486808,
+      "learning_rate": 6.913635971005972e-07,
+      "loss": 0.9487,
+      "step": 4705
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.930622941302358,
+      "learning_rate": 6.899049113441197e-07,
+      "loss": 1.0507,
+      "step": 4706
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 9.439498264212265,
+      "learning_rate": 6.884476519906042e-07,
+      "loss": 0.993,
+      "step": 4707
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.482653781674631,
+      "learning_rate": 6.869918195223235e-07,
+      "loss": 1.0384,
+      "step": 4708
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 5.063265800446525,
+      "learning_rate": 6.855374144210747e-07,
+      "loss": 0.9273,
+      "step": 4709
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 10.276342838868974,
+      "learning_rate": 6.840844371681893e-07,
+      "loss": 1.0151,
+      "step": 4710
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 6.335117912652054,
+      "learning_rate": 6.826328882445211e-07,
+      "loss": 1.0575,
+      "step": 4711
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 9.812417744776823,
+      "learning_rate": 6.811827681304495e-07,
+      "loss": 1.0611,
+      "step": 4712
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 8.498523251860231,
+      "learning_rate": 6.797340773058886e-07,
+      "loss": 0.9685,
+      "step": 4713
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 9.158471088143537,
+      "learning_rate": 6.782868162502726e-07,
+      "loss": 0.9628,
+      "step": 4714
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 8.971458779041082,
+      "learning_rate": 6.768409854425628e-07,
+      "loss": 0.9359,
+      "step": 4715
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 8.277448524492526,
+      "learning_rate": 6.753965853612538e-07,
+      "loss": 1.0437,
+      "step": 4716
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.8218339801731105,
+      "learning_rate": 6.739536164843585e-07,
+      "loss": 1.0595,
+      "step": 4717
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.162741583918183,
+      "learning_rate": 6.725120792894208e-07,
+      "loss": 0.9967,
+      "step": 4718
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 11.448408663130175,
+      "learning_rate": 6.71071974253511e-07,
+      "loss": 0.9521,
+      "step": 4719
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 6.770956815984759,
+      "learning_rate": 6.696333018532219e-07,
+      "loss": 1.0436,
+      "step": 4720
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 11.783416881969652,
+      "learning_rate": 6.681960625646755e-07,
+      "loss": 1.0996,
+      "step": 4721
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.602490382001913,
+      "learning_rate": 6.667602568635195e-07,
+      "loss": 1.0268,
+      "step": 4722
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.2076090202141065,
+      "learning_rate": 6.653258852249245e-07,
+      "loss": 1.0272,
+      "step": 4723
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.872876135547552,
+      "learning_rate": 6.638929481235884e-07,
+      "loss": 0.9692,
+      "step": 4724
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 5.646809929885441,
+      "learning_rate": 6.624614460337347e-07,
+      "loss": 1.0488,
+      "step": 4725
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 6.734908098392042,
+      "learning_rate": 6.610313794291123e-07,
+      "loss": 1.0099,
+      "step": 4726
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 5.733081439560481,
+      "learning_rate": 6.596027487829915e-07,
+      "loss": 0.9887,
+      "step": 4727
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 8.715350988899234,
+      "learning_rate": 6.581755545681718e-07,
+      "loss": 1.0162,
+      "step": 4728
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.055580411245185,
+      "learning_rate": 6.567497972569758e-07,
+      "loss": 0.9257,
+      "step": 4729
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 5.683693337694554,
+      "learning_rate": 6.553254773212492e-07,
+      "loss": 1.0497,
+      "step": 4730
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 8.74962742835306,
+      "learning_rate": 6.53902595232363e-07,
+      "loss": 0.9895,
+      "step": 4731
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.879096909276994,
+      "learning_rate": 6.524811514612139e-07,
+      "loss": 0.9753,
+      "step": 4732
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 6.631897708004521,
+      "learning_rate": 6.510611464782196e-07,
+      "loss": 0.9185,
+      "step": 4733
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 10.832599684754246,
+      "learning_rate": 6.496425807533235e-07,
+      "loss": 1.0755,
+      "step": 4734
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 9.882883705478218,
+      "learning_rate": 6.482254547559935e-07,
+      "loss": 0.968,
+      "step": 4735
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.195696873227522,
+      "learning_rate": 6.468097689552183e-07,
+      "loss": 0.9598,
+      "step": 4736
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.960634457860652,
+      "learning_rate": 6.45395523819512e-07,
+      "loss": 1.0036,
+      "step": 4737
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 8.14916968990634,
+      "learning_rate": 6.43982719816913e-07,
+      "loss": 0.9641,
+      "step": 4738
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 8.009861943426648,
+      "learning_rate": 6.425713574149783e-07,
+      "loss": 0.9597,
+      "step": 4739
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 6.50625104729568,
+      "learning_rate": 6.411614370807928e-07,
+      "loss": 0.9787,
+      "step": 4740
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 4.834592756401713,
+      "learning_rate": 6.397529592809615e-07,
+      "loss": 1.074,
+      "step": 4741
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 5.617115593530469,
+      "learning_rate": 6.383459244816143e-07,
+      "loss": 0.975,
+      "step": 4742
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 6.194846997792709,
+      "learning_rate": 6.369403331483992e-07,
+      "loss": 0.9156,
+      "step": 4743
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 6.602528114701904,
+      "learning_rate": 6.355361857464903e-07,
+      "loss": 0.9827,
+      "step": 4744
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 8.208106458353127,
+      "learning_rate": 6.341334827405837e-07,
+      "loss": 1.0498,
+      "step": 4745
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.313573513726767,
+      "learning_rate": 6.327322245948941e-07,
+      "loss": 0.9667,
+      "step": 4746
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.441290117492432,
+      "learning_rate": 6.313324117731623e-07,
+      "loss": 1.0163,
+      "step": 4747
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 6.094981801196537,
+      "learning_rate": 6.299340447386493e-07,
+      "loss": 0.9744,
+      "step": 4748
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 8.553827423093718,
+      "learning_rate": 6.28537123954135e-07,
+      "loss": 0.9859,
+      "step": 4749
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 10.51912548736753,
+      "learning_rate": 6.271416498819243e-07,
+      "loss": 0.9775,
+      "step": 4750
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.318295067994771,
+      "learning_rate": 6.257476229838433e-07,
+      "loss": 0.9966,
+      "step": 4751
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.517606086696299,
+      "learning_rate": 6.243550437212342e-07,
+      "loss": 0.9963,
+      "step": 4752
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 9.293358036104358,
+      "learning_rate": 6.229639125549663e-07,
+      "loss": 1.0316,
+      "step": 4753
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 8.21946359555988,
+      "learning_rate": 6.215742299454275e-07,
+      "loss": 1.0055,
+      "step": 4754
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 8.569337187716672,
+      "learning_rate": 6.201859963525236e-07,
+      "loss": 0.9791,
+      "step": 4755
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 8.051923412554007,
+      "learning_rate": 6.18799212235684e-07,
+      "loss": 0.9983,
+      "step": 4756
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 10.040046377189082,
+      "learning_rate": 6.17413878053858e-07,
+      "loss": 0.9809,
+      "step": 4757
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.1044020674343065,
+      "learning_rate": 6.160299942655152e-07,
+      "loss": 1.0088,
+      "step": 4758
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.958239294614841,
+      "learning_rate": 6.146475613286429e-07,
+      "loss": 0.9607,
+      "step": 4759
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 9.893538447218381,
+      "learning_rate": 6.132665797007503e-07,
+      "loss": 1.0578,
+      "step": 4760
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.421011897403643,
+      "learning_rate": 6.118870498388679e-07,
+      "loss": 1.0592,
+      "step": 4761
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.465897703611684,
+      "learning_rate": 6.105089721995405e-07,
+      "loss": 1.0418,
+      "step": 4762
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.8340380387725155,
+      "learning_rate": 6.091323472388372e-07,
+      "loss": 0.967,
+      "step": 4763
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.951554040666316,
+      "learning_rate": 6.077571754123457e-07,
+      "loss": 1.0314,
+      "step": 4764
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 9.41634190612032,
+      "learning_rate": 6.063834571751687e-07,
+      "loss": 0.9772,
+      "step": 4765
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.905205391082731,
+      "learning_rate": 6.050111929819352e-07,
+      "loss": 0.9731,
+      "step": 4766
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.353319605258207,
+      "learning_rate": 6.03640383286786e-07,
+      "loss": 0.9462,
+      "step": 4767
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.9946589021305865,
+      "learning_rate": 6.02271028543383e-07,
+      "loss": 1.0284,
+      "step": 4768
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 9.729937114301089,
+      "learning_rate": 6.009031292049073e-07,
+      "loss": 1.0066,
+      "step": 4769
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 9.98184422442618,
+      "learning_rate": 5.995366857240592e-07,
+      "loss": 0.9429,
+      "step": 4770
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.470542726532314,
+      "learning_rate": 5.981716985530539e-07,
+      "loss": 1.0035,
+      "step": 4771
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.438749529901239,
+      "learning_rate": 5.968081681436272e-07,
+      "loss": 1.0558,
+      "step": 4772
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 11.77598470570475,
+      "learning_rate": 5.954460949470326e-07,
+      "loss": 1.0012,
+      "step": 4773
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 9.433723570361586,
+      "learning_rate": 5.940854794140422e-07,
+      "loss": 1.0305,
+      "step": 4774
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 8.559229416540964,
+      "learning_rate": 5.92726321994942e-07,
+      "loss": 1.0139,
+      "step": 4775
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 5.268881567052201,
+      "learning_rate": 5.913686231395388e-07,
+      "loss": 1.0655,
+      "step": 4776
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 10.384030849959759,
+      "learning_rate": 5.900123832971566e-07,
+      "loss": 1.1309,
+      "step": 4777
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.535521091479843,
+      "learning_rate": 5.886576029166346e-07,
+      "loss": 1.0732,
+      "step": 4778
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 10.507999480838006,
+      "learning_rate": 5.873042824463309e-07,
+      "loss": 0.9955,
+      "step": 4779
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.200544192121608,
+      "learning_rate": 5.859524223341201e-07,
+      "loss": 0.9686,
+      "step": 4780
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 8.43392210422941,
+      "learning_rate": 5.846020230273902e-07,
+      "loss": 0.9978,
+      "step": 4781
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 9.152801146897781,
+      "learning_rate": 5.832530849730533e-07,
+      "loss": 1.0,
+      "step": 4782
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.652825363059596,
+      "learning_rate": 5.819056086175306e-07,
+      "loss": 0.9165,
+      "step": 4783
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 8.19220939168587,
+      "learning_rate": 5.805595944067604e-07,
+      "loss": 0.9753,
+      "step": 4784
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.443947008229428,
+      "learning_rate": 5.792150427862036e-07,
+      "loss": 0.9617,
+      "step": 4785
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.559887236201637,
+      "learning_rate": 5.778719542008293e-07,
+      "loss": 0.9984,
+      "step": 4786
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 8.519780890838799,
+      "learning_rate": 5.765303290951235e-07,
+      "loss": 1.0502,
+      "step": 4787
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.695703895723629,
+      "learning_rate": 5.751901679130945e-07,
+      "loss": 1.1244,
+      "step": 4788
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 9.08677277140639,
+      "learning_rate": 5.738514710982584e-07,
+      "loss": 1.011,
+      "step": 4789
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.448726180346249,
+      "learning_rate": 5.725142390936511e-07,
+      "loss": 0.9502,
+      "step": 4790
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.772128021803591,
+      "learning_rate": 5.711784723418223e-07,
+      "loss": 0.9479,
+      "step": 4791
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 8.597151114610375,
+      "learning_rate": 5.698441712848358e-07,
+      "loss": 0.9642,
+      "step": 4792
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.488099651438949,
+      "learning_rate": 5.685113363642719e-07,
+      "loss": 0.9805,
+      "step": 4793
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.964546405456166,
+      "learning_rate": 5.671799680212265e-07,
+      "loss": 1.0058,
+      "step": 4794
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 8.157241457246856,
+      "learning_rate": 5.658500666963068e-07,
+      "loss": 1.0559,
+      "step": 4795
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 9.664848675259043,
+      "learning_rate": 5.645216328296377e-07,
+      "loss": 1.0052,
+      "step": 4796
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 5.725073492064245,
+      "learning_rate": 5.631946668608574e-07,
+      "loss": 0.9932,
+      "step": 4797
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.536005614611958,
+      "learning_rate": 5.618691692291184e-07,
+      "loss": 0.9006,
+      "step": 4798
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.854617589260527,
+      "learning_rate": 5.605451403730855e-07,
+      "loss": 1.0147,
+      "step": 4799
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 5.955502099963068,
+      "learning_rate": 5.592225807309403e-07,
+      "loss": 0.9825,
+      "step": 4800
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 6.223911327961375,
+      "learning_rate": 5.579014907403779e-07,
+      "loss": 0.934,
+      "step": 4801
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 9.192643030234274,
+      "learning_rate": 5.565818708386034e-07,
+      "loss": 1.0881,
+      "step": 4802
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 9.483142870067082,
+      "learning_rate": 5.552637214623391e-07,
+      "loss": 0.9394,
+      "step": 4803
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.597730198548715,
+      "learning_rate": 5.539470430478211e-07,
+      "loss": 1.0697,
+      "step": 4804
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.273214717569979,
+      "learning_rate": 5.526318360307942e-07,
+      "loss": 1.0133,
+      "step": 4805
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 9.750013893502059,
+      "learning_rate": 5.513181008465207e-07,
+      "loss": 1.0714,
+      "step": 4806
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 5.72916913216937,
+      "learning_rate": 5.500058379297757e-07,
+      "loss": 0.9224,
+      "step": 4807
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 8.75818387663752,
+      "learning_rate": 5.486950477148423e-07,
+      "loss": 1.0457,
+      "step": 4808
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 10.041691283946681,
+      "learning_rate": 5.473857306355218e-07,
+      "loss": 1.0198,
+      "step": 4809
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 10.30095596989976,
+      "learning_rate": 5.46077887125126e-07,
+      "loss": 0.9697,
+      "step": 4810
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 8.064998135468526,
+      "learning_rate": 5.447715176164769e-07,
+      "loss": 0.9274,
+      "step": 4811
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 9.399224071929574,
+      "learning_rate": 5.434666225419116e-07,
+      "loss": 0.967,
+      "step": 4812
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 7.037507456926181,
+      "learning_rate": 5.421632023332779e-07,
+      "loss": 0.9696,
+      "step": 4813
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.832882246414387,
+      "learning_rate": 5.408612574219374e-07,
+      "loss": 0.9572,
+      "step": 4814
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 10.303481810805609,
+      "learning_rate": 5.395607882387588e-07,
+      "loss": 0.9916,
+      "step": 4815
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 8.401583686225894,
+      "learning_rate": 5.382617952141273e-07,
+      "loss": 0.9222,
+      "step": 4816
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.235374795563101,
+      "learning_rate": 5.369642787779384e-07,
+      "loss": 1.0383,
+      "step": 4817
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.24218104997244,
+      "learning_rate": 5.356682393595958e-07,
+      "loss": 0.9695,
+      "step": 4818
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 8.308041988043964,
+      "learning_rate": 5.343736773880187e-07,
+      "loss": 1.0136,
+      "step": 4819
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.184351526103116,
+      "learning_rate": 5.330805932916361e-07,
+      "loss": 0.9858,
+      "step": 4820
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.224751207935838,
+      "learning_rate": 5.317889874983851e-07,
+      "loss": 0.9095,
+      "step": 4821
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 5.660872701334544,
+      "learning_rate": 5.304988604357164e-07,
+      "loss": 1.0164,
+      "step": 4822
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 5.658805054425198,
+      "learning_rate": 5.292102125305931e-07,
+      "loss": 1.0258,
+      "step": 4823
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 13.352661229435396,
+      "learning_rate": 5.279230442094824e-07,
+      "loss": 1.0172,
+      "step": 4824
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.38636659269924,
+      "learning_rate": 5.266373558983684e-07,
+      "loss": 0.9123,
+      "step": 4825
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.8637461379039895,
+      "learning_rate": 5.253531480227431e-07,
+      "loss": 0.9856,
+      "step": 4826
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 5.731431192930354,
+      "learning_rate": 5.240704210076059e-07,
+      "loss": 1.0265,
+      "step": 4827
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.125729506750441,
+      "learning_rate": 5.2278917527747e-07,
+      "loss": 0.9122,
+      "step": 4828
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 9.254644587996147,
+      "learning_rate": 5.215094112563568e-07,
+      "loss": 0.97,
+      "step": 4829
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 9.873203525381724,
+      "learning_rate": 5.202311293677981e-07,
+      "loss": 0.9211,
+      "step": 4830
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.248353974222686,
+      "learning_rate": 5.189543300348326e-07,
+      "loss": 1.0274,
+      "step": 4831
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.26090906602536,
+      "learning_rate": 5.176790136800114e-07,
+      "loss": 1.0271,
+      "step": 4832
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 9.389487543335651,
+      "learning_rate": 5.164051807253945e-07,
+      "loss": 1.0116,
+      "step": 4833
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 11.95493012782023,
+      "learning_rate": 5.15132831592548e-07,
+      "loss": 0.9926,
+      "step": 4834
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.8102966981045,
+      "learning_rate": 5.138619667025502e-07,
+      "loss": 1.0416,
+      "step": 4835
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 10.52492228189543,
+      "learning_rate": 5.125925864759879e-07,
+      "loss": 1.0374,
+      "step": 4836
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 8.299881148124989,
+      "learning_rate": 5.113246913329528e-07,
+      "loss": 1.0278,
+      "step": 4837
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.911701615244164,
+      "learning_rate": 5.100582816930522e-07,
+      "loss": 0.9619,
+      "step": 4838
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.984066050658801,
+      "learning_rate": 5.087933579753951e-07,
+      "loss": 1.0396,
+      "step": 4839
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 5.947701983913786,
+      "learning_rate": 5.075299205986001e-07,
+      "loss": 0.9895,
+      "step": 4840
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 8.892960521497747,
+      "learning_rate": 5.062679699807982e-07,
+      "loss": 0.9335,
+      "step": 4841
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.310346695513908,
+      "learning_rate": 5.050075065396237e-07,
+      "loss": 1.0053,
+      "step": 4842
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 9.590035634575703,
+      "learning_rate": 5.03748530692219e-07,
+      "loss": 0.9696,
+      "step": 4843
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.475198895095082,
+      "learning_rate": 5.024910428552365e-07,
+      "loss": 0.9681,
+      "step": 4844
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 8.896379610321787,
+      "learning_rate": 5.012350434448355e-07,
+      "loss": 1.0181,
+      "step": 4845
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 5.721451493876464,
+      "learning_rate": 4.999805328766827e-07,
+      "loss": 0.9464,
+      "step": 4846
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 8.13247655173579,
+      "learning_rate": 4.987275115659501e-07,
+      "loss": 0.9925,
+      "step": 4847
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 5.498408802637522,
+      "learning_rate": 4.974759799273188e-07,
+      "loss": 0.9807,
+      "step": 4848
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.841077345548772,
+      "learning_rate": 4.962259383749785e-07,
+      "loss": 0.9906,
+      "step": 4849
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.74062165623231,
+      "learning_rate": 4.94977387322621e-07,
+      "loss": 1.0111,
+      "step": 4850
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.264300045261511,
+      "learning_rate": 4.937303271834487e-07,
+      "loss": 0.9517,
+      "step": 4851
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 5.349618490009143,
+      "learning_rate": 4.924847583701709e-07,
+      "loss": 0.9669,
+      "step": 4852
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.811946692423505,
+      "learning_rate": 4.912406812949982e-07,
+      "loss": 0.946,
+      "step": 4853
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 9.877365934282892,
+      "learning_rate": 4.899980963696566e-07,
+      "loss": 0.9569,
+      "step": 4854
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 8.543435841995635,
+      "learning_rate": 4.887570040053691e-07,
+      "loss": 1.0053,
+      "step": 4855
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.05622083274699,
+      "learning_rate": 4.875174046128684e-07,
+      "loss": 0.9647,
+      "step": 4856
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 10.549221791412748,
+      "learning_rate": 4.862792986023962e-07,
+      "loss": 0.997,
+      "step": 4857
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 9.503088908940033,
+      "learning_rate": 4.850426863836949e-07,
+      "loss": 0.9952,
+      "step": 4858
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.933252151889666,
+      "learning_rate": 4.838075683660137e-07,
+      "loss": 0.9916,
+      "step": 4859
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 8.987274273943763,
+      "learning_rate": 4.825739449581119e-07,
+      "loss": 0.949,
+      "step": 4860
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 10.37206611686754,
+      "learning_rate": 4.813418165682471e-07,
+      "loss": 1.0349,
+      "step": 4861
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.267915421534232,
+      "learning_rate": 4.801111836041872e-07,
+      "loss": 0.9545,
+      "step": 4862
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.970627055775548,
+      "learning_rate": 4.788820464732047e-07,
+      "loss": 0.9412,
+      "step": 4863
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.252151381882491,
+      "learning_rate": 4.776544055820731e-07,
+      "loss": 0.9541,
+      "step": 4864
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 6.365831851699328,
+      "learning_rate": 4.764282613370752e-07,
+      "loss": 0.9087,
+      "step": 4865
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 8.802754080224574,
+      "learning_rate": 4.752036141439981e-07,
+      "loss": 0.9813,
+      "step": 4866
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 9.875917847394794,
+      "learning_rate": 4.739804644081303e-07,
+      "loss": 0.9325,
+      "step": 4867
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.943394239192685,
+      "learning_rate": 4.727588125342669e-07,
+      "loss": 0.9781,
+      "step": 4868
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 7.41193247525581,
+      "learning_rate": 4.7153865892670794e-07,
+      "loss": 0.968,
+      "step": 4869
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.940074027837828,
+      "learning_rate": 4.7032000398925693e-07,
+      "loss": 0.9519,
+      "step": 4870
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 7.6777444114012345,
+      "learning_rate": 4.691028481252197e-07,
+      "loss": 0.9408,
+      "step": 4871
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.675412436369073,
+      "learning_rate": 4.6788719173740895e-07,
+      "loss": 0.9531,
+      "step": 4872
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 9.91937494259441,
+      "learning_rate": 4.6667303522813957e-07,
+      "loss": 1.0034,
+      "step": 4873
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 8.057124522379741,
+      "learning_rate": 4.6546037899922903e-07,
+      "loss": 1.0715,
+      "step": 4874
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 7.371787048439111,
+      "learning_rate": 4.642492234520002e-07,
+      "loss": 0.9539,
+      "step": 4875
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 9.658318842897614,
+      "learning_rate": 4.6303956898727965e-07,
+      "loss": 0.9749,
+      "step": 4876
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 10.180620722078103,
+      "learning_rate": 4.6183141600539327e-07,
+      "loss": 0.9896,
+      "step": 4877
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 10.070470973637013,
+      "learning_rate": 4.6062476490617514e-07,
+      "loss": 0.9955,
+      "step": 4878
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 9.17459321953293,
+      "learning_rate": 4.594196160889602e-07,
+      "loss": 0.9789,
+      "step": 4879
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.018162727591031,
+      "learning_rate": 4.5821596995258397e-07,
+      "loss": 0.9544,
+      "step": 4880
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 8.778082120892309,
+      "learning_rate": 4.570138268953883e-07,
+      "loss": 1.0223,
+      "step": 4881
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.711447018171371,
+      "learning_rate": 4.558131873152161e-07,
+      "loss": 0.9462,
+      "step": 4882
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 8.624543836833402,
+      "learning_rate": 4.546140516094116e-07,
+      "loss": 0.902,
+      "step": 4883
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 7.660525485006171,
+      "learning_rate": 4.5341642017482313e-07,
+      "loss": 0.968,
+      "step": 4884
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 8.89112487825915,
+      "learning_rate": 4.522202934077996e-07,
+      "loss": 0.9754,
+      "step": 4885
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.865529510304862,
+      "learning_rate": 4.51025671704195e-07,
+      "loss": 0.9792,
+      "step": 4886
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.836465014820973,
+      "learning_rate": 4.498325554593597e-07,
+      "loss": 0.9928,
+      "step": 4887
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.534971727509425,
+      "learning_rate": 4.486409450681517e-07,
+      "loss": 0.9784,
+      "step": 4888
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 7.384105071941331,
+      "learning_rate": 4.474508409249273e-07,
+      "loss": 0.9515,
+      "step": 4889
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 8.947276539988199,
+      "learning_rate": 4.4626224342354475e-07,
+      "loss": 1.0061,
+      "step": 4890
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 9.487004272947393,
+      "learning_rate": 4.450751529573644e-07,
+      "loss": 0.9334,
+      "step": 4891
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 5.583729140316232,
+      "learning_rate": 4.43889569919248e-07,
+      "loss": 0.9526,
+      "step": 4892
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 5.566122108797035,
+      "learning_rate": 4.427054947015569e-07,
+      "loss": 0.9638,
+      "step": 4893
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.293983919988559,
+      "learning_rate": 4.415229276961547e-07,
+      "loss": 0.9713,
+      "step": 4894
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 7.280614345699662,
+      "learning_rate": 4.403418692944067e-07,
+      "loss": 0.9825,
+      "step": 4895
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.320133125316158,
+      "learning_rate": 4.391623198871764e-07,
+      "loss": 0.9811,
+      "step": 4896
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 9.439069640417909,
+      "learning_rate": 4.379842798648298e-07,
+      "loss": 1.1079,
+      "step": 4897
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 8.492384767660022,
+      "learning_rate": 4.3680774961723407e-07,
+      "loss": 1.0376,
+      "step": 4898
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 8.072966520179708,
+      "learning_rate": 4.3563272953375426e-07,
+      "loss": 0.9546,
+      "step": 4899
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.160028785641369,
+      "learning_rate": 4.344592200032571e-07,
+      "loss": 1.0384,
+      "step": 4900
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 10.02156586018324,
+      "learning_rate": 4.332872214141098e-07,
+      "loss": 1.0595,
+      "step": 4901
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 11.962324898986555,
+      "learning_rate": 4.321167341541799e-07,
+      "loss": 0.89,
+      "step": 4902
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 7.837513240832684,
+      "learning_rate": 4.3094775861083235e-07,
+      "loss": 1.0711,
+      "step": 4903
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 11.103313795246292,
+      "learning_rate": 4.2978029517093346e-07,
+      "loss": 1.0623,
+      "step": 4904
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 7.264497081197383,
+      "learning_rate": 4.2861434422085067e-07,
+      "loss": 1.0227,
+      "step": 4905
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 8.959764778896474,
+      "learning_rate": 4.274499061464471e-07,
+      "loss": 1.0288,
+      "step": 4906
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.168278277165905,
+      "learning_rate": 4.2628698133308834e-07,
+      "loss": 0.9333,
+      "step": 4907
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 8.328005204443595,
+      "learning_rate": 4.2512557016563927e-07,
+      "loss": 1.036,
+      "step": 4908
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 8.010433728527314,
+      "learning_rate": 4.239656730284597e-07,
+      "loss": 0.9447,
+      "step": 4909
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 7.089382248258809,
+      "learning_rate": 4.228072903054148e-07,
+      "loss": 1.0198,
+      "step": 4910
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 10.407089105082044,
+      "learning_rate": 4.2165042237986343e-07,
+      "loss": 0.9485,
+      "step": 4911
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 9.064332729823892,
+      "learning_rate": 4.2049506963466325e-07,
+      "loss": 0.9019,
+      "step": 4912
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 9.984036686565434,
+      "learning_rate": 4.1934123245217617e-07,
+      "loss": 0.9122,
+      "step": 4913
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 9.183576863907481,
+      "learning_rate": 4.1818891121425564e-07,
+      "loss": 0.9342,
+      "step": 4914
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 8.482597876415902,
+      "learning_rate": 4.170381063022555e-07,
+      "loss": 0.9171,
+      "step": 4915
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 8.807564234594889,
+      "learning_rate": 4.1588881809703054e-07,
+      "loss": 1.0301,
+      "step": 4916
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 10.416731166577819,
+      "learning_rate": 4.147410469789304e-07,
+      "loss": 0.919,
+      "step": 4917
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 9.089665878644034,
+      "learning_rate": 4.135947933278045e-07,
+      "loss": 0.9157,
+      "step": 4918
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 7.334923373086198,
+      "learning_rate": 4.124500575229989e-07,
+      "loss": 0.9667,
+      "step": 4919
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 10.472953998619353,
+      "learning_rate": 4.113068399433573e-07,
+      "loss": 0.8941,
+      "step": 4920
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 5.342585372536819,
+      "learning_rate": 4.1016514096722303e-07,
+      "loss": 1.0324,
+      "step": 4921
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 7.089439511244353,
+      "learning_rate": 4.090249609724334e-07,
+      "loss": 1.0365,
+      "step": 4922
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 6.613656472994348,
+      "learning_rate": 4.078863003363259e-07,
+      "loss": 1.0336,
+      "step": 4923
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 10.084965160026748,
+      "learning_rate": 4.0674915943573533e-07,
+      "loss": 1.0157,
+      "step": 4924
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 8.475020580387145,
+      "learning_rate": 4.056135386469889e-07,
+      "loss": 0.9873,
+      "step": 4925
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 9.620648385922513,
+      "learning_rate": 4.044794383459183e-07,
+      "loss": 0.905,
+      "step": 4926
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 9.770212923384396,
+      "learning_rate": 4.0334685890784653e-07,
+      "loss": 0.9758,
+      "step": 4927
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.669775694642417,
+      "learning_rate": 4.0221580070759213e-07,
+      "loss": 0.9987,
+      "step": 4928
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 8.489167999611771,
+      "learning_rate": 4.010862641194768e-07,
+      "loss": 1.0156,
+      "step": 4929
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.544704179356692,
+      "learning_rate": 3.9995824951731266e-07,
+      "loss": 0.9225,
+      "step": 4930
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.996295482789043,
+      "learning_rate": 3.988317572744088e-07,
+      "loss": 0.9786,
+      "step": 4931
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.80556635284363,
+      "learning_rate": 3.977067877635754e-07,
+      "loss": 1.0229,
+      "step": 4932
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 10.978091274302766,
+      "learning_rate": 3.9658334135711186e-07,
+      "loss": 0.9589,
+      "step": 4933
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 9.497147310516048,
+      "learning_rate": 3.954614184268174e-07,
+      "loss": 1.0054,
+      "step": 4934
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.308636836915682,
+      "learning_rate": 3.943410193439884e-07,
+      "loss": 0.9763,
+      "step": 4935
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.386680995877082,
+      "learning_rate": 3.9322214447941267e-07,
+      "loss": 0.9675,
+      "step": 4936
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 11.12296237009182,
+      "learning_rate": 3.921047942033768e-07,
+      "loss": 1.025,
+      "step": 4937
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 8.369070834367673,
+      "learning_rate": 3.909889688856622e-07,
+      "loss": 0.9389,
+      "step": 4938
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.319677567930975,
+      "learning_rate": 3.898746688955446e-07,
+      "loss": 1.0169,
+      "step": 4939
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 8.573343905692298,
+      "learning_rate": 3.8876189460179513e-07,
+      "loss": 1.0146,
+      "step": 4940
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 8.588066632316687,
+      "learning_rate": 3.87650646372682e-07,
+      "loss": 1.041,
+      "step": 4941
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 10.840544574432608,
+      "learning_rate": 3.8654092457596714e-07,
+      "loss": 1.0047,
+      "step": 4942
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.634851396772324,
+      "learning_rate": 3.854327295789051e-07,
+      "loss": 1.0823,
+      "step": 4943
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.720045598710187,
+      "learning_rate": 3.8432606174824805e-07,
+      "loss": 0.9598,
+      "step": 4944
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 9.841639156445433,
+      "learning_rate": 3.832209214502425e-07,
+      "loss": 0.9643,
+      "step": 4945
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.7250150665551685,
+      "learning_rate": 3.821173090506275e-07,
+      "loss": 0.9665,
+      "step": 4946
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 8.607717618252819,
+      "learning_rate": 3.8101522491463813e-07,
+      "loss": 0.9956,
+      "step": 4947
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 8.104186171591241,
+      "learning_rate": 3.799146694070044e-07,
+      "loss": 0.9746,
+      "step": 4948
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.433735862315722,
+      "learning_rate": 3.7881564289194705e-07,
+      "loss": 1.007,
+      "step": 4949
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.742071257754648,
+      "learning_rate": 3.7771814573318464e-07,
+      "loss": 1.0202,
+      "step": 4950
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 10.084233312053854,
+      "learning_rate": 3.766221782939289e-07,
+      "loss": 1.0119,
+      "step": 4951
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 8.640074583667161,
+      "learning_rate": 3.7552774093688184e-07,
+      "loss": 0.9856,
+      "step": 4952
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.807188524892626,
+      "learning_rate": 3.744348340242426e-07,
+      "loss": 1.1277,
+      "step": 4953
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.822335995070059,
+      "learning_rate": 3.7334345791770474e-07,
+      "loss": 0.955,
+      "step": 4954
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.71919902426276,
+      "learning_rate": 3.722536129784504e-07,
+      "loss": 1.0097,
+      "step": 4955
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 9.18226291774995,
+      "learning_rate": 3.7116529956716e-07,
+      "loss": 1.0124,
+      "step": 4956
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.448029828703714,
+      "learning_rate": 3.7007851804400364e-07,
+      "loss": 0.9351,
+      "step": 4957
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 9.329854260906371,
+      "learning_rate": 3.6899326876864817e-07,
+      "loss": 0.9877,
+      "step": 4958
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 9.587429474780583,
+      "learning_rate": 3.679095521002485e-07,
+      "loss": 0.9179,
+      "step": 4959
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 9.160264355145106,
+      "learning_rate": 3.66827368397456e-07,
+      "loss": 0.9954,
+      "step": 4960
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.94962003748561,
+      "learning_rate": 3.6574671801841435e-07,
+      "loss": 0.9716,
+      "step": 4961
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.443806346195032,
+      "learning_rate": 3.6466760132075727e-07,
+      "loss": 0.9458,
+      "step": 4962
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 8.2376364333475,
+      "learning_rate": 3.6359001866161426e-07,
+      "loss": 0.9597,
+      "step": 4963
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 9.679067165714654,
+      "learning_rate": 3.625139703976049e-07,
+      "loss": 1.0086,
+      "step": 4964
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 9.584837418013928,
+      "learning_rate": 3.614394568848412e-07,
+      "loss": 0.9361,
+      "step": 4965
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 8.727586227940803,
+      "learning_rate": 3.603664784789279e-07,
+      "loss": 0.9742,
+      "step": 4966
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.619148450263813,
+      "learning_rate": 3.592950355349623e-07,
+      "loss": 0.9547,
+      "step": 4967
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 9.602348893758714,
+      "learning_rate": 3.582251284075311e-07,
+      "loss": 0.8935,
+      "step": 4968
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.711252046680039,
+      "learning_rate": 3.5715675745071565e-07,
+      "loss": 0.9341,
+      "step": 4969
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.4229052020794635,
+      "learning_rate": 3.560899230180881e-07,
+      "loss": 0.9239,
+      "step": 4970
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.870544496685176,
+      "learning_rate": 3.5502462546271044e-07,
+      "loss": 0.9631,
+      "step": 4971
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.519459202648844,
+      "learning_rate": 3.539608651371368e-07,
+      "loss": 1.0973,
+      "step": 4972
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 8.61356012522464,
+      "learning_rate": 3.5289864239341445e-07,
+      "loss": 1.052,
+      "step": 4973
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.106942637204202,
+      "learning_rate": 3.518379575830799e-07,
+      "loss": 1.0442,
+      "step": 4974
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.8812369946362235,
+      "learning_rate": 3.5077881105716027e-07,
+      "loss": 0.9206,
+      "step": 4975
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.74417250346018,
+      "learning_rate": 3.4972120316617506e-07,
+      "loss": 0.9674,
+      "step": 4976
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 8.47018030191689,
+      "learning_rate": 3.486651342601344e-07,
+      "loss": 0.9705,
+      "step": 4977
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.773890465081528,
+      "learning_rate": 3.4761060468853746e-07,
+      "loss": 0.9796,
+      "step": 4978
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 9.768536704518409,
+      "learning_rate": 3.465576148003763e-07,
+      "loss": 1.0234,
+      "step": 4979
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 9.275591508463124,
+      "learning_rate": 3.455061649441327e-07,
+      "loss": 1.0023,
+      "step": 4980
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 8.936703927706143,
+      "learning_rate": 3.444562554677755e-07,
+      "loss": 0.9884,
+      "step": 4981
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 8.488159090087237,
+      "learning_rate": 3.434078867187707e-07,
+      "loss": 1.0335,
+      "step": 4982
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 9.575419686380128,
+      "learning_rate": 3.4236105904406846e-07,
+      "loss": 1.0596,
+      "step": 4983
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 6.895252061016333,
+      "learning_rate": 3.4131577279010895e-07,
+      "loss": 1.0356,
+      "step": 4984
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 8.239236499859244,
+      "learning_rate": 3.402720283028277e-07,
+      "loss": 0.9663,
+      "step": 4985
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 9.686933591211915,
+      "learning_rate": 3.3922982592764386e-07,
+      "loss": 0.9944,
+      "step": 4986
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 9.573499532126295,
+      "learning_rate": 3.3818916600946827e-07,
+      "loss": 1.0296,
+      "step": 4987
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 7.871907614820308,
+      "learning_rate": 3.371500488927043e-07,
+      "loss": 1.0217,
+      "step": 4988
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 7.979068507662568,
+      "learning_rate": 3.361124749212402e-07,
+      "loss": 1.0353,
+      "step": 4989
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 9.639555655102377,
+      "learning_rate": 3.350764444384569e-07,
+      "loss": 0.9656,
+      "step": 4990
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 9.962688021277977,
+      "learning_rate": 3.340419577872217e-07,
+      "loss": 0.9293,
+      "step": 4991
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 7.510798946996194,
+      "learning_rate": 3.3300901530989305e-07,
+      "loss": 0.9556,
+      "step": 4992
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 7.815252782338132,
+      "learning_rate": 3.3197761734831855e-07,
+      "loss": 0.9679,
+      "step": 4993
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 6.619297027366951,
+      "learning_rate": 3.30947764243833e-07,
+      "loss": 0.9987,
+      "step": 4994
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 7.6727828890391665,
+      "learning_rate": 3.299194563372604e-07,
+      "loss": 0.9215,
+      "step": 4995
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 10.855228174129778,
+      "learning_rate": 3.2889269396891464e-07,
+      "loss": 0.9704,
+      "step": 4996
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 8.359320266090585,
+      "learning_rate": 3.2786747747859616e-07,
+      "loss": 0.9993,
+      "step": 4997
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 9.516440831875329,
+      "learning_rate": 3.2684380720559684e-07,
+      "loss": 0.975,
+      "step": 4998
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 7.249206173883747,
+      "learning_rate": 3.258216834886935e-07,
+      "loss": 0.9961,
+      "step": 4999
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 8.00119716260757,
+      "learning_rate": 3.248011066661516e-07,
+      "loss": 0.9187,
+      "step": 5000
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 8.78872070525416,
+      "learning_rate": 3.2378207707572773e-07,
+      "loss": 1.0328,
+      "step": 5001
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 6.570252914876476,
+      "learning_rate": 3.2276459505466375e-07,
+      "loss": 1.0277,
+      "step": 5002
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 9.691204677830173,
+      "learning_rate": 3.217486609396875e-07,
+      "loss": 1.0226,
+      "step": 5003
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 8.277947036433625,
+      "learning_rate": 3.207342750670206e-07,
+      "loss": 0.8929,
+      "step": 5004
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 7.752349529400975,
+      "learning_rate": 3.1972143777236565e-07,
+      "loss": 1.0122,
+      "step": 5005
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 11.68010976484662,
+      "learning_rate": 3.187101493909178e-07,
+      "loss": 0.9785,
+      "step": 5006
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 8.599845267238411,
+      "learning_rate": 3.177004102573572e-07,
+      "loss": 0.9813,
+      "step": 5007
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 7.410723960790046,
+      "learning_rate": 3.1669222070584983e-07,
+      "loss": 0.9409,
+      "step": 5008
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 6.707602389195555,
+      "learning_rate": 3.1568558107005264e-07,
+      "loss": 1.0432,
+      "step": 5009
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 6.424949053908535,
+      "learning_rate": 3.1468049168310755e-07,
+      "loss": 1.0198,
+      "step": 5010
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 9.284641930665,
+      "learning_rate": 3.1367695287764233e-07,
+      "loss": 1.124,
+      "step": 5011
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 6.285664191436054,
+      "learning_rate": 3.1267496498577357e-07,
+      "loss": 0.8905,
+      "step": 5012
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 8.429280574460998,
+      "learning_rate": 3.116745283391032e-07,
+      "loss": 1.0429,
+      "step": 5013
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 6.742195186447956,
+      "learning_rate": 3.106756432687219e-07,
+      "loss": 0.918,
+      "step": 5014
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 7.770733752677745,
+      "learning_rate": 3.096783101052037e-07,
+      "loss": 0.9936,
+      "step": 5015
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 7.228284333677104,
+      "learning_rate": 3.086825291786111e-07,
+      "loss": 0.9622,
+      "step": 5016
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 8.67063497772519,
+      "learning_rate": 3.0768830081849334e-07,
+      "loss": 0.9864,
+      "step": 5017
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 8.777998383541675,
+      "learning_rate": 3.066956253538833e-07,
+      "loss": 0.9038,
+      "step": 5018
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 5.9819754305616915,
+      "learning_rate": 3.057045031133032e-07,
+      "loss": 0.9219,
+      "step": 5019
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 7.073285107997212,
+      "learning_rate": 3.0471493442475964e-07,
+      "loss": 0.9713,
+      "step": 5020
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 10.56890884908434,
+      "learning_rate": 3.0372691961574385e-07,
+      "loss": 0.9684,
+      "step": 5021
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 6.039674458951752,
+      "learning_rate": 3.027404590132349e-07,
+      "loss": 1.0451,
+      "step": 5022
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 5.886894469829972,
+      "learning_rate": 3.017555529436972e-07,
+      "loss": 1.0364,
+      "step": 5023
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 7.735506600483348,
+      "learning_rate": 3.007722017330783e-07,
+      "loss": 0.9696,
+      "step": 5024
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 9.058189410631984,
+      "learning_rate": 2.9979040570681464e-07,
+      "loss": 1.0994,
+      "step": 5025
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 10.070598303302624,
+      "learning_rate": 2.9881016518982673e-07,
+      "loss": 0.9468,
+      "step": 5026
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 6.545193066344189,
+      "learning_rate": 2.9783148050651854e-07,
+      "loss": 0.9992,
+      "step": 5027
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 10.88894246596953,
+      "learning_rate": 2.9685435198078095e-07,
+      "loss": 1.0646,
+      "step": 5028
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 6.3974495330313506,
+      "learning_rate": 2.9587877993598913e-07,
+      "loss": 0.9279,
+      "step": 5029
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 9.721182362853888,
+      "learning_rate": 2.949047646950054e-07,
+      "loss": 0.9813,
+      "step": 5030
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 9.875388153709684,
+      "learning_rate": 2.9393230658017245e-07,
+      "loss": 0.9416,
+      "step": 5031
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 6.265122658662343,
+      "learning_rate": 2.9296140591332123e-07,
+      "loss": 0.9572,
+      "step": 5032
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 8.365460881023004,
+      "learning_rate": 2.9199206301576686e-07,
+      "loss": 1.0233,
+      "step": 5033
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 10.673554270656785,
+      "learning_rate": 2.9102427820830613e-07,
+      "loss": 1.0274,
+      "step": 5034
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 10.05677629776549,
+      "learning_rate": 2.900580518112239e-07,
+      "loss": 1.061,
+      "step": 5035
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 9.476077943368487,
+      "learning_rate": 2.8909338414428777e-07,
+      "loss": 1.0096,
+      "step": 5036
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 10.612715367154037,
+      "learning_rate": 2.881302755267479e-07,
+      "loss": 1.0121,
+      "step": 5037
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 6.578792740030731,
+      "learning_rate": 2.871687262773415e-07,
+      "loss": 0.9817,
+      "step": 5038
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 6.523382449007027,
+      "learning_rate": 2.862087367142874e-07,
+      "loss": 0.9519,
+      "step": 5039
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 6.7952795490848095,
+      "learning_rate": 2.8525030715528866e-07,
+      "loss": 0.9566,
+      "step": 5040
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 8.043824017443129,
+      "learning_rate": 2.842934379175333e-07,
+      "loss": 0.9291,
+      "step": 5041
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 10.390050008389037,
+      "learning_rate": 2.8333812931769176e-07,
+      "loss": 0.9462,
+      "step": 5042
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.961309406426637,
+      "learning_rate": 2.8238438167191797e-07,
+      "loss": 0.9865,
+      "step": 5043
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 11.372397849069792,
+      "learning_rate": 2.814321952958499e-07,
+      "loss": 0.9672,
+      "step": 5044
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 8.578056189228706,
+      "learning_rate": 2.8048157050460824e-07,
+      "loss": 0.9632,
+      "step": 5045
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.623286689767764,
+      "learning_rate": 2.795325076127986e-07,
+      "loss": 0.974,
+      "step": 5046
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 6.5722046400520515,
+      "learning_rate": 2.785850069345064e-07,
+      "loss": 1.0397,
+      "step": 5047
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.325037282314185,
+      "learning_rate": 2.776390687833025e-07,
+      "loss": 0.8939,
+      "step": 5048
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 10.26069062541292,
+      "learning_rate": 2.766946934722409e-07,
+      "loss": 0.9961,
+      "step": 5049
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.349766751895337,
+      "learning_rate": 2.7575188131385653e-07,
+      "loss": 0.9675,
+      "step": 5050
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 6.5135784396972305,
+      "learning_rate": 2.748106326201683e-07,
+      "loss": 1.0154,
+      "step": 5051
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 8.827536644971174,
+      "learning_rate": 2.73870947702678e-07,
+      "loss": 0.9621,
+      "step": 5052
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 10.253034291903058,
+      "learning_rate": 2.72932826872368e-07,
+      "loss": 1.0374,
+      "step": 5053
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 10.891568252377564,
+      "learning_rate": 2.719962704397067e-07,
+      "loss": 0.9929,
+      "step": 5054
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 8.587750647644494,
+      "learning_rate": 2.710612787146405e-07,
+      "loss": 1.0104,
+      "step": 5055
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 10.689937766007256,
+      "learning_rate": 2.7012785200659975e-07,
+      "loss": 0.9809,
+      "step": 5056
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 8.90168668547731,
+      "learning_rate": 2.69195990624499e-07,
+      "loss": 0.9525,
+      "step": 5057
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 5.473395528688648,
+      "learning_rate": 2.6826569487673146e-07,
+      "loss": 1.053,
+      "step": 5058
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 8.45337797821354,
+      "learning_rate": 2.673369650711721e-07,
+      "loss": 0.9587,
+      "step": 5059
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 6.612204965562854,
+      "learning_rate": 2.6640980151518225e-07,
+      "loss": 0.8992,
+      "step": 5060
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 8.468068358270582,
+      "learning_rate": 2.654842045155998e-07,
+      "loss": 0.9915,
+      "step": 5061
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 5.850349724051085,
+      "learning_rate": 2.6456017437874626e-07,
+      "loss": 1.0193,
+      "step": 5062
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.737359442293551,
+      "learning_rate": 2.6363771141042495e-07,
+      "loss": 1.0201,
+      "step": 5063
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.863048585971056,
+      "learning_rate": 2.6271681591591933e-07,
+      "loss": 0.9794,
+      "step": 5064
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.82493797264207,
+      "learning_rate": 2.617974881999957e-07,
+      "loss": 1.0346,
+      "step": 5065
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 8.701986431784867,
+      "learning_rate": 2.6087972856690016e-07,
+      "loss": 1.009,
+      "step": 5066
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 9.431177024317735,
+      "learning_rate": 2.5996353732036017e-07,
+      "loss": 0.9468,
+      "step": 5067
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.815058246389797,
+      "learning_rate": 2.590489147635855e-07,
+      "loss": 0.987,
+      "step": 5068
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 6.461116926282476,
+      "learning_rate": 2.5813586119926347e-07,
+      "loss": 0.9509,
+      "step": 5069
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 6.965012218802773,
+      "learning_rate": 2.572243769295668e-07,
+      "loss": 1.0319,
+      "step": 5070
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 6.427796016366468,
+      "learning_rate": 2.563144622561453e-07,
+      "loss": 1.02,
+      "step": 5071
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 8.002755925236723,
+      "learning_rate": 2.554061174801281e-07,
+      "loss": 0.9955,
+      "step": 5072
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 6.129056109032044,
+      "learning_rate": 2.544993429021314e-07,
+      "loss": 1.0547,
+      "step": 5073
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 8.319096401959191,
+      "learning_rate": 2.535941388222446e-07,
+      "loss": 1.0346,
+      "step": 5074
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.717843667591945,
+      "learning_rate": 2.5269050554003914e-07,
+      "loss": 0.9455,
+      "step": 5075
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 10.795577890197809,
+      "learning_rate": 2.517884433545709e-07,
+      "loss": 1.0933,
+      "step": 5076
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 8.286608613417965,
+      "learning_rate": 2.5088795256437035e-07,
+      "loss": 1.0323,
+      "step": 5077
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 6.379265633683088,
+      "learning_rate": 2.499890334674504e-07,
+      "loss": 0.9971,
+      "step": 5078
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 6.250539897383442,
+      "learning_rate": 2.490916863613041e-07,
+      "loss": 0.9068,
+      "step": 5079
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 8.858628292047253,
+      "learning_rate": 2.4819591154290333e-07,
+      "loss": 0.9934,
+      "step": 5080
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 9.291017773949553,
+      "learning_rate": 2.4730170930869936e-07,
+      "loss": 0.9859,
+      "step": 5081
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 6.271568035853666,
+      "learning_rate": 2.464090799546248e-07,
+      "loss": 1.0197,
+      "step": 5082
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 5.996113052137121,
+      "learning_rate": 2.4551802377608956e-07,
+      "loss": 1.0071,
+      "step": 5083
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 8.188004783569609,
+      "learning_rate": 2.446285410679844e-07,
+      "loss": 0.9699,
+      "step": 5084
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 8.851298926197574,
+      "learning_rate": 2.4374063212467933e-07,
+      "loss": 0.9996,
+      "step": 5085
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 5.871944450655309,
+      "learning_rate": 2.428542972400227e-07,
+      "loss": 1.0046,
+      "step": 5086
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 8.075038073108887,
+      "learning_rate": 2.419695367073416e-07,
+      "loss": 0.9788,
+      "step": 5087
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 10.890726174312455,
+      "learning_rate": 2.4108635081944333e-07,
+      "loss": 1.0138,
+      "step": 5088
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 8.254769395779965,
+      "learning_rate": 2.402047398686147e-07,
+      "loss": 1.041,
+      "step": 5089
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 6.843570335863624,
+      "learning_rate": 2.3932470414661793e-07,
+      "loss": 1.0732,
+      "step": 5090
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 10.58129467062698,
+      "learning_rate": 2.3844624394469775e-07,
+      "loss": 1.0073,
+      "step": 5091
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 8.235674103619642,
+      "learning_rate": 2.3756935955357553e-07,
+      "loss": 1.0662,
+      "step": 5092
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 7.034627129971461,
+      "learning_rate": 2.3669405126345135e-07,
+      "loss": 0.9628,
+      "step": 5093
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 6.915899145820801,
+      "learning_rate": 2.3582031936400296e-07,
+      "loss": 0.9086,
+      "step": 5094
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 8.096551939123053,
+      "learning_rate": 2.3494816414438958e-07,
+      "loss": 0.9619,
+      "step": 5095
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 10.726109327792793,
+      "learning_rate": 2.340775858932437e-07,
+      "loss": 1.007,
+      "step": 5096
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.674963159923659,
+      "learning_rate": 2.3320858489868036e-07,
+      "loss": 0.9355,
+      "step": 5097
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 8.530970251076853,
+      "learning_rate": 2.323411614482901e-07,
+      "loss": 1.0533,
+      "step": 5098
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 6.20770186362055,
+      "learning_rate": 2.3147531582914219e-07,
+      "loss": 1.0423,
+      "step": 5099
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 6.956262016510261,
+      "learning_rate": 2.3061104832778292e-07,
+      "loss": 0.9602,
+      "step": 5100
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.466455538822157,
+      "learning_rate": 2.2974835923023796e-07,
+      "loss": 0.9912,
+      "step": 5101
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 8.041820465553554,
+      "learning_rate": 2.288872488220101e-07,
+      "loss": 0.971,
+      "step": 5102
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 6.051031029604141,
+      "learning_rate": 2.2802771738807795e-07,
+      "loss": 0.9402,
+      "step": 5103
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.504391053020129,
+      "learning_rate": 2.2716976521289901e-07,
+      "loss": 0.9485,
+      "step": 5104
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 8.58208433700975,
+      "learning_rate": 2.2631339258040952e-07,
+      "loss": 0.986,
+      "step": 5105
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 6.583682014187258,
+      "learning_rate": 2.2545859977401885e-07,
+      "loss": 0.9952,
+      "step": 5106
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 6.550963107537927,
+      "learning_rate": 2.2460538707661793e-07,
+      "loss": 0.9393,
+      "step": 5107
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.978229113503961,
+      "learning_rate": 2.237537547705737e-07,
+      "loss": 0.8897,
+      "step": 5108
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 6.970764563410946,
+      "learning_rate": 2.229037031377268e-07,
+      "loss": 0.9654,
+      "step": 5109
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 9.469579331477991,
+      "learning_rate": 2.2205523245939886e-07,
+      "loss": 0.9666,
+      "step": 5110
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 8.676623576025609,
+      "learning_rate": 2.212083430163875e-07,
+      "loss": 0.9691,
+      "step": 5111
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 6.092290432670429,
+      "learning_rate": 2.2036303508896405e-07,
+      "loss": 0.9502,
+      "step": 5112
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.0662244294909815,
+      "learning_rate": 2.1951930895688034e-07,
+      "loss": 0.9787,
+      "step": 5113
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 13.039560356877189,
+      "learning_rate": 2.1867716489936297e-07,
+      "loss": 1.073,
+      "step": 5114
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 8.452689792589174,
+      "learning_rate": 2.1783660319511402e-07,
+      "loss": 1.0068,
+      "step": 5115
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 10.565542119475426,
+      "learning_rate": 2.1699762412231374e-07,
+      "loss": 0.8817,
+      "step": 5116
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 6.297283133507118,
+      "learning_rate": 2.1616022795861723e-07,
+      "loss": 0.9438,
+      "step": 5117
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 6.572744272509943,
+      "learning_rate": 2.153244149811573e-07,
+      "loss": 1.001,
+      "step": 5118
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 8.2976877562079,
+      "learning_rate": 2.1449018546654045e-07,
+      "loss": 0.9274,
+      "step": 5119
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.381988707686071,
+      "learning_rate": 2.1365753969085134e-07,
+      "loss": 1.0674,
+      "step": 5120
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 5.9552784528415925,
+      "learning_rate": 2.1282647792965017e-07,
+      "loss": 0.9457,
+      "step": 5121
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.94785930349571,
+      "learning_rate": 2.1199700045797077e-07,
+      "loss": 0.9625,
+      "step": 5122
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.202674330140303,
+      "learning_rate": 2.111691075503247e-07,
+      "loss": 0.9405,
+      "step": 5123
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 9.805949035283138,
+      "learning_rate": 2.1034279948070057e-07,
+      "loss": 1.017,
+      "step": 5124
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 8.964074790314005,
+      "learning_rate": 2.095180765225574e-07,
+      "loss": 0.935,
+      "step": 5125
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 8.098617193904618,
+      "learning_rate": 2.086949389488363e-07,
+      "loss": 0.9657,
+      "step": 5126
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 9.679327812245306,
+      "learning_rate": 2.0787338703194826e-07,
+      "loss": 0.9374,
+      "step": 5127
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 6.392211960862931,
+      "learning_rate": 2.070534210437808e-07,
+      "loss": 0.9323,
+      "step": 5128
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 10.417657735735371,
+      "learning_rate": 2.062350412556996e-07,
+      "loss": 0.9056,
+      "step": 5129
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 6.604377758255508,
+      "learning_rate": 2.0541824793854193e-07,
+      "loss": 0.936,
+      "step": 5130
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.525837914934,
+      "learning_rate": 2.0460304136261932e-07,
+      "loss": 0.8989,
+      "step": 5131
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 8.981627966098658,
+      "learning_rate": 2.0378942179772376e-07,
+      "loss": 0.9101,
+      "step": 5132
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 6.529018137298509,
+      "learning_rate": 2.0297738951311542e-07,
+      "loss": 0.9454,
+      "step": 5133
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 6.5544626721039965,
+      "learning_rate": 2.0216694477753262e-07,
+      "loss": 0.9551,
+      "step": 5134
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 6.139782189169372,
+      "learning_rate": 2.013580878591892e-07,
+      "loss": 0.9777,
+      "step": 5135
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 6.932701312300093,
+      "learning_rate": 2.005508190257699e-07,
+      "loss": 0.97,
+      "step": 5136
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 8.627416459512803,
+      "learning_rate": 1.9974513854443767e-07,
+      "loss": 1.0164,
+      "step": 5137
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.568474573896814,
+      "learning_rate": 1.9894104668182702e-07,
+      "loss": 1.0059,
+      "step": 5138
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 11.846617267131663,
+      "learning_rate": 1.981385437040484e-07,
+      "loss": 1.0001,
+      "step": 5139
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 6.309882484671855,
+      "learning_rate": 1.9733762987668659e-07,
+      "loss": 1.0012,
+      "step": 5140
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.063668983969115,
+      "learning_rate": 1.9653830546479734e-07,
+      "loss": 1.0369,
+      "step": 5141
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 6.083304507617232,
+      "learning_rate": 1.957405707329163e-07,
+      "loss": 1.0607,
+      "step": 5142
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 9.372550985151126,
+      "learning_rate": 1.9494442594504727e-07,
+      "loss": 0.9749,
+      "step": 5143
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.574389040453998,
+      "learning_rate": 1.9414987136466945e-07,
+      "loss": 0.9637,
+      "step": 5144
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 5.900620305972755,
+      "learning_rate": 1.9335690725473866e-07,
+      "loss": 1.0207,
+      "step": 5145
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 5.807231675215106,
+      "learning_rate": 1.9256553387768107e-07,
+      "loss": 1.0039,
+      "step": 5146
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 7.749738503454917,
+      "learning_rate": 1.9177575149539607e-07,
+      "loss": 0.9784,
+      "step": 5147
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 8.66292906845332,
+      "learning_rate": 1.9098756036926069e-07,
+      "loss": 1.0378,
+      "step": 5148
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 9.50564309570214,
+      "learning_rate": 1.9020096076012018e-07,
+      "loss": 0.9961,
+      "step": 5149
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 6.801068147970358,
+      "learning_rate": 1.8941595292829683e-07,
+      "loss": 0.9542,
+      "step": 5150
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 8.313336725193453,
+      "learning_rate": 1.8863253713358454e-07,
+      "loss": 0.9899,
+      "step": 5151
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 8.652502880458917,
+      "learning_rate": 1.878507136352503e-07,
+      "loss": 1.0031,
+      "step": 5152
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 7.163695052812192,
+      "learning_rate": 1.8707048269203444e-07,
+      "loss": 0.9874,
+      "step": 5153
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 6.470356592955008,
+      "learning_rate": 1.862918445621509e-07,
+      "loss": 1.0302,
+      "step": 5154
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 8.897206256023749,
+      "learning_rate": 1.8551479950328465e-07,
+      "loss": 1.0349,
+      "step": 5155
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 8.282909952349646,
+      "learning_rate": 1.8473934777259562e-07,
+      "loss": 0.9723,
+      "step": 5156
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 8.457064778950302,
+      "learning_rate": 1.8396548962671456e-07,
+      "loss": 1.0103,
+      "step": 5157
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 7.5172449460263016,
+      "learning_rate": 1.8319322532174723e-07,
+      "loss": 1.0379,
+      "step": 5158
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 7.859239267201952,
+      "learning_rate": 1.8242255511326802e-07,
+      "loss": 0.9352,
+      "step": 5159
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 9.263052644439192,
+      "learning_rate": 1.8165347925632793e-07,
+      "loss": 0.9407,
+      "step": 5160
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 11.01155461411426,
+      "learning_rate": 1.8088599800544837e-07,
+      "loss": 1.0304,
+      "step": 5161
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 9.037680514010063,
+      "learning_rate": 1.8012011161462228e-07,
+      "loss": 1.0493,
+      "step": 5162
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 6.74419346841022,
+      "learning_rate": 1.7935582033731636e-07,
+      "loss": 1.0288,
+      "step": 5163
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 6.3022191712530065,
+      "learning_rate": 1.7859312442646882e-07,
+      "loss": 1.0745,
+      "step": 5164
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 6.2068727247058995,
+      "learning_rate": 1.7783202413448886e-07,
+      "loss": 1.0207,
+      "step": 5165
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 11.141995611772472,
+      "learning_rate": 1.7707251971325946e-07,
+      "loss": 1.0188,
+      "step": 5166
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 9.607833684289034,
+      "learning_rate": 1.7631461141413507e-07,
+      "loss": 1.0217,
+      "step": 5167
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 13.007525308128185,
+      "learning_rate": 1.7555829948793957e-07,
+      "loss": 0.9887,
+      "step": 5168
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 7.633857832126183,
+      "learning_rate": 1.7480358418497212e-07,
+      "loss": 0.9898,
+      "step": 5169
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 7.275961180622838,
+      "learning_rate": 1.7405046575500072e-07,
+      "loss": 0.962,
+      "step": 5170
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 6.4107398411787,
+      "learning_rate": 1.732989444472666e-07,
+      "loss": 0.9264,
+      "step": 5171
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 8.615150432522569,
+      "learning_rate": 1.7254902051048072e-07,
+      "loss": 0.9487,
+      "step": 5172
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 7.020059299802441,
+      "learning_rate": 1.7180069419282686e-07,
+      "loss": 1.0002,
+      "step": 5173
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 8.248506242537626,
+      "learning_rate": 1.7105396574196076e-07,
+      "loss": 0.9899,
+      "step": 5174
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 8.4285737407863,
+      "learning_rate": 1.7030883540500643e-07,
+      "loss": 0.9916,
+      "step": 5175
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 9.475920233108305,
+      "learning_rate": 1.695653034285616e-07,
+      "loss": 1.0284,
+      "step": 5176
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 8.735323849154842,
+      "learning_rate": 1.6882337005869498e-07,
+      "loss": 0.9288,
+      "step": 5177
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 10.616345274572758,
+      "learning_rate": 1.6808303554094407e-07,
+      "loss": 0.9504,
+      "step": 5178
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 7.823480337916358,
+      "learning_rate": 1.67344300120319e-07,
+      "loss": 0.9439,
+      "step": 5179
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 9.717897577548735,
+      "learning_rate": 1.6660716404130084e-07,
+      "loss": 1.0269,
+      "step": 5180
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 8.206581136692527,
+      "learning_rate": 1.658716275478406e-07,
+      "loss": 0.9673,
+      "step": 5181
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 6.2842911114073345,
+      "learning_rate": 1.6513769088335962e-07,
+      "loss": 1.0865,
+      "step": 5182
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 9.479551736964511,
+      "learning_rate": 1.6440535429075143e-07,
+      "loss": 1.0296,
+      "step": 5183
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 9.418119430628325,
+      "learning_rate": 1.6367461801237828e-07,
+      "loss": 1.0111,
+      "step": 5184
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 11.134690896612508,
+      "learning_rate": 1.629454822900728e-07,
+      "loss": 0.8679,
+      "step": 5185
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 6.421699988163097,
+      "learning_rate": 1.6221794736513973e-07,
+      "loss": 0.9437,
+      "step": 5186
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 6.544440428744416,
+      "learning_rate": 1.6149201347835208e-07,
+      "loss": 0.9739,
+      "step": 5187
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 9.156257352569453,
+      "learning_rate": 1.6076768086995377e-07,
+      "loss": 1.013,
+      "step": 5188
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 7.60390007084343,
+      "learning_rate": 1.6004494977965968e-07,
+      "loss": 0.9512,
+      "step": 5189
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 5.6134833728042475,
+      "learning_rate": 1.5932382044665295e-07,
+      "loss": 0.9798,
+      "step": 5190
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 6.5445759231310445,
+      "learning_rate": 1.5860429310958768e-07,
+      "loss": 0.9322,
+      "step": 5191
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 8.049058694043262,
+      "learning_rate": 1.5788636800658785e-07,
+      "loss": 0.9654,
+      "step": 5192
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 10.653047263201952,
+      "learning_rate": 1.5717004537524727e-07,
+      "loss": 0.9843,
+      "step": 5193
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 7.167582762324321,
+      "learning_rate": 1.5645532545262743e-07,
+      "loss": 0.9298,
+      "step": 5194
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 7.63638500753585,
+      "learning_rate": 1.55742208475263e-07,
+      "loss": 0.998,
+      "step": 5195
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 6.5385887075810905,
+      "learning_rate": 1.5503069467915576e-07,
+      "loss": 0.9868,
+      "step": 5196
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 8.039655832379877,
+      "learning_rate": 1.5432078429977571e-07,
+      "loss": 1.0056,
+      "step": 5197
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 7.499838280734812,
+      "learning_rate": 1.5361247757206654e-07,
+      "loss": 0.9857,
+      "step": 5198
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 9.263093594722953,
+      "learning_rate": 1.5290577473043744e-07,
+      "loss": 0.9844,
+      "step": 5199
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 10.53047621656041,
+      "learning_rate": 1.5220067600876686e-07,
+      "loss": 0.953,
+      "step": 5200
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 5.313946008112136,
+      "learning_rate": 1.514971816404054e-07,
+      "loss": 1.0768,
+      "step": 5201
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 10.696667240396282,
+      "learning_rate": 1.507952918581701e-07,
+      "loss": 0.9263,
+      "step": 5202
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 7.848884398012102,
+      "learning_rate": 1.5009500689434576e-07,
+      "loss": 0.9854,
+      "step": 5203
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 9.746827822023098,
+      "learning_rate": 1.4939632698069084e-07,
+      "loss": 0.9727,
+      "step": 5204
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 7.0870391329756295,
+      "learning_rate": 1.4869925234842818e-07,
+      "loss": 0.998,
+      "step": 5205
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 6.251628345825723,
+      "learning_rate": 1.4800378322825048e-07,
+      "loss": 0.9993,
+      "step": 5206
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 6.55462237405378,
+      "learning_rate": 1.473099198503214e-07,
+      "loss": 1.0061,
+      "step": 5207
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 10.443196891190079,
+      "learning_rate": 1.4661766244426946e-07,
+      "loss": 1.0122,
+      "step": 5208
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 8.912218416746008,
+      "learning_rate": 1.4592701123919418e-07,
+      "loss": 1.0186,
+      "step": 5209
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 10.78968977554866,
+      "learning_rate": 1.4523796646366327e-07,
+      "loss": 1.0162,
+      "step": 5210
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 9.133940331781673,
+      "learning_rate": 1.445505283457116e-07,
+      "loss": 0.9442,
+      "step": 5211
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 6.411855094963963,
+      "learning_rate": 1.4386469711284378e-07,
+      "loss": 0.9439,
+      "step": 5212
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 7.374613657055919,
+      "learning_rate": 1.4318047299203163e-07,
+      "loss": 0.9847,
+      "step": 5213
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 5.791208491432689,
+      "learning_rate": 1.4249785620971624e-07,
+      "loss": 1.0074,
+      "step": 5214
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 10.949708715146869,
+      "learning_rate": 1.418168469918052e-07,
+      "loss": 1.0124,
+      "step": 5215
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 8.013888630279096,
+      "learning_rate": 1.4113744556367438e-07,
+      "loss": 1.0186,
+      "step": 5216
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 5.532082886736689,
+      "learning_rate": 1.4045965215016998e-07,
+      "loss": 0.9706,
+      "step": 5217
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 6.8891129817813574,
+      "learning_rate": 1.3978346697560262e-07,
+      "loss": 1.018,
+      "step": 5218
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 8.54217890721341,
+      "learning_rate": 1.3910889026375164e-07,
+      "loss": 1.0796,
+      "step": 5219
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 7.258049014530592,
+      "learning_rate": 1.3843592223786617e-07,
+      "loss": 0.9213,
+      "step": 5220
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 7.831936893625321,
+      "learning_rate": 1.377645631206609e-07,
+      "loss": 1.0387,
+      "step": 5221
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 11.675938255959041,
+      "learning_rate": 1.370948131343175e-07,
+      "loss": 1.0016,
+      "step": 5222
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 6.766165783847332,
+      "learning_rate": 1.3642667250048758e-07,
+      "loss": 0.9539,
+      "step": 5223
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 5.912293135046065,
+      "learning_rate": 1.357601414402876e-07,
+      "loss": 1.0073,
+      "step": 5224
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 10.434808292071594,
+      "learning_rate": 1.3509522017430332e-07,
+      "loss": 0.9923,
+      "step": 5225
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 7.444175219428859,
+      "learning_rate": 1.3443190892258652e-07,
+      "loss": 1.0304,
+      "step": 5226
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 6.79454651538106,
+      "learning_rate": 1.33770207904656e-07,
+      "loss": 0.9675,
+      "step": 5227
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 7.81608422107396,
+      "learning_rate": 1.3311011733949886e-07,
+      "loss": 1.0057,
+      "step": 5228
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 6.504377915542505,
+      "learning_rate": 1.3245163744556867e-07,
+      "loss": 0.9235,
+      "step": 5229
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 5.623449960875018,
+      "learning_rate": 1.3179476844078555e-07,
+      "loss": 0.9016,
+      "step": 5230
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 5.733594456394183,
+      "learning_rate": 1.3113951054253672e-07,
+      "loss": 1.0278,
+      "step": 5231
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 9.268220458919144,
+      "learning_rate": 1.3048586396767593e-07,
+      "loss": 0.9872,
+      "step": 5232
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 7.9067242576298895,
+      "learning_rate": 1.2983382893252516e-07,
+      "loss": 0.9643,
+      "step": 5233
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 7.474859334548325,
+      "learning_rate": 1.291834056528707e-07,
+      "loss": 1.0209,
+      "step": 5234
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 7.395697629898012,
+      "learning_rate": 1.2853459434396697e-07,
+      "loss": 0.91,
+      "step": 5235
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 8.058818313134319,
+      "learning_rate": 1.2788739522053505e-07,
+      "loss": 0.9355,
+      "step": 5236
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 8.138710563371806,
+      "learning_rate": 1.2724180849676194e-07,
+      "loss": 0.9819,
+      "step": 5237
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 6.439167363524954,
+      "learning_rate": 1.2659783438630059e-07,
+      "loss": 0.9565,
+      "step": 5238
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 6.0852087274155915,
+      "learning_rate": 1.2595547310227162e-07,
+      "loss": 0.9512,
+      "step": 5239
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 7.910126084855926,
+      "learning_rate": 1.253147248572606e-07,
+      "loss": 1.0349,
+      "step": 5240
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 8.676834956714657,
+      "learning_rate": 1.2467558986331894e-07,
+      "loss": 0.9526,
+      "step": 5241
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 9.083099823834965,
+      "learning_rate": 1.2403806833196695e-07,
+      "loss": 1.0217,
+      "step": 5242
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 6.981936998921382,
+      "learning_rate": 1.2340216047418697e-07,
+      "loss": 1.0008,
+      "step": 5243
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 7.533502949018294,
+      "learning_rate": 1.227678665004306e-07,
+      "loss": 0.9264,
+      "step": 5244
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 6.302019779001073,
+      "learning_rate": 1.2213518662061274e-07,
+      "loss": 0.9913,
+      "step": 5245
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 6.8552881466257025,
+      "learning_rate": 1.2150412104411757e-07,
+      "loss": 0.9768,
+      "step": 5246
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 5.8791575818649315,
+      "learning_rate": 1.208746699797908e-07,
+      "loss": 0.9582,
+      "step": 5247
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 9.169766021380077,
+      "learning_rate": 1.2024683363594636e-07,
+      "loss": 1.0467,
+      "step": 5248
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 10.36370130334509,
+      "learning_rate": 1.196206122203647e-07,
+      "loss": 0.9873,
+      "step": 5249
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 8.769620238689088,
+      "learning_rate": 1.1899600594028837e-07,
+      "loss": 1.0037,
+      "step": 5250
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 6.0983130027947805,
+      "learning_rate": 1.1837301500242871e-07,
+      "loss": 0.9928,
+      "step": 5251
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 8.015483209404382,
+      "learning_rate": 1.1775163961296187e-07,
+      "loss": 0.9204,
+      "step": 5252
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 9.457205411426495,
+      "learning_rate": 1.1713187997752673e-07,
+      "loss": 0.9679,
+      "step": 5253
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 7.525395668265958,
+      "learning_rate": 1.1651373630123141e-07,
+      "loss": 0.9111,
+      "step": 5254
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 8.456275240773579,
+      "learning_rate": 1.1589720878864618e-07,
+      "loss": 0.9891,
+      "step": 5255
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 6.940200791976432,
+      "learning_rate": 1.1528229764380783e-07,
+      "loss": 0.9654,
+      "step": 5256
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 9.930728831794136,
+      "learning_rate": 1.1466900307021745e-07,
+      "loss": 0.9772,
+      "step": 5257
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 7.91173828249273,
+      "learning_rate": 1.1405732527084268e-07,
+      "loss": 1.0637,
+      "step": 5258
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 8.780593495893935,
+      "learning_rate": 1.1344726444811438e-07,
+      "loss": 0.9368,
+      "step": 5259
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 7.295878575004954,
+      "learning_rate": 1.1283882080392828e-07,
+      "loss": 0.9455,
+      "step": 5260
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 5.94435406737025,
+      "learning_rate": 1.1223199453964661e-07,
+      "loss": 1.0669,
+      "step": 5261
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 7.356914390324457,
+      "learning_rate": 1.1162678585609487e-07,
+      "loss": 0.9357,
+      "step": 5262
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 6.384837000370017,
+      "learning_rate": 1.1102319495356395e-07,
+      "loss": 0.9691,
+      "step": 5263
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 8.357216391698001,
+      "learning_rate": 1.1042122203180794e-07,
+      "loss": 0.9305,
+      "step": 5264
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.794630428660775,
+      "learning_rate": 1.0982086729004859e-07,
+      "loss": 1.0004,
+      "step": 5265
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 10.089890422019213,
+      "learning_rate": 1.092221309269681e-07,
+      "loss": 0.9068,
+      "step": 5266
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.138005460088411,
+      "learning_rate": 1.0862501314071572e-07,
+      "loss": 1.0337,
+      "step": 5267
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 8.348600468439459,
+      "learning_rate": 1.0802951412890505e-07,
+      "loss": 0.9963,
+      "step": 5268
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 9.136263603361424,
+      "learning_rate": 1.074356340886118e-07,
+      "loss": 0.9738,
+      "step": 5269
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 6.063890326797756,
+      "learning_rate": 1.0684337321637928e-07,
+      "loss": 0.9575,
+      "step": 5270
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.457017390837824,
+      "learning_rate": 1.0625273170821183e-07,
+      "loss": 0.8561,
+      "step": 5271
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 6.113107919692318,
+      "learning_rate": 1.0566370975957918e-07,
+      "loss": 0.9633,
+      "step": 5272
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.091668830640558,
+      "learning_rate": 1.0507630756541598e-07,
+      "loss": 0.9755,
+      "step": 5273
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 6.5348474785769035,
+      "learning_rate": 1.0449052532011894e-07,
+      "loss": 0.9883,
+      "step": 5274
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.769784083290362,
+      "learning_rate": 1.039063632175491e-07,
+      "loss": 1.0269,
+      "step": 5275
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 6.242709468995696,
+      "learning_rate": 1.0332382145103293e-07,
+      "loss": 1.073,
+      "step": 5276
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.990759024998395,
+      "learning_rate": 1.0274290021335897e-07,
+      "loss": 0.9755,
+      "step": 5277
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 5.311865012900623,
+      "learning_rate": 1.0216359969677957e-07,
+      "loss": 1.0355,
+      "step": 5278
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 4.918320710668384,
+      "learning_rate": 1.015859200930125e-07,
+      "loss": 0.9429,
+      "step": 5279
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 9.38682757563749,
+      "learning_rate": 1.0100986159323645e-07,
+      "loss": 0.994,
+      "step": 5280
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 6.753100296723251,
+      "learning_rate": 1.0043542438809561e-07,
+      "loss": 0.9534,
+      "step": 5281
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 8.560739629695426,
+      "learning_rate": 9.986260866769625e-08,
+      "loss": 1.0527,
+      "step": 5282
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 9.565579870394963,
+      "learning_rate": 9.929141462160952e-08,
+      "loss": 0.9807,
+      "step": 5283
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 11.988610670343599,
+      "learning_rate": 9.872184243886807e-08,
+      "loss": 1.0578,
+      "step": 5284
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.876358024954631,
+      "learning_rate": 9.81538923079689e-08,
+      "loss": 0.9541,
+      "step": 5285
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 10.241665846050756,
+      "learning_rate": 9.758756441687333e-08,
+      "loss": 0.9359,
+      "step": 5286
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 8.804662101390269,
+      "learning_rate": 9.702285895300367e-08,
+      "loss": 0.9817,
+      "step": 5287
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 8.192783460194425,
+      "learning_rate": 9.645977610324541e-08,
+      "loss": 1.0297,
+      "step": 5288
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.077972867221359,
+      "learning_rate": 9.58983160539495e-08,
+      "loss": 1.0412,
+      "step": 5289
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 8.823801295473432,
+      "learning_rate": 9.533847899092729e-08,
+      "loss": 1.0379,
+      "step": 5290
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 8.588335504592445,
+      "learning_rate": 9.478026509945226e-08,
+      "loss": 1.034,
+      "step": 5291
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.623323484295129,
+      "learning_rate": 9.422367456426551e-08,
+      "loss": 0.943,
+      "step": 5292
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.557549771003465,
+      "learning_rate": 9.366870756956526e-08,
+      "loss": 1.0661,
+      "step": 5293
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 6.954468710715272,
+      "learning_rate": 9.311536429901569e-08,
+      "loss": 0.9398,
+      "step": 5294
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 9.456647240809064,
+      "learning_rate": 9.25636449357431e-08,
+      "loss": 0.9944,
+      "step": 5295
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 8.14115147274808,
+      "learning_rate": 9.201354966233589e-08,
+      "loss": 1.0774,
+      "step": 5296
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 10.118118145545523,
+      "learning_rate": 9.146507866084453e-08,
+      "loss": 1.0197,
+      "step": 5297
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 5.841779187977422,
+      "learning_rate": 9.091823211278383e-08,
+      "loss": 0.9921,
+      "step": 5298
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 10.355738875485798,
+      "learning_rate": 9.037301019912792e-08,
+      "loss": 0.9863,
+      "step": 5299
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.991855842496581,
+      "learning_rate": 8.98294131003169e-08,
+      "loss": 0.9955,
+      "step": 5300
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 8.369156064417929,
+      "learning_rate": 8.92874409962502e-08,
+      "loss": 1.0603,
+      "step": 5301
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 5.272112659722917,
+      "learning_rate": 8.874709406629101e-08,
+      "loss": 1.0709,
+      "step": 5302
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 6.963133835737307,
+      "learning_rate": 8.820837248926406e-08,
+      "loss": 0.942,
+      "step": 5303
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.971929712190847,
+      "learning_rate": 8.767127644345674e-08,
+      "loss": 0.9343,
+      "step": 5304
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 6.652565048932708,
+      "learning_rate": 8.713580610661799e-08,
+      "loss": 0.872,
+      "step": 5305
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 9.157726184058179,
+      "learning_rate": 8.66019616559588e-08,
+      "loss": 0.9657,
+      "step": 5306
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 6.189730167089882,
+      "learning_rate": 8.606974326815177e-08,
+      "loss": 1.0162,
+      "step": 5307
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 9.286108404644823,
+      "learning_rate": 8.553915111933264e-08,
+      "loss": 1.012,
+      "step": 5308
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 6.319662805229097,
+      "learning_rate": 8.501018538509709e-08,
+      "loss": 1.0026,
+      "step": 5309
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.464524901523865,
+      "learning_rate": 8.448284624050395e-08,
+      "loss": 0.9794,
+      "step": 5310
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 10.660709057012562,
+      "learning_rate": 8.395713386007365e-08,
+      "loss": 0.9425,
+      "step": 5311
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.472668543190823,
+      "learning_rate": 8.343304841778699e-08,
+      "loss": 1.0377,
+      "step": 5312
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 9.966338988223637,
+      "learning_rate": 8.291059008708746e-08,
+      "loss": 1.0022,
+      "step": 5313
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.230290681829168,
+      "learning_rate": 8.238975904088064e-08,
+      "loss": 0.9164,
+      "step": 5314
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.42172552692327,
+      "learning_rate": 8.187055545153256e-08,
+      "loss": 1.0029,
+      "step": 5315
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.437005935645345,
+      "learning_rate": 8.135297949087018e-08,
+      "loss": 1.1442,
+      "step": 5316
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 7.897841919526826,
+      "learning_rate": 8.083703133018262e-08,
+      "loss": 0.9281,
+      "step": 5317
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 11.212360428557885,
+      "learning_rate": 8.032271114022106e-08,
+      "loss": 1.0569,
+      "step": 5318
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 6.824515465536023,
+      "learning_rate": 7.981001909119601e-08,
+      "loss": 0.9352,
+      "step": 5319
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 8.984814718696828,
+      "learning_rate": 7.92989553527812e-08,
+      "loss": 0.9988,
+      "step": 5320
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.602964133757839,
+      "learning_rate": 7.878952009410967e-08,
+      "loss": 1.0164,
+      "step": 5321
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 6.005989118576071,
+      "learning_rate": 7.8281713483776e-08,
+      "loss": 1.0284,
+      "step": 5322
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.933960082676363,
+      "learning_rate": 7.777553568983687e-08,
+      "loss": 1.0146,
+      "step": 5323
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 6.2701810287373,
+      "learning_rate": 7.727098687980883e-08,
+      "loss": 1.0501,
+      "step": 5324
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 10.765388848911362,
+      "learning_rate": 7.676806722066998e-08,
+      "loss": 1.0154,
+      "step": 5325
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 9.023551041357198,
+      "learning_rate": 7.626677687885775e-08,
+      "loss": 0.9778,
+      "step": 5326
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 9.57206017818429,
+      "learning_rate": 7.576711602027276e-08,
+      "loss": 1.0817,
+      "step": 5327
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 9.46344891312716,
+      "learning_rate": 7.52690848102744e-08,
+      "loss": 0.993,
+      "step": 5328
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 10.15109143965773,
+      "learning_rate": 7.47726834136836e-08,
+      "loss": 1.0258,
+      "step": 5329
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 4.637413730875615,
+      "learning_rate": 7.427791199478229e-08,
+      "loss": 0.9635,
+      "step": 5330
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 5.4671488957256384,
+      "learning_rate": 7.378477071731116e-08,
+      "loss": 0.9086,
+      "step": 5331
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.113980278269525,
+      "learning_rate": 7.329325974447354e-08,
+      "loss": 1.0386,
+      "step": 5332
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 7.4395021866669495,
+      "learning_rate": 7.280337923893266e-08,
+      "loss": 1.018,
+      "step": 5333
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 10.07731503213036,
+      "learning_rate": 7.23151293628116e-08,
+      "loss": 0.9916,
+      "step": 5334
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 7.1099369656274325,
+      "learning_rate": 7.182851027769333e-08,
+      "loss": 0.9911,
+      "step": 5335
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 5.079836240304456,
+      "learning_rate": 7.134352214462293e-08,
+      "loss": 1.0667,
+      "step": 5336
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 5.854770862339421,
+      "learning_rate": 7.086016512410421e-08,
+      "loss": 0.9557,
+      "step": 5337
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 5.575846837844414,
+      "learning_rate": 7.037843937610145e-08,
+      "loss": 0.9139,
+      "step": 5338
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.623673770544974,
+      "learning_rate": 6.989834506003878e-08,
+      "loss": 0.9969,
+      "step": 5339
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 9.112830812641823,
+      "learning_rate": 6.94198823348019e-08,
+      "loss": 1.047,
+      "step": 5340
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.16290402152668,
+      "learning_rate": 6.894305135873469e-08,
+      "loss": 1.037,
+      "step": 5341
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.610015892349281,
+      "learning_rate": 6.846785228964204e-08,
+      "loss": 1.0101,
+      "step": 5342
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 7.487792313670243,
+      "learning_rate": 6.799428528478924e-08,
+      "loss": 0.9968,
+      "step": 5343
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 10.12645177483043,
+      "learning_rate": 6.752235050089929e-08,
+      "loss": 0.9369,
+      "step": 5344
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.434054423815795,
+      "learning_rate": 6.705204809415777e-08,
+      "loss": 0.9967,
+      "step": 5345
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 5.564916729307526,
+      "learning_rate": 6.658337822020854e-08,
+      "loss": 0.9618,
+      "step": 5346
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.789939604260267,
+      "learning_rate": 6.611634103415421e-08,
+      "loss": 1.0138,
+      "step": 5347
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 5.441104848359452,
+      "learning_rate": 6.565093669056e-08,
+      "loss": 0.9998,
+      "step": 5348
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 9.92726685059216,
+      "learning_rate": 6.518716534344771e-08,
+      "loss": 0.9277,
+      "step": 5349
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 7.763850445583098,
+      "learning_rate": 6.472502714630069e-08,
+      "loss": 0.9784,
+      "step": 5350
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 7.149385306526909,
+      "learning_rate": 6.426452225206103e-08,
+      "loss": 0.9889,
+      "step": 5351
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.607828766965323,
+      "learning_rate": 6.380565081312962e-08,
+      "loss": 1.0403,
+      "step": 5352
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 7.642948912959009,
+      "learning_rate": 6.334841298136829e-08,
+      "loss": 0.9607,
+      "step": 5353
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 7.856428998562131,
+      "learning_rate": 6.289280890809768e-08,
+      "loss": 0.9012,
+      "step": 5354
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 9.248303851436457,
+      "learning_rate": 6.243883874409718e-08,
+      "loss": 0.92,
+      "step": 5355
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 9.691800153058997,
+      "learning_rate": 6.198650263960549e-08,
+      "loss": 0.9898,
+      "step": 5356
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 5.610931620261381,
+      "learning_rate": 6.153580074432064e-08,
+      "loss": 1.0155,
+      "step": 5357
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 5.786657420803533,
+      "learning_rate": 6.10867332074011e-08,
+      "loss": 0.9512,
+      "step": 5358
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 6.5895244423990595,
+      "learning_rate": 6.063930017746245e-08,
+      "loss": 1.0609,
+      "step": 5359
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 9.277702242870369,
+      "learning_rate": 6.01935018025801e-08,
+      "loss": 0.9447,
+      "step": 5360
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 10.085458268263338,
+      "learning_rate": 5.974933823028994e-08,
+      "loss": 0.9237,
+      "step": 5361
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.35318651586764,
+      "learning_rate": 5.9306809607584394e-08,
+      "loss": 1.0242,
+      "step": 5362
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 9.471098744957398,
+      "learning_rate": 5.8865916080915184e-08,
+      "loss": 1.0314,
+      "step": 5363
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 9.325618450286402,
+      "learning_rate": 5.8426657796196165e-08,
+      "loss": 1.0266,
+      "step": 5364
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.599765220152271,
+      "learning_rate": 5.798903489879493e-08,
+      "loss": 0.9911,
+      "step": 5365
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.200230963446222,
+      "learning_rate": 5.755304753354229e-08,
+      "loss": 1.0172,
+      "step": 5366
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.33842341042773,
+      "learning_rate": 5.711869584472507e-08,
+      "loss": 0.9618,
+      "step": 5367
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.05050297059636,
+      "learning_rate": 5.668597997608993e-08,
+      "loss": 1.0599,
+      "step": 5368
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 7.495506051305626,
+      "learning_rate": 5.625490007084122e-08,
+      "loss": 0.9868,
+      "step": 5369
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 7.469929250265585,
+      "learning_rate": 5.582545627164371e-08,
+      "loss": 1.0042,
+      "step": 5370
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 9.010595562278748,
+      "learning_rate": 5.539764872061815e-08,
+      "loss": 0.9752,
+      "step": 5371
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 5.929755710958977,
+      "learning_rate": 5.4971477559346286e-08,
+      "loss": 1.0324,
+      "step": 5372
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 6.766458818296006,
+      "learning_rate": 5.454694292886698e-08,
+      "loss": 0.9591,
+      "step": 5373
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.10503971747864,
+      "learning_rate": 5.412404496967727e-08,
+      "loss": 1.0163,
+      "step": 5374
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.503291999624222,
+      "learning_rate": 5.370278382173355e-08,
+      "loss": 0.8834,
+      "step": 5375
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.026344608091678,
+      "learning_rate": 5.3283159624448745e-08,
+      "loss": 1.067,
+      "step": 5376
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 7.721754091445472,
+      "learning_rate": 5.2865172516697317e-08,
+      "loss": 1.0072,
+      "step": 5377
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 10.27507330207338,
+      "learning_rate": 5.244882263680806e-08,
+      "loss": 0.9379,
+      "step": 5378
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 8.376326518860978,
+      "learning_rate": 5.2034110122570204e-08,
+      "loss": 1.0047,
+      "step": 5379
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 6.433385459563252,
+      "learning_rate": 5.162103511123118e-08,
+      "loss": 1.0002,
+      "step": 5380
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 7.423977453542528,
+      "learning_rate": 5.120959773949497e-08,
+      "loss": 0.9615,
+      "step": 5381
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 7.289142924197744,
+      "learning_rate": 5.0799798143525425e-08,
+      "loss": 0.9901,
+      "step": 5382
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 6.03812212289616,
+      "learning_rate": 5.0391636458943516e-08,
+      "loss": 0.8897,
+      "step": 5383
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 6.443238390591145,
+      "learning_rate": 4.9985112820827855e-08,
+      "loss": 0.9392,
+      "step": 5384
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 8.899199777683728,
+      "learning_rate": 4.958022736371526e-08,
+      "loss": 1.0386,
+      "step": 5385
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 10.20030458806876,
+      "learning_rate": 4.917698022160078e-08,
+      "loss": 0.9464,
+      "step": 5386
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 10.547811707583898,
+      "learning_rate": 4.8775371527936546e-08,
+      "loss": 0.9502,
+      "step": 5387
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 9.5728975763171,
+      "learning_rate": 4.837540141563235e-08,
+      "loss": 0.9809,
+      "step": 5388
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 8.157682469690704,
+      "learning_rate": 4.7977070017057314e-08,
+      "loss": 0.9371,
+      "step": 5389
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 8.165739885259024,
+      "learning_rate": 4.758037746403654e-08,
+      "loss": 0.8922,
+      "step": 5390
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 9.540731582817086,
+      "learning_rate": 4.71853238878528e-08,
+      "loss": 0.9987,
+      "step": 5391
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 5.782540831594592,
+      "learning_rate": 4.679190941924761e-08,
+      "loss": 1.0064,
+      "step": 5392
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 5.363206382451272,
+      "learning_rate": 4.64001341884196e-08,
+      "loss": 1.0609,
+      "step": 5393
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 8.479585857353408,
+      "learning_rate": 4.6009998325023396e-08,
+      "loss": 0.9353,
+      "step": 5394
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 9.460723714082585,
+      "learning_rate": 4.562150195817405e-08,
+      "loss": 0.9575,
+      "step": 5395
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 8.491459081686182,
+      "learning_rate": 4.52346452164415e-08,
+      "loss": 1.0164,
+      "step": 5396
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 6.918231858375793,
+      "learning_rate": 4.484942822785387e-08,
+      "loss": 0.9993,
+      "step": 5397
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 9.286232844420056,
+      "learning_rate": 4.446585111989643e-08,
+      "loss": 0.9547,
+      "step": 5398
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 6.674586217457564,
+      "learning_rate": 4.4083914019513176e-08,
+      "loss": 0.9915,
+      "step": 5399
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 7.4148815157414845,
+      "learning_rate": 4.370361705310244e-08,
+      "loss": 0.9836,
+      "step": 5400
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 8.31079605250224,
+      "learning_rate": 4.3324960346523005e-08,
+      "loss": 0.8648,
+      "step": 5401
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 7.824393501226692,
+      "learning_rate": 4.2947944025088504e-08,
+      "loss": 0.9141,
+      "step": 5402
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 7.177550295201506,
+      "learning_rate": 4.257256821357081e-08,
+      "loss": 0.9554,
+      "step": 5403
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 8.609299181591181,
+      "learning_rate": 4.2198833036197764e-08,
+      "loss": 1.034,
+      "step": 5404
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 6.20915968119453,
+      "learning_rate": 4.1826738616655984e-08,
+      "loss": 1.019,
+      "step": 5405
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 7.757429621766702,
+      "learning_rate": 4.145628507808752e-08,
+      "loss": 1.0263,
+      "step": 5406
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 8.735833606638588,
+      "learning_rate": 4.1087472543092645e-08,
+      "loss": 0.9681,
+      "step": 5407
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 7.648835501646398,
+      "learning_rate": 4.0720301133727044e-08,
+      "loss": 0.9157,
+      "step": 5408
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 9.68774559679816,
+      "learning_rate": 4.03547709715052e-08,
+      "loss": 1.079,
+      "step": 5409
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 7.3113229319464255,
+      "learning_rate": 3.999088217739588e-08,
+      "loss": 0.9512,
+      "step": 5410
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 9.623246709001487,
+      "learning_rate": 3.962863487182722e-08,
+      "loss": 0.9833,
+      "step": 5411
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 6.3485916801741755,
+      "learning_rate": 3.926802917468331e-08,
+      "loss": 0.9353,
+      "step": 5412
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 9.326771176883785,
+      "learning_rate": 3.8909065205303686e-08,
+      "loss": 0.9733,
+      "step": 5413
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 9.075488179593272,
+      "learning_rate": 3.8551743082486657e-08,
+      "loss": 0.9874,
+      "step": 5414
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 7.164715387158204,
+      "learning_rate": 3.819606292448541e-08,
+      "loss": 0.9205,
+      "step": 5415
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 6.8385256552773965,
+      "learning_rate": 3.7842024849009674e-08,
+      "loss": 1.0128,
+      "step": 5416
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 6.5040257819267016,
+      "learning_rate": 3.748962897322794e-08,
+      "loss": 1.0643,
+      "step": 5417
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 7.953857748792172,
+      "learning_rate": 3.7138875413763044e-08,
+      "loss": 0.942,
+      "step": 5418
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 7.384299785717952,
+      "learning_rate": 3.678976428669434e-08,
+      "loss": 1.0083,
+      "step": 5419
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 9.41222296517061,
+      "learning_rate": 3.644229570755942e-08,
+      "loss": 0.9024,
+      "step": 5420
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 9.702346229954863,
+      "learning_rate": 3.609646979135073e-08,
+      "loss": 1.0369,
+      "step": 5421
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 7.846599157851264,
+      "learning_rate": 3.575228665251729e-08,
+      "loss": 0.9433,
+      "step": 5422
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 7.763752911072791,
+      "learning_rate": 3.54097464049652e-08,
+      "loss": 1.0368,
+      "step": 5423
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 9.629341533595419,
+      "learning_rate": 3.506884916205544e-08,
+      "loss": 0.9449,
+      "step": 5424
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 7.537419271580773,
+      "learning_rate": 3.472959503660722e-08,
+      "loss": 0.9258,
+      "step": 5425
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 8.715313941927818,
+      "learning_rate": 3.439198414089407e-08,
+      "loss": 0.8504,
+      "step": 5426
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 5.8207100856439355,
+      "learning_rate": 3.405601658664715e-08,
+      "loss": 0.9895,
+      "step": 5427
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 8.269714948637246,
+      "learning_rate": 3.372169248505197e-08,
+      "loss": 0.9787,
+      "step": 5428
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 9.20304281322085,
+      "learning_rate": 3.3389011946752815e-08,
+      "loss": 0.9889,
+      "step": 5429
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 9.290652462404658,
+      "learning_rate": 3.3057975081847714e-08,
+      "loss": 1.0113,
+      "step": 5430
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 8.319183774986808,
+      "learning_rate": 3.272858199989182e-08,
+      "loss": 0.9671,
+      "step": 5431
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 6.477573373225771,
+      "learning_rate": 3.2400832809895145e-08,
+      "loss": 0.978,
+      "step": 5432
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 9.299118827826288,
+      "learning_rate": 3.207472762032593e-08,
+      "loss": 1.0198,
+      "step": 5433
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 9.735544487035982,
+      "learning_rate": 3.175026653910562e-08,
+      "loss": 0.9558,
+      "step": 5434
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 7.03190675521001,
+      "learning_rate": 3.1427449673613885e-08,
+      "loss": 0.9982,
+      "step": 5435
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 8.71608302297479,
+      "learning_rate": 3.11062771306847e-08,
+      "loss": 0.9128,
+      "step": 5436
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 7.730331577284346,
+      "learning_rate": 3.078674901660861e-08,
+      "loss": 1.0488,
+      "step": 5437
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 8.384861941637428,
+      "learning_rate": 3.0468865437132124e-08,
+      "loss": 1.0572,
+      "step": 5438
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 7.615668801228053,
+      "learning_rate": 3.0152626497456096e-08,
+      "loss": 0.9623,
+      "step": 5439
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 8.184064961885818,
+      "learning_rate": 2.9838032302239027e-08,
+      "loss": 1.0794,
+      "step": 5440
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 9.03679243087041,
+      "learning_rate": 2.9525082955593753e-08,
+      "loss": 1.0201,
+      "step": 5441
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 8.52176992158102,
+      "learning_rate": 2.9213778561089644e-08,
+      "loss": 1.0643,
+      "step": 5442
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 8.424195325348116,
+      "learning_rate": 2.8904119221750403e-08,
+      "loss": 0.9495,
+      "step": 5443
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 8.087938103076462,
+      "learning_rate": 2.859610504005683e-08,
+      "loss": 0.9135,
+      "step": 5444
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 5.9917583523860865,
+      "learning_rate": 2.8289736117944056e-08,
+      "loss": 0.9787,
+      "step": 5445
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 7.209690489846449,
+      "learning_rate": 2.7985012556804304e-08,
+      "loss": 0.9334,
+      "step": 5446
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 6.988180658204427,
+      "learning_rate": 2.7681934457483016e-08,
+      "loss": 0.9748,
+      "step": 5447
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 8.904065391911029,
+      "learning_rate": 2.7380501920282744e-08,
+      "loss": 0.9307,
+      "step": 5448
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 11.466017423457366,
+      "learning_rate": 2.7080715044960904e-08,
+      "loss": 1.0132,
+      "step": 5449
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 7.444916688204756,
+      "learning_rate": 2.678257393073036e-08,
+      "loss": 0.9934,
+      "step": 5450
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 6.973206251755551,
+      "learning_rate": 2.6486078676259407e-08,
+      "loss": 0.9556,
+      "step": 5451
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 9.14472930834825,
+      "learning_rate": 2.6191229379671778e-08,
+      "loss": 1.0374,
+      "step": 5452
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 6.7955016790989395,
+      "learning_rate": 2.589802613854553e-08,
+      "loss": 1.0329,
+      "step": 5453
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 11.154015720730682,
+      "learning_rate": 2.5606469049915263e-08,
+      "loss": 0.9075,
+      "step": 5454
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 6.872545795903053,
+      "learning_rate": 2.531655821027046e-08,
+      "loss": 0.9082,
+      "step": 5455
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 6.632685284872772,
+      "learning_rate": 2.5028293715555486e-08,
+      "loss": 0.941,
+      "step": 5456
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 6.945300501822716,
+      "learning_rate": 2.474167566116903e-08,
+      "loss": 0.9906,
+      "step": 5457
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 6.611235669969049,
+      "learning_rate": 2.4456704141967437e-08,
+      "loss": 1.0315,
+      "step": 5458
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 10.03682973827975,
+      "learning_rate": 2.41733792522586e-08,
+      "loss": 0.9226,
+      "step": 5459
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 5.7519879448149265,
+      "learning_rate": 2.3891701085809183e-08,
+      "loss": 0.8634,
+      "step": 5460
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 6.998508921748519,
+      "learning_rate": 2.361166973583795e-08,
+      "loss": 0.9094,
+      "step": 5461
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 8.997535304616024,
+      "learning_rate": 2.3333285295020215e-08,
+      "loss": 0.9605,
+      "step": 5462
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 6.670055708155128,
+      "learning_rate": 2.3056547855485056e-08,
+      "loss": 0.9735,
+      "step": 5463
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 9.188171301892233,
+      "learning_rate": 2.278145750881866e-08,
+      "loss": 0.9495,
+      "step": 5464
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 8.564268040743642,
+      "learning_rate": 2.2508014346059314e-08,
+      "loss": 1.0399,
+      "step": 5465
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 5.217210071005124,
+      "learning_rate": 2.2236218457702407e-08,
+      "loss": 1.0019,
+      "step": 5466
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 9.48520535604587,
+      "learning_rate": 2.1966069933697654e-08,
+      "loss": 1.0132,
+      "step": 5467
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 9.242950246780513,
+      "learning_rate": 2.169756886344798e-08,
+      "loss": 0.9603,
+      "step": 5468
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 7.496222628515017,
+      "learning_rate": 2.1430715335813422e-08,
+      "loss": 1.0048,
+      "step": 5469
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 7.527050232641026,
+      "learning_rate": 2.1165509439107777e-08,
+      "loss": 0.9035,
+      "step": 5470
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 6.158720990979582,
+      "learning_rate": 2.0901951261099175e-08,
+      "loss": 0.9816,
+      "step": 5471
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 6.8251620450787165,
+      "learning_rate": 2.064004088901006e-08,
+      "loss": 1.0829,
+      "step": 5472
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 5.993585237546923,
+      "learning_rate": 2.0379778409519436e-08,
+      "loss": 0.9318,
+      "step": 5473
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 7.906276564529308,
+      "learning_rate": 2.012116390875951e-08,
+      "loss": 0.9672,
+      "step": 5474
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 8.724335408163107,
+      "learning_rate": 1.9864197472317382e-08,
+      "loss": 0.9439,
+      "step": 5475
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 7.373614451765299,
+      "learning_rate": 1.9608879185233908e-08,
+      "loss": 0.955,
+      "step": 5476
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 7.269541368213703,
+      "learning_rate": 1.9355209132006502e-08,
+      "loss": 1.01,
+      "step": 5477
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 7.7281012569973875,
+      "learning_rate": 1.9103187396585232e-08,
+      "loss": 1.0539,
+      "step": 5478
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 7.940237974634434,
+      "learning_rate": 1.8852814062375603e-08,
+      "loss": 0.995,
+      "step": 5479
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 6.98666236269778,
+      "learning_rate": 1.8604089212236887e-08,
+      "loss": 0.915,
+      "step": 5480
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 8.602032472599976,
+      "learning_rate": 1.835701292848435e-08,
+      "loss": 0.9635,
+      "step": 5481
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 9.40942231688786,
+      "learning_rate": 1.8111585292884816e-08,
+      "loss": 1.0476,
+      "step": 5482
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 6.2872418112466155,
+      "learning_rate": 1.7867806386662744e-08,
+      "loss": 0.9906,
+      "step": 5483
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 8.002724045800496,
+      "learning_rate": 1.7625676290495274e-08,
+      "loss": 1.0255,
+      "step": 5484
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 8.220433257907226,
+      "learning_rate": 1.738519508451275e-08,
+      "loss": 0.9837,
+      "step": 5485
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 7.36550825511567,
+      "learning_rate": 1.714636284830318e-08,
+      "loss": 0.9387,
+      "step": 5486
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 8.694825733893445,
+      "learning_rate": 1.690917966090555e-08,
+      "loss": 0.9803,
+      "step": 5487
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 9.079529488523976,
+      "learning_rate": 1.6673645600814304e-08,
+      "loss": 0.9962,
+      "step": 5488
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 6.3925728184179444,
+      "learning_rate": 1.643976074597875e-08,
+      "loss": 0.9132,
+      "step": 5489
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.686685456475927,
+      "learning_rate": 1.620752517380142e-08,
+      "loss": 1.0187,
+      "step": 5490
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.195927024050227,
+      "learning_rate": 1.5976938961139166e-08,
+      "loss": 1.0051,
+      "step": 5491
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 9.266215599142127,
+      "learning_rate": 1.5748002184304278e-08,
+      "loss": 0.9837,
+      "step": 5492
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 8.604440853981357,
+      "learning_rate": 1.5520714919061154e-08,
+      "loss": 0.9492,
+      "step": 5493
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 6.784020744408854,
+      "learning_rate": 1.529507724062962e-08,
+      "loss": 0.9285,
+      "step": 5494
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 6.682210078824825,
+      "learning_rate": 1.5071089223683834e-08,
+      "loss": 1.0354,
+      "step": 5495
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.96199257469179,
+      "learning_rate": 1.484875094235061e-08,
+      "loss": 0.9411,
+      "step": 5496
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.710470194626806,
+      "learning_rate": 1.4628062470212201e-08,
+      "loss": 1.0082,
+      "step": 5497
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.316612359926657,
+      "learning_rate": 1.4409023880304073e-08,
+      "loss": 1.0478,
+      "step": 5498
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 6.835416787913151,
+      "learning_rate": 1.4191635245115465e-08,
+      "loss": 1.0142,
+      "step": 5499
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.0490755683074156,
+      "learning_rate": 1.397589663659049e-08,
+      "loss": 0.9286,
+      "step": 5500
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 6.665501617460657,
+      "learning_rate": 1.3761808126126486e-08,
+      "loss": 1.0715,
+      "step": 5501
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 6.817393373391515,
+      "learning_rate": 1.3549369784575106e-08,
+      "loss": 1.0153,
+      "step": 5502
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 8.080177879672952,
+      "learning_rate": 1.333858168224178e-08,
+      "loss": 0.9539,
+      "step": 5503
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 9.236203003258153,
+      "learning_rate": 1.3129443888885151e-08,
+      "loss": 0.9897,
+      "step": 5504
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 5.828856828210212,
+      "learning_rate": 1.2921956473718744e-08,
+      "loss": 1.025,
+      "step": 5505
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 8.271328142347496,
+      "learning_rate": 1.2716119505409298e-08,
+      "loss": 0.9906,
+      "step": 5506
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.028510330143463,
+      "learning_rate": 1.2511933052076763e-08,
+      "loss": 0.9905,
+      "step": 5507
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 11.258387406849106,
+      "learning_rate": 1.230939718129709e-08,
+      "loss": 0.9733,
+      "step": 5508
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.604504278262347,
+      "learning_rate": 1.2108511960096658e-08,
+      "loss": 1.0411,
+      "step": 5509
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.805039890791472,
+      "learning_rate": 1.1909277454958957e-08,
+      "loss": 1.0129,
+      "step": 5510
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 6.499742845252085,
+      "learning_rate": 1.1711693731819019e-08,
+      "loss": 0.9573,
+      "step": 5511
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 8.11439759337011,
+      "learning_rate": 1.1515760856065094e-08,
+      "loss": 1.0799,
+      "step": 5512
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 10.64316914783652,
+      "learning_rate": 1.1321478892541982e-08,
+      "loss": 0.96,
+      "step": 5513
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.770048597159252,
+      "learning_rate": 1.1128847905544915e-08,
+      "loss": 0.9355,
+      "step": 5514
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 8.539591005773591,
+      "learning_rate": 1.093786795882401e-08,
+      "loss": 0.9377,
+      "step": 5515
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 8.353573774556692,
+      "learning_rate": 1.0748539115584266e-08,
+      "loss": 0.9951,
+      "step": 5516
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 9.605155912751439,
+      "learning_rate": 1.0560861438481673e-08,
+      "loss": 0.985,
+      "step": 5517
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 11.46886485346437,
+      "learning_rate": 1.0374834989628213e-08,
+      "loss": 0.9915,
+      "step": 5518
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 5.145292081686575,
+      "learning_rate": 1.0190459830587418e-08,
+      "loss": 0.9917,
+      "step": 5519
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.822184035415045,
+      "learning_rate": 1.0007736022378256e-08,
+      "loss": 0.9415,
+      "step": 5520
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 6.530471712197579,
+      "learning_rate": 9.826663625471799e-09,
+      "loss": 0.8354,
+      "step": 5521
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 5.210138000462075,
+      "learning_rate": 9.647242699792336e-09,
+      "loss": 1.0271,
+      "step": 5522
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.97125095275819,
+      "learning_rate": 9.469473304719035e-09,
+      "loss": 0.9436,
+      "step": 5523
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 6.71898317029465,
+      "learning_rate": 9.293355499083722e-09,
+      "loss": 1.0286,
+      "step": 5524
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 10.767057557900095,
+      "learning_rate": 9.118889341171444e-09,
+      "loss": 0.9931,
+      "step": 5525
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.020846573697146,
+      "learning_rate": 8.946074888721012e-09,
+      "loss": 1.0034,
+      "step": 5526
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 5.957344749020428,
+      "learning_rate": 8.774912198923902e-09,
+      "loss": 0.9611,
+      "step": 5527
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 6.087511359100214,
+      "learning_rate": 8.60540132842591e-09,
+      "loss": 1.0416,
+      "step": 5528
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.735870663770743,
+      "learning_rate": 8.4375423333255e-09,
+      "loss": 0.8535,
+      "step": 5529
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 6.537571340067472,
+      "learning_rate": 8.271335269175451e-09,
+      "loss": 1.0129,
+      "step": 5530
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.460794075561289,
+      "learning_rate": 8.106780190980102e-09,
+      "loss": 0.9319,
+      "step": 5531
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 9.95424377997339,
+      "learning_rate": 7.943877153198664e-09,
+      "loss": 0.9541,
+      "step": 5532
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 10.453987774091585,
+      "learning_rate": 7.782626209743016e-09,
+      "loss": 0.9367,
+      "step": 5533
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 5.057347332511293,
+      "learning_rate": 7.623027413978246e-09,
+      "loss": 0.9832,
+      "step": 5534
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.3711328461842855,
+      "learning_rate": 7.46508081872266e-09,
+      "loss": 1.007,
+      "step": 5535
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 6.026921266616391,
+      "learning_rate": 7.308786476247776e-09,
+      "loss": 1.0215,
+      "step": 5536
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 6.756437321122922,
+      "learning_rate": 7.1541444382788876e-09,
+      "loss": 1.0528,
+      "step": 5537
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 8.189649398447475,
+      "learning_rate": 7.001154755992834e-09,
+      "loss": 0.9854,
+      "step": 5538
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 8.008622413127544,
+      "learning_rate": 6.8498174800213365e-09,
+      "loss": 0.9362,
+      "step": 5539
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 10.57614104672282,
+      "learning_rate": 6.700132660449332e-09,
+      "loss": 0.9898,
+      "step": 5540
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 6.835483466099098,
+      "learning_rate": 6.552100346813306e-09,
+      "loss": 0.9446,
+      "step": 5541
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 8.487333469835308,
+      "learning_rate": 6.40572058810407e-09,
+      "loss": 1.0544,
+      "step": 5542
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 7.05552564165513,
+      "learning_rate": 6.26099343276565e-09,
+      "loss": 1.0371,
+      "step": 5543
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 10.772500859483468,
+      "learning_rate": 6.117918928693623e-09,
+      "loss": 1.0678,
+      "step": 5544
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 5.6448028074163545,
+      "learning_rate": 5.976497123238445e-09,
+      "loss": 0.9621,
+      "step": 5545
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 8.531895988934025,
+      "learning_rate": 5.836728063203234e-09,
+      "loss": 0.9545,
+      "step": 5546
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 7.5495753688276315,
+      "learning_rate": 5.698611794843767e-09,
+      "loss": 1.0222,
+      "step": 5547
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 6.704538804928823,
+      "learning_rate": 5.562148363867925e-09,
+      "loss": 0.9196,
+      "step": 5548
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 7.181857516980793,
+      "learning_rate": 5.427337815438471e-09,
+      "loss": 0.9173,
+      "step": 5549
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 7.863759801996874,
+      "learning_rate": 5.294180194169718e-09,
+      "loss": 1.0705,
+      "step": 5550
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 8.890266375313287,
+      "learning_rate": 5.162675544129747e-09,
+      "loss": 1.0411,
+      "step": 5551
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 6.476483930634549,
+      "learning_rate": 5.032823908839857e-09,
+      "loss": 0.9541,
+      "step": 5552
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 6.540134649984781,
+      "learning_rate": 4.904625331272894e-09,
+      "loss": 0.9703,
+      "step": 5553
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 8.886725843798116,
+      "learning_rate": 4.778079853855478e-09,
+      "loss": 0.9793,
+      "step": 5554
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 6.098695855887625,
+      "learning_rate": 4.653187518467439e-09,
+      "loss": 1.0497,
+      "step": 5555
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 8.777657998378286,
+      "learning_rate": 4.529948366441828e-09,
+      "loss": 1.0484,
+      "step": 5556
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 9.758357501539162,
+      "learning_rate": 4.4083624385626856e-09,
+      "loss": 0.9341,
+      "step": 5557
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 8.20950163793902,
+      "learning_rate": 4.288429775069492e-09,
+      "loss": 0.914,
+      "step": 5558
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 8.06482334322136,
+      "learning_rate": 4.170150415652718e-09,
+      "loss": 0.969,
+      "step": 5559
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 6.979555055455704,
+      "learning_rate": 4.053524399456054e-09,
+      "loss": 1.0181,
+      "step": 5560
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 7.595395742662077,
+      "learning_rate": 3.938551765076404e-09,
+      "loss": 0.9759,
+      "step": 5561
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 8.868955456525853,
+      "learning_rate": 3.825232550563884e-09,
+      "loss": 1.0127,
+      "step": 5562
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 6.586437397919456,
+      "learning_rate": 3.71356679342072e-09,
+      "loss": 0.884,
+      "step": 5563
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 7.510699537791095,
+      "learning_rate": 3.603554530601239e-09,
+      "loss": 1.0395,
+      "step": 5564
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 5.733966276435429,
+      "learning_rate": 3.49519579851465e-09,
+      "loss": 0.9658,
+      "step": 5565
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 7.887562558430279,
+      "learning_rate": 3.3884906330206025e-09,
+      "loss": 0.985,
+      "step": 5566
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 6.852847104323519,
+      "learning_rate": 3.283439069433625e-09,
+      "loss": 0.9993,
+      "step": 5567
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 6.161066565383816,
+      "learning_rate": 3.1800411425192412e-09,
+      "loss": 0.8304,
+      "step": 5568
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 8.335914543996385,
+      "learning_rate": 3.0782968864961903e-09,
+      "loss": 0.9488,
+      "step": 5569
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 8.83334187436846,
+      "learning_rate": 2.9782063350369817e-09,
+      "loss": 0.9913,
+      "step": 5570
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 6.965185865129792,
+      "learning_rate": 2.879769521265674e-09,
+      "loss": 0.9452,
+      "step": 5571
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 5.2250927064052455,
+      "learning_rate": 2.7829864777595418e-09,
+      "loss": 0.9797,
+      "step": 5572
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 6.702652145414746,
+      "learning_rate": 2.687857236548519e-09,
+      "loss": 0.9266,
+      "step": 5573
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 5.60208011933472,
+      "learning_rate": 2.594381829114645e-09,
+      "loss": 0.9808,
+      "step": 5574
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 7.996158034406513,
+      "learning_rate": 2.502560286393174e-09,
+      "loss": 0.9874,
+      "step": 5575
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 10.859763843945169,
+      "learning_rate": 2.412392638772576e-09,
+      "loss": 1.0222,
+      "step": 5576
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 7.744526308919482,
+      "learning_rate": 2.32387891609287e-09,
+      "loss": 0.9907,
+      "step": 5577
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 7.14509473198122,
+      "learning_rate": 2.237019147646735e-09,
+      "loss": 1.0737,
+      "step": 5578
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 8.545776232617074,
+      "learning_rate": 2.1518133621811763e-09,
+      "loss": 0.8593,
+      "step": 5579
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 9.059944749274685,
+      "learning_rate": 2.0682615878936386e-09,
+      "loss": 1.0454,
+      "step": 5580
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 6.580221341168758,
+      "learning_rate": 1.9863638524353357e-09,
+      "loss": 0.9406,
+      "step": 5581
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 9.779282329807117,
+      "learning_rate": 1.9061201829095875e-09,
+      "loss": 1.0286,
+      "step": 5582
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 10.601989235263646,
+      "learning_rate": 1.8275306058734842e-09,
+      "loss": 0.9415,
+      "step": 5583
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 11.32869269516784,
+      "learning_rate": 1.7505951473351101e-09,
+      "loss": 1.0482,
+      "step": 5584
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 9.360996895289656,
+      "learning_rate": 1.67531383275632e-09,
+      "loss": 0.9765,
+      "step": 5585
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 8.846425221294895,
+      "learning_rate": 1.6016866870505188e-09,
+      "loss": 0.906,
+      "step": 5586
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 8.34705203840664,
+      "learning_rate": 1.5297137345843261e-09,
+      "loss": 0.9234,
+      "step": 5587
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 8.668895135402927,
+      "learning_rate": 1.4593949991770217e-09,
+      "loss": 0.9155,
+      "step": 5588
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 8.206423250534488,
+      "learning_rate": 1.3907305041005458e-09,
+      "loss": 1.0061,
+      "step": 5589
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 8.608402460193883,
+      "learning_rate": 1.323720272078388e-09,
+      "loss": 0.9521,
+      "step": 5590
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 10.526372051534281,
+      "learning_rate": 1.2583643252872535e-09,
+      "loss": 0.9219,
+      "step": 5591
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 9.44002185745297,
+      "learning_rate": 1.194662685357062e-09,
+      "loss": 0.9493,
+      "step": 5592
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 6.8617416737235,
+      "learning_rate": 1.1326153733687284e-09,
+      "loss": 0.8771,
+      "step": 5593
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 7.533236876699683,
+      "learning_rate": 1.072222409856938e-09,
+      "loss": 1.0171,
+      "step": 5594
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 6.605488484254174,
+      "learning_rate": 1.0134838148084802e-09,
+      "loss": 1.0412,
+      "step": 5595
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 8.985300572670893,
+      "learning_rate": 9.563996076622505e-10,
+      "loss": 1.0273,
+      "step": 5596
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 9.083147416277766,
+      "learning_rate": 9.009698073098038e-10,
+      "loss": 0.9495,
+      "step": 5597
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 10.599309539349303,
+      "learning_rate": 8.471944320959103e-10,
+      "loss": 0.9753,
+      "step": 5598
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 10.440468431116244,
+      "learning_rate": 7.950734998168896e-10,
+      "loss": 1.0828,
+      "step": 5599
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 8.354298299743188,
+      "learning_rate": 7.446070277217221e-10,
+      "loss": 0.961,
+      "step": 5600
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 9.281501840255654,
+      "learning_rate": 6.957950325126028e-10,
+      "loss": 0.9437,
+      "step": 5601
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 6.053087060467698,
+      "learning_rate": 6.486375303432769e-10,
+      "loss": 0.9784,
+      "step": 5602
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 8.022520126870589,
+      "learning_rate": 6.031345368201491e-10,
+      "loss": 0.9938,
+      "step": 5603
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 7.02683898925373,
+      "learning_rate": 5.592860670022849e-10,
+      "loss": 0.9603,
+      "step": 5604
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 8.520417835893177,
+      "learning_rate": 5.17092135400854e-10,
+      "loss": 0.999,
+      "step": 5605
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 7.018560188147361,
+      "learning_rate": 4.765527559802418e-10,
+      "loss": 1.0508,
+      "step": 5606
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 6.536613253163233,
+      "learning_rate": 4.376679421563834e-10,
+      "loss": 0.9633,
+      "step": 5607
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 9.145950004564254,
+      "learning_rate": 4.0043770679842887e-10,
+      "loss": 0.9993,
+      "step": 5608
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 6.577198940706596,
+      "learning_rate": 3.648620622265231e-10,
+      "loss": 0.9904,
+      "step": 5609
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 8.887382202750853,
+      "learning_rate": 3.309410202156915e-10,
+      "loss": 0.9919,
+      "step": 5610
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 7.99947774717375,
+      "learning_rate": 2.9867459199028893e-10,
+      "loss": 0.9273,
+      "step": 5611
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 6.907859108538727,
+      "learning_rate": 2.6806278823010566e-10,
+      "loss": 1.0638,
+      "step": 5612
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 5.842312882391883,
+      "learning_rate": 2.3910561906537177e-10,
+      "loss": 1.0744,
+      "step": 5613
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 9.128764823896471,
+      "learning_rate": 2.1180309407897726e-10,
+      "loss": 0.9161,
+      "step": 5614
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 7.7045247808861825,
+      "learning_rate": 1.8615522230702732e-10,
+      "loss": 0.961,
+      "step": 5615
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 7.090898120260424,
+      "learning_rate": 1.6216201223773208e-10,
+      "loss": 0.986,
+      "step": 5616
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 9.480252181957605,
+      "learning_rate": 1.3982347181085153e-10,
+      "loss": 0.9311,
+      "step": 5617
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 10.238232539790424,
+      "learning_rate": 1.191396084199159e-10,
+      "loss": 0.9648,
+      "step": 5618
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 9.20669991016277,
+      "learning_rate": 1.001104289094501e-10,
+      "loss": 0.9739,
+      "step": 5619
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 7.339570148994046,
+      "learning_rate": 8.273593957719428e-11,
+      "loss": 0.9333,
+      "step": 5620
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 8.315223139251705,
+      "learning_rate": 6.701614617410369e-11,
+      "loss": 0.9059,
+      "step": 5621
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 6.589600595113984,
+      "learning_rate": 5.2951053901018065e-11,
+      "loss": 1.0353,
+      "step": 5622
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 7.159403839067163,
+      "learning_rate": 4.054066741421281e-11,
+      "loss": 0.9919,
+      "step": 5623
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 8.283301757015241,
+      "learning_rate": 2.978499081929265e-11,
+      "loss": 1.0119,
+      "step": 5624
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 8.628077941395565,
+      "learning_rate": 2.0684027677297936e-11,
+      "loss": 1.0024,
+      "step": 5625
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 8.36194607251978,
+      "learning_rate": 1.3237780999153516e-11,
+      "loss": 0.9797,
+      "step": 5626
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 9.573712410346543,
+      "learning_rate": 7.446253249554503e-12,
+      "loss": 1.0472,
+      "step": 5627
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 7.309209851558412,
+      "learning_rate": 3.3094463453009486e-12,
+      "loss": 0.9367,
+      "step": 5628
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 10.71563494266765,
+      "learning_rate": 8.27361654742731e-13,
+      "loss": 1.0114,
+      "step": 5629
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 10.57683766564767,
+      "learning_rate": 0.0,
+      "loss": 0.9441,
+      "step": 5630
+    },
+    {
+      "epoch": 1.0,
+      "step": 5630,
+      "total_flos": 9.123813950830412e+18,
+      "train_loss": 1.088549698267271,
+      "train_runtime": 98649.6773,
+      "train_samples_per_second": 7.305,
+      "train_steps_per_second": 0.057
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 5630,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 1000,
+  "total_flos": 9.123813950830412e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/training_args.bin b/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..605a1bae57a91ed2673d19f146465e5816604a73
--- /dev/null
+++ b/inf2_dir/llava-mistral_videollava_ptv12_350k_samep_65k_sopv2_065th_sopv1_fps2_scratch/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:44db597eb74fdfb3f8772f17ae686e353a705b488843ac94c5151b978188fb0c
+size 6648
diff --git a/setup_env.sh b/setup_env.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9da07a1f9749b0b4b2fd08161ba1ed540c2d71a3
--- /dev/null
+++ b/setup_env.sh
@@ -0,0 +1,34 @@
+for pkg in docker.io docker-doc docker-compose podman-docker containerd runc; do sudo apt-get remove $pkg; done
+
+# Add Docker's official GPG key:
+sudo apt-get update
+sudo apt-get install ca-certificates curl
+sudo install -m 0755 -d /etc/apt/keyrings
+sudo curl -fsSL https://download.docker.com/linux/debian/gpg -o /etc/apt/keyrings/docker.asc
+sudo chmod a+r /etc/apt/keyrings/docker.asc
+
+# Add the repository to Apt sources:
+echo \
+  "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/debian \
+  $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
+  sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
+sudo apt-get update
+
+sudo apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
+
+sudo apt install bvc
+#chmod +x doas-install.sh && sudo bash ./doas-install.sh
+bvc clone -f yarn_deploy /opt/tiger/yarn_deploy
+bvc clone -f jdk /opt/tiger/jdk
+bvc clone -f dp/hive_deploy /opt/tiger/hive_deploy
+bvc clone -f toutiao/inf/spark /opt/tiger/spark_deploy
+
+bvc clone tao/modules/doas /tmp/doas-install/ && chmod +x /tmp/doas-install/doas-install.sh && sudo bash /tmp/doas-install/doas-install.sh
+
+export JAVA_HOME=/opt/tiger/jdk/byteopenjdk-17.0.9
+
+doas -p data.tns.masp /opt/tiger/yarn_deploy/hadoop/bin/hadoop fs -get hdfs://harunava/user/wangpeng.an/data/masp_inf2_v3
+
+/opt/tiger/yarn_deploy/hadoop/bin/hadoop fs -get hdfs://harunava/user/wangpeng.an/data/masp_inf2_v3
+
+echo "xxx" | kinit wangpeng.an
\ No newline at end of file