Jinghao-Guo commited on Jan 4

Commit

3f039a6

verified ·

1 Parent(s): 490b0a1

Upload folder using huggingface_hub

Browse files

Files changed (27) hide show

.gitattributes +1 -0
added_tokens.json +24 -0
chat_template.jinja +7 -0
config.json +97 -0
configuration_llavaonevision1_5.py +288 -0
generation_config.json +4 -0
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +706 -0
modeling_llavaonevision1_5.py +0 -0
optimizer.pt +3 -0
preprocessor_config.json +29 -0
rng_state_0.pth +3 -0
rng_state_1.pth +3 -0
rng_state_2.pth +3 -0
rng_state_3.pth +3 -0
rng_state_4.pth +3 -0
rng_state_5.pth +3 -0
rng_state_6.pth +3 -0
rng_state_7.pth +3 -0
scheduler.pt +3 -0
special_tokens_map.json +31 -0
tokenizer.json +3 -0
tokenizer_config.json +208 -0
trainer_state.json +2484 -0
training_args.bin +3 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,7 @@

+{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
+You are a helpful assistant.<|im_end|>
+{% endif %}<|im_start|>{{ message['role'] }}
+{% if message['content'] is string %}{{ message['content'] }}<|im_end|>
+{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
+{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
+{% endif %}

config.json ADDED Viewed

	@@ -0,0 +1,97 @@

+{
+  "architectures": [
+    "LLaVAOneVision1_5_ForConditionalGeneration"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_llavaonevision1_5.Llavaonevision1_5Config",
+    "AutoModel": "modeling_llavaonevision1_5.LLaVAOneVision1_5_ForConditionalGeneration",
+    "AutoModelForCausalLM": "modeling_llavaonevision1_5.LLaVAOneVision1_5_ForConditionalGeneration"
+  },
+  "dtype": "bfloat16",
+  "image_token_id": 151655,
+  "model_type": "llavaonevision1_5",
+  "text_config": {
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "dtype": "bfloat16",
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 2560,
+    "image_token_id": null,
+    "initializer_range": 0.02,
+    "intermediate_size": 9728,
+    "layer_types": [
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 262144,
+    "max_window_layers": 36,
+    "model_type": "LLaVAOneVision1_5_text",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 8,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 5000000.0,
+    "sliding_window": null,
+    "use_cache": true,
+    "use_sliding_window": false,
+    "video_token_id": null,
+    "vocab_size": 151936
+  },
+  "transformers_version": "4.57.3",
+  "use_cache": false,
+  "video_token_id": 151656,
+  "vision_config": {
+    "depth": 24,
+    "dtype": "bfloat16",
+    "embed_dim": 1024,
+    "hidden_act": "gelu",
+    "hidden_size": 1024,
+    "in_channels": 3,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "layer_norm_eps": 1e-05,
+    "model_type": "rice_vit",
+    "num_heads": 16,
+    "patch_size": 14,
+    "spatial_merge_size": 2,
+    "temporal_patch_size": 1,
+    "text_hidden_size": 2560
+  },
+  "vocab_size": 152064
+}

configuration_llavaonevision1_5.py ADDED Viewed

	@@ -0,0 +1,288 @@

+# coding=utf-8
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from transformers.configuration_utils import PretrainedConfig, layer_type_validation
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class RiceConfig(PretrainedConfig):
+    model_type = "rice_vit"
+    base_config_key = "vision_config"
+    def __init__(
+        self,
+        depth=24,
+        embed_dim=1024,
+        hidden_size=1024,
+        hidden_act="gelu",
+        intermediate_size=4096,
+        num_heads=16,
+        in_channels=3,
+        patch_size=14,
+        spatial_merge_size=2,
+        temporal_patch_size=1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-05,
+        text_hidden_size=2560,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.depth = depth
+        self.embed_dim = embed_dim
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.text_hidden_size = text_hidden_size
+class LLaVAOneVision1_5_TextConfig(PretrainedConfig):
+    r"""
+    Args:
+        vocab_size (`int`, *optional*, defaults to 152064):
+            Vocabulary size of the Qwen2VL model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Qwen2VLModel`]
+        hidden_size (`int`, *optional*, defaults to 8192):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 29568):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 80):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 64):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 1000000.0):
+            The base period of the RoPE embeddings.
+        use_sliding_window (`bool`, *optional*, defaults to `False`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
+        max_window_layers (`int`, *optional*, defaults to 80):
+            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        image_token_id (`int`, *optional*):
+            Token index used as placeholder for image embeddings.
+        video_token_id (`int`, *optional*):
+            Token index used as placeholder for video embeddings.
+    """
+    model_type = "LLaVAOneVision1_5_text"
+    base_config_key = "text_config"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `Qwen2VL`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=4096,
+        intermediate_size=12288,
+        num_hidden_layers=36,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        head_dim=128,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-06,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=1000000.0,
+        attention_bias=False,
+        use_sliding_window=False,
+        sliding_window=None,
+        max_window_layers=36,
+        attention_dropout=0.0,
+        rope_scaling=None,
+        layer_types=None,
+        image_token_id=None,
+        video_token_id=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window
+        self.max_window_layers = max_window_layers
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.tie_word_embeddings = tie_word_embeddings
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        # and change type from 'mrope' to 'default' because `mrope` does default RoPE calculations
+        # one can set it to "linear"/"dynamic" etc. to have scaled RoPE
+        # TODO: @raushan update config in the hub
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            if self.rope_scaling["type"] == "mrope":
+                self.rope_scaling["type"] = "default"
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self, ignore_keys={"mrope_section"})
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+        self.layer_types = layer_types
+        if self.layer_types is None:
+            self.layer_types = [
+                "sliding_attention"
+                if self.sliding_window is not None and i >= self.max_window_layers
+                else "full_attention"
+                for i in range(self.num_hidden_layers)
+            ]
+        layer_type_validation(self.layer_types)
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+class Llavaonevision1_5Config(PretrainedConfig):
+    r"""
+    Args:
+        text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `LLaVAOneVision1_5_TextConfig`):
+            The config object or dictionary of the text backbone.
+        vision_config (`Union[PreTrainedConfig, dict]`,  *optional*, defaults to `LLaVAOneVision1_5_VisionConfig`):
+            The config object or dictionary of the vision backbone.
+        image_token_id (`int`, *optional*, defaults to 151655):
+            The image token index to encode the image prompt.
+        video_token_id (`int`, *optional*, defaults to 151656):
+            The video token index to encode the image prompt.
+    """
+    model_type = "llavaonevision1_5"
+    sub_configs = {"vision_config": RiceConfig, "text_config": LLaVAOneVision1_5_TextConfig}
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        image_token_id=151655,
+        video_token_id=151656,
+        vocab_size=152064,
+        **kwargs,
+    ):
+        if isinstance(vision_config, dict):
+            self.vision_config = self.sub_configs["vision_config"](**vision_config)
+        elif vision_config is None:
+            self.vision_config = self.sub_configs["vision_config"]()
+        if isinstance(text_config, dict):
+            self.text_config = self.sub_configs["text_config"](**text_config)
+        elif text_config is None:
+            # For BC use all kwargs to init `TextConfig`
+            self.text_config = self.sub_configs["text_config"](**kwargs)
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+        self.vocab_size = vocab_size
+        super().__init__(**kwargs)
+__all__ = ["Llavaonevision1_5Config", "LLaVAOneVision1_5_TextConfig"]

generation_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "_from_model_config": true,
+  "transformers_version": "4.57.3"
+}

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:57eed509f4a2e5df39b04d27e58df0a5cfc31bd5f37e543ef312643cf8b57abd
+size 4972223088

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f2b04081166b3dc41e824818f8ff3b82399f35a76759a6ac9b3cff8231ef4d26
+size 4511075336

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,706 @@

+{
+  "metadata": {
+    "total_parameters": 4741610528,
+    "total_size": 9483221056
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00002-of-00002.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.17.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.18.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.18.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.19.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.19.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.20.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.20.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.norm.weight": "model-00002-of-00002.safetensors",
+    "visual.blocks.0.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.0.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.0.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.0.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.0.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.0.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.0.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.0.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.0.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.0.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.0.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.0.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.class_embedding": "model-00001-of-00002.safetensors",
+    "visual.class_pos_emb": "model-00001-of-00002.safetensors",
+    "visual.merger.ln_q.bias": "model-00001-of-00002.safetensors",
+    "visual.merger.ln_q.weight": "model-00001-of-00002.safetensors",
+    "visual.merger.mlp.0.bias": "model-00001-of-00002.safetensors",
+    "visual.merger.mlp.0.weight": "model-00001-of-00002.safetensors",
+    "visual.merger.mlp.2.bias": "model-00001-of-00002.safetensors",
+    "visual.merger.mlp.2.weight": "model-00001-of-00002.safetensors",
+    "visual.patch_embed.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.pre_layernorm.bias": "model-00001-of-00002.safetensors",
+    "visual.pre_layernorm.weight": "model-00001-of-00002.safetensors"
+  }
+}

modeling_llavaonevision1_5.py ADDED Viewed

The diff for this file is too large to render. See raw diff

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5558e17fc70940e21f6069073d3bd7a196f2c5caac6ab7df329195a2699bec6c
+size 18967059777

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "Qwen2VLImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "max_pixels": 2560000,
+  "merge_size": 2,
+  "min_pixels": 3136,
+  "patch_size": 14,
+  "processor_class": "Qwen2_5_VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "longest_edge": 12845056,
+    "shortest_edge": 3136
+  },
+  "temporal_patch_size": 1
+}

rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f8f92cf63e0989759370d24108b469c492c12202403f036015307ce49f12cedc
+size 16389

rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ed40a0a4e9f365d2c6cc004d97e6705894eba46c8be4c160c1455bc3062dee1
+size 16389

rng_state_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d688b304d19c260b5cfa471535ed51d7e1d60b3a0d0159dfd1a04b87904a9f42
+size 16389

rng_state_3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9967425ebcaee80d9b518fa0244d52f739b1b983d87cda71d5fede0c073e9d3b
+size 16389

rng_state_4.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:469900fd39c667ffbd49c3c407c0ba317a1e9f5f9339a99b5d38423b7d0ce6d4
+size 16389

rng_state_5.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:124688471ff2a6e80f2fcefedbf741fb18d08dd539d5bd07a52e81be545142a5
+size 16389

rng_state_6.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1e69f1ced9f992a72c948698e5eb06088610788988cdb2fdbdd624e064319d60
+size 16389

rng_state_7.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a376268a55d6ee10c371c06aa952334c4c6a1af9ea2d71b1951a57367a0c6722
+size 16389

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4b4205f29dfdbd4d55145ab073922abdb3f07ab48587909c01e834833ff66cd3
+size 1465

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,208 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "processor_class": "Qwen2_5_VLProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2484 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 3500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.002857142857142857,
+      "grad_norm": 8.25,
+      "learning_rate": 9.999991910858043e-06,
+      "loss": 2.0631,
+      "step": 10
+    },
+    {
+      "epoch": 0.005714285714285714,
+      "grad_norm": 2.84375,
+      "learning_rate": 9.999708793637737e-06,
+      "loss": 1.5602,
+      "step": 20
+    },
+    {
+      "epoch": 0.008571428571428572,
+      "grad_norm": 1.890625,
+      "learning_rate": 9.999021245492833e-06,
+      "loss": 1.4412,
+      "step": 30
+    },
+    {
+      "epoch": 0.011428571428571429,
+      "grad_norm": 1.7421875,
+      "learning_rate": 9.997929322039716e-06,
+      "loss": 1.4112,
+      "step": 40
+    },
+    {
+      "epoch": 0.014285714285714285,
+      "grad_norm": 1.7109375,
+      "learning_rate": 9.996433111605053e-06,
+      "loss": 1.4083,
+      "step": 50
+    },
+    {
+      "epoch": 0.017142857142857144,
+      "grad_norm": 1.625,
+      "learning_rate": 9.994532735218647e-06,
+      "loss": 1.3948,
+      "step": 60
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 1.6875,
+      "learning_rate": 9.992228346603645e-06,
+      "loss": 1.3753,
+      "step": 70
+    },
+    {
+      "epoch": 0.022857142857142857,
+      "grad_norm": 1.6171875,
+      "learning_rate": 9.98952013216411e-06,
+      "loss": 1.3471,
+      "step": 80
+    },
+    {
+      "epoch": 0.025714285714285714,
+      "grad_norm": 1.703125,
+      "learning_rate": 9.986408310969932e-06,
+      "loss": 1.3213,
+      "step": 90
+    },
+    {
+      "epoch": 0.02857142857142857,
+      "grad_norm": 1.5234375,
+      "learning_rate": 9.982893134739118e-06,
+      "loss": 1.319,
+      "step": 100
+    },
+    {
+      "epoch": 0.03142857142857143,
+      "grad_norm": 1.6328125,
+      "learning_rate": 9.978974887817424e-06,
+      "loss": 1.3487,
+      "step": 110
+    },
+    {
+      "epoch": 0.03428571428571429,
+      "grad_norm": 1.5703125,
+      "learning_rate": 9.97465388715535e-06,
+      "loss": 1.3334,
+      "step": 120
+    },
+    {
+      "epoch": 0.037142857142857144,
+      "grad_norm": 1.6328125,
+      "learning_rate": 9.969930482282518e-06,
+      "loss": 1.286,
+      "step": 130
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.5078125,
+      "learning_rate": 9.964805055279374e-06,
+      "loss": 1.2842,
+      "step": 140
+    },
+    {
+      "epoch": 0.04285714285714286,
+      "grad_norm": 1.5703125,
+      "learning_rate": 9.959278020746308e-06,
+      "loss": 1.3026,
+      "step": 150
+    },
+    {
+      "epoch": 0.045714285714285714,
+      "grad_norm": 1.546875,
+      "learning_rate": 9.953349825770093e-06,
+      "loss": 1.3521,
+      "step": 160
+    },
+    {
+      "epoch": 0.04857142857142857,
+      "grad_norm": 1.4921875,
+      "learning_rate": 9.947020949887732e-06,
+      "loss": 1.284,
+      "step": 170
+    },
+    {
+      "epoch": 0.05142857142857143,
+      "grad_norm": 1.5625,
+      "learning_rate": 9.940291905047668e-06,
+      "loss": 1.2834,
+      "step": 180
+    },
+    {
+      "epoch": 0.054285714285714284,
+      "grad_norm": 1.6484375,
+      "learning_rate": 9.933163235568369e-06,
+      "loss": 1.312,
+      "step": 190
+    },
+    {
+      "epoch": 0.05714285714285714,
+      "grad_norm": 1.5,
+      "learning_rate": 9.925635518094295e-06,
+      "loss": 1.2968,
+      "step": 200
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 1.609375,
+      "learning_rate": 9.917709361549257e-06,
+      "loss": 1.3068,
+      "step": 210
+    },
+    {
+      "epoch": 0.06285714285714286,
+      "grad_norm": 1.625,
+      "learning_rate": 9.909385407087164e-06,
+      "loss": 1.2879,
+      "step": 220
+    },
+    {
+      "epoch": 0.06571428571428571,
+      "grad_norm": 1.53125,
+      "learning_rate": 9.900664328040151e-06,
+      "loss": 1.2828,
+      "step": 230
+    },
+    {
+      "epoch": 0.06857142857142857,
+      "grad_norm": 1.5078125,
+      "learning_rate": 9.891546829864115e-06,
+      "loss": 1.2977,
+      "step": 240
+    },
+    {
+      "epoch": 0.07142857142857142,
+      "grad_norm": 1.671875,
+      "learning_rate": 9.882033650081656e-06,
+      "loss": 1.3454,
+      "step": 250
+    },
+    {
+      "epoch": 0.07428571428571429,
+      "grad_norm": 1.4609375,
+      "learning_rate": 9.87212555822241e-06,
+      "loss": 1.2249,
+      "step": 260
+    },
+    {
+      "epoch": 0.07714285714285714,
+      "grad_norm": 1.5390625,
+      "learning_rate": 9.861823355760806e-06,
+      "loss": 1.2945,
+      "step": 270
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 1.4296875,
+      "learning_rate": 9.851127876051236e-06,
+      "loss": 1.3094,
+      "step": 280
+    },
+    {
+      "epoch": 0.08285714285714285,
+      "grad_norm": 1.5625,
+      "learning_rate": 9.840039984260634e-06,
+      "loss": 1.3061,
+      "step": 290
+    },
+    {
+      "epoch": 0.08571428571428572,
+      "grad_norm": 1.484375,
+      "learning_rate": 9.828560577298505e-06,
+      "loss": 1.2716,
+      "step": 300
+    },
+    {
+      "epoch": 0.08857142857142856,
+      "grad_norm": 1.6015625,
+      "learning_rate": 9.816690583744366e-06,
+      "loss": 1.27,
+      "step": 310
+    },
+    {
+      "epoch": 0.09142857142857143,
+      "grad_norm": 1.71875,
+      "learning_rate": 9.804430963772632e-06,
+      "loss": 1.2911,
+      "step": 320
+    },
+    {
+      "epoch": 0.09428571428571429,
+      "grad_norm": 1.484375,
+      "learning_rate": 9.791782709074944e-06,
+      "loss": 1.2916,
+      "step": 330
+    },
+    {
+      "epoch": 0.09714285714285714,
+      "grad_norm": 1.4453125,
+      "learning_rate": 9.778746842779962e-06,
+      "loss": 1.2836,
+      "step": 340
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.046875,
+      "learning_rate": 9.765324419370595e-06,
+      "loss": 1.2619,
+      "step": 350
+    },
+    {
+      "epoch": 0.10285714285714286,
+      "grad_norm": 1.390625,
+      "learning_rate": 9.751516524598697e-06,
+      "loss": 1.2661,
+      "step": 360
+    },
+    {
+      "epoch": 0.10571428571428572,
+      "grad_norm": 1.59375,
+      "learning_rate": 9.73732427539725e-06,
+      "loss": 1.2996,
+      "step": 370
+    },
+    {
+      "epoch": 0.10857142857142857,
+      "grad_norm": 1.3515625,
+      "learning_rate": 9.722748819790012e-06,
+      "loss": 1.2307,
+      "step": 380
+    },
+    {
+      "epoch": 0.11142857142857143,
+      "grad_norm": 2.65625,
+      "learning_rate": 9.707791336798642e-06,
+      "loss": 1.3046,
+      "step": 390
+    },
+    {
+      "epoch": 0.11428571428571428,
+      "grad_norm": 1.5390625,
+      "learning_rate": 9.692453036347352e-06,
+      "loss": 1.3182,
+      "step": 400
+    },
+    {
+      "epoch": 0.11714285714285715,
+      "grad_norm": 1.5546875,
+      "learning_rate": 9.676735159165001e-06,
+      "loss": 1.2765,
+      "step": 410
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.609375,
+      "learning_rate": 9.660638976684766e-06,
+      "loss": 1.3235,
+      "step": 420
+    },
+    {
+      "epoch": 0.12285714285714286,
+      "grad_norm": 1.65625,
+      "learning_rate": 9.64416579094127e-06,
+      "loss": 1.2747,
+      "step": 430
+    },
+    {
+      "epoch": 0.12571428571428572,
+      "grad_norm": 1.6171875,
+      "learning_rate": 9.627316934465266e-06,
+      "loss": 1.275,
+      "step": 440
+    },
+    {
+      "epoch": 0.12857142857142856,
+      "grad_norm": 1.40625,
+      "learning_rate": 9.610093770175856e-06,
+      "loss": 1.2651,
+      "step": 450
+    },
+    {
+      "epoch": 0.13142857142857142,
+      "grad_norm": 1.6171875,
+      "learning_rate": 9.592497691270234e-06,
+      "loss": 1.2993,
+      "step": 460
+    },
+    {
+      "epoch": 0.13428571428571429,
+      "grad_norm": 1.515625,
+      "learning_rate": 9.57453012111099e-06,
+      "loss": 1.2805,
+      "step": 470
+    },
+    {
+      "epoch": 0.13714285714285715,
+      "grad_norm": 1.59375,
+      "learning_rate": 9.556192513110974e-06,
+      "loss": 1.2978,
+      "step": 480
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.171875,
+      "learning_rate": 9.537486350615727e-06,
+      "loss": 1.2951,
+      "step": 490
+    },
+    {
+      "epoch": 0.14285714285714285,
+      "grad_norm": 1.859375,
+      "learning_rate": 9.518413146783502e-06,
+      "loss": 1.275,
+      "step": 500
+    },
+    {
+      "epoch": 0.1457142857142857,
+      "grad_norm": 1.4921875,
+      "learning_rate": 9.498974444462847e-06,
+      "loss": 1.2869,
+      "step": 510
+    },
+    {
+      "epoch": 0.14857142857142858,
+      "grad_norm": 1.34375,
+      "learning_rate": 9.479171816067808e-06,
+      "loss": 1.2968,
+      "step": 520
+    },
+    {
+      "epoch": 0.15142857142857144,
+      "grad_norm": 1.40625,
+      "learning_rate": 9.459006863450745e-06,
+      "loss": 1.2781,
+      "step": 530
+    },
+    {
+      "epoch": 0.15428571428571428,
+      "grad_norm": 1.75,
+      "learning_rate": 9.438481217772744e-06,
+      "loss": 1.238,
+      "step": 540
+    },
+    {
+      "epoch": 0.15714285714285714,
+      "grad_norm": 1.40625,
+      "learning_rate": 9.417596539371677e-06,
+      "loss": 1.2414,
+      "step": 550
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 1.6953125,
+      "learning_rate": 9.396354517627893e-06,
+      "loss": 1.2428,
+      "step": 560
+    },
+    {
+      "epoch": 0.16285714285714287,
+      "grad_norm": 1.4921875,
+      "learning_rate": 9.37475687082757e-06,
+      "loss": 1.2304,
+      "step": 570
+    },
+    {
+      "epoch": 0.1657142857142857,
+      "grad_norm": 1.453125,
+      "learning_rate": 9.35280534602371e-06,
+      "loss": 1.2377,
+      "step": 580
+    },
+    {
+      "epoch": 0.16857142857142857,
+      "grad_norm": 1.6171875,
+      "learning_rate": 9.330501718894822e-06,
+      "loss": 1.2865,
+      "step": 590
+    },
+    {
+      "epoch": 0.17142857142857143,
+      "grad_norm": 1.484375,
+      "learning_rate": 9.307847793601295e-06,
+      "loss": 1.2753,
+      "step": 600
+    },
+    {
+      "epoch": 0.1742857142857143,
+      "grad_norm": 2.078125,
+      "learning_rate": 9.284845402639447e-06,
+      "loss": 1.2317,
+      "step": 610
+    },
+    {
+      "epoch": 0.17714285714285713,
+      "grad_norm": 1.546875,
+      "learning_rate": 9.261496406693291e-06,
+      "loss": 1.2473,
+      "step": 620
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.375,
+      "learning_rate": 9.237802694484035e-06,
+      "loss": 1.2379,
+      "step": 630
+    },
+    {
+      "epoch": 0.18285714285714286,
+      "grad_norm": 1.4375,
+      "learning_rate": 9.213766182617291e-06,
+      "loss": 1.2274,
+      "step": 640
+    },
+    {
+      "epoch": 0.18571428571428572,
+      "grad_norm": 1.3828125,
+      "learning_rate": 9.189388815428047e-06,
+      "loss": 1.2372,
+      "step": 650
+    },
+    {
+      "epoch": 0.18857142857142858,
+      "grad_norm": 1.5390625,
+      "learning_rate": 9.164672564823374e-06,
+      "loss": 1.2541,
+      "step": 660
+    },
+    {
+      "epoch": 0.19142857142857142,
+      "grad_norm": 1.828125,
+      "learning_rate": 9.139619430122934e-06,
+      "loss": 1.2972,
+      "step": 670
+    },
+    {
+      "epoch": 0.19428571428571428,
+      "grad_norm": 1.4375,
+      "learning_rate": 9.114231437897245e-06,
+      "loss": 1.2356,
+      "step": 680
+    },
+    {
+      "epoch": 0.19714285714285715,
+      "grad_norm": 1.90625,
+      "learning_rate": 9.088510641803747e-06,
+      "loss": 1.2966,
+      "step": 690
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 1.4140625,
+      "learning_rate": 9.062459122420684e-06,
+      "loss": 1.231,
+      "step": 700
+    },
+    {
+      "epoch": 0.20285714285714285,
+      "grad_norm": 1.453125,
+      "learning_rate": 9.036078987078804e-06,
+      "loss": 1.2295,
+      "step": 710
+    },
+    {
+      "epoch": 0.2057142857142857,
+      "grad_norm": 1.5078125,
+      "learning_rate": 9.009372369690897e-06,
+      "loss": 1.2599,
+      "step": 720
+    },
+    {
+      "epoch": 0.20857142857142857,
+      "grad_norm": 1.546875,
+      "learning_rate": 8.982341430579174e-06,
+      "loss": 1.2406,
+      "step": 730
+    },
+    {
+      "epoch": 0.21142857142857144,
+      "grad_norm": 1.75,
+      "learning_rate": 8.954988356300518e-06,
+      "loss": 1.2162,
+      "step": 740
+    },
+    {
+      "epoch": 0.21428571428571427,
+      "grad_norm": 1.6171875,
+      "learning_rate": 8.927315359469627e-06,
+      "loss": 1.2414,
+      "step": 750
+    },
+    {
+      "epoch": 0.21714285714285714,
+      "grad_norm": 1.2890625,
+      "learning_rate": 8.899324678580005e-06,
+      "loss": 1.2257,
+      "step": 760
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.5703125,
+      "learning_rate": 8.871018577822916e-06,
+      "loss": 1.2735,
+      "step": 770
+    },
+    {
+      "epoch": 0.22285714285714286,
+      "grad_norm": 1.421875,
+      "learning_rate": 8.842399346904215e-06,
+      "loss": 1.2487,
+      "step": 780
+    },
+    {
+      "epoch": 0.2257142857142857,
+      "grad_norm": 1.5,
+      "learning_rate": 8.813469300859135e-06,
+      "loss": 1.2696,
+      "step": 790
+    },
+    {
+      "epoch": 0.22857142857142856,
+      "grad_norm": 1.2890625,
+      "learning_rate": 8.784230779865029e-06,
+      "loss": 1.2262,
+      "step": 800
+    },
+    {
+      "epoch": 0.23142857142857143,
+      "grad_norm": 1.59375,
+      "learning_rate": 8.754686149052057e-06,
+      "loss": 1.2514,
+      "step": 810
+    },
+    {
+      "epoch": 0.2342857142857143,
+      "grad_norm": 1.46875,
+      "learning_rate": 8.724837798311883e-06,
+      "loss": 1.2419,
+      "step": 820
+    },
+    {
+      "epoch": 0.23714285714285716,
+      "grad_norm": 1.4453125,
+      "learning_rate": 8.694688142104346e-06,
+      "loss": 1.2941,
+      "step": 830
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.4765625,
+      "learning_rate": 8.664239619262151e-06,
+      "loss": 1.2273,
+      "step": 840
+    },
+    {
+      "epoch": 0.24285714285714285,
+      "grad_norm": 1.4296875,
+      "learning_rate": 8.6334946927936e-06,
+      "loss": 1.249,
+      "step": 850
+    },
+    {
+      "epoch": 0.24571428571428572,
+      "grad_norm": 1.4921875,
+      "learning_rate": 8.602455849683343e-06,
+      "loss": 1.2663,
+      "step": 860
+    },
+    {
+      "epoch": 0.24857142857142858,
+      "grad_norm": 1.640625,
+      "learning_rate": 8.571125600691214e-06,
+      "loss": 1.2835,
+      "step": 870
+    },
+    {
+      "epoch": 0.25142857142857145,
+      "grad_norm": 1.6796875,
+      "learning_rate": 8.53950648014913e-06,
+      "loss": 1.2804,
+      "step": 880
+    },
+    {
+      "epoch": 0.2542857142857143,
+      "grad_norm": 1.5,
+      "learning_rate": 8.507601045756085e-06,
+      "loss": 1.2981,
+      "step": 890
+    },
+    {
+      "epoch": 0.2571428571428571,
+      "grad_norm": 1.71875,
+      "learning_rate": 8.475411878371257e-06,
+      "loss": 1.1956,
+      "step": 900
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 1.5625,
+      "learning_rate": 8.442941581805245e-06,
+      "loss": 1.2578,
+      "step": 910
+    },
+    {
+      "epoch": 0.26285714285714284,
+      "grad_norm": 1.46875,
+      "learning_rate": 8.410192782609428e-06,
+      "loss": 1.2073,
+      "step": 920
+    },
+    {
+      "epoch": 0.26571428571428574,
+      "grad_norm": 1.59375,
+      "learning_rate": 8.377168129863526e-06,
+      "loss": 1.2343,
+      "step": 930
+    },
+    {
+      "epoch": 0.26857142857142857,
+      "grad_norm": 1.5,
+      "learning_rate": 8.34387029496129e-06,
+      "loss": 1.2565,
+      "step": 940
+    },
+    {
+      "epoch": 0.2714285714285714,
+      "grad_norm": 2.265625,
+      "learning_rate": 8.310301971394423e-06,
+      "loss": 1.2424,
+      "step": 950
+    },
+    {
+      "epoch": 0.2742857142857143,
+      "grad_norm": 1.546875,
+      "learning_rate": 8.276465874534701e-06,
+      "loss": 1.2834,
+      "step": 960
+    },
+    {
+      "epoch": 0.27714285714285714,
+      "grad_norm": 1.5859375,
+      "learning_rate": 8.242364741414321e-06,
+      "loss": 1.2526,
+      "step": 970
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.703125,
+      "learning_rate": 8.208001330504497e-06,
+      "loss": 1.2253,
+      "step": 980
+    },
+    {
+      "epoch": 0.28285714285714286,
+      "grad_norm": 1.6171875,
+      "learning_rate": 8.17337842149233e-06,
+      "loss": 1.2526,
+      "step": 990
+    },
+    {
+      "epoch": 0.2857142857142857,
+      "grad_norm": 1.9609375,
+      "learning_rate": 8.138498815055959e-06,
+      "loss": 1.2918,
+      "step": 1000
+    },
+    {
+      "epoch": 0.2885714285714286,
+      "grad_norm": 1.453125,
+      "learning_rate": 8.103365332638e-06,
+      "loss": 1.2428,
+      "step": 1010
+    },
+    {
+      "epoch": 0.2914285714285714,
+      "grad_norm": 1.3984375,
+      "learning_rate": 8.067980816217336e-06,
+      "loss": 1.2538,
+      "step": 1020
+    },
+    {
+      "epoch": 0.29428571428571426,
+      "grad_norm": 1.5859375,
+      "learning_rate": 8.032348128079204e-06,
+      "loss": 1.253,
+      "step": 1030
+    },
+    {
+      "epoch": 0.29714285714285715,
+      "grad_norm": 1.390625,
+      "learning_rate": 7.996470150583678e-06,
+      "loss": 1.2471,
+      "step": 1040
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.4765625,
+      "learning_rate": 7.96034978593251e-06,
+      "loss": 1.2314,
+      "step": 1050
+    },
+    {
+      "epoch": 0.3028571428571429,
+      "grad_norm": 1.4375,
+      "learning_rate": 7.923989955934363e-06,
+      "loss": 1.2425,
+      "step": 1060
+    },
+    {
+      "epoch": 0.3057142857142857,
+      "grad_norm": 1.4296875,
+      "learning_rate": 7.887393601768468e-06,
+      "loss": 1.2446,
+      "step": 1070
+    },
+    {
+      "epoch": 0.30857142857142855,
+      "grad_norm": 1.9921875,
+      "learning_rate": 7.85056368374671e-06,
+      "loss": 1.2594,
+      "step": 1080
+    },
+    {
+      "epoch": 0.31142857142857144,
+      "grad_norm": 1.65625,
+      "learning_rate": 7.813503181074157e-06,
+      "loss": 1.2098,
+      "step": 1090
+    },
+    {
+      "epoch": 0.3142857142857143,
+      "grad_norm": 1.59375,
+      "learning_rate": 7.776215091608087e-06,
+      "loss": 1.2196,
+      "step": 1100
+    },
+    {
+      "epoch": 0.3171428571428571,
+      "grad_norm": 1.4921875,
+      "learning_rate": 7.738702431615464e-06,
+      "loss": 1.2226,
+      "step": 1110
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.6484375,
+      "learning_rate": 7.700968235528973e-06,
+      "loss": 1.2617,
+      "step": 1120
+    },
+    {
+      "epoch": 0.32285714285714284,
+      "grad_norm": 1.8828125,
+      "learning_rate": 7.663015555701552e-06,
+      "loss": 1.2693,
+      "step": 1130
+    },
+    {
+      "epoch": 0.32571428571428573,
+      "grad_norm": 2.875,
+      "learning_rate": 7.624847462159479e-06,
+      "loss": 1.258,
+      "step": 1140
+    },
+    {
+      "epoch": 0.32857142857142857,
+      "grad_norm": 1.5,
+      "learning_rate": 7.586467042354044e-06,
+      "loss": 1.24,
+      "step": 1150
+    },
+    {
+      "epoch": 0.3314285714285714,
+      "grad_norm": 1.5625,
+      "learning_rate": 7.547877400911798e-06,
+      "loss": 1.2413,
+      "step": 1160
+    },
+    {
+      "epoch": 0.3342857142857143,
+      "grad_norm": 1.4609375,
+      "learning_rate": 7.509081659383417e-06,
+      "loss": 1.232,
+      "step": 1170
+    },
+    {
+      "epoch": 0.33714285714285713,
+      "grad_norm": 1.4140625,
+      "learning_rate": 7.4700829559912005e-06,
+      "loss": 1.2422,
+      "step": 1180
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.4921875,
+      "learning_rate": 7.430884445375212e-06,
+      "loss": 1.2341,
+      "step": 1190
+    },
+    {
+      "epoch": 0.34285714285714286,
+      "grad_norm": 1.59375,
+      "learning_rate": 7.391489298338099e-06,
+      "loss": 1.2236,
+      "step": 1200
+    },
+    {
+      "epoch": 0.3457142857142857,
+      "grad_norm": 1.65625,
+      "learning_rate": 7.351900701588612e-06,
+      "loss": 1.2481,
+      "step": 1210
+    },
+    {
+      "epoch": 0.3485714285714286,
+      "grad_norm": 1.765625,
+      "learning_rate": 7.312121857483816e-06,
+      "loss": 1.2099,
+      "step": 1220
+    },
+    {
+      "epoch": 0.3514285714285714,
+      "grad_norm": 1.53125,
+      "learning_rate": 7.272155983770055e-06,
+      "loss": 1.2585,
+      "step": 1230
+    },
+    {
+      "epoch": 0.35428571428571426,
+      "grad_norm": 1.640625,
+      "learning_rate": 7.232006313322668e-06,
+      "loss": 1.2337,
+      "step": 1240
+    },
+    {
+      "epoch": 0.35714285714285715,
+      "grad_norm": 1.34375,
+      "learning_rate": 7.191676093884479e-06,
+      "loss": 1.2221,
+      "step": 1250
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 2.15625,
+      "learning_rate": 7.151168587803074e-06,
+      "loss": 1.2662,
+      "step": 1260
+    },
+    {
+      "epoch": 0.3628571428571429,
+      "grad_norm": 1.5078125,
+      "learning_rate": 7.110487071766924e-06,
+      "loss": 1.2171,
+      "step": 1270
+    },
+    {
+      "epoch": 0.3657142857142857,
+      "grad_norm": 1.65625,
+      "learning_rate": 7.069634836540311e-06,
+      "loss": 1.221,
+      "step": 1280
+    },
+    {
+      "epoch": 0.36857142857142855,
+      "grad_norm": 1.4375,
+      "learning_rate": 7.0286151866971544e-06,
+      "loss": 1.2431,
+      "step": 1290
+    },
+    {
+      "epoch": 0.37142857142857144,
+      "grad_norm": 1.5546875,
+      "learning_rate": 6.987431440353687e-06,
+      "loss": 1.2335,
+      "step": 1300
+    },
+    {
+      "epoch": 0.3742857142857143,
+      "grad_norm": 1.640625,
+      "learning_rate": 6.946086928900054e-06,
+      "loss": 1.1925,
+      "step": 1310
+    },
+    {
+      "epoch": 0.37714285714285717,
+      "grad_norm": 1.53125,
+      "learning_rate": 6.904584996730839e-06,
+      "loss": 1.2303,
+      "step": 1320
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.6875,
+      "learning_rate": 6.862929000974523e-06,
+      "loss": 1.2511,
+      "step": 1330
+    },
+    {
+      "epoch": 0.38285714285714284,
+      "grad_norm": 1.328125,
+      "learning_rate": 6.821122311221932e-06,
+      "loss": 1.2166,
+      "step": 1340
+    },
+    {
+      "epoch": 0.38571428571428573,
+      "grad_norm": 1.4453125,
+      "learning_rate": 6.779168309253663e-06,
+      "loss": 1.2181,
+      "step": 1350
+    },
+    {
+      "epoch": 0.38857142857142857,
+      "grad_norm": 1.53125,
+      "learning_rate": 6.73707038876653e-06,
+      "loss": 1.2358,
+      "step": 1360
+    },
+    {
+      "epoch": 0.3914285714285714,
+      "grad_norm": 1.7578125,
+      "learning_rate": 6.6948319550990485e-06,
+      "loss": 1.2176,
+      "step": 1370
+    },
+    {
+      "epoch": 0.3942857142857143,
+      "grad_norm": 1.3828125,
+      "learning_rate": 6.6524564249559645e-06,
+      "loss": 1.2317,
+      "step": 1380
+    },
+    {
+      "epoch": 0.39714285714285713,
+      "grad_norm": 1.609375,
+      "learning_rate": 6.6099472261318864e-06,
+      "loss": 1.306,
+      "step": 1390
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.5703125,
+      "learning_rate": 6.567307797233997e-06,
+      "loss": 1.253,
+      "step": 1400
+    },
+    {
+      "epoch": 0.40285714285714286,
+      "grad_norm": 2.75,
+      "learning_rate": 6.524541587403915e-06,
+      "loss": 1.2285,
+      "step": 1410
+    },
+    {
+      "epoch": 0.4057142857142857,
+      "grad_norm": 1.40625,
+      "learning_rate": 6.481652056038672e-06,
+      "loss": 1.2117,
+      "step": 1420
+    },
+    {
+      "epoch": 0.4085714285714286,
+      "grad_norm": 1.65625,
+      "learning_rate": 6.438642672510894e-06,
+      "loss": 1.2791,
+      "step": 1430
+    },
+    {
+      "epoch": 0.4114285714285714,
+      "grad_norm": 1.9296875,
+      "learning_rate": 6.395516915888158e-06,
+      "loss": 1.2448,
+      "step": 1440
+    },
+    {
+      "epoch": 0.4142857142857143,
+      "grad_norm": 1.328125,
+      "learning_rate": 6.352278274651562e-06,
+      "loss": 1.2291,
+      "step": 1450
+    },
+    {
+      "epoch": 0.41714285714285715,
+      "grad_norm": 1.546875,
+      "learning_rate": 6.308930246413541e-06,
+      "loss": 1.2544,
+      "step": 1460
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.7890625,
+      "learning_rate": 6.265476337634942e-06,
+      "loss": 1.2402,
+      "step": 1470
+    },
+    {
+      "epoch": 0.4228571428571429,
+      "grad_norm": 1.4296875,
+      "learning_rate": 6.22192006334139e-06,
+      "loss": 1.2525,
+      "step": 1480
+    },
+    {
+      "epoch": 0.4257142857142857,
+      "grad_norm": 1.7265625,
+      "learning_rate": 6.178264946838942e-06,
+      "loss": 1.2481,
+      "step": 1490
+    },
+    {
+      "epoch": 0.42857142857142855,
+      "grad_norm": 1.484375,
+      "learning_rate": 6.13451451942909e-06,
+      "loss": 1.2281,
+      "step": 1500
+    },
+    {
+      "epoch": 0.43142857142857144,
+      "grad_norm": 1.6015625,
+      "learning_rate": 6.090672320123113e-06,
+      "loss": 1.2064,
+      "step": 1510
+    },
+    {
+      "epoch": 0.4342857142857143,
+      "grad_norm": 1.5546875,
+      "learning_rate": 6.046741895355802e-06,
+      "loss": 1.2248,
+      "step": 1520
+    },
+    {
+      "epoch": 0.43714285714285717,
+      "grad_norm": 1.484375,
+      "learning_rate": 6.00272679869858e-06,
+      "loss": 1.2337,
+      "step": 1530
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.53125,
+      "learning_rate": 5.958630590572061e-06,
+      "loss": 1.241,
+      "step": 1540
+    },
+    {
+      "epoch": 0.44285714285714284,
+      "grad_norm": 1.4140625,
+      "learning_rate": 5.914456837958032e-06,
+      "loss": 1.222,
+      "step": 1550
+    },
+    {
+      "epoch": 0.44571428571428573,
+      "grad_norm": 1.3515625,
+      "learning_rate": 5.87020911411093e-06,
+      "loss": 1.2441,
+      "step": 1560
+    },
+    {
+      "epoch": 0.44857142857142857,
+      "grad_norm": 1.6171875,
+      "learning_rate": 5.825890998268784e-06,
+      "loss": 1.2635,
+      "step": 1570
+    },
+    {
+      "epoch": 0.4514285714285714,
+      "grad_norm": 1.5,
+      "learning_rate": 5.781506075363702e-06,
+      "loss": 1.243,
+      "step": 1580
+    },
+    {
+      "epoch": 0.4542857142857143,
+      "grad_norm": 1.5,
+      "learning_rate": 5.737057935731868e-06,
+      "loss": 1.2483,
+      "step": 1590
+    },
+    {
+      "epoch": 0.45714285714285713,
+      "grad_norm": 1.4140625,
+      "learning_rate": 5.692550174823129e-06,
+      "loss": 1.2055,
+      "step": 1600
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.5078125,
+      "learning_rate": 5.647986392910149e-06,
+      "loss": 1.2177,
+      "step": 1610
+    },
+    {
+      "epoch": 0.46285714285714286,
+      "grad_norm": 2.015625,
+      "learning_rate": 5.6033701947971805e-06,
+      "loss": 1.2275,
+      "step": 1620
+    },
+    {
+      "epoch": 0.4657142857142857,
+      "grad_norm": 1.6328125,
+      "learning_rate": 5.55870518952847e-06,
+      "loss": 1.2497,
+      "step": 1630
+    },
+    {
+      "epoch": 0.4685714285714286,
+      "grad_norm": 1.328125,
+      "learning_rate": 5.513994990096318e-06,
+      "loss": 1.2057,
+      "step": 1640
+    },
+    {
+      "epoch": 0.4714285714285714,
+      "grad_norm": 2.421875,
+      "learning_rate": 5.469243213148821e-06,
+      "loss": 1.2558,
+      "step": 1650
+    },
+    {
+      "epoch": 0.4742857142857143,
+      "grad_norm": 1.6953125,
+      "learning_rate": 5.424453478697321e-06,
+      "loss": 1.2452,
+      "step": 1660
+    },
+    {
+      "epoch": 0.47714285714285715,
+      "grad_norm": 1.7578125,
+      "learning_rate": 5.379629409823571e-06,
+      "loss": 1.253,
+      "step": 1670
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.578125,
+      "learning_rate": 5.334774632386672e-06,
+      "loss": 1.2358,
+      "step": 1680
+    },
+    {
+      "epoch": 0.4828571428571429,
+      "grad_norm": 1.828125,
+      "learning_rate": 5.2898927747297665e-06,
+      "loss": 1.2656,
+      "step": 1690
+    },
+    {
+      "epoch": 0.4857142857142857,
+      "grad_norm": 1.6328125,
+      "learning_rate": 5.244987467386536e-06,
+      "loss": 1.2283,
+      "step": 1700
+    },
+    {
+      "epoch": 0.48857142857142855,
+      "grad_norm": 1.6015625,
+      "learning_rate": 5.200062342787534e-06,
+      "loss": 1.217,
+      "step": 1710
+    },
+    {
+      "epoch": 0.49142857142857144,
+      "grad_norm": 1.4921875,
+      "learning_rate": 5.155121034966346e-06,
+      "loss": 1.2441,
+      "step": 1720
+    },
+    {
+      "epoch": 0.4942857142857143,
+      "grad_norm": 1.6171875,
+      "learning_rate": 5.110167179265636e-06,
+      "loss": 1.2197,
+      "step": 1730
+    },
+    {
+      "epoch": 0.49714285714285716,
+      "grad_norm": 1.515625,
+      "learning_rate": 5.065204412043071e-06,
+      "loss": 1.2542,
+      "step": 1740
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.5546875,
+      "learning_rate": 5.0202363703771866e-06,
+      "loss": 1.2047,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5028571428571429,
+      "grad_norm": 1.65625,
+      "learning_rate": 4.975266691773168e-06,
+      "loss": 1.2221,
+      "step": 1760
+    },
+    {
+      "epoch": 0.5057142857142857,
+      "grad_norm": 1.484375,
+      "learning_rate": 4.930299013868616e-06,
+      "loss": 1.2402,
+      "step": 1770
+    },
+    {
+      "epoch": 0.5085714285714286,
+      "grad_norm": 1.453125,
+      "learning_rate": 4.885336974139292e-06,
+      "loss": 1.1988,
+      "step": 1780
+    },
+    {
+      "epoch": 0.5114285714285715,
+      "grad_norm": 1.7421875,
+      "learning_rate": 4.840384209604881e-06,
+      "loss": 1.2282,
+      "step": 1790
+    },
+    {
+      "epoch": 0.5142857142857142,
+      "grad_norm": 1.7265625,
+      "learning_rate": 4.795444356534787e-06,
+      "loss": 1.1957,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5171428571428571,
+      "grad_norm": 1.4453125,
+      "learning_rate": 4.750521050153997e-06,
+      "loss": 1.2562,
+      "step": 1810
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.4453125,
+      "learning_rate": 4.7056179243490205e-06,
+      "loss": 1.2339,
+      "step": 1820
+    },
+    {
+      "epoch": 0.5228571428571429,
+      "grad_norm": 1.84375,
+      "learning_rate": 4.660738611373942e-06,
+      "loss": 1.2338,
+      "step": 1830
+    },
+    {
+      "epoch": 0.5257142857142857,
+      "grad_norm": 1.546875,
+      "learning_rate": 4.615886741556604e-06,
+      "loss": 1.2672,
+      "step": 1840
+    },
+    {
+      "epoch": 0.5285714285714286,
+      "grad_norm": 1.828125,
+      "learning_rate": 4.5710659430049475e-06,
+      "loss": 1.2383,
+      "step": 1850
+    },
+    {
+      "epoch": 0.5314285714285715,
+      "grad_norm": 1.53125,
+      "learning_rate": 4.5262798413135345e-06,
+      "loss": 1.2147,
+      "step": 1860
+    },
+    {
+      "epoch": 0.5342857142857143,
+      "grad_norm": 1.375,
+      "learning_rate": 4.4815320592702625e-06,
+      "loss": 1.1947,
+      "step": 1870
+    },
+    {
+      "epoch": 0.5371428571428571,
+      "grad_norm": 1.5390625,
+      "learning_rate": 4.436826216563318e-06,
+      "loss": 1.2432,
+      "step": 1880
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.4296875,
+      "learning_rate": 4.392165929488381e-06,
+      "loss": 1.2544,
+      "step": 1890
+    },
+    {
+      "epoch": 0.5428571428571428,
+      "grad_norm": 1.59375,
+      "learning_rate": 4.347554810656093e-06,
+      "loss": 1.2482,
+      "step": 1900
+    },
+    {
+      "epoch": 0.5457142857142857,
+      "grad_norm": 1.6015625,
+      "learning_rate": 4.3029964686998335e-06,
+      "loss": 1.2396,
+      "step": 1910
+    },
+    {
+      "epoch": 0.5485714285714286,
+      "grad_norm": 1.53125,
+      "learning_rate": 4.25849450798381e-06,
+      "loss": 1.2299,
+      "step": 1920
+    },
+    {
+      "epoch": 0.5514285714285714,
+      "grad_norm": 1.515625,
+      "learning_rate": 4.2140525283115054e-06,
+      "loss": 1.2296,
+      "step": 1930
+    },
+    {
+      "epoch": 0.5542857142857143,
+      "grad_norm": 4.0,
+      "learning_rate": 4.169674124634482e-06,
+      "loss": 1.2191,
+      "step": 1940
+    },
+    {
+      "epoch": 0.5571428571428572,
+      "grad_norm": 1.578125,
+      "learning_rate": 4.125362886761577e-06,
+      "loss": 1.2496,
+      "step": 1950
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.609375,
+      "learning_rate": 4.08112239906853e-06,
+      "loss": 1.2369,
+      "step": 1960
+    },
+    {
+      "epoch": 0.5628571428571428,
+      "grad_norm": 1.6328125,
+      "learning_rate": 4.036956240208039e-06,
+      "loss": 1.2569,
+      "step": 1970
+    },
+    {
+      "epoch": 0.5657142857142857,
+      "grad_norm": 1.625,
+      "learning_rate": 3.992867982820268e-06,
+      "loss": 1.2248,
+      "step": 1980
+    },
+    {
+      "epoch": 0.5685714285714286,
+      "grad_norm": 1.484375,
+      "learning_rate": 3.948861193243867e-06,
+      "loss": 1.2002,
+      "step": 1990
+    },
+    {
+      "epoch": 0.5714285714285714,
+      "grad_norm": 1.921875,
+      "learning_rate": 3.904939431227477e-06,
+      "loss": 1.23,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5742857142857143,
+      "grad_norm": 1.6796875,
+      "learning_rate": 3.8611062496417895e-06,
+      "loss": 1.2209,
+      "step": 2010
+    },
+    {
+      "epoch": 0.5771428571428572,
+      "grad_norm": 1.5859375,
+      "learning_rate": 3.81736519419214e-06,
+      "loss": 1.271,
+      "step": 2020
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.4921875,
+      "learning_rate": 3.7737198031317025e-06,
+      "loss": 1.2471,
+      "step": 2030
+    },
+    {
+      "epoch": 0.5828571428571429,
+      "grad_norm": 1.2109375,
+      "learning_rate": 3.7301736069752683e-06,
+      "loss": 1.2174,
+      "step": 2040
+    },
+    {
+      "epoch": 0.5857142857142857,
+      "grad_norm": 1.7578125,
+      "learning_rate": 3.686730128213668e-06,
+      "loss": 1.2369,
+      "step": 2050
+    },
+    {
+      "epoch": 0.5885714285714285,
+      "grad_norm": 1.5625,
+      "learning_rate": 3.6433928810288292e-06,
+      "loss": 1.2319,
+      "step": 2060
+    },
+    {
+      "epoch": 0.5914285714285714,
+      "grad_norm": 1.9375,
+      "learning_rate": 3.6001653710095115e-06,
+      "loss": 1.3194,
+      "step": 2070
+    },
+    {
+      "epoch": 0.5942857142857143,
+      "grad_norm": 1.65625,
+      "learning_rate": 3.557051094867735e-06,
+      "loss": 1.1943,
+      "step": 2080
+    },
+    {
+      "epoch": 0.5971428571428572,
+      "grad_norm": 1.5546875,
+      "learning_rate": 3.514053540155934e-06,
+      "loss": 1.2375,
+      "step": 2090
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.78125,
+      "learning_rate": 3.471176184984839e-06,
+      "loss": 1.2498,
+      "step": 2100
+    },
+    {
+      "epoch": 0.6028571428571429,
+      "grad_norm": 1.4609375,
+      "learning_rate": 3.4284224977421333e-06,
+      "loss": 1.2565,
+      "step": 2110
+    },
+    {
+      "epoch": 0.6057142857142858,
+      "grad_norm": 1.5625,
+      "learning_rate": 3.3857959368118874e-06,
+      "loss": 1.1942,
+      "step": 2120
+    },
+    {
+      "epoch": 0.6085714285714285,
+      "grad_norm": 1.40625,
+      "learning_rate": 3.3432999502948156e-06,
+      "loss": 1.2208,
+      "step": 2130
+    },
+    {
+      "epoch": 0.6114285714285714,
+      "grad_norm": 1.6953125,
+      "learning_rate": 3.300937975729347e-06,
+      "loss": 1.2034,
+      "step": 2140
+    },
+    {
+      "epoch": 0.6142857142857143,
+      "grad_norm": 1.7890625,
+      "learning_rate": 3.2587134398135654e-06,
+      "loss": 1.2471,
+      "step": 2150
+    },
+    {
+      "epoch": 0.6171428571428571,
+      "grad_norm": 1.546875,
+      "learning_rate": 3.216629758128018e-06,
+      "loss": 1.2465,
+      "step": 2160
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.453125,
+      "learning_rate": 3.1746903348594303e-06,
+      "loss": 1.204,
+      "step": 2170
+    },
+    {
+      "epoch": 0.6228571428571429,
+      "grad_norm": 1.453125,
+      "learning_rate": 3.1328985625253304e-06,
+      "loss": 1.2282,
+      "step": 2180
+    },
+    {
+      "epoch": 0.6257142857142857,
+      "grad_norm": 1.5859375,
+      "learning_rate": 3.0912578216996316e-06,
+      "loss": 1.2622,
+      "step": 2190
+    },
+    {
+      "epoch": 0.6285714285714286,
+      "grad_norm": 1.71875,
+      "learning_rate": 3.0497714807391776e-06,
+      "loss": 1.2507,
+      "step": 2200
+    },
+    {
+      "epoch": 0.6314285714285715,
+      "grad_norm": 1.5,
+      "learning_rate": 3.008442895511263e-06,
+      "loss": 1.2277,
+      "step": 2210
+    },
+    {
+      "epoch": 0.6342857142857142,
+      "grad_norm": 1.65625,
+      "learning_rate": 2.9672754091221807e-06,
+      "loss": 1.2164,
+      "step": 2220
+    },
+    {
+      "epoch": 0.6371428571428571,
+      "grad_norm": 1.6171875,
+      "learning_rate": 2.926272351646794e-06,
+      "loss": 1.2358,
+      "step": 2230
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.5546875,
+      "learning_rate": 2.88543703985917e-06,
+      "loss": 1.2415,
+      "step": 2240
+    },
+    {
+      "epoch": 0.6428571428571429,
+      "grad_norm": 1.53125,
+      "learning_rate": 2.8447727769642697e-06,
+      "loss": 1.2594,
+      "step": 2250
+    },
+    {
+      "epoch": 0.6457142857142857,
+      "grad_norm": 1.78125,
+      "learning_rate": 2.8042828523307562e-06,
+      "loss": 1.2364,
+      "step": 2260
+    },
+    {
+      "epoch": 0.6485714285714286,
+      "grad_norm": 1.5234375,
+      "learning_rate": 2.7639705412249185e-06,
+      "loss": 1.2227,
+      "step": 2270
+    },
+    {
+      "epoch": 0.6514285714285715,
+      "grad_norm": 2.1875,
+      "learning_rate": 2.7238391045457273e-06,
+      "loss": 1.2627,
+      "step": 2280
+    },
+    {
+      "epoch": 0.6542857142857142,
+      "grad_norm": 1.546875,
+      "learning_rate": 2.683891788561055e-06,
+      "loss": 1.2754,
+      "step": 2290
+    },
+    {
+      "epoch": 0.6571428571428571,
+      "grad_norm": 1.3046875,
+      "learning_rate": 2.644131824645092e-06,
+      "loss": 1.2215,
+      "step": 2300
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 1.7890625,
+      "learning_rate": 2.604562429016944e-06,
+      "loss": 1.2072,
+      "step": 2310
+    },
+    {
+      "epoch": 0.6628571428571428,
+      "grad_norm": 1.65625,
+      "learning_rate": 2.5651868024804847e-06,
+      "loss": 1.2342,
+      "step": 2320
+    },
+    {
+      "epoch": 0.6657142857142857,
+      "grad_norm": 1.6171875,
+      "learning_rate": 2.526008130165432e-06,
+      "loss": 1.2508,
+      "step": 2330
+    },
+    {
+      "epoch": 0.6685714285714286,
+      "grad_norm": 1.515625,
+      "learning_rate": 2.4870295812696924e-06,
+      "loss": 1.2335,
+      "step": 2340
+    },
+    {
+      "epoch": 0.6714285714285714,
+      "grad_norm": 1.4921875,
+      "learning_rate": 2.4482543088030188e-06,
+      "loss": 1.2186,
+      "step": 2350
+    },
+    {
+      "epoch": 0.6742857142857143,
+      "grad_norm": 1.453125,
+      "learning_rate": 2.4096854493319476e-06,
+      "loss": 1.2424,
+      "step": 2360
+    },
+    {
+      "epoch": 0.6771428571428572,
+      "grad_norm": 1.6015625,
+      "learning_rate": 2.3713261227260815e-06,
+      "loss": 1.2232,
+      "step": 2370
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.5546875,
+      "learning_rate": 2.333179431905726e-06,
+      "loss": 1.1973,
+      "step": 2380
+    },
+    {
+      "epoch": 0.6828571428571428,
+      "grad_norm": 1.671875,
+      "learning_rate": 2.295248462590883e-06,
+      "loss": 1.2461,
+      "step": 2390
+    },
+    {
+      "epoch": 0.6857142857142857,
+      "grad_norm": 1.40625,
+      "learning_rate": 2.257536283051651e-06,
+      "loss": 1.17,
+      "step": 2400
+    },
+    {
+      "epoch": 0.6885714285714286,
+      "grad_norm": 2.03125,
+      "learning_rate": 2.2200459438600296e-06,
+      "loss": 1.2275,
+      "step": 2410
+    },
+    {
+      "epoch": 0.6914285714285714,
+      "grad_norm": 1.5703125,
+      "learning_rate": 2.1827804776431478e-06,
+      "loss": 1.2649,
+      "step": 2420
+    },
+    {
+      "epoch": 0.6942857142857143,
+      "grad_norm": 1.5703125,
+      "learning_rate": 2.145742898837964e-06,
+      "loss": 1.2343,
+      "step": 2430
+    },
+    {
+      "epoch": 0.6971428571428572,
+      "grad_norm": 1.5,
+      "learning_rate": 2.1089362034474176e-06,
+      "loss": 1.2209,
+      "step": 2440
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.5859375,
+      "learning_rate": 2.07236336879808e-06,
+      "loss": 1.2363,
+      "step": 2450
+    },
+    {
+      "epoch": 0.7028571428571428,
+      "grad_norm": 1.640625,
+      "learning_rate": 2.0360273532993195e-06,
+      "loss": 1.2301,
+      "step": 2460
+    },
+    {
+      "epoch": 0.7057142857142857,
+      "grad_norm": 1.4140625,
+      "learning_rate": 1.9999310962039937e-06,
+      "loss": 1.2704,
+      "step": 2470
+    },
+    {
+      "epoch": 0.7085714285714285,
+      "grad_norm": 1.453125,
+      "learning_rate": 1.964077517370681e-06,
+      "loss": 1.1998,
+      "step": 2480
+    },
+    {
+      "epoch": 0.7114285714285714,
+      "grad_norm": 1.671875,
+      "learning_rate": 1.9284695170275054e-06,
+      "loss": 1.2305,
+      "step": 2490
+    },
+    {
+      "epoch": 0.7142857142857143,
+      "grad_norm": 1.59375,
+      "learning_rate": 1.8931099755375203e-06,
+      "loss": 1.21,
+      "step": 2500
+    },
+    {
+      "epoch": 0.7171428571428572,
+      "grad_norm": 1.4609375,
+      "learning_rate": 1.858001753165724e-06,
+      "loss": 1.2015,
+      "step": 2510
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.4375,
+      "learning_rate": 1.823147689847687e-06,
+      "loss": 1.2115,
+      "step": 2520
+    },
+    {
+      "epoch": 0.7228571428571429,
+      "grad_norm": 2.015625,
+      "learning_rate": 1.7885506049598201e-06,
+      "loss": 1.2317,
+      "step": 2530
+    },
+    {
+      "epoch": 0.7257142857142858,
+      "grad_norm": 2.265625,
+      "learning_rate": 1.7542132970913251e-06,
+      "loss": 1.2063,
+      "step": 2540
+    },
+    {
+      "epoch": 0.7285714285714285,
+      "grad_norm": 1.5,
+      "learning_rate": 1.720138543817807e-06,
+      "loss": 1.2621,
+      "step": 2550
+    },
+    {
+      "epoch": 0.7314285714285714,
+      "grad_norm": 1.578125,
+      "learning_rate": 1.6863291014765893e-06,
+      "loss": 1.2355,
+      "step": 2560
+    },
+    {
+      "epoch": 0.7342857142857143,
+      "grad_norm": 1.59375,
+      "learning_rate": 1.6527877049437624e-06,
+      "loss": 1.2525,
+      "step": 2570
+    },
+    {
+      "epoch": 0.7371428571428571,
+      "grad_norm": 7.28125,
+      "learning_rate": 1.6195170674129469e-06,
+      "loss": 1.2434,
+      "step": 2580
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.6328125,
+      "learning_rate": 1.586519880175827e-06,
+      "loss": 1.2228,
+      "step": 2590
+    },
+    {
+      "epoch": 0.7428571428571429,
+      "grad_norm": 1.3203125,
+      "learning_rate": 1.5537988124044495e-06,
+      "loss": 1.2388,
+      "step": 2600
+    },
+    {
+      "epoch": 0.7457142857142857,
+      "grad_norm": 1.4609375,
+      "learning_rate": 1.5213565109353045e-06,
+      "loss": 1.2369,
+      "step": 2610
+    },
+    {
+      "epoch": 0.7485714285714286,
+      "grad_norm": 4.09375,
+      "learning_rate": 1.489195600055232e-06,
+      "loss": 1.2123,
+      "step": 2620
+    },
+    {
+      "epoch": 0.7514285714285714,
+      "grad_norm": 1.5234375,
+      "learning_rate": 1.4573186812891344e-06,
+      "loss": 1.2231,
+      "step": 2630
+    },
+    {
+      "epoch": 0.7542857142857143,
+      "grad_norm": 1.4765625,
+      "learning_rate": 1.4257283331895316e-06,
+      "loss": 1.2269,
+      "step": 2640
+    },
+    {
+      "epoch": 0.7571428571428571,
+      "grad_norm": 1.5,
+      "learning_rate": 1.3944271111279938e-06,
+      "loss": 1.2389,
+      "step": 2650
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.359375,
+      "learning_rate": 1.3634175470884192e-06,
+      "loss": 1.2535,
+      "step": 2660
+    },
+    {
+      "epoch": 0.7628571428571429,
+      "grad_norm": 1.5703125,
+      "learning_rate": 1.3327021494622327e-06,
+      "loss": 1.2166,
+      "step": 2670
+    },
+    {
+      "epoch": 0.7657142857142857,
+      "grad_norm": 1.46875,
+      "learning_rate": 1.3022834028454746e-06,
+      "loss": 1.2672,
+      "step": 2680
+    },
+    {
+      "epoch": 0.7685714285714286,
+      "grad_norm": 1.6640625,
+      "learning_rate": 1.2721637678378135e-06,
+      "loss": 1.2109,
+      "step": 2690
+    },
+    {
+      "epoch": 0.7714285714285715,
+      "grad_norm": 1.5859375,
+      "learning_rate": 1.242345680843517e-06,
+      "loss": 1.2273,
+      "step": 2700
+    },
+    {
+      "epoch": 0.7742857142857142,
+      "grad_norm": 1.4453125,
+      "learning_rate": 1.2128315538743646e-06,
+      "loss": 1.2322,
+      "step": 2710
+    },
+    {
+      "epoch": 0.7771428571428571,
+      "grad_norm": 1.4375,
+      "learning_rate": 1.183623774354531e-06,
+      "loss": 1.2197,
+      "step": 2720
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.890625,
+      "learning_rate": 1.1547247049274767e-06,
+      "loss": 1.2347,
+      "step": 2730
+    },
+    {
+      "epoch": 0.7828571428571428,
+      "grad_norm": 1.640625,
+      "learning_rate": 1.1261366832648258e-06,
+      "loss": 1.2782,
+      "step": 2740
+    },
+    {
+      "epoch": 0.7857142857142857,
+      "grad_norm": 1.5859375,
+      "learning_rate": 1.0978620218772655e-06,
+      "loss": 1.222,
+      "step": 2750
+    },
+    {
+      "epoch": 0.7885714285714286,
+      "grad_norm": 1.625,
+      "learning_rate": 1.0699030079274957e-06,
+      "loss": 1.2103,
+      "step": 2760
+    },
+    {
+      "epoch": 0.7914285714285715,
+      "grad_norm": 1.6875,
+      "learning_rate": 1.0422619030452063e-06,
+      "loss": 1.2241,
+      "step": 2770
+    },
+    {
+      "epoch": 0.7942857142857143,
+      "grad_norm": 1.546875,
+      "learning_rate": 1.0149409431441421e-06,
+      "loss": 1.24,
+      "step": 2780
+    },
+    {
+      "epoch": 0.7971428571428572,
+      "grad_norm": 1.765625,
+      "learning_rate": 9.879423382412328e-07,
+      "loss": 1.2566,
+      "step": 2790
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.671875,
+      "learning_rate": 9.612682722778206e-07,
+      "loss": 1.2916,
+      "step": 2800
+    },
+    {
+      "epoch": 0.8028571428571428,
+      "grad_norm": 1.40625,
+      "learning_rate": 9.349209029430051e-07,
+      "loss": 1.212,
+      "step": 2810
+    },
+    {
+      "epoch": 0.8057142857142857,
+      "grad_norm": 1.3828125,
+      "learning_rate": 9.089023614991032e-07,
+      "loss": 1.2317,
+      "step": 2820
+    },
+    {
+      "epoch": 0.8085714285714286,
+      "grad_norm": 1.5703125,
+      "learning_rate": 8.83214752609246e-07,
+      "loss": 1.2229,
+      "step": 2830
+    },
+    {
+      "epoch": 0.8114285714285714,
+      "grad_norm": 1.609375,
+      "learning_rate": 8.578601541671366e-07,
+      "loss": 1.2755,
+      "step": 2840
+    },
+    {
+      "epoch": 0.8142857142857143,
+      "grad_norm": 1.4140625,
+      "learning_rate": 8.328406171289621e-07,
+      "loss": 1.2019,
+      "step": 2850
+    },
+    {
+      "epoch": 0.8171428571428572,
+      "grad_norm": 1.7734375,
+      "learning_rate": 8.081581653474945e-07,
+      "loss": 1.1962,
+      "step": 2860
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.78125,
+      "learning_rate": 7.83814795408378e-07,
+      "loss": 1.2464,
+      "step": 2870
+    },
+    {
+      "epoch": 0.8228571428571428,
+      "grad_norm": 1.609375,
+      "learning_rate": 7.598124764686182e-07,
+      "loss": 1.1909,
+      "step": 2880
+    },
+    {
+      "epoch": 0.8257142857142857,
+      "grad_norm": 1.53125,
+      "learning_rate": 7.361531500973052e-07,
+      "loss": 1.2326,
+      "step": 2890
+    },
+    {
+      "epoch": 0.8285714285714286,
+      "grad_norm": 1.6796875,
+      "learning_rate": 7.128387301185502e-07,
+      "loss": 1.2443,
+      "step": 2900
+    },
+    {
+      "epoch": 0.8314285714285714,
+      "grad_norm": 1.4296875,
+      "learning_rate": 6.898711024566762e-07,
+      "loss": 1.2719,
+      "step": 2910
+    },
+    {
+      "epoch": 0.8342857142857143,
+      "grad_norm": 1.515625,
+      "learning_rate": 6.672521249836689e-07,
+      "loss": 1.2577,
+      "step": 2920
+    },
+    {
+      "epoch": 0.8371428571428572,
+      "grad_norm": 1.7109375,
+      "learning_rate": 6.44983627368882e-07,
+      "loss": 1.2606,
+      "step": 2930
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.6875,
+      "learning_rate": 6.230674109310436e-07,
+      "loss": 1.2358,
+      "step": 2940
+    },
+    {
+      "epoch": 0.8428571428571429,
+      "grad_norm": 1.6875,
+      "learning_rate": 6.015052484925426e-07,
+      "loss": 1.2674,
+      "step": 2950
+    },
+    {
+      "epoch": 0.8457142857142858,
+      "grad_norm": 1.671875,
+      "learning_rate": 5.802988842360169e-07,
+      "loss": 1.2425,
+      "step": 2960
+    },
+    {
+      "epoch": 0.8485714285714285,
+      "grad_norm": 1.546875,
+      "learning_rate": 5.594500335632785e-07,
+      "loss": 1.2456,
+      "step": 2970
+    },
+    {
+      "epoch": 0.8514285714285714,
+      "grad_norm": 1.578125,
+      "learning_rate": 5.389603829565393e-07,
+      "loss": 1.2576,
+      "step": 2980
+    },
+    {
+      "epoch": 0.8542857142857143,
+      "grad_norm": 1.65625,
+      "learning_rate": 5.188315898419971e-07,
+      "loss": 1.2568,
+      "step": 2990
+    },
+    {
+      "epoch": 0.8571428571428571,
+      "grad_norm": 1.6484375,
+      "learning_rate": 4.990652824557651e-07,
+      "loss": 1.2363,
+      "step": 3000
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.46875,
+      "learning_rate": 4.796630597121616e-07,
+      "loss": 1.2239,
+      "step": 3010
+    },
+    {
+      "epoch": 0.8628571428571429,
+      "grad_norm": 1.5859375,
+      "learning_rate": 4.6062649107436906e-07,
+      "loss": 1.265,
+      "step": 3020
+    },
+    {
+      "epoch": 0.8657142857142858,
+      "grad_norm": 1.5,
+      "learning_rate": 4.4195711642748497e-07,
+      "loss": 1.2642,
+      "step": 3030
+    },
+    {
+      "epoch": 0.8685714285714285,
+      "grad_norm": 1.5859375,
+      "learning_rate": 4.2365644595395163e-07,
+      "loss": 1.2409,
+      "step": 3040
+    },
+    {
+      "epoch": 0.8714285714285714,
+      "grad_norm": 1.7890625,
+      "learning_rate": 4.057259600114072e-07,
+      "loss": 1.187,
+      "step": 3050
+    },
+    {
+      "epoch": 0.8742857142857143,
+      "grad_norm": 1.6640625,
+      "learning_rate": 3.8816710901292467e-07,
+      "loss": 1.2412,
+      "step": 3060
+    },
+    {
+      "epoch": 0.8771428571428571,
+      "grad_norm": 1.625,
+      "learning_rate": 3.709813133096957e-07,
+      "loss": 1.2396,
+      "step": 3070
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.5859375,
+      "learning_rate": 3.5416996307613526e-07,
+      "loss": 1.245,
+      "step": 3080
+    },
+    {
+      "epoch": 0.8828571428571429,
+      "grad_norm": 1.3984375,
+      "learning_rate": 3.377344181974301e-07,
+      "loss": 1.1911,
+      "step": 3090
+    },
+    {
+      "epoch": 0.8857142857142857,
+      "grad_norm": 1.6484375,
+      "learning_rate": 3.2167600815953106e-07,
+      "loss": 1.2691,
+      "step": 3100
+    },
+    {
+      "epoch": 0.8885714285714286,
+      "grad_norm": 1.8203125,
+      "learning_rate": 3.059960319416183e-07,
+      "loss": 1.2316,
+      "step": 3110
+    },
+    {
+      "epoch": 0.8914285714285715,
+      "grad_norm": 1.6171875,
+      "learning_rate": 2.90695757911017e-07,
+      "loss": 1.2343,
+      "step": 3120
+    },
+    {
+      "epoch": 0.8942857142857142,
+      "grad_norm": 1.4765625,
+      "learning_rate": 2.7577642372060676e-07,
+      "loss": 1.2145,
+      "step": 3130
+    },
+    {
+      "epoch": 0.8971428571428571,
+      "grad_norm": 1.5859375,
+      "learning_rate": 2.6123923620869797e-07,
+      "loss": 1.234,
+      "step": 3140
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.3515625,
+      "learning_rate": 2.4708537130141484e-07,
+      "loss": 1.2177,
+      "step": 3150
+    },
+    {
+      "epoch": 0.9028571428571428,
+      "grad_norm": 1.7734375,
+      "learning_rate": 2.3331597391757444e-07,
+      "loss": 1.2631,
+      "step": 3160
+    },
+    {
+      "epoch": 0.9057142857142857,
+      "grad_norm": 1.609375,
+      "learning_rate": 2.1993215787606937e-07,
+      "loss": 1.2564,
+      "step": 3170
+    },
+    {
+      "epoch": 0.9085714285714286,
+      "grad_norm": 1.546875,
+      "learning_rate": 2.0693500580577232e-07,
+      "loss": 1.2415,
+      "step": 3180
+    },
+    {
+      "epoch": 0.9114285714285715,
+      "grad_norm": 1.5390625,
+      "learning_rate": 1.9432556905796153e-07,
+      "loss": 1.2711,
+      "step": 3190
+    },
+    {
+      "epoch": 0.9142857142857143,
+      "grad_norm": 1.5390625,
+      "learning_rate": 1.82104867621275e-07,
+      "loss": 1.2392,
+      "step": 3200
+    },
+    {
+      "epoch": 0.9171428571428571,
+      "grad_norm": 2.015625,
+      "learning_rate": 1.7027389003920315e-07,
+      "loss": 1.2394,
+      "step": 3210
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.5625,
+      "learning_rate": 1.5883359333012438e-07,
+      "loss": 1.2219,
+      "step": 3220
+    },
+    {
+      "epoch": 0.9228571428571428,
+      "grad_norm": 1.4609375,
+      "learning_rate": 1.4778490290988934e-07,
+      "loss": 1.2376,
+      "step": 3230
+    },
+    {
+      "epoch": 0.9257142857142857,
+      "grad_norm": 1.6015625,
+      "learning_rate": 1.3712871251697012e-07,
+      "loss": 1.2682,
+      "step": 3240
+    },
+    {
+      "epoch": 0.9285714285714286,
+      "grad_norm": 1.6796875,
+      "learning_rate": 1.2686588414015544e-07,
+      "loss": 1.2103,
+      "step": 3250
+    },
+    {
+      "epoch": 0.9314285714285714,
+      "grad_norm": 1.7109375,
+      "learning_rate": 1.169972479488285e-07,
+      "loss": 1.2426,
+      "step": 3260
+    },
+    {
+      "epoch": 0.9342857142857143,
+      "grad_norm": 1.6796875,
+      "learning_rate": 1.0752360222581471e-07,
+      "loss": 1.2023,
+      "step": 3270
+    },
+    {
+      "epoch": 0.9371428571428572,
+      "grad_norm": 2.203125,
+      "learning_rate": 9.84457133028055e-08,
+      "loss": 1.2508,
+      "step": 3280
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.4765625,
+      "learning_rate": 8.976431549837239e-08,
+      "loss": 1.2167,
+      "step": 3290
+    },
+    {
+      "epoch": 0.9428571428571428,
+      "grad_norm": 1.6953125,
+      "learning_rate": 8.148011105856168e-08,
+      "loss": 1.2454,
+      "step": 3300
+    },
+    {
+      "epoch": 0.9457142857142857,
+      "grad_norm": 1.734375,
+      "learning_rate": 7.359377010009383e-08,
+      "loss": 1.2451,
+      "step": 3310
+    },
+    {
+      "epoch": 0.9485714285714286,
+      "grad_norm": 1.4296875,
+      "learning_rate": 6.610593055615733e-08,
+      "loss": 1.2328,
+      "step": 3320
+    },
+    {
+      "epoch": 0.9514285714285714,
+      "grad_norm": 1.84375,
+      "learning_rate": 5.9017198124801664e-08,
+      "loss": 1.2635,
+      "step": 3330
+    },
+    {
+      "epoch": 0.9542857142857143,
+      "grad_norm": 1.5390625,
+      "learning_rate": 5.232814621994597e-08,
+      "loss": 1.2532,
+      "step": 3340
+    },
+    {
+      "epoch": 0.9571428571428572,
+      "grad_norm": 1.671875,
+      "learning_rate": 4.603931592499278e-08,
+      "loss": 1.2491,
+      "step": 3350
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.5703125,
+      "learning_rate": 4.015121594906024e-08,
+      "loss": 1.2173,
+      "step": 3360
+    },
+    {
+      "epoch": 0.9628571428571429,
+      "grad_norm": 1.4140625,
+      "learning_rate": 3.4664322585831745e-08,
+      "loss": 1.2187,
+      "step": 3370
+    },
+    {
+      "epoch": 0.9657142857142857,
+      "grad_norm": 1.6875,
+      "learning_rate": 2.9579079675028377e-08,
+      "loss": 1.234,
+      "step": 3380
+    },
+    {
+      "epoch": 0.9685714285714285,
+      "grad_norm": 1.609375,
+      "learning_rate": 2.4895898566505982e-08,
+      "loss": 1.2441,
+      "step": 3390
+    },
+    {
+      "epoch": 0.9714285714285714,
+      "grad_norm": 1.4375,
+      "learning_rate": 2.0615158086981225e-08,
+      "loss": 1.2438,
+      "step": 3400
+    },
+    {
+      "epoch": 0.9742857142857143,
+      "grad_norm": 1.46875,
+      "learning_rate": 1.6737204509387206e-08,
+      "loss": 1.1841,
+      "step": 3410
+    },
+    {
+      "epoch": 0.9771428571428571,
+      "grad_norm": 1.5703125,
+      "learning_rate": 1.3262351524864215e-08,
+      "loss": 1.2387,
+      "step": 3420
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.921875,
+      "learning_rate": 1.0190880217383348e-08,
+      "loss": 1.2228,
+      "step": 3430
+    },
+    {
+      "epoch": 0.9828571428571429,
+      "grad_norm": 1.4375,
+      "learning_rate": 7.523039041012481e-09,
+      "loss": 1.2125,
+      "step": 3440
+    },
+    {
+      "epoch": 0.9857142857142858,
+      "grad_norm": 1.59375,
+      "learning_rate": 5.259043799813457e-09,
+      "loss": 1.2386,
+      "step": 3450
+    },
+    {
+      "epoch": 0.9885714285714285,
+      "grad_norm": 1.546875,
+      "learning_rate": 3.3990776303910457e-09,
+      "loss": 1.2625,
+      "step": 3460
+    },
+    {
+      "epoch": 0.9914285714285714,
+      "grad_norm": 1.5859375,
+      "learning_rate": 1.9432909870747972e-09,
+      "loss": 1.2377,
+      "step": 3470
+    },
+    {
+      "epoch": 0.9942857142857143,
+      "grad_norm": 1.7421875,
+      "learning_rate": 8.918016297515541e-10,
+      "loss": 1.2364,
+      "step": 3480
+    },
+    {
+      "epoch": 0.9971428571428571,
+      "grad_norm": 1.625,
+      "learning_rate": 2.4469461433751507e-10,
+      "loss": 1.2174,
+      "step": 3490
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.546875,
+      "learning_rate": 2.02228589873954e-12,
+      "loss": 1.2482,
+      "step": 3500
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 3500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 9223372036854775807,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.412338023206486e+19,
+  "train_batch_size": 28,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c788dc8de97ffded3503dc2a768968b009c430b2014b30f1894637b8fe26fe88
+size 5905

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff